-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathconfig_file_example
75 lines (56 loc) · 5.46 KB
/
config_file_example
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# ---------------- Global parameters ----------------
Ensembl_FTP_URL: ftp.ensembl.org/pub/release-108/
annotationFilename: Homo_sapiens.GRCh38.108.chr_patch_hapl_scaff
ensembl_release: 108
contaminants_fasta: "crap.fasta" # Path to the FASTA file with contaminant sequences
custom_transcript_list: "data/transcripts_reference_108.csv" # Path to the CSV file containing transcript IDs to be included in the datavase. Give an empty string ("") to ignore.
included_transcript_biotypes: "protein_coding" # Comma-separated list of transcript biotypes to be included (e.g. "protein_coding,retained_intron,protein_coding_LoF"). Ignored when custom_transcript_list is given. Default: "protein_coding". Refer to Gencode for biotype names: https://www.gencodegenes.org/pages/biotypes.html
only_MANE_select: False # boolean value - include only MANE Select transcripts? (This will exclude most biotypes other than protein coding transcripts.)
use_ProHap: True # boolean value - True or False - whether or not to compute haplotypes (run ProHap). Default: True
use_ProVar: False # boolean value - True or False - whether or not to compute individual variants (run ProVar). Default: False
final_fasta_file: <resulting fasta filename>
fasta_simplify_headers: False # boolean value - extract information from the FASTA headers into a separate file. Default: False
# ---------------- ProVar parameters ----------------
# The variant_vcf parameter contains a dictionary of objects describing different data sources of variants to be considered individually (not in haplotypes). Give an empty object if no extra variants are included.
# Dictionary is structured as follows:
# <dataset_name> : {
# file: <filename, expecting VCF format>,
# fasta_accession_prefix: <accession prefix for these variants in the resulting fasta file>,
# min_af: <minimal required minor allele frequency, specify -1 to include alleles without AF information>
# }
#
# Example of usage:
# variant_vcf:
# EnsemblVar: { file: "data/ensembl_vcf/homo_sapiens_incl_consequences.vcf", fasta_accession_prefix: "ensvar", min_af: 0.01 }
# RareVar: { file: "data/inhouse_vcf/inhouse_variation.vcf", fasta_accession_prefix: "rarevar", min_af: -1 }
variant_vcf: {}
working_dir_name_var: variants_tmp # directory for temp. files where individual variants will be processed - default: variants_tmp
var_require_start: 1 # Require annotation of canonical start codon for ProVar? 0 or 1 - default: 1
var_fasta_file: <filename - fasta with variant sequences, before any downstream optimization>
var_cdna_file: <filename - fasta with all variant cDNA sequences before translation (optional, give empty string ("") to skip)
var_table_file: <filename - supplementary table for variants>
# Option to add existing haplotype database to the output of ProVar
add_existing_haplo: False # boolean value - True or False
haplo_added_table: <path to the haplotype table file (one of the F2 files in the Zenodo repository)>
haplo_added_fasta: <path to the fasta file (one of the F3 files in the Zenodo repository)>
# ---------------- ProHap parameters ----------------
phased_FTP_URL: http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/
phased_local_path: "" # if not usinga data set available online, give the path to the directory containing all the VCF files here (otherwise give empty string: "")
phased_vcf_file_name: "ALL.chr{chr}.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf"
sample_metadata_file: "igsr_samples.tsv" # samples metadata (see https://github.com/ProGenNo/ProHap/wiki/Input-&-Usage#prohap for details)
working_dir_name_haplo: haplotypes_tmp # directory for temp. files where haplotypes will be processed - default: haplotypes_tmp
haplo_min_freq: -1 # minimal accepted value of haplotype frequency (haplotypes below threshold will be discarded), specify -1 to use haplotype count instead. Default: -1
haplo_min_count: 10 # minimal accepted count of occurrences for haplotypes (haplotypes below threshold will be discarded). Default: 10
phased_min_af: 0.01 # minimal minor allele frequency (AF) threshold for 1000 Genomes Project variants. Default: 0.01
phased_af_field: "AF" # allele frequency field for thresholding - change if you want to use the frequency in a specific population within 1000 Genomes, or according to your own file
x_par1_to: 2781479 # End position of the first pseudo-autosomal region (PAR) on the X chromosome
x_par2_from: 155701383 # Start position of the second pseudo-autosomal region on the X chromosome
# parameters
haplo_require_start: 1 # Require annotation of canonical start codon for ProHap? 0 or 1 - default: 1
haplo_ignore_UTR: 1 # Ignore variation in UTRs in ProHap? 0 or 1. If 0, UTR sequences are still removed in the final optimized database, but retained in the haplotypes FASTA. - default: 1
haplo_skip_start_lost: 1 # Ignore haplotypes where the start codon is lost? 0 or 1. If 0, haplotype cDNA sequences are translated in 3 reading frames including UTR sequences. - default: 1
max_cores: 3 # Maximum number of cores used per chromosome, default: 3
# output files
haplo_fasta_file: <filename - fasta with haplotype sequences, before any downstream optimization>
haplo_cdna_file: <filename - fasta with all haplotype cDNA sequences before translation (optional, give empty string ("") to skip)
haplo_table_file: <filename - supplementary table for haplotypes>