config_file_example

# ---------------- Global parameters ----------------

Ensembl_FTP_URL: ftp.ensembl.org/pub/release-108/
annotationFilename: Homo_sapiens.GRCh38.108.chr_patch_hapl_scaff
ensembl_release: 108

contaminants_fasta: "crap.fasta"      # Path to the FASTA file with contaminant sequences

custom_transcript_list: "data/transcripts_reference_108.csv"  # Path to the CSV file containing transcript IDs to be included in the datavase. Give an empty string ("") to ignore.
included_transcript_biotypes: "protein_coding"                # Comma-separated list of transcript biotypes to be included (e.g. "protein_coding,retained_intron,protein_coding_LoF"). Ignored when custom_transcript_list is given. Default: "protein_coding". Refer to Gencode for biotype names: https://www.gencodegenes.org/pages/biotypes.html
only_MANE_select: False                                       # boolean value - include only MANE Select transcripts? (This will exclude most biotypes other than protein coding transcripts.)

use_ProHap: True # boolean value - True or False - whether or not to compute haplotypes (run ProHap). Default: True
use_ProVar: False  # boolean value - True or False - whether or not to compute individual variants (run ProVar). Default: False

final_fasta_file: <resulting fasta filename>
fasta_simplify_headers: False   # boolean value - extract information from the FASTA headers into a separate file. Default: False

# ---------------- ProVar parameters ----------------

# The variant_vcf parameter contains a dictionary of objects describing different data sources of variants to be considered individually (not in haplotypes). Give an empty object if no extra variants are included.
# Dictionary is structured as follows:
# <dataset_name> : { 
#   file: <filename, expecting VCF format>, 
#   fasta_accession_prefix: <accession prefix for these variants in the resulting fasta file>, 
#   min_af: <minimal required minor allele frequency, specify -1 to include alleles without AF information>   
# }
#
# Example of usage:
# variant_vcf:
#   EnsemblVar: { file: "data/ensembl_vcf/homo_sapiens_incl_consequences.vcf", fasta_accession_prefix: "ensvar", min_af: 0.01 }
#   RareVar: { file: "data/inhouse_vcf/inhouse_variation.vcf", fasta_accession_prefix: "rarevar", min_af: -1 }

variant_vcf: {}

working_dir_name_var: variants_tmp      # directory for temp. files where individual variants will be processed - default: variants_tmp

var_require_start: 1      # Require annotation of canonical start codon for ProVar? 0 or 1 - default: 1

var_fasta_file: <filename - fasta with variant sequences, before any downstream optimization>
var_cdna_file: <filename - fasta with all variant cDNA sequences before translation (optional, give empty string ("") to skip)
var_table_file: <filename - supplementary table for variants>

# Option to add existing haplotype database to the output of ProVar
add_existing_haplo: False # boolean value - True or False
haplo_added_table: <path to the haplotype table file (one of the F2 files in the Zenodo repository)> 
haplo_added_fasta: <path to the fasta file (one of the F3 files in the Zenodo repository)> 

# ---------------- ProHap parameters ----------------

phased_FTP_URL: http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/
phased_local_path: "" # if not usinga data set available online, give the path to the directory containing all the VCF files here (otherwise give empty string: "")
phased_vcf_file_name: "ALL.chr{chr}.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf"
sample_metadata_file: "igsr_samples.tsv"  # samples metadata (see https://github.com/ProGenNo/ProHap/wiki/Input-&-Usage#prohap for details)

working_dir_name_haplo: haplotypes_tmp  # directory for temp. files where haplotypes will be processed - default: haplotypes_tmp

haplo_min_freq: -1      # minimal accepted value of haplotype frequency (haplotypes below threshold will be discarded), specify -1 to use haplotype count instead. Default: -1
haplo_min_count: 10     # minimal accepted count of occurrences for haplotypes (haplotypes below threshold will be discarded). Default: 10
phased_min_af: 0.01       # minimal minor allele frequency (AF) threshold for 1000 Genomes Project variants. Default: 0.01
phased_af_field: "AF"     # allele frequency field for thresholding - change if you want to use the frequency in a specific population within 1000 Genomes, or according to your own file

x_par1_to: 2781479        # End position of the first pseudo-autosomal region (PAR) on the X chromosome
x_par2_from: 155701383    # Start position of the second pseudo-autosomal region on the X chromosome

# parameters
haplo_require_start: 1    # Require annotation of canonical start codon for ProHap? 0 or 1 - default: 1
haplo_ignore_UTR: 1       # Ignore variation in UTRs in ProHap? 0 or 1. If 0, UTR sequences are still removed in the final optimized database, but retained in the haplotypes FASTA. - default: 1
haplo_skip_start_lost: 1  # Ignore haplotypes where the start codon is lost? 0 or 1. If 0, haplotype cDNA sequences are translated in 3 reading frames including UTR sequences. - default: 1
max_cores: 3              # Maximum number of cores used per chromosome, default: 3

# output files
haplo_fasta_file: <filename - fasta with haplotype sequences, before any downstream optimization>
haplo_cdna_file: <filename - fasta with all haplotype cDNA sequences before translation (optional, give empty string ("") to skip)
haplo_table_file: <filename - supplementary table for haplotypes>