config_ase.yaml.example

# bin_dir is a directory containing software binary used in this pipeline:
#   trimgalore (version 0.6.5 with cutadapt (version 2.1))
#   tabix and bgzip (version 0.2.5)
#	STAR (version 2.6.0c )
#   GenomeAnalysisTK.jar (version 4.1.2.0)
#   samtools (version 1.9)
#   md5sum
#   phASER (version downloaded 23-03-2020 ) with :
#       python2.7 and associated Scipy and Numpy library
#       bgzip (version 0.2.5, cf tabix)
#       bcftools (version 1.9)

# if not bin_dir, the binary need to be available in the PATH
bin_dir : bin

# fasta_ref file is the masked genome reference Fasta file (need to be indexed with samtools faidx and picard CreateSequenceDictionary )
fasta_ref : data/reference.fa

# gtf_ref file is the genome reference GTF file
gtf_ref : data/reference.gtf

# VCF input files to analyse (including SNP only). The VCF files need to be indexed by tabix
vcf : data/variants.vcf.gz

# sample_config file is a tabular file describing each input fastq file(s) corresponding to samples included in the previous VCF file.
# header columns are : 
#   idx  name  forward_read  reverse_read  sequencer read_length phred_scale   group
#
# forward_read and reverse_read are fastq file names. These files need to be in the data_dir directory
# sample may be paired end or single end. 
#       - If single end leave an empty column in the reverse_read
#       - If paired, file names need to ends with _R1.fastq.gz or _R2.fastq.gz (or fq instead of fastq, and not necessarly compressed)
# sequencer need to be choosen in the list : ["ILLUMINA","SLX","SOLEXA","SOLID","454","LS454","COMPLETE","PACBIO","IONTORRENT","CAPILLARY","HELICOS","UNKNOWN"]
# phred scale indicate the phred score scale use to code base quality: either 33 (Sanger and illumina 1.8+) or 64 (Solexa, illumina 1.3 to 1.8 excluded)
sample_config : data/population.tsv.example

# data_dir contains fastq files described in sample_config file
data_dir : data

# Filter on variant type if not already done (True or False)
SNP_filter : True
# minimum calling depth
depth : 10
# minimum percentage (between 0 and 1) of sample with known genotype
GTpopCR_th : 0.50
# minimum percentage (between 0 and 1) of sample with known genotype with DP > 5
DPgt05rPopCR_th : 0.20
# minimum base quality
baseQuality : 20
# minimum mapping quality
mappingQuality : 10

# phASER option:
# Separator to use when generating unique IDs. Must not be found in reference contig name, and cannot include ':'. Default ( "_" )
id_separator : "" 

# computing ressources, also give to --cluster-config snakemake option if executed on a cluster
# this yaml file defined default resources (mem and cpu at least) in a __default__ section, and specific resources either all or one resource for particular rule if different from the default
resources: resources_SLURM.yaml