Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# bin_dir is a directory containing software binary used in this pipeline:
# trimgalore (version 0.4.5 with cutadapt (version 1.14))
# tabix and bgzip (version 0.2.5)
# STAR (version 2.6.0c )
# GenomeAnalysisTK.jar (version 3.8.1)
# bedtools (version 2.27.1)
# picard.jar (version 2.1.1)
# samtools (version 1.8)
# md5sum
# phASER (version downloaded 07-06-2019 ) with :
# python2.7 and associated Scipy and Numpy library
# bgzip (version 0.2.5, cf tabix)
# bcftools (version 1.8)
# if not bin_dir, the binary need to be available in the PATH
bin_dir : bin
# fasta_ref file is the masked genome reference Fasta file (need to be indexed with samtools faidx and picard CreateSequenceDictionary )
fasta_ref : data/reference.fa
# gtf_ref file is the genome reference GTF file
gtf_ref : data/reference.gtf
# VCF input files to analyse (including SNP only). The VCF files need to be indexed by tabix
vcf : data/variants.vcf.gz
# sample_config file is a tabular file describing each input fastq file(s) corresponding to samples included in the previous VCF file.
# header columns are :
# idx name forward_read reverse_read sequencer read_length phred_scale group
#
# forward_read and reverse_read are fastq file names. These files need to be in the data_dir directory
# sample may be paired end or single end.
# - If single end leave an empty column in the reverse_read
# - If paired, file names need to ends with _R1.fastq.gz or _R2.fastq.gz (or fq instead of fastq, and not necessarly compressed)
# sequencer need to be choosen in the list : ["ILLUMINA","SLX","SOLEXA","SOLID","454","LS454","COMPLETE","PACBIO","IONTORRENT","CAPILLARY","HELICOS","UNKNOWN"]
# phred scale indicate the phred score scale use to code base quality: either 33 (Sanger and illumina 1.8+) or 64 (Solexa, illumina 1.3 to 1.8 excluded)
sample_config : data/population.tsv.example
# data_dir contains fastq files described in sample_config file
data_dir : data
# Filter on variant type if not already done (True or False)
SNP_filter : True
# minimum calling depth
depth : 10
# minimum percentage (between 0 and 1) of sample with known genotype
GTpopCR_th : 0.50
# minimum percentage (between 0 and 1) of sample with known genotype with DP > 5
DPgt05rPopCR_th : 0.20
# minimum base quality
baseQuality : 20
# minimum mapping quality
mappingQuality : 10
# phASER option:
# Separator to use when generating unique IDs. Must not be found in reference contig name, and cannot include ':'. Default ( "_" )
id_separator : ""
# computing ressources, also give to --cluster-config snakemake option if executed on a cluster
# this yaml file defined default resources (mem and cpu at least) in a __default__ section, and specific resources either all or one resource for particular rule if different from the default
resources: resources_SLURM.yaml