Commit 9a561fc7 authored by Jacques Lagnel's avatar Jacques Lagnel
Browse files

update Snakefile

parent 72f243be
......@@ -67,7 +67,7 @@ rule final_outs:
input:
#expand("{outdir}/fastp/{sample}_merged.fastq.gz", outdir=config["outdir"], sample=samples['SampleName']),
#expand("{outdir}/mapped/{sample}_sorted.bam", outdir=config["outdir"], sample=samples['SampleName']),
"{outdir}/multiqc/multiqc_report_fastqc.html".format(outdir=config["outdir"]),
#"{outdir}/multiqc/multiqc_report_fastqc.html".format(outdir=config["outdir"]),
"{outdir}/multiqc/multiqc_report_bam.html".format(outdir=config["outdir"]),
#expand("{outdir}/variant/gatk_{mychr}.vcf.gz", outdir=config["outdir"], mychr=CHRS),
#expand("{outdir}/variant/gatk_gvcf/{sample}-{mychr}.g.vcf.gz",outdir=config["outdir"], sample=samples['SampleName'], mychr=CHRS),
......@@ -406,9 +406,11 @@ rule gatk4_hc:
input:
#"{outdir}/variant/chrslist.OK".format(outdir=config["outdir"]),
bam = "{outdir}/mapped/{{sample}}_sorted.bam".format(outdir=config["outdir"]),
bambai = "{outdir}/mapped/{{sample}}_sorted.bam.bai".format(outdir=config["outdir"]),
refdict = "{refp}/{ref}.dict".format(refp=config["REFPATH"],ref=config["GENOME"]),
output:
gvcf="{outdir}/variant/gatk_gvcf/{{sample}}-{{mychr}}.g.vcf.gz".format(outdir=config["outdir"])
gvcf="{outdir}/variant/gatk_gvcf/{{sample}}-{{mychr}}.g.vcf.gz".format(outdir=config["outdir"]),
gtbi="{outdir}/variant/gatk_gvcf/{{sample}}-{{mychr}}.g.vcf.gz.tbi".format(outdir=config["outdir"])
params:
ch="{mychr}",
ref = config["REFPATH"] + "/" + config["GENOME"],
......@@ -419,7 +421,7 @@ rule gatk4_hc:
shell:
"""
singularity exec {params.bind} {params.gatk4_bin} \
gatk --java-options '-Xmx24G -XX:ParallelGCThreads={threads}' HaplotypeCaller \
gatk --java-options '-Xmx8G -XX:ParallelGCThreads={threads}' HaplotypeCaller \
--reference {params.ref} \
--input {input.bam} \
-ERC GVCF -L {params.ch} \
......@@ -431,7 +433,9 @@ rule gatk4_hc:
rule gatk4_gvcf_map_file:
input:
gvcfs = expand("{outdir}/variant/gatk_gvcf/{sample}-{{mychr}}.g.vcf.gz", outdir=config["outdir"], sample=samples['SampleName']),
gtbi = expand("{outdir}/variant/gatk_gvcf/{sample}-{{mychr}}.g.vcf.gz.tbi", outdir=config["outdir"], sample=samples['SampleName']),
fai = "{refp}/{ref}".format(refp=config["REFPATH"],ref=config["GENOME_FAI"]),
refdict = "{refp}/{ref}.dict".format(refp=config["REFPATH"],ref=config["GENOME"]),
output:
gvcfmap = "{outdir}/variant/gvcf_{{mychr}}_list.map".format(outdir=config["outdir"]),
params:
......@@ -453,6 +457,7 @@ rule gatk4_gvcf_map_file:
rule gatk4_gdb:
input:
gvcf = expand("{outdir}/variant/gatk_gvcf/{sample}-{{mychr}}.g.vcf.gz",outdir=config["outdir"],sample=samples['SampleName']),
gtbi = expand("{outdir}/variant/gatk_gvcf/{sample}-{{mychr}}.g.vcf.gz.tbi",outdir=config["outdir"],sample=samples['SampleName']),
gvcfmap = "{outdir}/variant/gvcf_{{mychr}}_list.map".format(outdir=config["outdir"]),
output:
"{outdir}/variant/gatk_genomicsdb_{{mychr}}.ok".format(outdir=config["outdir"])
......@@ -464,12 +469,12 @@ rule gatk4_gdb:
bind = config["BIND"],
samtools_bin = config["samtools_bin"],
gatk4_bin = config["gatk4_bin"]
threads: 10
threads: 1
shell:
"""
mkdir -p {params.tempdir}
singularity exec {params.bind} {params.gatk4_bin} \
gatk --java-options '-Xmx8g -Xms8g' GenomicsDBImport \
gatk --java-options '-Xmx16g -Xms16g' GenomicsDBImport \
--genomicsdb-workspace-path {params.gdb} \
--batch-size 200 \
-L {params.ch} \
......@@ -485,7 +490,8 @@ rule gatk4_gc:
input:
gdb="{outdir}/variant/gatk_genomicsdb_{{mychr}}.ok".format(outdir=config["outdir"])
output:
vcf="{outdir}/variant/gatk_{{mychr}}_genotyped.vcf.gz".format(outdir=config["outdir"])
vcf="{outdir}/variant/gatk_{{mychr}}_genotyped.vcf.gz".format(outdir=config["outdir"]),
tbi="{outdir}/variant/gatk_{{mychr}}_genotyped.vcf.gz.tbi".format(outdir=config["outdir"])
# WARNING if {outdir}/variant/gatk_{{mychr}}.vcf.gz is NOT working we need {{mychr}}_something
params:
tempdir=config["outdir"] + "/tmpgatkdir",
......@@ -500,7 +506,7 @@ rule gatk4_gc:
"""
mkdir -p {params.tempdir}
singularity exec {params.bind} {params.gatk4_bin} \
gatk --java-options '-Xmx8G -XX:ParallelGCThreads={threads}' GenotypeGVCFs \
gatk --java-options '-Xmx12G -XX:ParallelGCThreads={threads}' GenotypeGVCFs \
--variant gendb://{params.gdb} \
--reference {params.ref} \
--output {output.vcf} \
......@@ -512,9 +518,11 @@ rule combinevcf:
input:
#"{outdir}/variant/chrslist.OK".format(outdir=config["outdir"]),
vcf=expand("{outdir}/variant/gatk_{mychr}_genotyped.vcf.gz",outdir=config["outdir"],mychr=CHRS),
tbi=expand("{outdir}/variant/gatk_{mychr}_genotyped.vcf.gz.tbi",outdir=config["outdir"],mychr=CHRS),
refdict = config["REFPATH"] + "/" + config["GENOME"] +".dict",
output:
gvcf = "{outdir}/variant/gatk_all.vcf.gz".format(outdir=config["outdir"])
vcf = "{outdir}/variant/gatk_all.vcf.gz".format(outdir=config["outdir"]),
tbi = "{outdir}/variant/gatk_all.vcf.gz.tbi".format(outdir=config["outdir"]),
params:
mlist = expand(" -I {outdir}/variant/gatk_{mychr}_genotyped.vcf.gz",outdir=config["outdir"],mychr=CHRS),
ref = config["REFPATH"] + "/" + config["GENOME"],
......@@ -524,9 +532,9 @@ rule combinevcf:
shell:
"""
singularity exec {params.bind} {params.gatk4_bin} \
gatk --java-options '-Xmx16G' GatherVcfs {params.mlist} -O {output.gvcf}
gatk --java-options '-Xmx16G' GatherVcfs {params.mlist} -O {output.vcf}
singularity exec {params.bind} {params.gatk4_bin} \
gatk --java-options '-Xmx16G' IndexFeatureFile -I {output.gvcf}
gatk --java-options '-Xmx16G' IndexFeatureFile -I {output.vcf}
#-I vcf_list_to_merge.txt
"""
......@@ -535,20 +543,22 @@ rule combinevcf:
# 3-7) select SNPs only
rule gatk4_select_snps_variants:
input:
vcf = "{outdir}/variant/gatk_all.vcf.gz".format(outdir=config["outdir"])
vcf = "{outdir}/variant/gatk_all.vcf.gz".format(outdir=config["outdir"]),
tbi = "{outdir}/variant/gatk_all.vcf.gz.tbi".format(outdir=config["outdir"]),
output:
vcf="{outdir}/variant/gatk_all.raw_snps.vcf.gz".format(outdir=config["outdir"])
vcf="{outdir}/variant/gatk_all.raw_snps.vcf.gz".format(outdir=config["outdir"]),
tbi="{outdir}/variant/gatk_all.raw_snps.vcf.gz.tbi".format(outdir=config["outdir"]),
params:
ref = config["REFPATH"] + "/" + config["GENOME"],
bind = config["BIND"],
samtools_bin = config["samtools_bin"],
gatk4_bin = config["gatk4_bin"]
threads: 10
threads: 1
message: "GATK4 select SNPs (only) variants vcf\n"
shell:
"""
singularity exec {params.bind} {params.gatk4_bin} \
gatk --java-options '-Xmx8G -XX:ParallelGCThreads={threads}' SelectVariants \
gatk --java-options '-Xmx12G -XX:ParallelGCThreads={threads}' SelectVariants \
-select-type SNP \
--variant {input.vcf} \
--reference {params.ref} \
......@@ -576,9 +586,11 @@ rule gatk4_select_snps_variants:
# ReadPosRankSum < -8.0
rule gatk4_filterSnps:
input:
vcf="{outdir}/variant/gatk_all.raw_snps.vcf.gz".format(outdir=config["outdir"])
vcf="{outdir}/variant/gatk_all.raw_snps.vcf.gz".format(outdir=config["outdir"]),
tbi="{outdir}/variant/gatk_all.raw_snps.vcf.gz.tbi".format(outdir=config["outdir"]),
output:
vcf="{outdir}/variant/gatk_all.filtered_snps.vcf.gz".format(outdir=config["outdir"])
vcf="{outdir}/variant/gatk_all.filtered_snps.vcf.gz".format(outdir=config["outdir"]),
tbi="{outdir}/variant/gatk_all.filtered_snps.vcf.gz.tbi".format(outdir=config["outdir"]),
params:
vcftmp ="{outdir}/variant/vcf.snp.tmp.vcf.gz".format(outdir=config["outdir"]),
ref = config["REFPATH"] + "/" + config["GENOME"],
......@@ -592,12 +604,12 @@ rule gatk4_filterSnps:
SOR_filter = config["snp_SOR_filter"],
MQRankSum_filter = config["snp_MQRankSum_filter"],
ReadPosRankSum_filter = config["snp_ReadPosRankSum_filter"]
threads: 10
threads: 1
message: "GATK4 select SNPs (only) variants vcf\n"
shell:
"""
singularity exec {params.bind} {params.gatk4_bin} \
gatk --java-options '-Xmx8G -XX:ParallelGCThreads={threads}' VariantFiltration \
gatk --java-options '-Xmx12G -XX:ParallelGCThreads={threads}' VariantFiltration \
--variant {input.vcf} \
--reference {params.ref} \
--output {params.vcftmp} \
......@@ -612,7 +624,7 @@ rule gatk4_filterSnps:
-filter-name "ReadPosRankSum_filter" -filter "ReadPosRankSum {params.ReadPosRankSum_filter}"
singularity exec {params.bind} {params.gatk4_bin} \
gatk --java-options '-Xmx8G -XX:ParallelGCThreads={threads}' SelectVariants \
gatk --java-options '-Xmx12G -XX:ParallelGCThreads={threads}' SelectVariants \
--variant {params.vcftmp} \
--set-filtered-gt-to-nocall \
--exclude-filtered \
......@@ -633,11 +645,11 @@ rule gatk4_snps_raw_score_table:
bind = config["BIND"],
samtools_bin = config["samtools_bin"],
gatk4_bin = config["gatk4_bin"]
threads: 2
threads: 1
shell:
"""
singularity exec {params.bind} {params.gatk4_bin} \
gatk --java-options '-Xmx8G -XX:ParallelGCThreads={threads}' VariantsToTable \
gatk --java-options '-Xmx12G -XX:ParallelGCThreads={threads}' VariantsToTable \
-V {input.vcf} \
-R {params.ref} \
-F CHROM -F POS -F QUAL -F QD -F DP -F MQ -F MQRankSum -F FS -F ReadPosRankSum -F SOR \
......@@ -656,11 +668,11 @@ rule gatk4_snps_filtered_score_table:
bind = config["BIND"],
samtools_bin = config["samtools_bin"],
gatk4_bin = config["gatk4_bin"]
threads: 2
threads: 1
shell:
"""
singularity exec {params.bind} {params.gatk4_bin} \
gatk --java-options '-Xmx8G -XX:ParallelGCThreads={threads}' VariantsToTable \
gatk --java-options '-Xmx12G -XX:ParallelGCThreads={threads}' VariantsToTable \
-V {input.vcf} \
-R {params.ref} \
-F CHROM -F POS -F QUAL -F QD -F DP -F MQ -F MQRankSum -F FS -F ReadPosRankSum -F SOR \
......@@ -697,12 +709,12 @@ rule gatk4_select_indels_variants:
bind = config["BIND"],
samtools_bin = config["samtools_bin"],
gatk4_bin = config["gatk4_bin"]
threads: 10
threads: 1
message: "GATK4 select INDELs (only) variants vcf\n"
shell:
"""
singularity exec {params.bind} {params.gatk4_bin} \
gatk --java-options '-Xmx8G -XX:ParallelGCThreads={threads}' SelectVariants \
gatk --java-options '-Xmx12G -XX:ParallelGCThreads={threads}' SelectVariants \
-select-type INDEL \
--variant {input.vcf} \
--reference {params.ref} \
......@@ -726,12 +738,12 @@ rule gatk4_filterIndels:
QD_filter = config["indel_QD_filter"],
FS_filter = config["indel_FS_filter"],
ReadPosRankSum_filter = config["indel_ReadPosRankSum_filter"]
threads: 10
threads: 1
message: "GATK4 select INDELs (only) variants vcf\n"
shell:
"""
singularity exec {params.bind} {params.gatk4_bin} \
gatk --java-options '-Xmx8G -XX:ParallelGCThreads={threads}' VariantFiltration \
gatk --java-options '-Xmx12G -XX:ParallelGCThreads={threads}' VariantFiltration \
--variant {input.vcf} \
--reference {params.ref} \
--output {output.vcf} \
......@@ -753,11 +765,11 @@ rule gatk4_indels_raw_score_table:
bind = config["BIND"],
samtools_bin = config["samtools_bin"],
gatk4_bin = config["gatk4_bin"]
threads: 2
threads: 1
shell:
"""
singularity exec {params.bind} {params.gatk4_bin} \
gatk --java-options '-Xmx8G -XX:ParallelGCThreads={threads}' VariantsToTable \
gatk --java-options '-Xmx12G -XX:ParallelGCThreads={threads}' VariantsToTable \
-V {input.vcf} \
-R {params.ref} \
-F CHROM -F POS -F QUAL -F QD -F DP -F MQ -F MQRankSum -F FS -F ReadPosRankSum -F SOR \
......@@ -776,11 +788,11 @@ rule gatk4_indels_filtered_score_table:
bind = config["BIND"],
samtools_bin = config["samtools_bin"],
gatk4_bin = config["gatk4_bin"]
threads: 2
threads: 1
shell:
"""
singularity exec {params.bind} {params.gatk4_bin} \
gatk --java-options '-Xmx8G -XX:ParallelGCThreads={threads}' VariantsToTable \
gatk --java-options '-Xmx12G -XX:ParallelGCThreads={threads}' VariantsToTable \
-V {input.vcf} \
-R {params.ref} \
-F CHROM -F POS -F QUAL -F QD -F DP -F MQ -F MQRankSum -F FS -F ReadPosRankSum -F SOR \
......
......@@ -302,15 +302,16 @@
"multiqc_bam" :
{
"time" : "12:00:00",
"cores" : 2,
"mem" : "4g"
"mem" : "6g"
},
"bams_list" :
{
"time" : "00:05:00",
"cores" : 1,
"mem" : "1g"
"time" : "00:10:00",
"cores" : 1,
"mem" : "2g"
},
"gatk4_hc" :
......@@ -325,15 +326,15 @@
"gatk4_combine_calls" :
{
"time" : "48:00:00",
"cores" : 10,
"mem" : "10g"
"cores" : 1,
"mem" : "16g"
},
"gatk4_gdb" :
{
"jobname" : "{rule}_{wildcards.mychr}",
"time" : "48:00:00",
"cores" : 10,
"cores" : 1,
"mem" : "16g",
"out" : "{rule}_{wildcards.mychr}.out"
},
......@@ -341,9 +342,9 @@
"gatk4_gc" :
{
"jobname" : "{rule}_{wildcards.mychr}",
"time" : "92:00:00",
"cores" : 2,
"mem" : "10g",
"time" : "72:00:00",
"cores" : 1,
"mem" : "12g",
"out" : "{rule}_{wildcards.mychr}.out"
},
......@@ -356,50 +357,50 @@
"combinevcf" :
{
"time" : "32:00:00",
"cores" : 2,
"cores" : 1,
"mem" : "16g"
},
"gatk4_select_snps_variants" :
{
"time" : "08:00:00",
"cores" : 10,
"mem" : "10g"
"time" : "12:00:00",
"cores" : 1,
"mem" : "12g"
},
"gatk4_select_indels_variants" :
{
"time" : "08:00:00",
"cores" : 10,
"mem" : "10g"
"time" : "12:00:00",
"cores" : 1,
"mem" : "12g"
},
"gatk4_filterSnps" :
{
"time" : "08:00:00",
"cores" : 2,
"mem" : "10g"
"time" : "24:00:00",
"cores" : 1,
"mem" : "12g"
},
"gatk4_filterIndels" :
{
"time" : "08:00:00",
"cores" : 2,
"mem" : "10g"
"time" : "12:00:00",
"cores" : 1,
"mem" : "12g"
},
"gatk4_snps_quality_graphe_pdf" :
{
"time" : "08:00:00",
"cores" : 2,
"time" : "12:00:00",
"cores" : 1,
"mem" : "32g"
},
"gatk4_indels_quality_graphe_pdf" :
{
"time" : "08:00:00",
"time" : "12:00:00",
"cores" : 1,
"mem" : "10g"
"mem" : "16g"
}
}
......
# path or URL to sample sheet (TSV format(tab , no quote), columns: SampleName fq1 fq2 ...)
samplesfile : "samples.csv"
samplesfile : "Reads_melo_chinois_epgv.csv"
fq_dir : "/work2/project/redd_vat_phyton/DurArmel/reads_melo_chinois"
outdir : "/work2/project/redd_vat_phyton/DurArmel/analysis_chrparfull/out"
GENOME : "Harukei3_pseudomol_v1.41.fasta"
GENOME_FAI : "Harukei3_pseudomol_v1.41.fasta.fai"
REFPATH : "/work2/project/redd_vat_phyton/DurArmel/ref_genomes"
outdir : "/work2/project/redd_vat_phyton/DurArmel/analysis_payzawat/out"
GENOME : "GCA_009760825.1_ASM976082v1_genomic_chr0_chr12.fasta"
GENOME_FAI : "GCA_009760825.1_ASM976082v1_genomic_chr0_chr12.fasta.fai"
REFPATH : "/work2/project/redd_vat_phyton/DurArmel/analysis_payzawat/ref_payzawat"
# adapter in fasta 4 cutadapt if used
cutadapt_adapters : "cutadapt_adapters.fas"
......
#!/bin/bash
#SBATCH --job-name=filt_10r # job name (-J)
#SBATCH --time="12:00:00" #max run time "hh:mm:ss" or "dd-hh:mm:ss" (-t)
#SBATCH --time="48:00:00" #max run time "hh:mm:ss" or "dd-hh:mm:ss" (-t)
#SBATCH --cpus-per-task=1 # max nb of cores (-c)
#SBATCH --ntasks=1 #nb of tasks
#SBATCH --mem=16G # max memory (-m)
......@@ -8,7 +8,7 @@
module load system/singularity-3.5.3
export SINGULARITY_BINDPATH="/work2/project/gafl"
export SINGULARITY_BINDPATH="/work2/project/redd_vat_phyton,/work2/project/gafl"
#######################################################################33
# filter the orginal vcf with SNPs only
......@@ -32,7 +32,6 @@ export SINGULARITY_BINDPATH="/work2/project/gafl"
#singularity exec /work2/project/gafl/tools/containers/bcftools_v1.11.sif bgzip -c ${invcf}.vcf >${invcf}.vcf.gz
#singularity exec /work2/project/gafl/tools/containers/bcftools_v1.11.sif tabix -f -p vcf ${invcf}.vcf.gz
outdir="/work2/project/gafl/DADI/ABRIxG/new_v3.1_marouch/persica_v2.0.a1/out"
# genome used for some algo
ref="/work2/project/gafl/Data/Rosaceae/Prunus_armeniaca/DNA/Ref_Genome/Marouch/marouch_v31/marouch_v31_genome.fasta"
......@@ -44,14 +43,14 @@ BCFTOOLS='/work2/project/gafl/tools/containers/bcftools_v1.11.sif'
# vcf input without .vcf.gz
invcf=gatk_all.filtered_snps
MINDP=3
MAXMISSING=0.20
MINDP=5
MAXMISSING=0.15
MAF=0.05
# 1) -------- we filter the genotype with DP <10 --------------------
$GATK VariantFiltration \
--variant ${outdir}/variant/${invcf}.vcf.gz \
--variant out/variant/${invcf}.vcf.gz \
--genotype-filter-expression "DP < $MINDP" \
--genotype-filter-name "LowDP$MINDP" \
--output tmp1a.vcf.gz
......@@ -64,14 +63,14 @@ $GATK SelectVariants \
--exclude-filtered \
--restrict-alleles-to BIALLELIC \
--output tmp2a.vcf.gz
rm -f tmp1a.vcf.gz
rm -f tmp1a.vcf.gz*
$GATK SelectVariants \
--variant tmp2a.vcf.gz \
--set-filtered-gt-to-nocall \
--exclude-filtered \
--output snps_minr${MINDP}.biallelic.vcf.gz
rm -f tmp2a.vcf.gz
rm -f tmp2a.vcf.gz*
# 3) ------------- keep only SNP with max 0.20 missing genotypes --------------
......@@ -101,6 +100,20 @@ $GATK SelectVariants \
--output snps_minr${MINDP}.biallelic.maxmissing${MAXMISSING}.maf${MAF}p.wonovariant.vcf.gz
# general stats
#without extension (.vcf.gz)
vcf=snps_minr${MINDP}.biallelic.maxmissing${MAXMISSING}.maf${MAF}p
$GATK VariantsToTable \
-V ${vcf}.vcf.gz \
-F CHROM -F POS -F REF -F ALT -F QUAL -F HET -F HOM-REF -F HOM-VAR -F NO-CALL -F TYPE -F VAR -F NSAMPLES -F NCALLED -F MULTI-ALLELIC \
-O ${vcf}.VariantsToTable.csv
#all
$BCFTOOLS stats -v -s - ${vcf}.vcf.gz >${vcf}.bcftools.stats.all.txt
# SNPs count
echo
......@@ -118,17 +131,6 @@ zcat snps_minr${MINDP}.biallelic.maxmissing${MAXMISSING}.maf${MAF}p.vcf.gz|grep
echo
# general stats
#without extension (.vcf.gz)
vcf=snps_minr${MINDP}.biallelic.maxmissing${MAXMISSING}.maf${MAF}p
$GATK VariantsToTable \
-V ${vcf}.vcf.gz \
-F CHROM -F POS -F REF -F ALT -F QUAL -F HET -F HOM-REF -F HOM-VAR -F NO-CALL -F TYPE -F VAR -F NSAMPLES -F NCALLED -F MULTI-ALLELIC \
-O ${vcf}.VariantsToTable.csv
#all
$BCFTOOLS stats -v -s - ${vcf}.vcf.gz >${vcf}.bcftools.stats.all.txt
exit 0
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment