Commit 92cf6855 authored by Jacques Lagnel's avatar Jacques Lagnel
Browse files

pipeline correction split chr

parent 40b7f2c0
......@@ -58,7 +58,6 @@ def get_fql2(wildcards):
return ml
####################################################################################
############################ filan rule ############################################
####################################################################################
......@@ -67,7 +66,7 @@ rule final_outs:
input:
#expand("{outdir}/fastp/{sample}_merged.fastq.gz", outdir=config["outdir"], sample=samples['SampleName']),
#expand("{outdir}/mapped/{sample}_sorted.bam", outdir=config["outdir"], sample=samples['SampleName']),
#"{outdir}/multiqc/multiqc_report_fastqc.html".format(outdir=config["outdir"]),
"{outdir}/multiqc/multiqc_report_fastqc.html".format(outdir=config["outdir"]),
"{outdir}/multiqc/multiqc_report_bam.html".format(outdir=config["outdir"]),
#expand("{outdir}/variant/gatk_{mychr}.vcf.gz", outdir=config["outdir"], mychr=CHRS),
#expand("{outdir}/variant/gatk_gvcf/{sample}-{mychr}.g.vcf.gz",outdir=config["outdir"], sample=samples['SampleName'], mychr=CHRS),
......@@ -95,7 +94,6 @@ rule fastp_pe:
R2 = "{outdir}/fastp/{{sample}}_2_trim.fastq.gz".format(outdir=config["outdir"])
message: "Running fastp on files \n"
params:
spl = config["outdir"]+"/fastp/{sample}",
fqdir = config["fq_dir"],
outdir = config["outdir"],
modules = config["MODULES"],
......@@ -106,13 +104,10 @@ rule fastp_pe:
html = config["outdir"]+"/fastp/{sample}_trim.html"
shell:
"""
date
cat {input.R1} >{params.spl}.r1.fq.tmp.gz
date
cat {input.R2} >{params.spl}.r2.fq.tmp.gz
singularity exec {params.bind} {params.fastp_bin} fastp \
-i {params.spl}.r1.fq.tmp.gz \
-I {params.spl}.r2.fq.tmp.gz \
--stdin \
-i <(zcat {input.R1}) \
-I <(zcat {input.R2}) \
-o {output.R1} \
-O {output.R2} \
-h {params.html} \
......@@ -125,8 +120,6 @@ rule fastp_pe:
--low_complexity_filter \
--complexity_threshold 30 \
-w 4
rm -f {params.spl}.r1.fq.tmp.gz {params.spl}.r2.fq.tmp.gz
rm -f {params.html} {params.json}
"""
# 1-2) merge reads for and rev :
......
......@@ -13,6 +13,12 @@ cutadapt_adapters : "cutadapt_adapters.fas"
# remove duplicate
bam_remove_duplicate : "Y" #must be Y or N
# remove reads multi mapped on the ref
bam_remove_multi_mapped : "N" #must be Y or N
# GATK filtering:
# https://gatk.broadinstitute.org/hc/en-us/articles/360035890471-Hard-filtering-germline-short-variants
# Quality QUAL
......
#!/bin/bash
#SBATCH --job-name=filt_10r # job name (-J)
#SBATCH --time="48:00:00" #max run time "hh:mm:ss" or "dd-hh:mm:ss" (-t)
#SBATCH --cpus-per-task=1 # max nb of cores (-c)
#SBATCH --ntasks=1 #nb of tasks
#SBATCH --mem=16G # max memory (-m)
#SBATCH --output=post_filter_min10r.out #stdout (-o)
module load system/singularity-3.5.3
export SINGULARITY_BINDPATH="/work2/project/redd_vat_phyton,/work2/project/gafl"
#######################################################################33
# filter the orginal vcf with SNPs only
# 1) by genotype min depth
# 2) keeep only biallelic sites
# 3) filter out sites with more than max missing genotypes
# 4) keep sites with min MAF 0.05%
# do some stats
########################################################################
# we use first VariantFiltration to apply filters and then SelectVariants to keep only selected snps
# cf:
# GATK VariantFiltration
# https://gatk.broadinstitute.org/hc/en-us/articles/360045800332-VariantFiltration
# GATK SelectVariants
# https://gatk.broadinstitute.org/hc/en-us/articles/360051305531-SelectVariants
# if we need to compress & index a vcf file:
#zcat gatk_all.filtered_snps.vcf.gz|head -n500 >${invcf}.vcf
#singularity exec /work2/project/gafl/tools/containers/bcftools_v1.11.sif bgzip -c ${invcf}.vcf >${invcf}.vcf.gz
#singularity exec /work2/project/gafl/tools/containers/bcftools_v1.11.sif tabix -f -p vcf ${invcf}.vcf.gz
# genome used for some algo
ref="/work2/project/gafl/Data/Rosaceae/Prunus_armeniaca/DNA/Ref_Genome/Marouch/marouch_v31/marouch_v31_genome.fasta"
#GATK='singularity exec /work2/project/gafl/tools/containers/gatk4_v4.1.9.0.sif gatk'
GATK='singularity exec /work2/project/gafl/tools/containers/gatk4_v4.1.4.1.sif gatk'
VCFTOOLS='/work2/project/gafl/tools/containers/vcftools_0.1.16.sif'
PLINK="/work2/project/gafl/tools/containers/plink_v1.90.sif"
BCFTOOLS='/work2/project/gafl/tools/containers/bcftools_v1.11.sif'
# vcf input without .vcf.gz
invcf=gatk_all.filtered_snps
MINDP=5
MAXMISSING=0.15
MAF=0.05
# 1) -------- we filter the genotype with DP <10 --------------------
$GATK VariantFiltration \
--variant out/variant/${invcf}.vcf.gz \
--genotype-filter-expression "DP < $MINDP" \
--genotype-filter-name "LowDP$MINDP" \
--output tmp1a.vcf.gz
# 2) --------- we select variant from the previous filter and keep only biallelic samples ---------------
$GATK SelectVariants \
--variant tmp1a.vcf.gz \
--set-filtered-gt-to-nocall \
--exclude-filtered \
--restrict-alleles-to BIALLELIC \
--output tmp2a.vcf.gz
rm -f tmp1a.vcf.gz*
$GATK SelectVariants \
--variant tmp2a.vcf.gz \
--set-filtered-gt-to-nocall \
--exclude-filtered \
--output snps_minr${MINDP}.biallelic.vcf.gz
rm -f tmp2a.vcf.gz*
# 3) ------------- keep only SNP with max 0.20 missing genotypes --------------
$GATK SelectVariants \
--variant snps_minr${MINDP}.biallelic.vcf.gz \
--set-filtered-gt-to-nocall \
--exclude-filtered \
--max-nocall-fraction ${MAXMISSING} \
--output snps_minr${MINDP}.biallelic.maxmissing${MAXMISSING}.vcf.gz
# 4) ------ filter by MAF>0.05 -------------------------------------
#http://vcftools.sourceforge.net/man_latest.html
$VCFTOOLS --gzvcf snps_minr${MINDP}.biallelic.maxmissing${MAXMISSING}.vcf.gz --maf 0.05 --recode --recode-INFO-all --out tmp3a
# gzip and index
singularity exec $BCFTOOLS bgzip -c tmp3a.recode.vcf >snps_minr${MINDP}.biallelic.maxmissing${MAXMISSING}.maf${MAF}p.vcf.gz
singularity exec $BCFTOOLS tabix -f -p vcf snps_minr${MINDP}.biallelic.maxmissing${MAXMISSING}.maf${MAF}p.vcf.gz
rm -f tmp3a.recode.vcf
#SelectVariants to exclude non variants (variants that becomes 100% homozygote-var with filtering)
$GATK SelectVariants \
--variant snps_minr${MINDP}.biallelic.maxmissing${MAXMISSING}.maf${MAF}p.vcf.gz \
--exclude-non-variants \
--output snps_minr${MINDP}.biallelic.maxmissing${MAXMISSING}.maf${MAF}p.wonovariant.vcf.gz
# general stats
#without extension (.vcf.gz)
vcf=snps_minr${MINDP}.biallelic.maxmissing${MAXMISSING}.maf${MAF}p
$GATK VariantsToTable \
-V ${vcf}.vcf.gz \
-F CHROM -F POS -F REF -F ALT -F QUAL -F HET -F HOM-REF -F HOM-VAR -F NO-CALL -F TYPE -F VAR -F NSAMPLES -F NCALLED -F MULTI-ALLELIC \
-O ${vcf}.VariantsToTable.csv
#all
$BCFTOOLS stats -v -s - ${vcf}.vcf.gz >${vcf}.bcftools.stats.all.txt
# SNPs count
echo
echo -en "1\t${invcf}.vcf.gz\t"
zcat ${outdir}/variant/${invcf}.vcf.gz|grep -cv '^#'
echo -en "2\tsnps_minr${MINDP}.biallelic.vcf.gz\t"
zcat snps_minr${MINDP}.biallelic.vcf.gz|grep -cv '^#'
echo -en "3\tsnps_minr${MINDP}.biallelic.maxmissing${MAXMISSING}.vcf.gz\t"
zcat snps_minr${MINDP}.biallelic.maxmissing${MAXMISSING}.vcf.gz|grep -cv '^#'
echo -en "4\tsnps_minr${MINDP}.biallelic.maxmissing${MAXMISSING}.maf${MAF}p.vcf.gz\t"
zcat snps_minr${MINDP}.biallelic.maxmissing${MAXMISSING}.maf${MAF}p.vcf.gz|grep -cv '^#'
echo
exit 0
############# BIALLELIC in SelectVariants ######################3
#--restrict-alleles-to / NA
#Select only variants of a particular allelicity
#When this argument is used, we can choose to include only multiallelic or biallelic sites, depending on how many alleles are listed in the ALT column of a VCF. For example, a multiallelic record such as: 1 100 . A AAA,AAAAA will be excluded if `-restrict-alleles-to BIALLELIC` is used, because there are two alternate alleles, whereas a record such as: 1 100 . A T will be included in that case, but would be excluded if `-restrict-alleles-to MULTIALLELIC` is used. Valid options are ALL (default), MULTIALLELIC or BIALLELIC.
#The --restrict-alleles-to argument is an enumerated type (NumberAlleleRestriction), which can have one of the following values:
#ALL
#BIALLELIC
#MULTIALLELIC
......@@ -56,8 +56,9 @@ rm -fr .snakemake
mkdir -p logs
# Generate the dag files
# With samples
snakemake --configfile $CONFIG -s $RULES --dag | dot -Tpdf > dag.pdf
# With samples, useless if many samples
#snakemake --configfile $CONFIG -s $RULES --dag | dot -Tpdf > dag.pdf
# Only rules
snakemake --configfile $CONFIG -s $RULES --rulegraph | dot -Tpdf > dag_rules.pdf
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment