Commit f4947737 authored by jacqueslagnel's avatar jacqueslagnel
Browse files

Release1

parents
# WGS pipeline for Paired-end Illumina sequencing
## From raw fastq data to vcf file.
## Pipeline features:
### 1) read preprocessing,
### 2) reads QC,
### 3) mapping with bwa,
### 4) mapping QC
### 5) SNPs calling using freebayes
## Snakemake features: fastq from csv file, config, modules, SLURM
### Workflow steps are descibed in the dag_rules.pdf
### Snakemake rules are based on [Snakemake modules](https://forgemia.inra.fr/gafl/snakemake_modules)
### Files description:
### 1) Snakefile
- Snakefile.smk self-contained snakefile (all rules are included)
- Snakefile_modules.smk uses external rules (include directive)
### 2) Configuration file in yaml format:, paths, singularity images paths, parameters,....
- config.yaml
### 3) a sbatch file to run the pipeline: (to be edited)
- run_snakemake_pipeline.slurm
### 4) A slurm directive (#core, mem,...) in json format. Can be adjusted if needed
- cluster.json
### 5) samples file in csv format
Must contens at least 2 columns for SE reads and 3 for PE reads (tab separator )
SampleName fq1 fq2
SampleName : your sample ID
fq1: fastq file for a given sample
fq2: read 2 for paired-end reads
- samples.csv
## RUN:
### 1) Optionaly If using external rules/modules get all modules from the git:
`git clone https://forgemia.inra.fr/gafl/snakemake_modules.git smkmodules`
### 2) edit the config.yaml
### 3) set your samples in the sample.csv
### 4) adjust the run_snakemake_pipeline.slurm file
### 5) run pipelene:
`sbatch run_snakemake_pipeline.slurm`
### [Dicoexpress gitlab MIA](https://forgemia.inra.fr/GNet/dicoexpress)
#### Documentation being written (still)
# WGS Snakemake pipeline for Paired-end Illumina sequencing using a Reference genome
# Pipeline features:
# 1) read preprocessing,
# 2) reads QC,
# 3) mapping with bwa,
# 4) mapping QC
# 5) SNPs calling using freebayes
#
# Snakemake features: fastq from csv file, config, modules, SLURM
import pandas as pd
from snakemake.utils import min_version
# snakemake built-in report function requires min version 5.1
min_version("5.1.0")
#read the sample file using pandas lib (sample names+ fastq names) and crezate index using the sample name
samples = pd.read_csv(config["samplesfile"], sep='\t', dtype=str, comment='#').set_index(["SampleName"], drop=False)
rule final_outs:
input:
"{outdir}/multiqc/multiqc_report_fastqc.html".format(outdir=config["outdir"]),
"{outdir}/multiqc/multiqc_report_bam.html".format(outdir=config["outdir"]),
"{outdir}/gene_count_dicoexpress.csv".format(outdir=config["outdir"])
#function return only the fastq1 file and adds the fastq path
def get_fastq1(wildcards):
return config["fq_dir"]+"/"+samples.loc[(wildcards.sample), ["fq1"]].dropna()
#function return only the fastq2 file and adds the fastq path
def get_fastq2(wildcards):
return config["fq_dir"]+"/"+samples.loc[(wildcards.sample), ["fq2"]].dropna()
#module 4 fastp for Paired-end read
#use with utils.smk
# 2alternatives preprocessing
rule fastp_pe:
input:
R1 = get_fastq1,
R2 = get_fastq2
#R1 = lambda wildcards: "../data/"+samples.fq1[wildcards.sample],
#R2 = lambda wildcards: "../data/"+samples.fq2[wildcards.sample]
#R1 = lambda wildcards: config["fq_dir"]+"/"+samples.loc[(wildcards.sample), ["fq1"]].dropna(),
#R2 = lambda wildcards: config["fq_dir"]+"/"+samples.loc[(wildcards.sample), ["fq2"]].dropna()
output:
R1 = "{outdir}/fastp/{{sample}}_1_trim.fastq.gz".format(outdir=config["outdir"]),
R2 = "{outdir}/fastp/{{sample}}_2_trim.fastq.gz".format(outdir=config["outdir"])
message: "Running fastp on files {input.R1} and {input.R2} \n"
params:
fqdir = config["fq_dir"],
outdir = config["outdir"],
modules = config["MODULES"],
fastp_bin = config["fastp_bin"],
bind = config["BIND"],
json = config["outdir"]+"/fastp/{sample}_trim.json",
html = config["outdir"]+"/fastp/{sample}_trim.html"
shell:
"""
singularity exec {params.bind} {params.fastp_bin} fastp \
-i {input.R1} \
-I {input.R2} \
-o {output.R1} \
-O {output.R2} \
-h {params.html} \
-j {params.json} \
--max_len1 350 \
--correction \
--cut_mean_quality 20 \
--cut_window_size 4 \
--low_complexity_filter \
--complexity_threshold 30 \
-w 4
rm -f {params.html} {params.json}
exit 0
singularity exec {params.bind} {params.fastp_bin} fastp \
-i {input.R1} \
-I {input.R2} \
-o {output.R1} \
-O {output.R2} \
--json={params.json} \
--html={params.html} \
--trim_tail1=1 \
--cut_front \
--cut_tail \
--length_required 35 \
--average_qual 20 \
--length_limit 400 \
--correction \
--cut_mean_quality 10 \
--low_complexity_filter \
--complexity_threshold 20 \
--thread 4
#rm -f {params.html} {params.json}
"""
# singularity exec {params.bind} {params.fastp_bin} fastp \
# -i {input.R1} \
# -I {input.R2} \
# -o {output.R1} \
# -O {output.R2} \
# --json={params.json} \
# --html={params.html} \
# --trim_tail1=1 \
# --cut_front \
# --cut_tail \
# --length_required 35 \
# --average_qual 10 \
# --length_limit 400 \
# --correction \
# --cut_mean_quality 10 \
# --low_complexity_filter \
# --complexity_threshold 20 \
# --thread 4
# #rm -f {params.html} {params.json}
#QC
#fastqc PE mode
#function return fastq1 and fastq2 files and adds the fastq path
rule fastqc_pe:
input:
R1 = "{outdir}/fastp/{{sample}}_1_trim.fastq.gz".format(outdir=config["outdir"]),
R2 = "{outdir}/fastp/{{sample}}_2_trim.fastq.gz".format(outdir=config["outdir"])
output:
# to avoid the output name generated by fastqc (regex on the name) we use a flag file
"{outdir}/fastqc/{{sample}}.OK.done".format(outdir=config["outdir"])
threads:
2
resources:
mem_mb=4000
params:
fqdir = config["fq_dir"],
outdir = config["outdir"],
fastqc_bin = config["fastqc_bin"],
bind = config["BIND"]
shell:
"""
#mkdir -p {params.outdir}/fastqc #snakemake create automaticly the folders
singularity exec {params.bind} {params.fastqc_bin} fastqc -o {params.outdir}/fastqc -t {threads} {input.R1} {input.R2} && touch {output}
"""
#QC
# multiQC on fastqc outputs
rule multiqc_fastqc:
input:
expand("{outdir}/fastqc/{sample}.OK.done", outdir=config["outdir"], sample=samples['SampleName'])
output:
#report("{outdir}/multiqc/multiqc_report_fastqc.html".format(outdir=config["outdir"]), caption="report/multiqc.rst", category="Quality control")
"{outdir}/multiqc/multiqc_report_fastqc.html".format(outdir=config["outdir"])
threads:
1
params:
outdir = config["outdir"],
multiqc_bin = config["multiqc_bin"],
bind = config["BIND"]
shell:
"""
#mkdir -p {params.outdir}/multiqc #snakemake create automaticly the folders
singularity exec {params.bind} {params.multiqc_bin} multiqc --filename {output} {params.outdir}/fastqc
"""
#QC
#multiqc for bams
rule multiqc_bam:
input:
expand("{outdir}/mapped/{sample}.stats.txt", outdir=config["outdir"], sample=samples['SampleName'])
output:
"{outdir}/multiqc/multiqc_report_bam.html".format(outdir=config["outdir"])
#"{outdir}/multiqc/multiqc_report.html".format(outdir=config["outdir"])
threads:
1
params:
outdir = config["outdir"]+"/mapped/*.stats.txt",
multiqc_bin = config["multiqc_bin"],
bind = config["BIND"]
shell:
"""
#mkdir -p {params.outdir}/multiqc #snakemake create automaticly the folders
singularity exec {params.bind} {params.multiqc_bin} multiqc --filename {output} {input}
"""
#mapping with bwa PE mode
#reference indexing
#example with and without mark duplicate
#sam 2 bam sorted and statistics
rule bwa_index:
input:
genome = config["REFPATH"] + "/" + config["GENOME"]
output:
config["REFPATH"] + "/" + config["GENOME"] + ".amb",
config["REFPATH"] + "/" + config["GENOME"] + ".ann",
config["REFPATH"] + "/" + config["GENOME"] + ".bwt",
config["REFPATH"] + "/" + config["GENOME"] + ".pac",
config["REFPATH"] + "/" + config["GENOME"] + ".sa",
config["REFPATH"] + "/" + config["GENOME"] + ".fai"
params:
bind = config["BIND"],
bwa_bin = config["bwa_bin"],
samtools_bin = config["samtools_bin"]
message: "Building BWA index for reference genome {input.genome}\n"
shell:
"""
singularity exec {params.bind} {params.bwa_bin} bwa index -a bwtsw -b 500000000 {input.genome}
singularity exec {params.bind} {params.samtools_bin} samtools faidx {input.genome}
"""
rule bwa_pe :
input:
R1 = "{outdir}/fastp/{{sample}}_1_trim.fastq.gz".format(outdir=config["outdir"]),
R2 = "{outdir}/fastp/{{sample}}_2_trim.fastq.gz".format(outdir=config["outdir"]),
#fake input used to force index building before alignement if not present
idx = config["REFPATH"] + "/" + config["GENOME"] + ".bwt"
output:
bam = "{outdir}/mapped/{{sample}}_sorted.bam".format(outdir=config["outdir"]),
params:
outdir = config["outdir"],
idxbase = config["REFPATH"] + "/" + config["GENOME"],
bind = config["BIND"],
bwa_bin = config["bwa_bin"],
samtools_bin = config["samtools_bin"],
freebayes_bin= config["freebayes_bin"], # freebayes_bin used for sambamba for markdup
rg = "@RG\\tID:{sample}\\tSM:{sample}"
threads: 10
message: "Mapping reads {input.R1} to {params.idxbase} using BWA.\n"
#converting to bam, sorting and removing dupplicates in a single command!
shell:
"""
singularity exec {params.bind} {params.bwa_bin} bwa mem \
-t {threads} \
-K 100000000 \
{params.idxbase} \
{input.R1} {input.R2} \
| singularity exec {params.bind} {params.samtools_bin} samtools sort -@2 -m 6G -o {params.outdir}/{wildcards.sample}.bam.raw -
#Without rm duplicate
singularity exec {params.bind} {params.samtools_bin} samtools index -@ 10 {params.outdir}/{wildcards.sample}.bam.raw
mv {params.outdir}/{wildcards.sample}.bam.raw.bai {output.bam}.bai
mv {params.outdir}/{wildcards.sample}.bam.raw {output.bam}
#with rm duplicate
#singularity exec {params.bind} {params.samtools_bin} samtools rmdup -s {params.outdir}/{wildcards.sample}.bam.raw {output.bam}
#singularity exec {params.bind} {params.samtools_bin} samtools index -@ 10 {output.bam}
#singularity exec {params.bind} {params.freebayes_bin} sambamba markdup -t 10 {params.outdir}/{wildcards.sample}.bam.raw {output.bam}
#singularity exec {params.bind} {params.samtools_bin} samtools index -@ 10 {output.bam}
singularity exec {params.bind} {params.samtools_bin} samtools flagstat -@ 10 {output.bam}
rm -f {params.outdir}/{wildcards.sample}.bam.raw*
"""
rule bam_stats:
input:
bam = "{outdir}/mapped/{{sample}}_sorted.bam".format(outdir=config["outdir"])
output:
stats = "{outdir}/mapped/{{sample}}.stats.txt".format(outdir=config["outdir"])
params:
outdir = config["outdir"],
idxbase = config["REFPATH"] + "/" + config["GENOME"],
bind = config["BIND"],
bwa_bin = config["bwa_bin"],
samtools_bin = config["samtools_bin"]
threads: 2
shell:
"""
singularity exec {params.bind} {params.samtools_bin} samtools stats -@ 2 {input.bam} > {output.stats}
"""
# SNP caller
#freebayes rule
rule freebayes:
input:
bam = "{outdir}/mapped/{{sample}}_sorted_parsed.bam".format(outdir=config["outdir"])
#on merged bams
#bam = "{outdir}/bams_merged_sorted.bam".format(outdir=config["outdir"])
output:
var = "{outdir}/variant/{{sample}}_varscan.vcf".format(outdir=config["outdir"])
#on merged bams
#var = "{outdir}/variant/SNPs4all_freebayes.vcf".format(outdir=config["outdir"])
params:
ref = config["REFPATH"] + "/" + config["GENOME"],
bind = config["BIND"],
samtools_bin = config["samtools_bin"],
varscan_bin = config["varscan_bin"],
freebayes_bin = config["freebayes_bin"]
message: "Looking for SNP in {input.bam} with Varscan \n"
shell:
"""
zero=$(singularity exec {params.bind} {params.samtools_bin} samtools flagstat {input.bam}|head -n1|sed 's/\s.*//g')
if [ "$zero" -lt "1" ]
then
echo "WARNING ALLMINE {output.var} EMPTY"
touch {output.var}
exit 0
fi
singularity exec {params.bind} {params.freebayes_bin} \
freebayes -f {params.ref} \
--min-mapping-quality 30 \
--min-base-quality 20 \
--min-alternate-fraction 0.15 \
--min-coverage 8 \
--use-duplicate-reads \
{input.bam} > {output.var}
exit 0
"""
# WGS Snakemake pipeline for Paired-end Illumina sequencing using a Reference genome
# Pipeline features:
# 1) read preprocessing,
# 2) reads QC,
# 3) mapping with bwa,
# 4) mapping QC
# 5) SNPs calling using freebayes
#
# Snakemake features: fastq from csv file, config, modules, SLURM
# feature: fastq from csv file, modules, SLURM, report
import pandas as pd
from snakemake.utils import min_version
# snakemake built-in report function requires min version 5.1
min_version("5.1.0")
#read the sample file using pandas lib (sample names+ fastq names) and crezate index using the sample name
samples = pd.read_csv(config["samplesfile"], sep='\t', dtype=str, comment='#').set_index(["SampleName"], drop=False)
cwd = os.getcwd() + "/"
# modules loading...
include : cwd + "smkmodules/modules/utils.smk"
include : cwd + "smkmodules/modules/PREPROC_fastp_pe.smk"
include : cwd + "smkmodules/modules/QC_fastqc_pe.smk"
include : cwd + "smkmodules/modules/QC_multiqc_fastqc.smk"
include : cwd + "smkmodules/modules/MAPPING_bwa_pe.smk"
include : cwd + "smkmodules/modules/QC_multiqc_bam.smk"
include : cwd + "smkmodules/modules/SNP_freebayes.smk"
rule final_outs:
input:
"{outdir}/multiqc/multiqc_report_fastqc.html".format(outdir=config["outdir"]),
"{outdir}/multiqc/multiqc_report_bam.html".format(outdir=config["outdir"]),
expand("{outdir}/variant/{sample}_varscan.vcf".format(outdir=config["outdir"])", outdir=config["outdir"], sample=samples['SampleName'])
{
"__default__" :
{
"jobname" : "{rule}",
"time" : "04:00:00",
"cores" : 1,
"mem" : "4g",
"out" : "{rule}.out"
},
"trim_and_merge_pe" :
{
"jobname" : "{rule}_{wildcards.sample}",
"time" : "06:00:00",
"cores" : 4,
"mem" : "16g",
"out" : "{rule}_{wildcards.sample}.out"
},
"merge_bams" :
{
"time" : "12:00:00",
"cores" : 10,
"mem" : "16g"
},
"freebayes" :
{
"time" : "24:00:00",
"cores" : 1,
"mem" : "16g"
},
"freebayes_parallel" :
{
"time" : "12:00:00",
"cores" : 20,
"mem" : "16g"
},
"fastp_pe" :
{
"jobname" : "{rule}_{wildcards.sample}",
"time" : "08:00:00",
"cores" : 4,
"mem" : "16g",
"out" : "{rule}_{wildcards.sample}.out"
},
"fastp_se" :
{
"jobname" : "{rule}_{wildcards.sample}",
"time" : "08:00:00",
"cores" : 2,
"mem" : "8g",
"out" : "{rule}_{wildcards.sample}.out"
},
"trim_and_merge_pe" :
{
"jobname" : "{rule}_{wildcards.sample}",
"time" : "08:00:00",
"cores" : 4,
"mem" : "16g",
"out" : "{rule}_{wildcards.sample}.out"
},
"cutadapt_pe" :
{
"jobname" : "{rule}_{wildcards.sample}",
"time" : "08:00:00",
"cores" : 4,
"mem" : "16g",
"out" : "{rule}_{wildcards.sample}.out"
},
"fastqc_se" :
{
"jobname" : "{rule}_{wildcards.sample}",
"time" : "04:00:00",
"cores" : 2,
"mem" : "4g",
"out" : "{rule}_{wildcards.sample}.out"
},
"fastqc_pe" :
{
"jobname" : "{rule}_{wildcards.sample}",
"time" : "04:00:00",
"cores" : 2,
"mem" : "4g",
"out" : "{rule}_{wildcards.sample}.out"
},
"multiqc_fastqc" :
{
"time" : "06:00:00",
"cores" : 2,
"mem" : "4g"
},
"bowtie2_index" :
{
"time" : "10:00:00",
"cores" : 1,
"mem" : "32g"
},
"bowtie2_se" :
{
"jobname" : "{rule}_{wildcards.sample}",
"time" : "12:00:00",
"cores" : 10,
"mem" : "32g",
"out" : "{rule}_{wildcards.sample}.out"
},
"bowtie2_pe" :
{
"jobname" : "{rule}_{wildcards.sample}",
"time" : "12:00:00",
"cores" : 10,
"mem" : "32g",
"out" : "{rule}_{wildcards.sample}.out"
},
"bwa_index" :
{
"time" : "10:00:00",
"cores" : 1,
"mem" : "32g"
},
"bwa_se" :
{
"jobname" : "{rule}_{wildcards.sample}",
"time" : "12:00:00",
"cores" : 10,
"mem" : "32g",
"out" : "{rule}_{wildcards.sample}.out"
},
"bwa_pe" :
{
"jobname" : "{rule}_{wildcards.sample}",
"time" : "12:00:00",
"cores" : 10,
"mem" : "32g",
"out" : "{rule}_{wildcards.sample}.out"
},
"star_index" :
{
"time" : "10:00:00",
"cores" : 20,
"mem" : "32g"
},
"star_index2" :
{
"time" : "10:00:00",
"cores" : 20,
"mem" : "32g"
},
"star_pass1" :
{
"jobname" : "{rule}_{wildcards.sample}",
"time" : "12:00:00",
"cores" : 10,
"mem" : "32g",
"out" : "{rule}_{wildcards.sample}.out"
},
"star_pass2" :
{
"jobname" : "{rule}_{wildcards.sample}",
"time" : "12:00:00",
"cores" : 10,
"mem" : "32g",
"out" : "{rule}_{wildcards.sample}.out"
},
"parse_bam_with_bed" :
{
"jobname" : "{rule}_{wildcards.sample}",
"time" : "12:00:00",
"cores" : 1,
"mem" : "8g",
"out" : "{rule}_{wildcards.sample}.out"
},
"varscan" :
{
"jobname" : "{rule}_{wildcards.sample}",
"time" : "12:00:00",
"cores" : 1,
"mem" : "8g",
"out" : "{rule}_{wildcards.sample}.out"
},
"merge_bams" :
{
"time" : "12:00:00",
"cores" : 10,
"mem" : "16g"
},
"annovar" :
{
"jobname" : "{rule}_{wildcards.sample}",
"time" : "12:00:00",
"cores" : 1,
"mem" : "8g",
"out" : "{rule}_{wildcards.sample}.out"
},
"alternative_proteins" :
{
"jobname" : "{rule}_{wildcards.sample}",
"time" : "12:00:00",
"cores" : 1,
"mem" : "8g",
"out" : "{rule}_{wildcards.sample}.out"
},
"cov_track" :
{
"time" : "12:00:00",
"cores" : 1,
"mem" : "8g"
},
"make_report" :
{
"time" : "2:00:00",
"cores" : 1,
"mem" : "4g"
},
"make_markdown" :
{
"time" : "2:00:00",
"cores" : 1,
"mem" : "4g"
},
"vcf4_to_avinput" :
{
"jobname" : "{rule}_{wildcards.sample}",
"time" : "12:00:00",
"cores" : 1,
"mem" : "8g",