Commit 8820be5d authored by Jacques Lagnel's avatar Jacques Lagnel
Browse files

update with parallel GATK

parents
# WGS SNPs calling pipeline for Paired-end Illumina sequencing
## From raw fastq data to vcf file.
## Pipeline features:
### 1) read preprocessing,
### 2) reads QC,
### 3) mapping with bwa,
### 4) mapping QC
### 5) SNPs calling using GATK in parallel mode: reference splitted by sequences: #tasks=#samples X #chrs
## Snakemake features: fastq from csv file, config, modules, SLURM
### Workflow steps are descibed in the dag_rules.pdf
### Snakemake rules are based on [Snakemake modules](https://forgemia.inra.fr/gafl/snakemake_modules)
### Files description:
### 1) Snakefile
- Snakefile_para_gatk_PEmerged.smk
### 2) Configuration file in yaml format:, paths, singularity images paths, parameters, GATK filters ....
- config.yaml
### 3) a sbatch file to run the pipeline: (to be edited)
- run_snakemake_pipeline_gatk.slurm
### 4) A slurm directive (#core, mem,...) in json format. Can be adjusted if needed
- cluster.json
### 5) samples file in csv format
Must contens at least 2 columns for SE reads and 3 for PE reads (tab separator )
SampleName fq1 fq2
SampleName : your sample ID
fq1: fastq file for a given sample
fq2: read 2 for paired-end reads
- samples.csv
## RUN:
### 1) Edit the config.yaml
### 2) Set your samples in the sample.csv
### 3) Adjust the run_snakemake_pipeline_gatk.slurm file
### 3) Run pipelene in dry run mode first:
`sbatch run_snakemake_pipeline_gatk.slurm`
### 4) uncomment the real run line and run the pipeline:
`sbatch run_snakemake_pipeline_gatk.slurm`
#### Documentation being written (still)
This diff is collapsed.
{
"__default__" :
{
"jobname" : "{rule}",
"time" : "06:00:00",
"cores" : 1,
"mem" : "6g",
"out" : "{rule}.out"
},
"trim_and_merge_pe" :
{
"jobname" : "{rule}_{wildcards.sample}",
"time" : "06:00:00",
"cores" : 4,
"mem" : "16g",
"out" : "{rule}_{wildcards.sample}.out"
},
"mergePE" :
{
"jobname" : "{rule}_{wildcards.sample}",
"time" : "12:00:00",
"cores" : 10,
"mem" : "22g",
"out" : "{rule}_{wildcards.sample}.out"
},
"merge_bams" :
{
"time" : "12:00:00",
"cores" : 10,
"mem" : "16g"
},
"freebayes" :
{
"jobname" : "{rule}",
"time" : "95:30:00",
"cores" : 1,
"mem" : "24g"
},
"freebayes_parallel" :
{
"time" : "95:30:00",
"cores" : 20,
"mem" : "24g"
},
"fastp_pe" :
{
"jobname" : "{rule}_{wildcards.sample}",
"time" : "08:00:00",
"cores" : 4,
"mem" : "16g",
"out" : "{rule}_{wildcards.sample}.out"
},
"fastp_se" :
{
"jobname" : "{rule}_{wildcards.sample}",
"time" : "08:00:00",
"cores" : 1,
"mem" : "4g",
"out" : "{rule}_{wildcards.sample}.out"
},
"trim_and_merge_pe" :
{
"jobname" : "{rule}_{wildcards.sample}",
"time" : "08:00:00",
"cores" : 4,
"mem" : "16g",
"out" : "{rule}_{wildcards.sample}.out"
},
"cutadapt_pe" :
{
"jobname" : "{rule}_{wildcards.sample}",
"time" : "12:00:00",
"cores" : 4,
"mem" : "16g",
"out" : "{rule}_{wildcards.sample}.out"
},
"fastqc_se" :
{
"jobname" : "{rule}_{wildcards.sample}",
"time" : "04:00:00",
"cores" : 1,
"mem" : "4g",
"out" : "{rule}_{wildcards.sample}.out"
},
"fastqc_pe" :
{
"jobname" : "{rule}_{wildcards.sample}",
"time" : "04:00:00",
"cores" : 2,
"mem" : "4g",
"out" : "{rule}_{wildcards.sample}.out"
},
"multiqc_fastqc" :
{
"time" : "06:00:00",
"cores" : 2,
"mem" : "4g"
},
"bowtie2_index" :
{
"time" : "10:00:00",
"cores" : 1,
"mem" : "32g"
},
"bowtie2_se" :
{
"jobname" : "{rule}_{wildcards.sample}",
"time" : "12:00:00",
"cores" : 10,
"mem" : "32g",
"out" : "{rule}_{wildcards.sample}.out"
},
"bowtie2_pe" :
{
"jobname" : "{rule}_{wildcards.sample}",
"time" : "12:00:00",
"cores" : 10,
"mem" : "32g",
"out" : "{rule}_{wildcards.sample}.out"
},
"bwa_index" :
{
"time" : "20:00:00",
"cores" : 1,
"mem" : "48g"
},
"bwa_se" :
{
"jobname" : "{rule}_{wildcards.sample}",
"time" : "12:00:00",
"cores" : 2,
"mem" : "16g",
"out" : "{rule}_{wildcards.sample}.out"
},
"bwa_pe" :
{
"jobname" : "{rule}_{wildcards.sample}",
"time" : "24:00:00",
"cores" : 8,
"mem" : "32g",
"out" : "{rule}_{wildcards.sample}.out"
},
"bam_stats" :
{
"jobname" : "{rule}_{wildcards.sample}",
"time" : "03:00:00",
"cores" : 1,
"mem" : "8g",
"out" : "{rule}_{wildcards.sample}.out"
},
"remove_duplicates" :
{
"jobname" : "{rule}_{wildcards.sample}",
"time" : "24:00:00",
"cores" : 10,
"mem" : "24g",
"out" : "{rule}_{wildcards.sample}.out"
},
"keep_duplicates" :
{
"jobname" : "{rule}_{wildcards.sample}",
"time" : "12:00:00",
"cores" : 10,
"mem" : "16g",
"out" : "{rule}_{wildcards.sample}.out"
},
"star_index" :
{
"time" : "10:00:00",
"cores" : 20,
"mem" : "36g"
},
"star_index2" :
{
"time" : "10:00:00",
"cores" : 20,
"mem" : "32g"
},
"star_pass1" :
{
"jobname" : "{rule}_{wildcards.sample}",
"time" : "12:00:00",
"cores" : 10,
"mem" : "48g",
"out" : "{rule}_{wildcards.sample}.out"
},
"star_pass2" :
{
"jobname" : "{rule}_{wildcards.sample}",
"time" : "12:00:00",
"cores" : 10,
"mem" : "48g",
"out" : "{rule}_{wildcards.sample}.out"
},
"parse_bam_with_bed" :
{
"jobname" : "{rule}_{wildcards.sample}",
"time" : "12:00:00",
"cores" : 1,
"mem" : "8g",
"out" : "{rule}_{wildcards.sample}.out"
},
"varscan" :
{
"jobname" : "{rule}_{wildcards.sample}",
"time" : "12:00:00",
"cores" : 1,
"mem" : "8g",
"out" : "{rule}_{wildcards.sample}.out"
},
"merge_bams" :
{
"time" : "24:00:00",
"cores" : 20,
"mem" : "48g"
},
"annovar" :
{
"jobname" : "{rule}_{wildcards.sample}",
"time" : "12:00:00",
"cores" : 1,
"mem" : "8g",
"out" : "{rule}_{wildcards.sample}.out"
},
"alternative_proteins" :
{
"jobname" : "{rule}_{wildcards.sample}",
"time" : "12:00:00",
"cores" : 1,
"mem" : "8g",
"out" : "{rule}_{wildcards.sample}.out"
},
"cov_track" :
{
"time" : "12:00:00",
"cores" : 1,
"mem" : "8g"
},
"make_report" :
{
"time" : "2:00:00",
"cores" : 1,
"mem" : "4g"
},
"make_markdown" :
{
"time" : "2:00:00",
"cores" : 1,
"mem" : "4g"
},
"vcf4_to_avinput" :
{
"jobname" : "{rule}_{wildcards.sample}",
"time" : "12:00:00",
"cores" : 1,
"mem" : "8g",
"out" : "{rule}_{wildcards.sample}.out"
},
"whatshap" :
{
"jobname" : "{rule}_{wildcards.sample}",
"time" : "12:00:00",
"cores" : 1,
"mem" : "8g",
"out" : "{rule}_{wildcards.sample}.out"
},
"multiqc_bam" :
{
"cores" : 2,
"mem" : "4g"
},
"bams_list" :
{
"time" : "00:05:00",
"cores" : 1,
"mem" : "1g"
},
"gatk4_hc" :
{
"jobname" : "{rule}_{wildcards.sample}-{wildcards.mychr}",
"time" : "72:00:00",
"cores" : 1,
"mem" : "7g",
"out" : "{rule}_{wildcards.sample}-{wildcards.mychr}.out"
},
"gatk4_combine_calls" :
{
"time" : "48:00:00",
"cores" : 10,
"mem" : "10g"
},
"gatk4_gdb" :
{
"jobname" : "{rule}_{wildcards.mychr}",
"time" : "48:00:00",
"cores" : 10,
"mem" : "16g",
"out" : "{rule}_{wildcards.mychr}.out"
},
"gatk4_gc" :
{
"jobname" : "{rule}_{wildcards.mychr}",
"time" : "92:00:00",
"cores" : 2,
"mem" : "10g",
"out" : "{rule}_{wildcards.mychr}.out"
},
"gatk4_gvcf_map_file" :
{
"jobname" : "{rule}_{wildcards.mychr}",
"out" : "{rule}_{wildcards.mychr}.out"
},
"combinevcf" :
{
"time" : "32:00:00",
"cores" : 2,
"mem" : "16g"
},
"gatk4_select_snps_variants" :
{
"time" : "08:00:00",
"cores" : 10,
"mem" : "10g"
},
"gatk4_select_indels_variants" :
{
"time" : "08:00:00",
"cores" : 10,
"mem" : "10g"
},
"gatk4_filterSnps" :
{
"time" : "08:00:00",
"cores" : 2,
"mem" : "10g"
},
"gatk4_filterIndels" :
{
"time" : "08:00:00",
"cores" : 2,
"mem" : "10g"
},
"gatk4_snps_quality_graphe_pdf" :
{
"time" : "08:00:00",
"cores" : 2,
"mem" : "32g"
},
"gatk4_indels_quality_graphe_pdf" :
{
"time" : "08:00:00",
"cores" : 1,
"mem" : "10g"
}
}
# path or URL to sample sheet (TSV format(tab , no quote), columns: SampleName fq1 fq2 ...)
samplesfile : "samples.csv"
fq_dir : "/work2/project/redd_vat_phyton/DurArmel/reads_melo_chinois"
outdir : "/work2/project/redd_vat_phyton/DurArmel/analysis_chrparfull/out"
GENOME : "Harukei3_pseudomol_v1.41.fasta"
GENOME_FAI : "Harukei3_pseudomol_v1.41.fasta.fai"
REFPATH : "/work2/project/redd_vat_phyton/DurArmel/ref_genomes"
# adapter in fasta 4 cutadapt if used
cutadapt_adapters : "cutadapt_adapters.fas"
# remove duplicate
bam_remove_duplicate : "Y" #must be Y or N
# GATK filtering:
# https://gatk.broadinstitute.org/hc/en-us/articles/360035890471-Hard-filtering-germline-short-variants
# Quality QUAL
# QualByDepth (QD): https://gatk.broadinstitute.org/hc/en-us/articles/360036711511-QualByDepth
# FisherStrand (FS): https://gatk.broadinstitute.org/hc/en-us/articles/360037592371-FisherStrand
# StrandOddsRatio (SOR): https://gatk.broadinstitute.org/hc/en-us/articles/360037592151-StrandOddsRatio
# RMSMappingQuality (MQ): https://gatk.broadinstitute.org/hc/en-us/articles/360036711191-RMSMappingQuality
# MappingQualityRankSumTest (MQRankSum): https://gatk.broadinstitute.org/hc/en-us/articles/360037592711-MappingQualityRankSumTest
# ReadPosRankSumTest (ReadPosRankSum): https://gatk.broadinstitute.org/hc/en-us/articles/360036360852-ReadPosRankSumTest
#
# https://gatk.broadinstitute.org/hc/en-us/articles/360035531112?id=2806
# For reference, here are some basic filtering thresholds to improve upon.
# QD < 2.0
# QUAL < 30.0
# SOR > 3.0
# FS > 60.0
# MQ < 40.0
# MQRankSum < -12.5
# ReadPosRankSum < -8.0"
# SNPs
snp_QUAL_filter : "< 30.0"
snp_QD_filter : "< 2.0"
snp_FS_filter : "> 60.0"
snp_MQ_filter : "< 40.0"
snp_SOR_filter: "> 3.0"
snp_MQRankSum_filter : "< -12.5"
snp_ReadPosRankSum_filter : "< -8.0"
# # indels:
indel_QUAL_filter : "< 30.0"
indel_QD_filter : "< 2.0"
indel_FS_filter : "> 200.0"
indel_ReadPosRankSum_filter : "< -20.0"
# ---------------------- singularity config -----------------------------
# singularity images location
fastqc_bin : "/work2/project/gafl/tools/containers/fastqc_V0.11.8.sif"
multiqc_bin : "/work2/project/gafl/tools/containers/multiqc_v1.7.sif"
cutadapt_bin : "/work2/project/gafl/tools/containers/cutadapt_v2.7.sif"
fastp_bin : "/work2/project/gafl/tools/containers/fastp_0.20.0.sif"
samtools_bin : "/work2/project/gafl/tools/containers/samtools_v1.9.sif"
bwa_bin : "/work2/project/gafl/tools/containers/bwa_0.7.17.sif"
sambamba_bin : "/work2/project/gafl/tools/containers/sambamba_v0.7.1.sif"
gatk4_bin : "/work2/project/gafl/tools/containers/gatk4_v4.1.4.1.sif"
R_bin : "/work2/project/gafl/tools/containers/R_base_V3.6.0.sif"
# singularity bind for IFB core the home is in: /shared/mfs/data/home
# BIND : "-B /shared/mfs/data/home/jlagnel"
# if no binding:
# Bind gafl01:
#BIND : "-B /work/project/gafl,/nosave/project/gafl,/work2/project/gafl"
# bind Genotoul:
BIND : "-B /work2/project/redd_vat_phyton,/work2/project/gafl"
# singularity modules only for genotoul
#MODULES : "module load system/singularity-2.5.1"
# if singularity is not as module
MODULES : ""
# ----------------------------------------------------------------------
>truseq
TGCACACGTCTGAACTCCAGTCACNNNNNNNNATCTCGTATGCCGTCTTC
>D5xx
AATGATACGGCGACCACCGAGATCTACACNNNNNNNNACACTCTTTCCCTACACGACGCTCTTC
>D7xx
CAAGCAGAAGACGGCATACGAGATNNNNNNNNGTGACTGGAGTTCAGACGTGT
>IlluminaMultiplexingPCRPrimer2.01
TCTGCTTAGATCGGAAGAGCACACGTCTGAACTCCCTGCTTAGATCGGAAG
>adprimer
AGATCGGAAGAGC
File added
library('gridExtra')
library('ggplot2')
args <- commandArgs(trailingOnly = TRUE)
if (length(args) !=4) {
stop("At least 4 arguments must be supplies:\nvcf count table raw\nvcf count table filtered\nTitle for graphs (SNPs ...)\npdf output file\n", call.=FALSE)
}
#log <- file(snakemake@log[[1]], open="wt")
#sink(log)
#sink(log, type="message")
table1 = args[1] # vcf count table raw SNPs|indels
table2 = args[2] # vcf count table filtered SNPs|indels
title = args[3] # Title for graphs ("SNPs" ...)
outfile = args[4] # pdf output file
data1 <- read.csv(table1, header = T, na.strings=c("","NA"), sep = "\t")
data2 <- read.csv(table2, header = T, na.strings=c("","NA"), sep = "\t")
dim(data1)
dim(data2)
leg1 <- gsub(" ", "_", paste("Raw",title))
leg2 <- gsub(" ", "_", paste("Filtered",title))
VCF <- rbind(data1,data2)
VCF$Variant <- factor(c(rep(leg1, dim(data1)[1]),rep(leg2, dim(data2)[1])))
craw <- '#A9E2E4'
cfilt <- '#F4CCCA'
QUAL<- ggplot(VCF, aes(x=QUAL, fill=Variant)) + geom_density(alpha=0.3) + scale_x_continuous(trans = 'log10')
DP <- ggplot(VCF, aes(x=DP, fill=Variant)) + geom_density(alpha=0.3)
QD <- ggplot(VCF, aes(x=QD, fill=Variant)) + geom_density(alpha=.3) + geom_vline(xintercept=20, size=0.7)
FS <- ggplot(VCF, aes(x=FS, fill=Variant)) + geom_density(alpha=.3)
MQ <- ggplot(VCF, aes(x=MQ, fill=Variant)) + geom_density(alpha=.3) + geom_vline(xintercept=40, size=0.7)
MQRankSum <- ggplot(VCF, aes(x=MQRankSum, fill=Variant)) + geom_density(alpha=.3)
SOR <- ggplot(VCF, aes(x=SOR, fill=Variant)) + geom_density(alpha=.3) #+ geom_vline(xintercept=c(4, 10), size=1, colour = c(craw,cfilt))
ReadPosRankSum <- ggplot(VCF, aes(x=ReadPosRankSum, fill=Variant)) + geom_density(alpha=.3) #+
#geom_vline(xintercept=c(-10,10,-20,20), size=1, colour = c(craw,craw,cfilt,cfilt)) + xlim(-30, 30)
VCF <- rbind(data2)
VCF$Variant <- factor(c(rep(rep(leg2, dim(data2)[1]))))
QUALf<- ggplot(VCF, aes(x=QUAL, fill=Variant)) + geom_density(alpha=0.3) + scale_x_continuous(trans = 'log10')
DPf <- ggplot(VCF, aes(x=DP, fill=Variant)) + geom_density(alpha=0.3)
QDf <- ggplot(VCF, aes(x=QD, fill=Variant)) + geom_density(alpha=.3) + geom_vline(xintercept=20, size=0.7)
FSf <- ggplot(VCF, aes(x=FS, fill=Variant)) + geom_density(alpha=.3)
MQf <- ggplot(VCF, aes(x=MQ, fill=Variant)) + geom_density(alpha=.3) + geom_vline(xintercept=40, size=0.7)
MQRankSumf <- ggplot(VCF, aes(x=MQRankSum, fill=Variant)) + geom_density(alpha=.3)
SORf <- ggplot(VCF, aes(x=SOR, fill=Variant)) + geom_density(alpha=.3) #+ geom_vline(xintercept=c(4, 10), size=1, colour = c(craw,cfilt))
ReadPosRankSumf <- ggplot(VCF, aes(x=ReadPosRankSum, fill=Variant)) + geom_density(alpha=.3) #+
#geom_vline(xintercept=c(-10,10,-20,20), size=1, colour = c(craw,craw,cfilt,cfilt)) + xlim(-30, 30)
pdf(outfile, height=40, width=15)
theme_set(theme_gray(base_size = 16))
grid.arrange(QUAL,QD, DP, FS, MQ, MQRankSum, SOR, ReadPosRankSum,QUALf,QDf, DPf, FSf, MQf, MQRankSumf, SORf, ReadPosRankSumf, nrow=9)
dev.off()