Commit 59c49144 authored by Jacques Lagnel's avatar Jacques Lagnel
Browse files

Initial commit

parents
File added
# GBS SNPs calling pipeline for Single-end Illumina sequencing
## From raw fastq data to vcf file.
## Pipeline features:
### 1) read preprocessing,
### 2) reads QC, fastq report
### 3) mapping with bwa,
### 4) mapping QC report (multiqc)
### 5) SNPs calling using GATK in parallel mode: reference splitted by sequences: #tasks=#samples X #chrs
a) GATK using "Best Practices Workflows" https://gatk.broadinstitute.org/hc/en-us/sections/360007226651-Best-Practices-Workflows
b) Using hard filtering as outlined in GATK docs
https://gatkforums.broadinstitute.org/gatk/discussion/2806/howto-apply-hard-filters-to-a-call-set
https://gatk.broadinstitute.org/hc/en-us/articles/360035890471-Hard-filtering-germline-short-variants
:exclamation: :exclamation: !! For the reference fasta file, he reference definition line must not contain space or special character.
## Snakemake features: fastq from csv file, config, modules, SLURM
### Workflow steps are descibed in the dag_rules.pdf
### Snakemake rules are based on [Snakemake modules](https://forgemia.inra.fr/gafl/snakemake_modules)
### Files description:
### 1) Snakefile
- Snakefile_para_gatk_SE.smk
### 2) Configuration file in yaml format:, paths, singularity images paths, parameters, GATK filters ....
- config.yaml
### 3) a sbatch file to run the pipeline: (to be edited)
- run_snakemake_pipeline_gatk.slurm
### 4) A slurm directive (#core, mem,...) in json format. Can be adjusted if needed
- cluster.json
### 5) samples file in csv format
Must contens at least 2 columns for SE reads and 3 for PE reads (tab separator )
SampleName fq1 fq2
SampleName : your sample ID
fq1: fastq file for a given sample
fq2: read 2 for paired-end reads
(sp1 ra_R1.fastq.gz ra_R2.fastq.gz)
a sample my have multiple fastq files separated by a ','
(sp1 ra_R1.fastq.gz,rb_R1.fastq.gz ra_R2.fastq.gz,rb_R2.fastq.gz)
- samples.csv
## RUN:
### 1) Edit the config.yaml
### 2) Set your samples in the sample.csv
### 3) Adjust the run_snakemake_pipeline_gatk.slurm file
### 3) Run pipelene in dry run mode first:
`sbatch run_snakemake_pipeline_gatk.slurm`
### 4) uncomment the real run line and run the pipeline:
`sbatch run_snakemake_pipeline_gatk.slurm`
### 5) optionaly run post vcf filtering (example use for GWAS)
1) filter by read depth
2) keep only biallelic allels
3) max missing sample for a site
4) filter by MAF
5) keep only polymorph sites
`sbatch run_post_filters_and_stats.sh`
#### Documentation being written (still)
This diff is collapsed.
{
"__default__" :
{
"jobname" : "{rule}",
"time" : "06:00:00",
"cores" : 1,
"mem" : "6g",
"out" : "{rule}.out"
},
"trim_and_merge_pe" :
{
"jobname" : "{rule}_{wildcards.sample}",
"time" : "06:00:00",
"cores" : 4,
"mem" : "16g",
"out" : "{rule}_{wildcards.sample}.out"
},
"mergePE" :
{
"jobname" : "{rule}_{wildcards.sample}",
"time" : "12:00:00",
"cores" : 10,
"mem" : "22g",
"out" : "{rule}_{wildcards.sample}.out"
},
"merge_bams" :
{
"time" : "12:00:00",
"cores" : 10,
"mem" : "16g"
},
"freebayes" :
{
"jobname" : "{rule}",
"time" : "95:30:00",
"cores" : 1,
"mem" : "24g"
},
"freebayes_parallel" :
{
"time" : "95:30:00",
"cores" : 20,
"mem" : "24g"
},
"fastp_pe" :
{
"jobname" : "{rule}_{wildcards.sample}",
"time" : "08:00:00",
"cores" : 4,
"mem" : "16g",
"out" : "{rule}_{wildcards.sample}.out"
},
"fastp_se" :
{
"jobname" : "{rule}_{wildcards.sample}",
"time" : "08:00:00",
"cores" : 1,
"mem" : "4g",
"out" : "{rule}_{wildcards.sample}.out"
},
"trim_and_merge_pe" :
{
"jobname" : "{rule}_{wildcards.sample}",
"time" : "08:00:00",
"cores" : 4,
"mem" : "16g",
"out" : "{rule}_{wildcards.sample}.out"
},
"cutadapt_pe" :
{
"jobname" : "{rule}_{wildcards.sample}",
"time" : "12:00:00",
"cores" : 4,
"mem" : "16g",
"out" : "{rule}_{wildcards.sample}.out"
},
"fastqc_se" :
{
"jobname" : "{rule}_{wildcards.sample}",
"time" : "04:00:00",
"cores" : 1,
"mem" : "4g",
"out" : "{rule}_{wildcards.sample}.out"
},
"fastqc_pe" :
{
"jobname" : "{rule}_{wildcards.sample}",
"time" : "04:00:00",
"cores" : 2,
"mem" : "4g",
"out" : "{rule}_{wildcards.sample}.out"
},
"multiqc_fastqc" :
{
"time" : "06:00:00",
"cores" : 2,
"mem" : "4g"
},
"bowtie2_index" :
{
"time" : "10:00:00",
"cores" : 1,
"mem" : "32g"
},
"bowtie2_se" :
{
"jobname" : "{rule}_{wildcards.sample}",
"time" : "12:00:00",
"cores" : 10,
"mem" : "32g",
"out" : "{rule}_{wildcards.sample}.out"
},
"bowtie2_pe" :
{
"jobname" : "{rule}_{wildcards.sample}",
"time" : "12:00:00",
"cores" : 10,
"mem" : "32g",
"out" : "{rule}_{wildcards.sample}.out"
},
"bwa_index" :
{
"time" : "20:00:00",
"cores" : 1,
"mem" : "48g"
},
"bwa_se" :
{
"jobname" : "{rule}_{wildcards.sample}",
"time" : "12:00:00",
"cores" : 2,
"mem" : "16g",
"out" : "{rule}_{wildcards.sample}.out"
},
"bwa_pe" :
{
"jobname" : "{rule}_{wildcards.sample}",
"time" : "24:00:00",
"cores" : 8,
"mem" : "32g",
"out" : "{rule}_{wildcards.sample}.out"
},
"bam_stats" :
{
"jobname" : "{rule}_{wildcards.sample}",
"time" : "03:00:00",
"cores" : 1,
"mem" : "8g",
"out" : "{rule}_{wildcards.sample}.out"
},
"remove_duplicates" :
{
"jobname" : "{rule}_{wildcards.sample}",
"time" : "24:00:00",
"cores" : 10,
"mem" : "24g",
"out" : "{rule}_{wildcards.sample}.out"
},
"keep_duplicates" :
{
"jobname" : "{rule}_{wildcards.sample}",
"time" : "12:00:00",
"cores" : 10,
"mem" : "16g",
"out" : "{rule}_{wildcards.sample}.out"
},
"star_index" :
{
"time" : "10:00:00",
"cores" : 20,
"mem" : "36g"
},
"star_index2" :
{
"time" : "10:00:00",
"cores" : 20,
"mem" : "32g"
},
"star_pass1" :
{
"jobname" : "{rule}_{wildcards.sample}",
"time" : "12:00:00",
"cores" : 10,
"mem" : "48g",
"out" : "{rule}_{wildcards.sample}.out"
},
"star_pass2" :
{
"jobname" : "{rule}_{wildcards.sample}",
"time" : "12:00:00",
"cores" : 10,
"mem" : "48g",
"out" : "{rule}_{wildcards.sample}.out"
},
"parse_bam_with_bed" :
{
"jobname" : "{rule}_{wildcards.sample}",
"time" : "12:00:00",
"cores" : 1,
"mem" : "8g",
"out" : "{rule}_{wildcards.sample}.out"
},
"varscan" :
{
"jobname" : "{rule}_{wildcards.sample}",
"time" : "12:00:00",
"cores" : 1,
"mem" : "8g",
"out" : "{rule}_{wildcards.sample}.out"
},
"merge_bams" :
{
"time" : "24:00:00",
"cores" : 20,
"mem" : "48g"
},
"annovar" :
{
"jobname" : "{rule}_{wildcards.sample}",
"time" : "12:00:00",
"cores" : 1,
"mem" : "8g",
"out" : "{rule}_{wildcards.sample}.out"
},
"alternative_proteins" :
{
"jobname" : "{rule}_{wildcards.sample}",
"time" : "12:00:00",
"cores" : 1,
"mem" : "8g",
"out" : "{rule}_{wildcards.sample}.out"
},
"cov_track" :
{
"time" : "12:00:00",
"cores" : 1,
"mem" : "8g"
},
"make_report" :
{
"time" : "2:00:00",
"cores" : 1,
"mem" : "4g"
},
"make_markdown" :
{
"time" : "2:00:00",
"cores" : 1,
"mem" : "4g"
},
"vcf4_to_avinput" :
{
"jobname" : "{rule}_{wildcards.sample}",
"time" : "12:00:00",
"cores" : 1,
"mem" : "8g",
"out" : "{rule}_{wildcards.sample}.out"
},
"whatshap" :
{
"jobname" : "{rule}_{wildcards.sample}",
"time" : "12:00:00",
"cores" : 1,
"mem" : "8g",
"out" : "{rule}_{wildcards.sample}.out"
},
"multiqc_bam" :
{
"time" : "12:00:00",
"cores" : 2,
"mem" : "6g"
},
"bams_list" :
{
"time" : "00:10:00",
"cores" : 1,
"mem" : "2g"
},
"gatk4_hc" :
{
"jobname" : "{rule}_{wildcards.sample}-{wildcards.mychr}",
"time" : "72:00:00",
"cores" : 1,
"mem" : "7g",
"out" : "{rule}_{wildcards.sample}-{wildcards.mychr}.out"
},
"gatk4_combine_calls" :
{
"time" : "48:00:00",
"cores" : 1,
"mem" : "16g"
},
"gatk4_gdb" :
{
"jobname" : "{rule}_{wildcards.mychr}",
"time" : "48:00:00",
"cores" : 1,
"mem" : "16g",
"out" : "{rule}_{wildcards.mychr}.out"
},
"gatk4_gc" :
{
"jobname" : "{rule}_{wildcards.mychr}",
"time" : "72:00:00",
"cores" : 1,
"mem" : "12g",
"out" : "{rule}_{wildcards.mychr}.out"
},
"gatk4_gvcf_map_file" :
{
"jobname" : "{rule}_{wildcards.mychr}",
"out" : "{rule}_{wildcards.mychr}.out"
},
"combinevcf" :
{
"time" : "32:00:00",
"cores" : 1,
"mem" : "16g"
},
"gatk4_select_snps_variants" :
{
"time" : "12:00:00",
"cores" : 1,
"mem" : "12g"
},
"gatk4_select_indels_variants" :
{
"time" : "12:00:00",
"cores" : 1,
"mem" : "12g"
},
"gatk4_filterSnps" :
{
"time" : "24:00:00",
"cores" : 1,
"mem" : "12g"
},
"gatk4_filterIndels" :
{
"time" : "12:00:00",
"cores" : 1,
"mem" : "12g"
},
"gatk4_snps_quality_graphe_pdf" :
{
"time" : "12:00:00",
"cores" : 1,
"mem" : "32g"
},
"gatk4_indels_quality_graphe_pdf" :
{
"time" : "12:00:00",
"cores" : 1,
"mem" : "16g"
}
}
# path or URL to sample sheet (TSV format(tab , no quote), columns: SampleName fq1 fq2 ...)
samplesfile : "Reads_melo_chinois_epgv.csv"
fq_dir : "/work2/project/redd_vat_phyton/DurArmel/reads_melo_chinois"
outdir : "/work2/project/redd_vat_phyton/DurArmel/analysis_payzawat/out"
GENOME : "GCA_009760825.1_ASM976082v1_genomic_chr0_chr12.fasta"
GENOME_FAI : "GCA_009760825.1_ASM976082v1_genomic_chr0_chr12.fasta.fai"
REFPATH : "/work2/project/redd_vat_phyton/DurArmel/analysis_payzawat/ref_payzawat"
# adapter in fasta 4 cutadapt if used
cutadapt_adapters : "cutadapt_adapters.fas"
# remove reads multi mapped on the ref
bam_remove_multi_mapped : "N" #must be Y or N
# GATK filtering:
# https://gatk.broadinstitute.org/hc/en-us/articles/360035890471-Hard-filtering-germline-short-variants
# Quality QUAL
# QualByDepth (QD): https://gatk.broadinstitute.org/hc/en-us/articles/360036711511-QualByDepth
# FisherStrand (FS): https://gatk.broadinstitute.org/hc/en-us/articles/360037592371-FisherStrand
# StrandOddsRatio (SOR): https://gatk.broadinstitute.org/hc/en-us/articles/360037592151-StrandOddsRatio
# RMSMappingQuality (MQ): https://gatk.broadinstitute.org/hc/en-us/articles/360036711191-RMSMappingQuality
# MappingQualityRankSumTest (MQRankSum): https://gatk.broadinstitute.org/hc/en-us/articles/360037592711-MappingQualityRankSumTest
# ReadPosRankSumTest (ReadPosRankSum): https://gatk.broadinstitute.org/hc/en-us/articles/360036360852-ReadPosRankSumTest
#
# https://gatk.broadinstitute.org/hc/en-us/articles/360035531112?id=2806
# For reference, here are some basic filtering thresholds to improve upon.
# QD < 2.0
# QUAL < 30.0
# SOR > 3.0
# FS > 60.0
# MQ < 40.0
# MQRankSum < -12.5
# ReadPosRankSum < -8.0"
# SNPs
snp_QUAL_filter : "< 30.0"
snp_QD_filter : "< 2.0"
snp_FS_filter : "> 60.0"
snp_MQ_filter : "< 40.0"
snp_SOR_filter: "> 3.0"
snp_MQRankSum_filter : "< -12.5"
snp_ReadPosRankSum_filter : "< -8.0"
# # indels:
indel_QUAL_filter : "< 30.0"
indel_QD_filter : "< 2.0"
indel_FS_filter : "> 200.0"
indel_ReadPosRankSum_filter : "< -20.0"
# ---------------------- singularity config -----------------------------
# singularity images location
fastqc_bin : "/work2/project/gafl/tools/containers/fastqc_V0.11.8.sif"
multiqc_bin : "/work2/project/gafl/tools/containers/multiqc_v1.7.sif"
cutadapt_bin : "/work2/project/gafl/tools/containers/cutadapt_v2.7.sif"
fastp_bin : "/work2/project/gafl/tools/containers/fastp_0.20.0.sif"
samtools_bin : "/work2/project/gafl/tools/containers/samtools_v1.9.sif"
bwa_bin : "/work2/project/gafl/tools/containers/bwa_0.7.17.sif"
sambamba_bin : "/work2/project/gafl/tools/containers/sambamba_v0.7.1.sif"
gatk4_bin : "/work2/project/gafl/tools/containers/gatk4_v4.1.4.1.sif"
R_bin : "/work2/project/gafl/tools/containers/R_base_V3.6.0.sif"
# singularity bind for IFB core the home is in: /shared/mfs/data/home
# BIND : "-B /shared/mfs/data/home/jlagnel"
# if no binding:
# Bind gafl01:
#BIND : "-B /work/project/gafl,/nosave/project/gafl,/work2/project/gafl"
# bind Genotoul:
BIND : "-B /work2/project/redd_vat_phyton,/work2/project/gafl"
# singularity modules only for genotoul
#MODULES : "module load system/singularity-2.5.1"
# if singularity is not as module
MODULES : ""
# ----------------------------------------------------------------------
>truseq
TGCACACGTCTGAACTCCAGTCACNNNNNNNNATCTCGTATGCCGTCTTC
>D5xx
AATGATACGGCGACCACCGAGATCTACACNNNNNNNNACACTCTTTCCCTACACGACGCTCTTC
>D7xx
CAAGCAGAAGACGGCATACGAGATNNNNNNNNGTGACTGGAGTTCAGACGTGT
>IlluminaMultiplexingPCRPrimer2.01
TCTGCTTAGATCGGAAGAGCACACGTCTGAACTCCCTGCTTAGATCGGAAG
>adprimer
AGATCGGAAGAGC
library('gridExtra')
library('ggplot2')
args <- commandArgs(trailingOnly = TRUE)
if (length(args) !=4) {
stop("At least 4 arguments must be supplies:\nvcf count table raw\nvcf count table filtered\nTitle for graphs (SNPs ...)\npdf output file\n", call.=FALSE)
}
#log <- file(snakemake@log[[1]], open="wt")
#sink(log)
#sink(log, type="message")
table1 = args[1] # vcf count table raw SNPs|indels
table2 = args[2] # vcf count table filtered SNPs|indels
title = args[3] # Title for graphs ("SNPs" ...)
outfile = args[4] # pdf output file
data1 <- read.csv(table1, header = T, na.strings=c("","NA"), sep = "\t")
data2 <- read.csv(table2, header = T, na.strings=c("","NA"), sep = "\t")
dim(data1)
dim(data2)
leg1 <- gsub(" ", "_", paste("Raw",title))
leg2 <- gsub(" ", "_", paste("Filtered",title))
VCF <- rbind(data1,data2)
VCF$Variant <- factor(c(rep(leg1, dim(data1)[1]),rep(leg2, dim(data2)[1])))
craw <- '#A9E2E4'
cfilt <- '#F4CCCA'
QUAL<- ggplot(VCF, aes(x=QUAL, fill=Variant)) + geom_density(alpha=0.3) + scale_x_continuous(trans = 'log10')
DP <- ggplot(VCF, aes(x=DP, fill=Variant)) + geom_density(alpha=0.3)
QD <- ggplot(VCF, aes(x=QD, fill=Variant)) + geom_density(alpha=.3) + geom_vline(xintercept=20, size=0.7)
FS <- ggplot(VCF, aes(x=FS, fill=Variant)) + geom_density(alpha=.3)
MQ <- ggplot(VCF, aes(x=MQ, fill=Variant)) + geom_density(alpha=.3) + geom_vline(xintercept=40, size=0.7)
MQRankSum <- ggplot(VCF, aes(x=MQRankSum, fill=Variant)) + geom_density(alpha=.3)
SOR <- ggplot(VCF, aes(x=SOR, fill=Variant)) + geom_density(alpha=.3) #+ geom_vline(xintercept=c(4, 10), size=1, colour = c(craw,cfilt))
ReadPosRankSum <- ggplot(VCF, aes(x=ReadPosRankSum, fill<