Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • laure.heurtevin/workflows
  • cervin.guyomar/workflows
  • bios4biol/workflows
3 results
Show changes
Commits on Source (23)
Showing
with 797 additions and 56 deletions
# coding: utf-8
__author__ = "Kenza BAZI KABBAJ - Sigenae"
__version__ = "1.0.0"
"""
Rules for BamStats. This step will collect statistics and generate a bed file detailing callable, uncallable, poorly mapped and other parts of the genome using GATK CallableLoci. Secondly, the average read coverage is calculated using Gatk DepthOfCoverage.
"""
rule callableLoci:
input:
bam="results/recalibrated_bams/{sample}_dedup_recal.bam",
ref=config["reference_genome"]
output:
summary=protected("results/BamStats/{sample}.CallableLoci.summary.txt"),
CallableLoci=protected("results/BamStats/{sample}.CallableLoci.bed")
log:
stderr="results/BamStats/logs/{sample}_CallableLoci.stderr",
stdout="results/BamStats/logs/{sample}_CallableLoci.stdout"
params:
mem=str(config["callableLoci"]["mem"])
shell:
"java -Xmx{params.mem} -jar {config[gatk]} -T CallableLoci -R {input.ref} -I {input.bam} -summary {output.summary} -o {output.CallableLoci} 2> {log.stderr} > {log.stdout}"
rule depthOfCoverage:
input:
bam="results/recalibrated_bams/{sample}_dedup_recal.bam",
ref=config["reference_genome"]
output:
protected("results/BamStats/{sample}_dedup_recal.coverage.sample_cumulative_coverage_counts"),
protected("results/BamStats/{sample}_dedup_recal.coverage.sample_interval_statistics"),
protected("results/BamStats/{sample}_dedup_recal.coverage.sample_interval_summary"),
protected("results/BamStats/{sample}_dedup_recal.coverage.sample_statistics"),
protected("results/BamStats/{sample}_dedup_recal.coverage.sample_summary"),
protected("results/BamStats/{sample}_dedup_recal.coverage.sample_cumulative_coverage_proportions")
log:
stderr="results/BamStats/logs/{sample}_DepthOfCoverage.stderr",
stdout="results/BamStats/logs/{sample}_DepthOfCoverage.stdout"
params:
mem=str(config["depthOfCoverage"]["mem"])
shell:
"java -Xmx{params.mem} -jar {config[gatk]} -T DepthOfCoverage -R {input.ref} -I {input.bam} --omitDepthOutputAtEachBase --logging_level ERROR --summaryCoverageThreshold 10 --summaryCoverageThreshold 20 --summaryCoverageThreshold 30 --summaryCoverageThreshold 40 --summaryCoverageThreshold 50 --summaryCoverageThreshold 80 --summaryCoverageThreshold 90 --summaryCoverageThreshold 100 --summaryCoverageThreshold 150 --minBaseQuality 15 --minMappingQuality 30 --start 1 --stop 1000 --nBins 999 -dt NONE -o results/BamStats/{wildcards.sample}_dedup_recal.coverage 2> {log.stderr} > {log.stdout}"
# coding: utf-8
__author__ = "Kenza BAZI KABBAJ - Sigenae"
__version__ = "1.0.0"
from snakemake.exceptions import MissingInputException
"""
Rules for mapping trimmed paired reads with bwa mem (forward paired, forward unpaired, reverse paired, reverse unpaired). Bam files are sorted and indexed with samtools.
"""
# Get ID names of each PE sample
UNIT_TO_SAMPLE_PE = {
ID: sample for sample, units in PE_samples.items()
for ID in units}
rule bwa_mem_pe:
input:
config["reference_genome"],
"results/trimmomatic/PE/{unit}_1P.fastq.gz",
"results/trimmomatic/PE/{unit}_2P.fastq.gz"
output:
bam=temp("results/mapping/PE/{unit}-pe.bam")
params:
ID=lambda wildcards: UNIT_TO_SAMPLE_PE[wildcards.unit],
PLATEFORM=lambda wildcards: plateform[UNIT_TO_SAMPLE_PE[wildcards.unit]]
threads: config["bwa_mem_pe"]["cpu"]
log:
"results/mapping/PE/logs/{unit}_mapping_PE.stderr",
"results/mapping/PE/logs/{unit}_samtools_view_PE.stderr"
shell:
"{config[bwa]} mem -M -t {threads} -R '@RG\\tID:{wildcards.unit}_1P\\tPL:{params.PLATEFORM}\\tSM:{params.ID}' {input} 2> {log[0]} | {config[samtools]} view -@ {threads} -bS - > {output.bam} 2> {log[1]} "
rule bwa_pe_sort:
input:
temp("results/mapping/PE/{unit}-pe.bam")
output:
temp("results/mapping/PE/{unit}-pe.sorted.bam")
threads:
config["bwa_pe_sort"]["cpu"]
log:
"results/mapping/PE/logs/{unit}_samtools_sort_PE.stderr"
shell:
"{config[samtools]} sort {input} -@ {threads} -o {output} 2> {log}"
rule bwa_pe_index:
input:
temp("results/mapping/PE/{unit}-pe.sorted.bam")
output:
temp("results/mapping/PE/{unit}-pe.sorted.bam.bai")
threads:
config["bwa_pe_index"]["cpu"]
log:
"results/mapping/PE/logs/{unit}_samtools_index_PE.stderr"
shell:
" {config[samtools]} index {input} -@ {threads} {output} 2> {log}"
rule bwa_mem_se_R1:
input:
config["reference_genome"],
"results/trimmomatic/PE/{unit}_1U.fastq.gz"
output:
bam=temp("results/mapping/PE/{unit}_1U-se.bam")
params:
ID=lambda wildcards: UNIT_TO_SAMPLE_PE[wildcards.unit],
PLATEFORM=lambda wildcards: plateform[UNIT_TO_SAMPLE_PE[wildcards.unit]]
threads: config["bwa_mem_se_R1"]["cpu"]
log:
"results/mapping/PE/logs/{unit}_mapping_R1.stderr",
"results/mapping/PE/logs/{unit}_samtools_view_R1.stderr"
shell:
"{config[bwa]} mem -M -t {threads} -R '@RG\\tID:{wildcards.unit}_1U\\tPL:{params.PLATEFORM}\\tSM:{params.ID}' {input} 2> {log[0]} | {config[samtools]} view -@ {threads} -bS - > {output.bam} 2> {log[1]} "
rule bwa_se_R1_sort:
input:
temp("results/mapping/PE/{unit}_1U-se.bam")
output:
temp("results/mapping/PE/{unit}_1U-se.sorted.bam")
threads:
config["bwa_se_R1_sort"]["cpu"]
log:
"results/mapping/PE/logs/{unit}_samtools_sort_se_R1.stderr"
shell:
"{config[samtools]} sort {input} -@ {threads} -o {output} 2> {log}"
rule bwa_se_R1_index:
input:
temp("results/mapping/PE/{unit}_1U-se.sorted.bam")
output:
temp("results/mapping/PE/{unit}_1U-se.sorted.bam.bai")
threads:
config["bwa_se_R1_index"]["cpu"]
log:
"results/mapping/PE/logs/{unit}_samtools_index_se_R1.stderr"
shell:
" {config[samtools]} index {input} -@ {threads} {output} 2> {log}"
rule bwa_mem_se_R2:
input:
config["reference_genome"],
"results/trimmomatic/PE/{unit}_2U.fastq.gz"
output:
bam=temp("results/mapping/PE/{unit}_2U-se.bam")
params:
ID=lambda wildcards: UNIT_TO_SAMPLE_PE[wildcards.unit],
PLATEFORM=lambda wildcards: plateform[UNIT_TO_SAMPLE_PE[wildcards.unit]]
threads: config["bwa_mem_se_R2"]["cpu"]
log:
"results/mapping/PE/logs/{unit}_mapping_R2.stderr",
"results/mapping/PE/logs/{unit}_samtools_view_R2.stderr"
shell:
"{config[bwa]} mem -M -t {threads} -R '@RG\\tID:{wildcards.unit}_2U\\tPL:{params.PLATEFORM}\\tSM:{params.ID}' {input} 2> {log[0]} | {config[samtools]} view -@ {threads} -bS - > {output.bam} 2> {log[1]}"
rule bwa_se_R2_sort:
input:
temp("results/mapping/PE/{unit}_2U-se.bam")
output:
temp("results/mapping/PE/{unit}_2U-se.sorted.bam")
threads:
config["bwa_se_R2_sort"]["cpu"]
log:
"results/mapping/PE/logs/{unit}_samtools_sort_se_R2.stderr"
shell:
"{config[samtools]} sort {input} -@ {threads} -o {output} 2> {log}"
rule bwa_se_R2_index:
input:
temp("results/mapping/PE/{unit}_2U-se.sorted.bam")
output:
temp("results/mapping/PE/{unit}_2U-se.sorted.bam.bai")
threads:
config["bwa_se_R2_index"]["cpu"]
log:
"results/mapping/PE/logs/{unit}_samtools_index_se_R2.stderr"
shell:
" {config[samtools]} index {input} -@ {threads} {output} 2> {log}"
# coding: utf-8
__author__ = "Kenza BAZI KABBAJ - Sigenae"
__version__ = "1.0.0"
from snakemake.exceptions import MissingInputException
"""
Rules for mapping trimmed single-end reads with bwa mem. Bam file(s) are sorted and indexed with samtools.
"""
# Get ID names of each SE sample
UNIT_TO_SAMPLE_SE = {
ID: sample for sample, units in SE_samples.items()
for ID in units}
rule bwa_mem_se:
input:
config["reference_genome"],
"results/trimmomatic/SE/{unit}.fastq.gz"
output:
bam=temp("results/mapping/SE/{unit}-se.bam")
params:
ID=lambda wildcards: UNIT_TO_SAMPLE_SE[wildcards.unit],
PLATEFORM=lambda wildcards: plateform[UNIT_TO_SAMPLE_SE[wildcards.unit]]
threads: config["bwa_mem_se"]["cpu"]
log:
"results/mapping/SE/logs/{unit}_mapping_SE.stderr",
"results/mapping/SE/logs/{unit}_samtools_view_SE.stderr"
shell:
"{config[bwa]} mem -M -t {threads} -R '@RG\\tID:{wildcards.unit}\\tPL:{params.PLATEFORM}\\tSM:{params.ID}' {input} 2> {log[0]} | {config[samtools]} view -@ {threads} -bS - > {output.bam} 2> {log[1]}"
rule bwa_se_sort:
input:
temp("results/mapping/SE/{unit}-se.bam")
output:
temp("results/mapping/SE/{unit}-se.sorted.bam")
threads:
config["bwa_se_sort"]["cpu"]
log:
"results/mapping/SE/logs/{unit}_samtools_sort_SE.stderr"
shell:
"{config[samtools]} sort {input} -@ {threads} -o {output} 2> {log}"
rule bwa_se_index:
input:
temp("results/mapping/SE/{unit}-se.sorted.bam")
output:
temp("results/mapping/SE/{unit}-se.sorted.bam.bai")
threads:
config["bwa_se_index"]["cpu"]
log:
"results/mapping/SE/logs/{unit}_samtools_index_SE.stderr"
shell:
" {config[samtools]} index {input} -@ {threads} {output} 2> {log}"
# coding: utf-8
import os
__author__ = "Kenza BAZI KABBAJ - Sigenae"
__version__ = "1.0.0"
"""
Rules for analysing fastq files with FastQC.
"""
rule fastqc:
input:
reads="data/reads/{prefix}.fastq.gz"
output:
protected("results/fastqc/{prefix}_fastqc.zip"),
temp("results/fastqc/{prefix}_fastqc.html")
threads: config["fastqc"]["cpu"]
log:
"results/fastqc/logs/{prefix}_fastqc.stderr"
shell:
"{config[Fastqc]} -q -t {threads} {input.reads} -o results/fastqc 2> {log}"
rule md5sum_fastqc:
input:
"results/fastqc/{prefix}_fastqc.zip"
output:
"results/md5sum_codes/{prefix}_fastqc.zip.md5"
log:
"results/md5sum_codes/logs/{prefix}_fastqc.stderr"
shell:
"{config[md5sum]} {input} > {output} 2> {log}"
# coding: utf-8
__author__ = "Kenza BAZI KABBAJ - Sigenae"
__version__ = "1.0.0"
"""
Rule for variant calling with GATK haplotypeCaller tool. The outputs are zipped g.vcf file(s) per sample.
"""
rule haplotypeCaller:
input:
bam="results/recalibrated_bams/{sample}_dedup_recal.bam",
ref=config["reference_genome"]
output:
protected("results/variantcalling/{sample}_dedup_recal.g.vcf.gz")
threads: config["haplotypeCaller"]["cpu"]
log:
stderr="results/variantcalling/logs/{sample}_calling.stderr",
stdout="results/variantcalling/logs/{sample}_calling.stdout"
params:
mem=config["haplotypeCaller"]["mem"]
shell:
"java -Xmx{params.mem} -jar {config[gatk]} -T HaplotypeCaller -nct {threads} -R {input.ref} -I {input.bam} -o {output} -ERC GVCF -variant_index_type LINEAR -variant_index_parameter 128000 2> {log.stderr} > {log.stdout}"
# coding: utf-8
__author__ = "Kenza BAZI KABBAJ - Sigenae"
__version__ = "1.0.0"
"""
Rule for marking PCR and optical duplicates using Picard MarkDuplicates. Note that the option OPTICAL_DUPLICATE_PIXEL_DISTANCE equals 100 for data generated on non-arrayed flowcells or equals 2500 for arrayed flowcell data. The outputs are marked bam files per sample.
"""
rule markduplicates:
"""
merged bam files are marked with markduplicates tool
"""
input:
"results/sample_merge/{sample}.sorted.bam"
output:
temp("results/marked_bams/{sample}_dedup.bai"),
marked_bam=temp("results/marked_bams/{sample}_dedup.bam"),
metrics=protected("results/marked_bams/{sample}_dedup.metrics")
log:
stderr="results/marked_bams/logs/{sample}_duplicates.stderr",
stdout="results/marked_bams/logs/{sample}_duplicates.stdout"
params:
mem=config["markduplicates"]["mem"]
run:
arrayed_flowcells=["GAIIx", "HiSeq1500", "HiSeq2000", "HiSeq2500"]
non_arrayed_flowcells=["HiSeqX", "HiSeq3000", "HiSeq4000", "NovaSeq"]
if Type_flowcell[wildcards.sample] in arrayed_flowcells:
flowcell_param="OPTICAL_DUPLICATE_PIXEL_DISTANCE=100"
elif Type_flowcell[wildcards.sample] in non_arrayed_flowcells:
flowcell_param="OPTICAL_DUPLICATE_PIXEL_DISTANCE=2500"
mem=str(int(params.mem.replace("G","").replace("g",""))-4)+"G"
shell("java -Xmx{mem} -jar {config[picard]} MarkDuplicates I={input} O={output.marked_bam} M={output.metrics} {flowcell_param} CREATE_INDEX=true VALIDATION_STRINGENCY=LENIENT 2> {log.stderr} > {log.stdout} ")
# coding: utf-8
__author__ = "Kenza BAZI KABBAJ - Sigenae"
__version__ = "1.0.0"
"""
generates md5sum code for each output file shared with the consortium
"""
rule md5sum_gvcf:
input:
"results/variantcalling/{sample}_dedup_recal.g.vcf.gz"
output:
"results/md5sum_codes/{sample}_dedup_recal.g.vcf.gz.md5"
log:
"results/md5sum_codes/logs/md5sum.stderr"
shell:
"{config[md5sum]} {input} > {output} 2>> {log}"
rule md5sum_bams:
input:
"results/recalibrated_bams/{sample}_dedup_recal.bam"
output:
"results/md5sum_codes/{sample}_dedup_recal.bam.md5"
log:
"results/md5sum_codes/logs/md5sum.stderr"
shell:
"{config[md5sum]} {input} > {output} 2>> {log}"
rule md5sum_recal_pdf:
input:
"results/recalibrated_bams/{sample}_recal_plots.pdf"
output:
"results/md5sum_codes/{sample}_recal_plots.pdf.md5"
log:
"results/md5sum_codes/logs/md5sum.stderr"
shell:
"{config[md5sum]} {input} > {output} 2>> {log}"
rule md5sum_metrics:
input:
"results/marked_bams/{sample}_dedup.metrics"
output:
"results/md5sum_codes/{sample}_dedup.metrics.md5"
log:
"results/md5sum_codes/logs/md5sum.stderr"
shell:
"{config[md5sum]} {input} > {output} 2>> {log}"
rule md5sum_callableLoci:
input:
"results/BamStats/{sample}.CallableLoci.bed"
output:
"results/md5sum_codes/{sample}.CallableLoci.bed.md5"
log:
"results/md5sum_codes/logs/md5sum.stderr"
shell:
"{config[md5sum]} {input} > {output} 2>> {log}"
# coding: utf-8
import glob
__author__ = "Kenza BAZI KABBAJ - Sigenae"
__version__ = "1.0.0"
"""
generates md5sum code for each output file shared with the consortium
"""
rule md5sum_gvcf:
input:
"results/variantcalling/{sample}_dedup_recal.g.vcf.gz"
output:
protected("results/md5sum_codes/{sample}_dedup_recal.g.vcf.gz.md5")
log:
"results/md5sum_codes/logs/md5sum_{sample}.stderr"
shell:
"{config[md5sum]} {input} > {output} 2>> {log}"
rule md5sum_bams:
input:
"results/recalibrated_bams/{sample}_dedup_recal.bam"
output:
protected("results/md5sum_codes/{sample}_dedup_recal.bam.md5")
log:
"results/md5sum_codes/logs/md5sum_{sample}.stderr"
shell:
"{config[md5sum]} {input} > {output} 2>> {log}"
rule md5sum_recal_pdf:
input:
"results/recalibrated_bams/{sample}_recal_plots.pdf"
output:
protected("results/md5sum_codes/{sample}_recal_plots.pdf.md5")
log:
"results/md5sum_codes/logs/md5sum_{sample}.stderr"
shell:
"{config[md5sum]} {input} > {output} 2>> {log}"
rule md5sum_metrics:
input:
"results/marked_bams/{sample}_dedup.metrics"
output:
protected("results/md5sum_codes/{sample}_dedup.metrics.md5")
log:
"results/md5sum_codes/logs/md5sum_{sample}.stderr"
shell:
"{config[md5sum]} {input} > {output} 2>> {log}"
rule md5sum_callableLoci_bed:
input:
"results/BamStats/{sample}.CallableLoci.bed"
output:
protected("results/md5sum_codes/{sample}.CallableLoci.bed.md5")
log:
"results/md5sum_codes/logs/md5sum_{sample}.stderr"
shell:
"{config[md5sum]} {input} > {output} 2>> {log}"
rule md5sum_callableLoci_summary:
input:
"results/BamStats/{sample}.CallableLoci.summary.txt"
output:
protected("results/md5sum_codes/{sample}.CallableLoci.summary.txt.md5")
log:
"results/md5sum_codes/logs/md5sum_{sample}.stderr"
shell:
"{config[md5sum]} {input} > {output} 2>> {log}"
rule md5sum_coverage_counts:
input:
"results/BamStats/{sample}_dedup_recal.coverage.sample_cumulative_coverage_counts"
output:
protected("results/md5sum_codes/{sample}_dedup_recal.coverage.sample_cumulative_coverage_counts.md5")
log:
"results/md5sum_codes/logs/md5sum_{sample}.stderr"
shell:
"{config[md5sum]} {input} > {output} 2>> {log}"
rule md5sum_coverage_proportions:
input:
"results/BamStats/{sample}_dedup_recal.coverage.sample_cumulative_coverage_proportions"
output:
protected("results/md5sum_codes/{sample}_dedup_recal.coverage.sample_cumulative_coverage_proportions.md5")
log:
"results/md5sum_codes/logs/md5sum_{sample}.stderr"
shell:
"{config[md5sum]} {input} > {output} 2>> {log}"
rule md5sum_coverage_stats1:
input:
"results/BamStats/{sample}_dedup_recal.coverage.sample_interval_statistics"
output:
protected("results/md5sum_codes/{sample}_dedup_recal.coverage.sample_interval_statistics.md5")
log:
"results/md5sum_codes/logs/md5sum_{sample}.stderr"
shell:
"{config[md5sum]} {input} > {output} 2>> {log}"
rule md5sum_coverage_summary1:
input:
"results/BamStats/{sample}_dedup_recal.coverage.sample_interval_summary"
output:
protected("results/md5sum_codes/{sample}_dedup_recal.coverage.sample_interval_summary.md5")
log:
"results/md5sum_codes/logs/md5sum_{sample}.stderr"
shell:
"{config[md5sum]} {input} > {output} 2>> {log}"
rule md5sum_coverage_stats2:
input:
"results/BamStats/{sample}_dedup_recal.coverage.sample_statistics"
output:
protected("results/md5sum_codes/{sample}_dedup_recal.coverage.sample_statistics.md5")
log:
"results/md5sum_codes/logs/md5sum_{sample}.stderr"
shell:
"{config[md5sum]} {input} > {output} 2>> {log}"
rule md5sum_coverage_summary2:
input:
"results/BamStats/{sample}_dedup_recal.coverage.sample_summary"
output:
protected("results/md5sum_codes/{sample}_dedup_recal.coverage.sample_summary.md5")
log:
"results/md5sum_codes/logs/md5sum_{sample}.stderr"
shell:
"{config[md5sum]} {input} > {output} 2>> {log}"
# coding: utf-8
__author__ = "Kenza BAZI KABBAJ - Sigenae"
__version__ = "1.0.0"
"""
Rules for base quality recalibration using GATK BaseRecalibrator and PrintReads tools. It will reprocess quality score for each nucleotids in reads based on a set of known variants. In addition, AnalyzeCovariates tool is used to generates plots in order to visualize the effects of the recalibration process (before and after recalibration). The outputs are recalibrated bam files per sample and pdf statistic report per sample.
"""
rule baserecalibrator_1:
input:
bai="results/marked_bams/{sample}_dedup.bai",
bam="results/marked_bams/{sample}_dedup.bam",
ref=config["reference_genome"],
known_variants=config["known_variants"]
output:
protected("results/recalibrated_bams/{sample}.before.recal.table")
threads: config["baserecalibrator_1"]["cpu"]
log:
stderr="results/recalibrated_bams/logs/{sample}_baserecalibrator_1.stderr",
stdout="results/recalibrated_bams/logs/{sample}_baserecalibrator_1.stdout"
params:
mem= config["baserecalibrator_1"]["mem"]
shell:
"java -Xmx{params.mem} -jar {config[gatk]} -T BaseRecalibrator -nct {threads} -R {input.ref} -I {input.bam} -knownSites:vcf {input.known_variants} -o {output} 2> {log.stderr} > {log.stdout}"
rule printreads:
input:
bai="results/marked_bams/{sample}_dedup.bai",
bam="results/marked_bams/{sample}_dedup.bam",
recal_table="results/recalibrated_bams/{sample}.before.recal.table",
ref=config["reference_genome"]
output:
protected("results/recalibrated_bams/{sample}_dedup_recal.bam")
threads: config["printreads"]["cpu"]
log:
stderr="results/recalibrated_bams/logs/{sample}_printreads.stderr",
stdout="results/recalibrated_bams/logs/{sample}_printreads.stdout"
params:
mem=config["printreads"]["mem"]
shell:
"java -Xmx{params.mem} -jar {config[gatk]} -T PrintReads -nct {threads} -R {input.ref} -I {input.bam} -BQSR {input.recal_table} -o {output} 2> {log.stderr} > {log.stdout}"
rule baserecalibrator_2:
input:
bai="results/marked_bams/{sample}_dedup.bai",
bam="results/marked_bams/{sample}_dedup.bam",
ref=config["reference_genome"],
known_variants=config["known_variants"],
recal_table="results/recalibrated_bams/{sample}.before.recal.table"
output:
protected("results/recalibrated_bams/{sample}.after.recal.table")
threads: config["baserecalibrator_2"]["cpu"]
log:
stderr="results/recalibrated_bams/logs/{sample}_baserecalibrator_2.stderr",
stdout="results/recalibrated_bams/logs/{sample}_baserecalibrator_2.stdout"
params:
mem= config["baserecalibrator_2"]["mem"]
shell:
"java -Xmx{params.mem} -jar {config[gatk]} -T BaseRecalibrator -nct {threads} -R {input.ref} -I {input.bam} -knownSites:vcf {input.known_variants} -BQSR {input.recal_table} -o {output} 2> {log.stderr} > {log.stdout}"
rule analyzeCovariates:
input:
before_recal_table="results/recalibrated_bams/{sample}.before.recal.table",
after_recal_table="results/recalibrated_bams/{sample}.after.recal.table",
ref=config["reference_genome"]
output:
protected("results/recalibrated_bams/{sample}_recal_plots.pdf")
log:
stderr="results/recalibrated_bams/logs/{sample}_analyzeCovariates.stderr",
stdout="results/recalibrated_bams/logs/{sample}_analyzeCovariates.stdout"
params:
mem= config["analyzeCovariates"]["mem"]
shell: "java -Xmx{params.mem} -jar {config[gatk]} -T AnalyzeCovariates -R {input.ref} -before {input.before_recal_table} -after {input.after_recal_table} -plots {output} 2> {log.stderr} > {log.stdout}"
# coding: utf-8
__author__ = "Kenza BAZI KABBAJ - Sigenae"
__version__ = "1.0.1"
"""
Rules for trimming paired reads of adapter and filtering out reads with mean qscore < 20 or length < 35bp using trimmomatic PE tool. Note that for modern Illumina adapter sequences, the fastq files are Phred+33 encoded. So before trimming, this rule will check the quality encoding of each fasqt file (Phred+33 or Phred+64) with fastq_dectect.pl script. if fastq files are Phred+64 encoded, then an option is added to trimmomatic 'TOPHRED33' to convert to Phred+33 encoding. 4 outputs are generated: forward paired, forward unpaired, reverse paired, reverse unpaired in fastq format.
"""
# guess PE fastq encoding (Phred33 or Phred64)
rule guess_encoding_pe:
input:
lambda wildcards: PE_reads[wildcards.unit]
output:
temp("results/trimmomatic/encoding/{unit}_encoding.txt")
shell:
"perl scripts/fastq_detect.pl {input[0]} 1000 > {output}"
rule trimmomatic_pe:
input:
reads=lambda wildcards: PE_reads[wildcards.unit],
adapter=config["adapter_PE"],
encoding="results/trimmomatic/encoding/{unit}_encoding.txt"
output:
temp("results/trimmomatic/PE/{unit}_1P.fastq.gz"),
temp("results/trimmomatic/PE/{unit}_2P.fastq.gz"),
temp("results/trimmomatic/PE/{unit}_1U.fastq.gz"),
temp("results/trimmomatic/PE/{unit}_2U.fastq.gz"),
protected("results/trimmomatic/PE/{unit}.summary")
threads: config["trimmomatic_pe"]["cpu"]
log:
"results/trimmomatic/PE/logs/{unit}_trimming_PE.stderrs"
params:
mem= config["trimmomatic_pe"]["mem"]
run:
#add tophred33 option to trimmomatic if the file is phred+64 encoded.
for i in open(input.encoding, "r"):
if "#" not in i:
if i.strip().split()[0] == "Phred+64" or i.strip().split()[0] == "Solexa+64":
add_option="TOPHRED33"
elif i.strip().split()[0] == "Phred+33":
add_option=""
shell("java -Xmx{params.mem} -jar {config[trimmomatic]} PE -threads {threads} -summary {output[4]} {input.reads} {output[0]} {output[2]} {output[1]} {output[3]} ILLUMINACLIP:{input.adapter}:2:30:3:1:true LEADING:20 TRAILING:20 SLIDINGWINDOW:3:15 AVGQUAL:20 MINLEN:35 "+add_option+" 2> {log}")
rule md5sum_trim_pe:
input:
"results/trimmomatic/PE/{unit}.summary"
output:
protected("results/md5sum_codes/{unit}.summary_PE.md5")
log:
"results/md5sum_codes/logs/{unit}.summary_PE.stderr"
shell:
"{config[md5sum]} {input} > {output} 2> {log}"
# coding: utf-8
__author__ = "Kenza BAZI KABBAJ - Sigenae"
__version__ = "1.0.1"
"""
Rules for trimming single-end reads of adapter and filtering out reads with mean qscore < 20 or length < 35bp using trimmomatic PE tool. Note that for modern Illumina adapter sequences, the fastq files are Phred+33 encoded. So before trimming, this rule will check the quality encoding of each fasqt file (Phred+33 or Phred+64) with fastq_dectect.pl script. if fastq files are Phred+64 encoded, then an option is added to trimmomatic 'TOPHRED33' to convert to Phred+33 encoding. One output file is produced in fastq format.
"""
# guess PE fastq encoding (Phred33 or Phred64)
rule guess_encoding_se:
input:
lambda wildcards: SE_reads[wildcards.unit]
output:
temp("results/trimmomatic/encoding/{unit}_encoding.txt")
shell:
"perl scripts/fastq_detect.pl {input[0]} 1000 > {output}"
rule trimmomatic_se:
input:
reads=lambda wildcards: SE_reads[wildcards.unit],
adapter=config["adapter_SE"],
encoding="results/trimmomatic/encoding/{unit}_encoding.txt"
output:
temp("results/trimmomatic/SE/{unit}.fastq.gz"),
protected("results/trimmomatic/SE/{unit}.summary")
threads: config["trimmomatic_se"]["cpu"]
log:
"results/trimmomatic/SE/logs/{unit}_trimming_SE.stderrs"
params:
mem= config["trimmomatic_se"]["mem"]
run:
#add tophred33 option to trimmomatic if the file is phred+64 encoded.
for i in open(input.encoding, "r"):
if "#" not in i:
if i.strip().split()[0] == "Phred+64" or i.strip().split()[0] == "Solexa+64":
add_option="TOPHRED33"
else:
add_option=""
shell("java -Xmx{params.mem} -jar {config[trimmomatic]} SE -threads {threads} -summary {output[1]} {input.reads} {output[0]} ILLUMINACLIP:{input.adapter}:2:30:3:1:true LEADING:20 TRAILING:20 SLIDINGWINDOW:3:15 AVGQUAL:20 MINLEN:35 "+add_option+" 2> {log}")
rule md5sum_trim_se:
input:
"results/trimmomatic/SE/{unit}.summary"
output:
protected("results/md5sum_codes/{unit}.summary_SE.md5")
log:
"results/md5sum_codes/logs/{unit}.summary_SE.stderr"
shell:
"{config[md5sum]} {input} > {output} 2> {log}"
# Allel Specific Expression Analysis workflow
authors__: Maria Bernard (INRAE - SIGENAE) / Kevin Muret - Frederic Jehl - Sandrine Lagarrigue (INRAE - PEGASE)
authors__: Maria Bernard (INRAE - SIGENAE) / Kevin Muret - Frederic Jehl - Sandrine Lagarrigue ( AGROCAMPUS OUEST / INRAE - PEGASE)
**<u>!!! STATUT : DEVELOPMENT !!!</u>**
......@@ -18,9 +18,9 @@ This pipeline will perform variant filtering, reads alignment on masked genome a
The user must use the [config_calling.yaml](config_calling.yaml.example) file to provide all necessary inputs for the pipeline:
- A masked reference fasta file (fasta_ref option), indexed with samtools faidx and GATK4 CreateSequenceDictionary
- A masked reference fasta file (fasta_ref option), indexed with samtools faidx and GATK4 CreateSequenceDictionary. We recommend to mask genome with variant filtered bi-allelic and on GATK metrics FS < 30, QD > 2 and AF < 1. In our used case we mask genome on variant construct at the population level, and then we launch this pipeline at the tissue level.
- A genome reference GTF file (gtf_ref option)
- An genome annotation reference GTF file (gtf_ref option)
- A set of variant to include in the ASE analysis
......@@ -47,6 +47,8 @@ The user must use the [config_calling.yaml](config_calling.yaml.example) file to
First step is to filter and trim raw reads before alignment. This step will convert phred 64 scale fastq files into phred 33 scale fastq files and then trim reads with [trim_galore](https://www.bioinformatics.babraham.ac.uk/projects/trim_galore/). It will trim adapter and filtered out trimmed reads that are smaller than the third of the read length.
By default the trimming quality threshold is set to 15.
You will find in Result/Summary directory, a Log_TrimGalore_summary.tsv that detail the trim_galor statistics.
......@@ -77,7 +79,7 @@ Finally reads that contain Ns in their cigar string will be splittd, and alignme
Variants will be filtered before submitting to ASE.
Only SNP with FS < or equal to 30.0 and QD < or equal to 2.0 will be used in ASE.
Only bi-allelic SNP with FS < or equal to 30.0 and QD < or equal to 2.0 will be used in ASE.
Filtering on SNP type is an option, see SNP_filter in the configuration file. By default it is set to False as the calling pipeline generate output VCF file filtered on SNP type.
......@@ -92,19 +94,28 @@ The pipeline will also add annotation in the INFO field in the final VCF file (s
Additionally, the pipeline will generate 3 tables that extract samples GT, DP and AD_REF and AD_ALT genotypes annotations. These tables will include only SNP that have a genotype call rate greater than 50% and a genotype with at least 5 reads coverage call rate greater than 20%.
### Compute Allele Specific Expression
Alleles Specific Expression will be obtained using phASER (https://github.com/secastel/phaser) and ASE Read Counter (https://gatk.broadinstitute.org/hc/en-us/articles/360036714351-ASEReadCounter), on bi-allelic variants filtered on FS < 30, and QD > 2.
ASEReadCounter and phASER returnby sample, allele expression on each SNP.
phASER will also return gene allele specific espression by constructing gene haplotypes by sample, and finally an expression matrix which agglomerate all the samples.
## 3. Dependencies
The workflow depends on:
* cutadapt
* trimgalore (version 0.4.5)
* STAR (version 2.5.2b)
* picard.jar (version 2.1.1)
* samtools (version 1.3.1 )
* rsem-prepare-reference ( RSEM version 1.3.0)
* rsem-calculate-expression ( RSEM version 1.3.0)
* GenomeAnalysisTK.jar (version 3.7)
* cutadapt (version 2.1)
* trimgalore (version 0.6.5)
* STAR (version 2.6.0c)
* samtools (version 1.9 )
* bcftools (version 1.9)
* bedtools( version 2.29.0)
* tabix (version 0.2.5)
* GenomeAnalysisTK.jar (version 3.7) which include Picard tools and ASE Read Counter
* java (version 8)
* phASER (cloned the 29th of May 2020)
Theses softwares need to be available in you PATH or in a bin directory precised in the config.yaml file with the bin_dir option.
......
......@@ -159,9 +159,8 @@ for sample in table["sample_name"].unique():
final_outputs.append('Results_ASE/phASER/' + sample + '_phASER.allelic_counts.txt')
final_outputs.append('Results_ASE/phASER_gene_ae/' + sample + '_phASER.gene_ae.txt')
# TO FINISH AND TEST phaser_expr_matrix
# final_outputs.append('Results_ASE/phASER_pop/' + os.path.basename(config['vcf']).replace('.vcf.gz','_phASER.bed'))
# final_outputs.append('Results_ASE/phASER_pop/' + os.path.basename(config['vcf']).replace('.vcf.gz','_phASER.gw_phased.bed'))
final_outputs.append('Results_ASE/phASER_pop/' + os.path.basename(config['vcf']).replace('.vcf.gz','_phASER.bed.gz'))
final_outputs.append('Results_ASE/phASER_pop/' + os.path.basename(config['vcf']).replace('.vcf.gz','_phASER.gw_phased.bed.gz'))
# print(final_outputs)
# ['Results_ASE/STAR_Aln_2/samplePE_rg_genomic_pp_rmdup_uniq_split.bam',
......
Snakemake/1000RNASeq_chicken/ASE/img/full_dryrun.png

231 KiB

......@@ -49,7 +49,8 @@ SplitNCigarReads:
phASER:
cpu : 8
mem : "130G"
mem : "180G"
time : "8:00:00"
phASER_expr_matrix:
cpu : 8
......
......@@ -71,7 +71,7 @@ rule phASER:
idSeparator = '_' if config['id_separator'] == '' else config['id_separator']
shell:
"""
python2.7 {config[bin_dir]}/phaser.py --vcf {input.vcf} --bam {input.bam} --sample {wildcards.sample} --threads {threads} --o Results_ASE/phASER/{wildcards.sample}_phASER --paired_end {params.paired_option} --baseq {params.baseQuality} --mapq {params.mappingQuality} --id_separator {params.idSeparator}
python2.7 {config[bin_dir]}/phaser.py --vcf {input.vcf} --bam {input.bam} --sample {wildcards.sample} --threads {threads} --o Results_ASE/phASER/{wildcards.sample}_phASER --paired_end {params.paired_option} --unphased_vars 1 --gw_phase_vcf 1 --baseq {params.baseQuality} --mapq {params.mappingQuality} --id_separator {params.idSeparator}
"""
......@@ -97,20 +97,19 @@ rule phASER_gene_ae:
python2.7 {config[bin_dir]}/phaser_gene_ae.py --haplotypic_counts {input.hap_count} --features {input.bed} --o {output} --id_separator {params.idSeparator}
"""
# TO FINISH AND TEST
# rule phASER_expr_matrix:
# input:
# expand("Results_ASE/phASER_gene_ae/{sample}_phASER.gene_ae.txt", sample=table["sample_name"].unique()),
# bed = 'Results_ASE/ref_bed/' + os.path.basename(config['gtf_ref']).replace('.gtf', '.bed')
# output:
# 'Results_ASE/phASER_pop/' + os.path.basename(config['vcf']).replace('.vcf.gz','_phASER.bed'),
# 'Results_ASE/phASER_pop/' + os.path.basename(config['vcf']).replace('.vcf.gz','_phASER.gw_phased.bed')
# params:
# gene_ae_dir = "Results_ASE/phASER_gene_ae/",
# out_dir = 'Results_ASE/phASER_pop/'
# threads: config["phASER_expr_matrix"]["cpu"]
# shell:
# """
# python2.7 {config[bin_dir]}/phaser_expr_matrix.py --t {threads} --gene_ae_dir {params.gene_ae_dir} --feature {input.bed} --o {param
# """
rule phASER_expr_matrix:
input:
expand("Results_ASE/phASER_gene_ae/{sample}_phASER.gene_ae.txt", sample=table["sample_name"].unique()),
bed = 'Results_ASE/ref_bed/' + os.path.basename(config['gtf_ref']).replace('.gtf', '.bed')
output:
bed = 'Results_ASE/phASER_pop/' + os.path.basename(config['vcf']).replace('.vcf.gz','_phASER.bed.gz'),
phased_bed = 'Results_ASE/phASER_pop/' + os.path.basename(config['vcf']).replace('.vcf.gz','_phASER.gw_phased.bed.gz')
params:
gene_ae_dir = "Results_ASE/phASER_gene_ae/",
out_prefix = 'Results_ASE/phASER_pop/' + os.path.basename(config['vcf']).replace('.vcf.gz','_phASER')
threads: config["phASER_expr_matrix"]["cpu"]
shell:
"""
python2.7 {config[bin_dir]}/phaser_expr_matrix.py --t {threads} --gene_ae_dir {params.gene_ae_dir} --feature {input.bed} --o {params.out_prefix}
"""
......@@ -11,7 +11,9 @@ import multiprocessing
# FUNCTION
def select_on_chrom(in_vcf, accepted_chrom, excluded_chrom, selected_vcf):
FH_in = open(in_vcf, 'rt')
#~ FH_in = open(in_vcf, encoding='utf8', 'rt')
# pour FLLL je ne comprends pas pourquoi mais utf8 ne fonctionne pas ...
FH_in = open(in_vcf, encoding='iso-8859-1', mode='rt')
FH_out = open(selected_vcf,'wt')
for line in FH_in:
......
# RNASeq Analysis workflow : quantification and SNP calling
__authors__: Maria Bernard (INRA - SIGENAE) / Kevin Muret - Frederic Jehl - Sandrine Lagarrigue (INRA - PEGASE)
__authors__: Maria Bernard (INRAE - SIGENAE) / Kevin Muret - Frederic Jehl - Sandrine Lagarrigue (AGROCAMPUS OUEST / INRAE - PEGASE)
- [RNASeq Analysis workflow : quantification and SNP calling](#rnaseq-analysis-workflow---quantification-and-snp-calling)
* [What it does ?](#what-it-does--)
......
......@@ -620,7 +620,7 @@ rule individual_gVCF_HaplotypeCaller:
jar=find_jar("GenomeAnalysisTK.jar")
shell:
"""
java -Xmx{params.mem} -jar {params.jar} -T HaplotypeCaller -I {input.bam} -R {input.fasta} -stand_call_conf 20.0 --min_base_quality_score 10 --min_mapping_quality_score 20 -ERC GVCF -dontUseSoftClippedBases -o {output.gvcf} 2> {output.log}
java -Xmx{params.mem} -jar {params.jar} -T HaplotypeCaller -I {input.bam} -R {input.fasta} --min_base_quality_score 10 --min_mapping_quality_score 20 -ERC GVCF -dontUseSoftClippedBases -o {output.gvcf} 2> {output.log}
"""
......@@ -643,7 +643,7 @@ rule genotypeGVCF:
run:
bams = gatk_multi_arg("--variant", input.gvcf)
shell( "java -Xmx{params.mem} -jar {params.jar} -T GenotypeGVCFs {bams} -R {input.fasta} -o {output.vcf} 2> {output.log}")
shell( "java -Xmx{params.mem} -jar {params.jar} -T GenotypeGVCFs {bams} -R {input.fasta} -stand_call_conf 20.0 -o {output.vcf} 2> {output.log}")
# /GATK
#######
......@@ -663,22 +663,5 @@ rule selectVariant:
java -Xmx{params.mem} -jar {params.jar} -T SelectVariants -R {input.ref} -V {input.vcf} -selectType INDEL -o {output.indel}
"""
# ajouter l'option --excludeIntervals pour exclure certains intervalles
rule filter_variant:
input:
ref = "Results/Fai_Dict_Index/" + os.path.basename(config["fasta_ref"]),
snp = "Results/GATK/"+POP+"_SNP.vcf.gz"
output:
snp = "Results/GATK/"+POP+"_SNP_flag.vcf.gz"
params:
mem=config["selectVariant"]["mem"],
jar=find_jar("GenomeAnalysisTK.jar")
shell:
"""
java -Xmx{params.mem} -jar {params.jar} -T VariantFiltration -R {input.ref} -V {input.snp} -window 35 -cluster 3 -filterName FS -filter "FS > 30.0" -filterName QD -filter "QD < 2.0" -o {output.snp}
"""
# ADD rule for merging flagstat
# ADD rule for merging individual HC stat log
# ADD rule split SNP/INDEL
# ADD rule Hard filtering
# ADD rule filtering on junction
......@@ -21,13 +21,13 @@ star_index_pass1 :
mem : "18G"
star_aln_pass1 :
mem : "12G"
mem : "5G"
star_index_pass2 :
mem : "18G"
star_aln_pass2 :
mem : "12G"
mem : "5G"
log_merge_STAR:
cpu : 1
......