From 92cdf48f0032aa0255fb3d4eb3e00e1d74b9fd70 Mon Sep 17 00:00:00 2001 From: Floreal Cabanettes <floreal.cabanettes@inra.fr> Date: Tue, 17 Apr 2018 13:49:17 +0200 Subject: [PATCH] Add snakefile for align fastq files on reference --- snakecnv/Snakefile | 7 +- snakecnv/align.snk | 93 +++++++++++++++++++++++++++ snakecnv/functions.py | 38 ++++++++++- snakecnv/lib/align_snakemake_utils.py | 12 ++++ snakecnv/lib/svsnakemake_utils.py | 1 - 5 files changed, 144 insertions(+), 7 deletions(-) create mode 100644 snakecnv/align.snk create mode 100644 snakecnv/lib/align_snakemake_utils.py diff --git a/snakecnv/Snakefile b/snakecnv/Snakefile index 299dd0e..ce3e7b1 100644 --- a/snakecnv/Snakefile +++ b/snakecnv/Snakefile @@ -3,7 +3,7 @@ import sys import re import os -from functions import getsamples +from functions import get_samples from os.path import join, isfile @@ -26,8 +26,7 @@ def message(mes): sys.stderr.write("|--- " + mes + "\n") -def getChromosomes(reference_file, - validchromreg='((C|c)hr|Scaffold)?(\d+|Z|W|X|Y)$'): +def getChromosomes(reference_file, validchromreg='((C|c)hr|Scaffold)?(\d+|Z|W|X|Y)$'): """ From the reference genome fasta file extract the valid chromosomes @@ -62,7 +61,7 @@ WDIR = config['wdir'] workdir: WDIR # list of samples -samples = getsamples(config['sample_file'], size_batches) +samples = get_samples(config['sample_file'], size_batches) # list of tools tools = config['tools'].split(',') diff --git a/snakecnv/align.snk b/snakecnv/align.snk new file mode 100644 index 0000000..4c1d018 --- /dev/null +++ b/snakecnv/align.snk @@ -0,0 +1,93 @@ +import sys +import re +import os +import yaml +from functions import get_samples_align + +ROOTPATH = os.path.dirname(workflow.snakefile) +sys.path.insert(0, os.path.join(ROOTPATH, "lib")) +os.environ["PYTHONPATH"] = os.path.join(ROOTPATH, "lib") + +from align_snakemake_utils import SnakemakeUtils + + +# Get from config: +samples = get_samples_align(config['sample_file']) +reference = config['reference'] +wdir = config['wdir'] + +workdir: wdir + +snakemake_utils = SnakemakeUtils(samples) + +include: "tools/threads.snk" + + +rule all: + input: + expand("bams/{sample}.bam", sample=samples.keys()), + expand("bams/{sample}.bam.bai", sample=samples.keys()) + + + +rule bwaindex: + input: reference + output: reference + ".bwt" + log: + stdout = "logs/index.o", + stderr = "logs/index.e" + shell: + "bwa index {input} 1> {log.stdout} 2> {log.stderr}" + +rule bwamap: + input: + unpack(snakemake_utils.get_inputs_fastq), + reference = reference, + ref_index = reference + ".bwt" + output: + bam = "bwa/{sample}.bam" + threads: + get_threads("bwamap", 4) + log: + stdout = "logs/map.o", + stderr = "logs/map.e" + shell: + "bwa mem -t {threads} -M -R '@RG\\tID:{wildcards.sample}\\tPL:ILLUMINA\\tPU:-\\tSM:{wildcards.sample}\\tLB:{wildcards.sample}' " + "{input.reference} {input.fastq} 2> {log.stderr} | samtools view -bS - 2> {log.stderr} | samtools " + "sort -o {output.bam} 1> {log.stdout} 2> {log.stderr}" + +rule bwamapindex: + input: + "bwa/{sample}.bam" + output: + bai = "bwa/{sample}.bam.bai", + summary = "bwa/{sample}.bam.summary" + threads: + 1 + log: + stdout = "logs/mapindex.o", + stderr = "logs/mapindex.o" + shell: + "samtools index {input}; " + "samtools flagstat {input} > {output.summary}" + +rule markduplicate: + input: + bam = "bwa/{sample}.bam", + bai = "bwa/{sample}.bam.bai" + output: + bam = "bams/{sample}.bam", + bai = "bams/{sample}.bam.bai", + metrics = "bams/{sample}.bam.metrics", + summary = "bams/{sample}.bam.summary" + threads: + 1 + log: + stdout = "logs/markduplicates.o", + stderr = "logs/markduplicates.o" + shell: + """ + picard MarkDuplicates I={input.bam} O={output.bam} M={output.metrics} REMOVE_DUPLICATES=true 1> {log.stdout} \ + 2> {log.stderr}; + samtools flagstat {output.bam} > {output.summary} 2>> {log.stderr} + """ \ No newline at end of file diff --git a/snakecnv/functions.py b/snakecnv/functions.py index f0cc1e5..3e18c60 100644 --- a/snakecnv/functions.py +++ b/snakecnv/functions.py @@ -1,7 +1,10 @@ +import os +import yaml +import traceback from collections import OrderedDict -def getsamples(samplefile, size_batches): +def get_samples(samplefile, size_batches): samples = OrderedDict() samples_batch = [] n = 1 @@ -14,4 +17,35 @@ def getsamples(samplefile, size_batches): n += 1 if len(samples_batch) > 0: samples["batch%03d" % n] = samples_batch - return samples \ No newline at end of file + return samples + + +def get_samples_align(samples_file): + samples = {} + with open(samples_file) as smples: + try: + samples = yaml.load(smples) + except yaml.YAMLError: + traceback.print_exc() + raise Exception("Error: sample file is not a valid YAML file") + + if not isinstance(samples, dict): + raise Exception("Error: sample file is not valid") + + for sample, files in samples.items(): + if not isinstance(sample, str): + samples[str(sample)] = samples[sample] + del samples[sample] + if not isinstance(files, dict): + raise Exception("Error: sample file is not valid") + if "reads1" not in files and "reads" not in files: + raise Exception("Invalid sample file: each sample must have a reads1 or a reads key") + if "reads" in files and "reads1" in files: + raise Exception("Invalid sample file: a sample can't have both reads and reads1 key") + if "reads" in files: + files["reads1"] = files["reads"] + del files["reads"] + for file in files: + files[file] = os.path.abspath(files[file]) + + return samples diff --git a/snakecnv/lib/align_snakemake_utils.py b/snakecnv/lib/align_snakemake_utils.py new file mode 100644 index 0000000..4b8d0af --- /dev/null +++ b/snakecnv/lib/align_snakemake_utils.py @@ -0,0 +1,12 @@ +class SnakemakeUtils: + + def __init__(self, samples): + self.samples = samples + + def get_inputs_fastq(self, wildcards): + fastq = [] + files = self.samples[wildcards.sample] + fastq.append(files["reads1"]) + if "reads2" in files: + fastq.append(files["reads2"]) + return {"fastq": fastq} diff --git a/snakecnv/lib/svsnakemake_utils.py b/snakecnv/lib/svsnakemake_utils.py index a4e1cad..1c69fca 100644 --- a/snakecnv/lib/svsnakemake_utils.py +++ b/snakecnv/lib/svsnakemake_utils.py @@ -1,6 +1,5 @@ import os -import sys class SnakemakeUtils: -- GitLab