Skip to content
Snippets Groups Projects
Commit 92cdf48f authored by Floreal Cabanettes's avatar Floreal Cabanettes
Browse files

Add snakefile for align fastq files on reference

parent d02182a9
No related branches found
No related tags found
1 merge request!6Add align workflow
......@@ -3,7 +3,7 @@
import sys
import re
import os
from functions import getsamples
from functions import get_samples
from os.path import join, isfile
......@@ -26,8 +26,7 @@ def message(mes):
sys.stderr.write("|--- " + mes + "\n")
def getChromosomes(reference_file,
validchromreg='((C|c)hr|Scaffold)?(\d+|Z|W|X|Y)$'):
def getChromosomes(reference_file, validchromreg='((C|c)hr|Scaffold)?(\d+|Z|W|X|Y)$'):
"""
From the reference genome fasta file extract the
valid chromosomes
......@@ -62,7 +61,7 @@ WDIR = config['wdir']
workdir: WDIR
# list of samples
samples = getsamples(config['sample_file'], size_batches)
samples = get_samples(config['sample_file'], size_batches)
# list of tools
tools = config['tools'].split(',')
......
import sys
import re
import os
import yaml
from functions import get_samples_align
ROOTPATH = os.path.dirname(workflow.snakefile)
sys.path.insert(0, os.path.join(ROOTPATH, "lib"))
os.environ["PYTHONPATH"] = os.path.join(ROOTPATH, "lib")
from align_snakemake_utils import SnakemakeUtils
# Get from config:
samples = get_samples_align(config['sample_file'])
reference = config['reference']
wdir = config['wdir']
workdir: wdir
snakemake_utils = SnakemakeUtils(samples)
include: "tools/threads.snk"
rule all:
input:
expand("bams/{sample}.bam", sample=samples.keys()),
expand("bams/{sample}.bam.bai", sample=samples.keys())
rule bwaindex:
input: reference
output: reference + ".bwt"
log:
stdout = "logs/index.o",
stderr = "logs/index.e"
shell:
"bwa index {input} 1> {log.stdout} 2> {log.stderr}"
rule bwamap:
input:
unpack(snakemake_utils.get_inputs_fastq),
reference = reference,
ref_index = reference + ".bwt"
output:
bam = "bwa/{sample}.bam"
threads:
get_threads("bwamap", 4)
log:
stdout = "logs/map.o",
stderr = "logs/map.e"
shell:
"bwa mem -t {threads} -M -R '@RG\\tID:{wildcards.sample}\\tPL:ILLUMINA\\tPU:-\\tSM:{wildcards.sample}\\tLB:{wildcards.sample}' "
"{input.reference} {input.fastq} 2> {log.stderr} | samtools view -bS - 2> {log.stderr} | samtools "
"sort -o {output.bam} 1> {log.stdout} 2> {log.stderr}"
rule bwamapindex:
input:
"bwa/{sample}.bam"
output:
bai = "bwa/{sample}.bam.bai",
summary = "bwa/{sample}.bam.summary"
threads:
1
log:
stdout = "logs/mapindex.o",
stderr = "logs/mapindex.o"
shell:
"samtools index {input}; "
"samtools flagstat {input} > {output.summary}"
rule markduplicate:
input:
bam = "bwa/{sample}.bam",
bai = "bwa/{sample}.bam.bai"
output:
bam = "bams/{sample}.bam",
bai = "bams/{sample}.bam.bai",
metrics = "bams/{sample}.bam.metrics",
summary = "bams/{sample}.bam.summary"
threads:
1
log:
stdout = "logs/markduplicates.o",
stderr = "logs/markduplicates.o"
shell:
"""
picard MarkDuplicates I={input.bam} O={output.bam} M={output.metrics} REMOVE_DUPLICATES=true 1> {log.stdout} \
2> {log.stderr};
samtools flagstat {output.bam} > {output.summary} 2>> {log.stderr}
"""
\ No newline at end of file
import os
import yaml
import traceback
from collections import OrderedDict
def getsamples(samplefile, size_batches):
def get_samples(samplefile, size_batches):
samples = OrderedDict()
samples_batch = []
n = 1
......@@ -14,4 +17,35 @@ def getsamples(samplefile, size_batches):
n += 1
if len(samples_batch) > 0:
samples["batch%03d" % n] = samples_batch
return samples
\ No newline at end of file
return samples
def get_samples_align(samples_file):
samples = {}
with open(samples_file) as smples:
try:
samples = yaml.load(smples)
except yaml.YAMLError:
traceback.print_exc()
raise Exception("Error: sample file is not a valid YAML file")
if not isinstance(samples, dict):
raise Exception("Error: sample file is not valid")
for sample, files in samples.items():
if not isinstance(sample, str):
samples[str(sample)] = samples[sample]
del samples[sample]
if not isinstance(files, dict):
raise Exception("Error: sample file is not valid")
if "reads1" not in files and "reads" not in files:
raise Exception("Invalid sample file: each sample must have a reads1 or a reads key")
if "reads" in files and "reads1" in files:
raise Exception("Invalid sample file: a sample can't have both reads and reads1 key")
if "reads" in files:
files["reads1"] = files["reads"]
del files["reads"]
for file in files:
files[file] = os.path.abspath(files[file])
return samples
class SnakemakeUtils:
def __init__(self, samples):
self.samples = samples
def get_inputs_fastq(self, wildcards):
fastq = []
files = self.samples[wildcards.sample]
fastq.append(files["reads1"])
if "reads2" in files:
fastq.append(files["reads2"])
return {"fastq": fastq}
import os
import sys
class SnakemakeUtils:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment