Commit 18d23597 authored by Jerome Mariette's avatar Jerome Mariette
Browse files

add a quickstart and remove alignment

parent a5ec44b1
#
# Copyright (C) 2012 INRA
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
import os
from subprocess import Popen, PIPE
from jflow.component import Component
from jflow.abstraction import MultiMap
from jflow.utils import get_argument_pattern
from weaver.function import ShellFunction
from weaver.abstraction import Map
class BWA (Component):
def define_parameters(self, reference_genome, read1, read2=None, group_prefix=None, algorithm="aln"):
"""
@param reference_genome : [str] Path to the reference genome (it must be indexed).
@param read1 : [list] Paths to reads 1.
@param read2 : [list] Paths to reads 2.
@param group_prefix : [list] The component produces one bam by prefix. Each bam is a merge of all files with the correct prefix.
@param algorithm : [str] Algorithm for the alignment (aln or bwasw or mem).
"""
# Parameters
self.add_parameter_list( "group_prefix", "The component produces one bam by prefix. Each bam is a merge of all files with the correct prefix.", default=group_prefix )
self.add_parameter( "algorithm", "Algorithm for the alignment : aln or bwasw or mem.", default=algorithm, choices=["aln", "bwasw", "mem"] )
# Files
self.add_output_file("stderr", "The BWA stderr file", filename='bwa.stderr')
self.add_input_file("reference_genome", "Which reference file should be used", default=reference_genome, required=True)
self.add_input_file_list( "read1", "Which read1 files should be used.", default=read1, required=True )
self.add_input_file_list( "read2", "Which read2 files should be used.", default=read2 )
self.sai1 = None
self.sai2 = None
if algorithm == "aln":
self.add_output_file_list("sai1", "The BWA read1 sai files.", pattern='{basename_woext}.sai', items=self.read1)
if read2 != None:
self.add_output_file_list("sai2", "The BWA read2 sai files.", pattern='{basename_woext}.sai', items=self.read2)
if group_prefix == None:
self.add_output_file_list("bam_files", "The BWA bam files.", pattern='{basename_woext}.bam', items=self.read1, file_format="bam" )
else:
self.add_output_file_list("bam_files", "The BWA bam files.", pattern='{basename_woext}.bam', items=group_prefix, file_format="bam" )
def get_version(self):
cmd = [self.get_exec_path("bwa")]
p = Popen(cmd, stdout=PIPE, stderr=PIPE)
stdout, stderr = p.communicate()
return stderr.split()[7]
def _get_filepath_by_prefix( self, path_list, prefixes ):
"""
Gather files path with same prefix. Ex :
list [/home/sample1_L002.fastq, /home/sample2_L002.fastq, /home/sample1_L003.fastq]
with prefixes [sample1, sample2]
return {'sample1':[/home/sample1_L002.fastq, /home/sample1_L003.fastq], 'sample2':[/home/sample2_L002.fastq]}
@param path_list : the list of files
@param prefixes : prefix to gather
"""
path_groups = {}
for current_prefix in sorted(prefixes)[::-1]:
path_groups[current_prefix] = []
for file_path in path_list:
if os.path.basename(file_path).startswith(current_prefix):
path_groups[current_prefix].append(file_path)
return path_groups
def process(self):
unmerged_bam = []
if self.group_prefix == None:
unmerged_bam = self.bam_files
else:
unmerged_bam = self.get_outputs( '{basename_woext}.bam', self.read1 )
# Algorithm bwasw or mem
if self.algorithm == "bwasw" or self.algorithm == "mem":
# Paired-end
if self.read2:
bwa = ShellFunction(self.get_exec_path("bwa") + " " + self.algorithm + " " + self.reference_genome + \
" $1 $2 2>> " + self.stderr + " | " + self.get_exec_path("samtools") + " view -bS - | " + \
self.get_exec_path("samtools") + " sort - $3 2>> " + self.stderr + "; mv $3.bam $3;", cmd_format='{EXE} {IN} {OUT}')
bwasw = MultiMap(bwa, inputs=[self.read1, self.read2], outputs=unmerged_bam, includes=self.reference_genome)
# Single-end
else:
bwa = ShellFunction(self.get_exec_path("bwa") + " " + self.algorithm + " " + self.reference_genome + \
" $1 2>> " + self.stderr + " | " + self.get_exec_path("samtools") + " view -bS - | " + \
self.get_exec_path("samtools") + " sort - $2 2>> " + self.stderr + "; mv $2.bam $2;", cmd_format='{EXE} {IN} {OUT}')
bwasw = Map(bwa, self.read1, unmerged_bam, includes=self.reference_genome)
# Algorithm aln
else:
reads, sais = [], []
reads.extend(self.read1)
sais.extend(self.sai1)
bwa = ShellFunction(self.get_exec_path("bwa") + " " + self.algorithm + " " + self.reference_genome + \
" $1 > $2 2>> " + self.stderr, cmd_format='{EXE} {IN} {OUT}')
# Paired-end
if self.read2:
reads.extend(self.read2)
sais.extend(self.sai2)
bwa_aln = Map(bwa, inputs=reads, outputs=sais, includes=self.reference_genome)
bwasampe = ShellFunction(self.get_exec_path("bwa") + " sampe " + self.reference_genome + \
" $1 $2 $3 $4 2>> " + self.stderr + " | " + self.get_exec_path("samtools") + " view -bS - | " + \
self.get_exec_path("samtools") + " sort - $5 2>> " + self.stderr + "; mv $5.bam $5;",
cmd_format='{EXE} {IN} {OUT}')
bwasampe = MultiMap(bwasampe, inputs=[self.sai1, self.sai2, self.read1, self.read2], outputs=unmerged_bam, includes=self.reference_genome)
# Single-end
else:
bwa_aln = Map(bwa, inputs=reads, outputs=sais, includes=self.reference_genome)
bwasamse = ShellFunction(self.get_exec_path("bwa") + " samse " + self.reference_genome + \
" $1 $2 2>> " + self.stderr + " | " + self.get_exec_path("samtools") + " view -bS - | " + \
self.get_exec_path("samtools") + " sort - $3 2>> " + self.stderr + "; mv $3.bam $3;",
cmd_format='{EXE} {IN} {OUT}')
bwasamse = MultiMap(bwasamse, inputs=[self.sai1, self.read1], outputs=unmerged_bam, includes=self.reference_genome)
if self.group_prefix != None:
# Create dictionary : key = prefix and value = list of files to merge
groups_path = self._get_filepath_by_prefix( unmerged_bam, self.group_prefix)
# Create dictionary : key = prefix and value = the output bam
outputs_path = self._get_filepath_by_prefix( self.bam_files, self.group_prefix)
# Merges bam or rename
for prefix in self.group_prefix:
if len(groups_path[prefix]) > 1:
[cmd_inputs_pattern, next_arg_number] = get_argument_pattern(groups_path[prefix], 1)
samtoolsmerge = ShellFunction( self.get_exec_path("samtools") + ' merge ${' + str(next_arg_number) + '} ' + cmd_inputs_pattern + " 2>> " + self.stderr, cmd_format='{EXE} {IN} {OUT}')
samtoolsmerge(inputs=groups_path[prefix], outputs=outputs_path[prefix])
elif groups_path[prefix] != outputs_path[prefix]:
link = ShellFunction( "ln -fs ${1} ${2}", cmd_format='{EXE} {IN} {OUT}')
link(inputs=groups_path[prefix], outputs=outputs_path[prefix])
\ No newline at end of file
......@@ -16,22 +16,20 @@
#
from jflow.workflow import Workflow
from jflow.parameter import InputFileList, InputFile
class Alignment (Workflow):
class QuickStart (Workflow):
def get_description(self):
return "Align reads against a reference genome"
def define_parameters(self, function="process"):
self.add_input_file_list("read_1", "Which read1 files should be used", file_format="fastq", required=True)
self.add_input_file_list("read_2", "Which read2 files should be used (if single end, leave empty)", file_format="fastq")
self.add_input_file("reference_genome", "Which genome should the read being align on", required=True, file_format="fasta")
self.add_input_file_list("reads", "Which read files should be used", file_format="fastq", required=True)
self.add_input_file("reference_genome", "Which genome should the read being align on", file_format="fasta", required=True)
def process(self):
def process(self):
# index the reference genome
bwaindex = self.add_component("BWAIndex", [self.reference_genome])
# align reads against indexed genome
bwa = self.add_component("BWA", [bwaindex.databank, self.read_1, self.read_2])
\ No newline at end of file
# align reads against the indexed genome
bwamem = self.add_component("BWAmem", [bwaindex.databank, self.reads])
\ No newline at end of file
#
# Copyright (C) 2012 INRA
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
\ No newline at end of file
......@@ -16,44 +16,21 @@
#
import os
from subprocess import Popen, PIPE
from jflow.component import Component
from weaver.function import PythonFunction, ShellFunction
def bwa_index(exec_path, algorithm, input_fasta, databank, stdout_path, stderr_path):
from subprocess import Popen, PIPE
# first make the symbolic link
os.symlink(input_fasta, databank)
# then execute bwa index
cmd = [exec_path, "index", "-a", algorithm, databank]
p = Popen(cmd, stdout=PIPE, stderr=PIPE)
stdout, stderr = p.communicate()
# write down the stdout
stdoh = open(stdout_path, "w")
stdoh.write(stdout)
stdoh.close()
# write down the stderr
stdeh = open(stderr_path, "w")
stdeh.write(stderr)
stdeh.close()
class BWAIndex (Component):
def define_parameters(self, input_fasta, algorithm="bwtsw"):
self.add_input_file("input_fasta", "Which fasta file should be indexed", file_format="fasta", default=input_fasta, required=True)
self.add_parameter("algorithm", "Which algorithm should be used to index the fasta file", default=algorithm, choices=["bwtsw", "div", "is"])
self.add_output_file("databank", "The indexed databank to use with BWA", filename=os.path.basename(input_fasta))
self.add_output_file("databank", "The indexed databank", filename=os.path.basename(input_fasta))
self.add_output_file("stdout", "The BWAIndex stdout file", filename="bwaindex.stdout")
self.add_output_file("stderr", "The BWAIndex stderr file", filename="bwaindex.stderr")
def process(self):
bwaindex = PythonFunction(bwa_index, cmd_format="{EXE} {ARG} {IN} {OUT}")
bwaindex(inputs=self.input_fasta, outputs=[self.databank, self.stdout, self.stderr], arguments=[self.get_exec_path("bwa"), self.algorithm])
def get_version(self):
cmd = [self.get_exec_path("bwa")]
p = Popen(cmd, stdout=PIPE, stderr=PIPE)
stdout, stderr = p.communicate()
return stderr.split()[7]
bwaindex = ShellFunction("ln -s $1 $2; " + self.get_exec_path("bwa") + " index -a " + self.algorithm + " $2 > $3 2>> $4", cmd_format="{EXE} {IN} {OUT}")
bwaindex(inputs=self.input_fasta, outputs=[self.databank, self.stdout, self.stderr])
\ No newline at end of file
#
# Copyright (C) 2012 INRA
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
from jflow.component import Component
from jflow.abstraction import MultiMap
from weaver.function import ShellFunction
class BWAmem (Component):
def define_parameters(self, reference_genome, reads):
self.add_input_file_list( "reads", "Which reads files should be used.", default=reads, required=True )
self.add_input_file("reference_genome", "Which reference file should be used", default=reference_genome, required=True)
self.add_output_file_list("stderr", "The BWA stderr file", pattern='{basename_woext}.stderr', items=self.reads)
def process(self):
bwamem = ShellFunction(self.get_exec_path("bwa") + " mem " + self.reference_genome + " $1 2>> $2", cmd_format='{EXE} {IN} {OUT}')
bwamem = MultiMap(bwamem, inputs=[self.reads], outputs=[self.stderr])
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment