Commit fd41415b authored by Penom Nom's avatar Penom Nom
Browse files

Adapt illumina matepair pipeline to jflow

parent fcba0eef
......@@ -69,6 +69,7 @@ tmp_directory = <path>/tmp
#
# Global softwares
#
#cutadapt = /usr/bin/cutadapt
#blastall = /usr/bin/blastall
#formatdb = /usr/bin/formatdb
#blastn = /usr/bin/blastn
......
......@@ -18,7 +18,6 @@
import os
from jflow.component import Component
from jflow.iotypes import OutputFileList, Formats
from jflow.abstraction import MultiMap
from weaver.function import ShellFunction
......@@ -26,13 +25,15 @@ from weaver.function import ShellFunction
class FastxReverseComplement (Component):
def define_parameters(self, input_files, qual_offset=33):
self.input_files = input_files
self.add_input_file_list( "input_files", "input_files", default=input_files, required=True)
if( self.input_files[0].endswith("fastq.gz") or self.input_files[0].endswith(".fastq") ):
self.output_files = OutputFileList(self.get_outputs('{basename}', self.input_files), Formats.FASTQ)
self.add_output_file_list( "output_files", "output_files", pattern='{basename}', items=self.input_files, file_format='fastq')
else:
self.output_files = OutputFileList(self.get_outputs('{basename}', self.input_files))
self.stderr = OutputFileList(self.get_outputs('{basename_woext}.stderr', self.input_files))
self.options = " -Q " + str(qual_offset)
self.add_output_file_list( "output_files", "output_files", pattern='{basename}', items=self.input_files)
self.add_output_file_list( "stderr", "stderr", pattern='{basename_woext}.stderr', items=self.input_files)
self.add_parameter("qual_offset", "qual_offset", default=qual_offset, type="int")
self.options = " -Q " + str(self.qual_offset)
def process(self):
if self.input_files[0].endswith(".gz"):
......
......@@ -22,7 +22,6 @@ from jflow.abstraction import MultiMap
from jflow.utils import get_argument_pattern
from weaver.function import ShellFunction
from weaver.abstraction import Map
from ng6.analysis import Analysis
from ng6.utils import Utils
......@@ -43,7 +42,6 @@ class BWA (Analysis):
self.add_parameter( "algorithm", "Algorithm for the alignment : aln or bwasw or mem.", default=algorithm, choices=["aln", "bwasw", "mem"] )
self.add_parameter("keep_bam", "keep_bam", default=keep_bam, type=bool)
# Files
self.add_output_file("stderr", "The BWA stderr file", filename='bwa.stderr')
self.add_input_file("reference_genome", "Which reference file should be used", default=reference_genome, required=True)
self.add_input_file_list( "read1", "Which read1 files should be used.", default=read1, required=True )
self.add_input_file_list( "read2", "Which read2 files should be used.", default=read2 )
......@@ -57,7 +55,15 @@ class BWA (Analysis):
self.add_output_file_list("bam_files", "The BWA bam files.", pattern='{basename_woext}.bam', items=self.read1, file_format="bam")
else:
self.add_output_file_list("bam_files", "The BWA bam files.", pattern='{basename_woext}.bam', items=group_prefix, file_format="bam")
self.add_output_file_list("stderrs", "BWA stderrs files", pattern='{basename_woext}.bwa.stderr', items=self.read1)
if self.algorithm == 'aln' :
if self.read2 :
self.add_output_file_list("stderrs_aln", "BWA stderrs files", pattern='{basename_woext}.bwa_aln.stderr', items=self.read1 + self.read2)
else :
self.add_output_file_list("stderrs_aln", "BWA stderrs files", pattern='{basename_woext}.bwa_aln.stderr', items=self.read1)
def get_version(self):
cmd = [self.get_exec_path("bwa")]
p = Popen(cmd, stdout=PIPE, stderr=PIPE)
......@@ -76,61 +82,53 @@ class BWA (Analysis):
source.close()
self.options += " " + reference_desc
def post_process(self):
if self.keep_bam:
# Finally create and add the archive to the analysis
self._save_files(self.bam_files)
def process(self):
unmerged_bam = []
if self.group_prefix == None:
unmerged_bam = self.bam_files
else:
unmerged_bam = self.get_outputs( '{basename_woext}.bam', self.read1 )
unmerged_bam = self.get_outputs( '{basename_woext}.bam', self.read1 )
# Algorithm bwasw or mem
if self.algorithm == "bwasw" or self.algorithm == "mem":
# Paired-end
if self.read2:
stderrs = [self.stderr for i in range(len(self.read1))]
bwa = ShellFunction(self.get_exec_path("bwa") + " " + self.algorithm + " " + self.reference_genome +
" $1 $2 2>> $4 | " + self.get_exec_path("samtools") + " view -bS - | " +
self.get_exec_path("samtools") + " sort - $3 2>> $4; mv $3.bam $3;", cmd_format='{EXE} {IN} {OUT}')
bwasw = MultiMap(bwa, inputs=[self.read1, self.read2], outputs=[unmerged_bam, stderrs], includes=self.reference_genome)
bwasw = MultiMap(bwa, inputs=[self.read1, self.read2], outputs=[unmerged_bam, self.stderrs], includes=self.reference_genome)
# Single-end
else:
bwa = ShellFunction(self.get_exec_path("bwa") + " " + self.algorithm + " " + self.reference_genome +
" $1 2>> $3 | " + self.get_exec_path("samtools") + " view -bS - | " +
self.get_exec_path("samtools") + " sort - $2 2>> $3; mv $2.bam $2;", cmd_format='{EXE} {IN} {OUT}')
bwasw = Map(bwa, inputs=self.read1, outputs=[unmerged_bam, self.stderr], includes=self.reference_genome)
self.get_exec_path("samtools") + " sort - $2 2>> $3 ; mv $2.bam $2;", cmd_format='{EXE} {IN} {OUT}')
bwasw = MultiMap(bwa, inputs=[self.read1], outputs=[unmerged_bam, self.stderrs], includes=self.reference_genome)
# Algorithm aln
else:
stderrs = [self.stderr for i in range(len(self.read1))]
reads, sais = [], []
reads.extend(self.read1)
sais.extend(self.sai1)
bwa = ShellFunction(self.get_exec_path("bwa") + " " + self.algorithm + " " + self.reference_genome +
" $1 > $2 2>> " + self.stderr , cmd_format='{EXE} {IN} {OUT}')
" $1 > $2 2>> $3 " , cmd_format='{EXE} {IN} {OUT}')
# Paired-end
if self.read2:
reads.extend(self.read2)
sais.extend(self.sai2)
bwa_aln = Map(bwa, inputs=reads, outputs=sais, includes=self.reference_genome)
bwa_aln = MultiMap(bwa, inputs=[reads], outputs=[sais, self.stderrs_aln], includes=self.reference_genome)
bwasampe = ShellFunction(self.get_exec_path("bwa") + " sampe " + self.reference_genome +
" $1 $2 $3 $4 2>> $6 | " + self.get_exec_path("samtools") + " view -bS - | " +
self.get_exec_path("samtools") + " sort - $5 2>> $6; mv $5.bam $5;",
cmd_format='{EXE} {IN} {OUT}')
bwasampe = MultiMap(bwasampe, inputs=[self.sai1, self.sai2, self.read1, self.read2], outputs=[unmerged_bam, stderrs], includes=self.reference_genome)
bwasampe = MultiMap(bwasampe, inputs=[self.sai1, self.sai2, self.read1, self.read2], outputs=[unmerged_bam, self.stderrs], includes=self.reference_genome)
# Single-end
else:
bwa_aln = Map(bwa, inputs=reads, outputs=sais, includes=self.reference_genome)
bwa_aln = MultiMap(bwa, inputs=[reads], outputs=[sais, self.stderrs_aln], includes=self.reference_genome)
bwasamse = ShellFunction(self.get_exec_path("bwa") + " samse " + self.reference_genome +
" $1 $2 2>> $4 | " + self.get_exec_path("samtools") + " view -bS - | " +
self.get_exec_path("samtools") + " sort - $3 2>> $4; mv $3.bam $3;",
cmd_format='{EXE} {IN} {OUT}')
bwasamse = MultiMap(bwasamse, inputs=[self.sai1, self.read1], outputs=[unmerged_bam, stderrs], includes=self.reference_genome)
bwasamse = MultiMap(bwasamse, inputs=[self.sai1, self.read1], outputs=[unmerged_bam, self.stderrs], includes=self.reference_genome)
if self.group_prefix != None:
# Create dictionary : key = prefix and value = list of files to merge
......@@ -143,7 +141,7 @@ class BWA (Analysis):
for prefix in self.group_prefix:
if len(groups_path[prefix]) > 1:
[cmd_inputs_pattern, next_arg_number] = get_argument_pattern(groups_path[prefix], 1)
samtoolsmerge = ShellFunction( self.get_exec_path("samtools") + ' merge ${' + str(next_arg_number) + '} ' + cmd_inputs_pattern + " 2>> " + self.stderr, cmd_format='{EXE} {IN} {OUT}')
samtoolsmerge = ShellFunction( self.get_exec_path("samtools") + ' merge ${' + str(next_arg_number) + '} ' + cmd_inputs_pattern , cmd_format='{EXE} {IN} {OUT}')
samtoolsmerge(inputs=groups_path[prefix], outputs=outputs_path[prefix])
elif groups_path[prefix] != outputs_path[prefix]:
link = ShellFunction( "ln -fs ${1} ${2}", cmd_format='{EXE} {IN} {OUT}')
......
......@@ -15,21 +15,16 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
import os,re
import os
import re
from subprocess import Popen, PIPE
from jflow.component import Component
from jflow.iotypes import OutputFileList, InputFileList, OutputFile, Formats
from jflow.abstraction import MultiMap
import jflow.seqio as seqio
from ng6.analysis import Analysis
from ng6.utils import Utils
from weaver.function import PythonFunction
from weaver.function import ShellFunction
from weaver.abstraction import Map
from re import sub
class CutAdapt (Analysis):
def _get_length_table(self, input_file):
......@@ -48,23 +43,24 @@ class CutAdapt (Analysis):
sizes[len(seq)] = 1
return [nb_seq, sizes]
def define_parameters(self, input_files_R1, input_files_R2, adapt_FWD, adapt_REV, is_paired=False, error_rate=0.1, times=None, min_length=None, max_length=None, overlap_length=None, archive_name=None):
self.input_files_R1=InputFileList(input_files_R1, Formats.FASTQ)
self.input_files_R2=InputFileList(input_files_R2, Formats.FASTQ)
def define_parameters(self, input_files_R1, input_files_R2, adapt_FWD, adapt_REV, is_paired=False,
error_rate=0.1, times=None, min_length=None, max_length=None, overlap_length=None, archive_name=None):
self.add_input_file_list( "input_files_R1", "input files read1", default=input_files_R1, required=True, file_format = 'fastq')
self.add_input_file_list( "input_files_R2", "input files read2", default=input_files_R2, required=True, file_format = 'fastq')
self.add_output_file_list( "output_files_R1", "output files read1", pattern='{basename_woext}_cut.fastq', items=self.input_files_R1)
self.add_output_file_list( "output_files_R2", "output files read2", pattern='{basename_woext}_cut.fastq', items=self.input_files_R2)
self.add_output_file_list( "log_files_R1", "log files read1", pattern='{basename_woext}.stderr', items=self.input_files_R1)
self.add_output_file_list( "log_files_R2", "log files read2", pattern='{basename_woext}.stderr', items=self.input_files_R2)
self.adapt_FWD=adapt_FWD
self.adapt_REV=adapt_REV
self.output_files_R1= OutputFileList(self.get_outputs('{basename_woext}_cut.fastq', self.input_files_R1))
self.output_files_R2= OutputFileList(self.get_outputs('{basename_woext}_cut.fastq', self.input_files_R2))
self.log_files_R1 = OutputFileList(self.get_outputs('{basename_woext}.stderr', self.input_files_R1))
self.log_files_R2 = OutputFileList(self.get_outputs('{basename_woext}.stderr', self.input_files_R2))
self.is_paired=is_paired
self.error_rate=error_rate
self.min_length=min_length
self.max_length=max_length
self.overlap_length=overlap_length
self.times=times
self.archive_name=archive_name
self.add_parameter("is_paired", "is_paired", default=is_paired, type="bool")
self.add_parameter("error_rate", "error_rate", default=error_rate, type='float')
self.add_parameter("min_length", "min_length", default=min_length, type='int')
self.add_parameter("max_length", "max_length", default=max_length, type='int')
self.add_parameter("overlap_length", "overlap_length", default=overlap_length, type='int')
self.add_parameter("times", "times", default=times, type='int')
self.add_parameter("archive_name", "Archive name", default=archive_name)
def define_analysis(self):
self.name = "CutAdapt"
......@@ -200,7 +196,6 @@ class CutAdapt (Analysis):
cutAdapt = ShellFunction(self.get_exec_path("cutadapt") + self.options_FWD + self.parameters + " --paired-output $3 -o $4 $1 $2 > $5", cmd_format = '{EXE} {IN} {OUT} ')
cutAdapt = MultiMap(cutAdapt, inputs = [self.tmp_files_R2, self.tmp_files_R1], outputs = [self.output_files_R2, self.output_files_R1 ,self.log_files_R2])
print "list R1 output : "+str(self.output_files_R1)
else :
cutAdapt = ShellFunction(self.get_exec_path("cutadapt")+ self.options_FWD + self.parameters + " $1 -o $2 > $3", cmd_format='{EXE} {IN} {OUT}')
......
......@@ -26,12 +26,12 @@ from ng6.utils import Utils
class IlluminaMatePair (CasavaNG6Workflow):
def get_name(self):
return 'illumina_matepair'
return 'illumina_matepair'
def get_description(self):
return "illumina quality check pipeline for matepair analyse"
return "illumina quality check pipeline for matepair analyse"
def define_parameters(self, function="process"):
def define_parameters(self, function="process"):
self.add_parameter("compression", "How should the data be compressed once archived", choices= [ "none", "gz", "bz2"], default = "none")
self.add_input_file("reference_genome", "Which genome should the read being align on")
self.add_input_file_list("databank", "Which databank should be used to seek contamination (as to be phiX databank indexed for bwa)")
......@@ -43,7 +43,7 @@ class IlluminaMatePair (CasavaNG6Workflow):
" fewer than this percentage of overall reads", type=float, default = 0.01, group="INSERTSIZE section")
self.add_parameter("no_group", "Disables grouping of bases for reads >50bp", type=bool, default = True)
def process(self):
def process(self):
if len(self.undetermined_reads1) > 0 :
demultiplex_stats = self.add_component("DemultiplexStats", [self.get_all_reads("read1"), self.undetermined_reads1])
......@@ -77,7 +77,7 @@ class IlluminaMatePair (CasavaNG6Workflow):
addrawfiles = self.add_component("AddRawFiles", [self.runobj, saved_files, self.args["compression"]])
# make some statistics on raw file
fastqc = self.add_component("FastQC", [filtered_read1_files+filtered_read2_files, (self.group_prefix is not None), True, run_name+"_fastqc.tar.gz"], parent = fastqilluminafilter)
fastqc = self.add_component("FastQC", [filtered_read1_files+filtered_read2_files, (self.group_prefix is not None), True, "_fastqc.tar.gz"], parent = fastqilluminafilter)
# contamination_search
try: self.args["databank"].extend([self.get_resource("phix_bwa"), self.get_resource("ecoli_bwa"), self.get_resource("yeast_bwa")])
......@@ -86,12 +86,14 @@ class IlluminaMatePair (CasavaNG6Workflow):
contamination_search = self.add_component("ContaminationSearch", [filtered_read1_files+filtered_read2_files, self.args["databank"], reads_prefixes], parent = fastqilluminafilter)
# mate_pair analyse
concatenate1 = self.add_component("ConcatenateFilesGroups", [filtered_read1_files, (Utils.get_group_basenames(self.read1_files, "read")).keys()],component_prefix="read1")
concatenate2 = self.add_component("ConcatenateFilesGroups", [filtered_read2_files, (Utils.get_group_basenames(self.read2_files, "read")).keys()],component_prefix="read2")
concatenate1 = self.add_component("ConcatenateFilesGroups", [filtered_read1_files, (Utils.get_group_basenames(self.get_all_reads('read1'), "read")).keys()],component_prefix="read1")
concatenate2 = self.add_component("ConcatenateFilesGroups", [filtered_read2_files, (Utils.get_group_basenames(self.get_all_reads('read2'), "read")).keys()],component_prefix="read2")
concatenate1.concat_files = sorted(concatenate1.concat_files)
concatenate2.concat_files = sorted(concatenate2.concat_files)
cutadapt = self.add_component("CutAdapt",[concatenate1.concat_files, concatenate2.concat_files,{"g":["CTGTCTCTT","ATACACATCT","AGATCTAT","AAGAGACAG"]},{"g":["CTGTCTCTT","ATACACATCT","AGATCTAT","AAGAGACAG"]},self.is_paired_end,0.1,4,20 ],parent= fastqilluminafilter)
cutadapt = self.add_component("CutAdapt",[concatenate1.concat_files, concatenate2.concat_files,
{"g":["CTGTCTCTT","ATACACATCT","AGATCTAT","AAGAGACAG"]},{"g":["CTGTCTCTT","ATACACATCT","AGATCTAT","AAGAGACAG"]},self.is_paired_end,0.1,4,20 ],
parent= fastqilluminafilter)
#reverse_complement
......@@ -101,7 +103,7 @@ class IlluminaMatePair (CasavaNG6Workflow):
revcom2.output_files = sorted(revcom2.output_files)
# make some statistics on filtered file
fastqc = self.add_component("FastQC", [revcom1.output_files+revcom2.output_files, (group_prefix is not None), True, run_name+"_fastqc.tar.gz"], parent = cutadapt, component_prefix="Trimmed_read")
fastqc = self.add_component("FastQC", [revcom1.output_files+revcom2.output_files, (self.group_prefix is not None), True, "_fastqc.tar.gz"], parent = cutadapt, component_prefix="Trimmed_read")
if self.reference_genome:
......@@ -113,7 +115,7 @@ class IlluminaMatePair (CasavaNG6Workflow):
# align reads against indexed genome
sample_lane_prefixes = None
if group_prefix is not None :
if self.group_prefix is not None :
sample_lane_prefixes = (Utils.get_group_basenames(revcom1.output_files+revcom2.output_files, "lane")).keys()
#bwa = self.add_component("BWA", [indexed_ref, gunzip.fastq_R1 , gunzip.fastq_R2, sample_lane_prefixes], parent = cutadapt)
bwa = self.add_component("BWA", [indexed_ref, revcom1.output_files , revcom2.output_files, sample_lane_prefixes, 'aln', not self.delete_bam], parent = cutadapt)
......
......@@ -2,6 +2,6 @@
# Please do not edit.
l4_BARCODES:=CGATGT Undetermined CAGATC
l4_SAMPLEIDS:=A lane4 B
l4_SUBDIRS:=Project_test-dev/Sample_A Undetermined_indices/Sample_lane4 Project_test-dev/Sample_B
l4_BARCODES:=CGATGT CAGATC
l4_SAMPLEIDS:=A B
l4_SUBDIRS:=Project_Demo_project/Sample_A Project_Demo_project/Sample_B
......@@ -30,12 +30,11 @@ bovine
--description
Insert : pb for Sample Demo. [C] 5,5 pM
--delete-bam
--casava-description
casava-directory=workflows/illumina_matepair/data/casava_directory_test/
lane-number=1
--project-id
266
1
--casava
casava-directory=workflows/illumina_matepair/data/casava_directory_test/
lane=4
--reference-genome
workflows/illumina_matepair/data/genome/Chr1.fasta
--admin-login
CTD
workflows/illumina_matepair/data/genome/chr1.fasta
#
# Copyright (C) 2012 INRA
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
[global]
name = illumina_matepair
description = illumina matepair quality check pipeline
#
# Parameter section
# param.name: the parameter display name
# .flag: the command line flag to use the argument
# .help: a brief description of what the parameter does
# .default [None]: the value produced if the parameter is not provided
# .type [str]: the parameter type that should be tested (str|int|date|file|bool)
# .choices [None]: a container of the allowable values for the parameter
# .required [False]: whether or not the command-line option may be omitted
# .action [store]: the basic type of action to be taken (store|append)
#
[parameters]
#Casava directory description
sample_description.name = description
sample_description.flag = --casava-description
sample_description.help = where are stored casava results and the lane processed
sample_description.type = multiple
sample_description.required = True
sample_description.exclude = files_read
sample_description.casava_directory.name = casava directory
sample_description.casava_directory.flag = casava-directory
sample_description.casava_directory.help = Where are stored casava results (see also lane-number)
sample_description.casava_directory.type = casava_dir
sample_description.lane_number.name = lane number
sample_description.lane_number.flag = lane-number
sample_description.lane_number.help = Which lane should be processed (mandatory with casava-directory)
#Files read description
files_read.name = reads
files_read.flag = --reads
files_read.help = Define which read1 files and/or read2 should be used
files_read.type = multiple
files_read.action = append
files_read.exclude = sample_description
files_read.read_1.name = read 1
files_read.read_1.flag = R1
files_read.read_1.help = Which read1 files should be used
files_read.read_1.required = True
files_read.read_1.type = localfile
files_read.read_2.name = read 2
files_read.read_2.flag = R2
files_read.read_2.help = Which read2 files should be used (if single end, leave empty)
files_read.read_2.type = localfile
compression.name = compression
compression.flag = --compression
compression.help = How should data be compressed once archived (none|gz|bz2)
compression.default = none
compression.choices = none|gz|bz2
reference_genome.name = reference_genome
reference_genome.flag = --reference-genome
reference_genome.help = Which genome should the read being align on
reference_genome.type = localfile
databank.name = databank
databank.flag = --databank
databank.help = Which databank should be used to seek contamination (as to be phiX databank indexed for bwa)
databank.action = append
databank.type = localfile
keep_reads.name = keep_reads
keep_reads.flag = --keep
keep_reads.help = Keep reads which pass the Illumina filters or keep reads which not pass the Illumina filters (pass_illumina_filters|not_pass_illumina_filters|all). With other values that "all" the headers of reads must be '@<instrument>:<run number>:<flowcell ID>:<lane>:<tile>:<x-pos>:<y-pos> <read>:<is filtered>:<control number>:<index sequence>'
keep_reads.default = pass_illumina_filters
keep_reads.choices = pass_illumina_filters|not_pass_illumina_filters|all
#Insert size section
histogram_width.group = INSERTSIZE section
min_pct.group = INSERTSIZE section
histogram_width.name = histogram_width
histogram_width.flag = --histogram_width
histogram_width.help = Explicitly sets the histogram width, overriding automatic truncation of histogram tail.
histogram_width.default = 10000
min_pct.name = min_pct
min_pct.flag = --min_pct
min_pct.help = When generating the histogram, discard any data categories (out of FR, TANDEM, RF) that have fewer than this percentage of overall reads.
min_pct.default = 0.01
delete_bam.name = delete bam
delete_bam.flag = --delete-bam
delete_bam.help = The BAM are not stored
delete_bam.type = bool
delete_bam.default = False
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment