Commit cc36cebc authored by Claire Kuchly's avatar Claire Kuchly
Browse files

add adaptor reverse and forward

add pair end data 
parent db9b4f96
......@@ -22,9 +22,11 @@ from subprocess import Popen, PIPE
from jflow.component import Component
from jflow.iotypes import OutputFileList, InputFileList, OutputFile, Formats
from jflow.abstraction import MultiMap
from ng6.analysis import Analysis
from weaver.function import PythonFunction
from ng6.utils import Utils
from weaver.function import PythonFunction
from weaver.function import ShellFunction
from weaver.abstraction import Map
from re import sub
......@@ -45,18 +47,17 @@ class CutAdapt (Analysis):
else:
sizes[len(seq)] = 1
return [nb_seq, sizes]
def define_parameters(self, input_files_R1, input_files_R2, adapt_FWD, adapt_REV, is_paired=False, error_rate=0.1, min_length=None, max_length=None, overlap_length=None, archive_name=None):
self.input_files_R1=InputFileList(input_files_R1, Formats.FASTQ)
self.input_files_R2=InputFileList(input_files_R2, Formats.FASTQ)
self.adapt_FWD=adapt_FWD
self.adapt_REV=adapt_REV
self.output_files_R1=OutputFileList(self.get_outputs('{basename_woext}_cut.fastq', self.input_files_R1))
self.output_files_R2=OutputFileList(self.get_outputs('{basename_woext}_cut.fastq', self.input_files_R2))
self.tmp_files_R1 = self.get_outputs('{basename_woext}.tmp',self.input_files_R1)
self.tmp_files_R2 = self.get_outputs('{basename_woext}.tmp',self.input_files_R2)
self.output_files_R1= OutputFileList(self.get_outputs('{basename_woext}_cut.fastq', self.input_files_R1))
self.output_files_R2= OutputFileList(self.get_outputs('{basename_woext}_cut.fastq', self.input_files_R2))
self.log_files_R1 = OutputFileList(self.get_outputs('{basename_woext}.stderr', self.input_files_R1))
self.log_files_R2 = OutputFileList(self.get_outputs('{basename_woext}.stderr', self.input_files_R2))
self.log_files_R2 = OutputFileList(self.get_outputs('{basename_woext}.stderr', self.input_files_R2))
self.is_paired=is_paired
self.error_rate=error_rate
self.min_length=min_length
......@@ -68,32 +69,49 @@ class CutAdapt (Analysis):
self.name = "CutAdapt"
self.description = "Remove adapters"
self.software = "cutadapt"
self.options = ""
self.options_FWD = ""
self.options_REV = ""
self.parameters = ""
self.options = "MAIN "
if self.error_rate:
self.options += " -e " + str(self.error_rate)
self.parameters += " -e " + str(self.error_rate)
if self.min_length:
if not self.is_paired :
self.options += " -m " + str(self.min_length)
self.options += " -m " + str(self.min_length)
self.parameters += " -m " + str(self.min_length)
if self.max_length:
self.options += " -M " + str(self.max_length)
self.parameters += " -M " + str(self.max_length)
if self.overlap_length:
self.options += " -O " + str(self.overlap_length)
self.parameters += " -O " + str(self.overlap_length)
for cle, valeur in self.adapt_FWD.items() :
if cle == 'a' :
for adapt in valeur :
self.options += " -a " + adapt
self.options_FWD += " -a " + adapt
if cle == 'g' :
for adapt in valeur :
self.options_FWD += " -g " + adapt
if cle == 'b' :
for adapt in valeur :
self.options_FWD += " -b " + adapt
for cle, valeur in self.adapt_REV.items() :
if cle == 'a' :
for adapt in valeur :
self.options_REV += " -a " + adapt
if cle == 'g' :
for adapt in valeur :
self.options += " -g " + adapt
self.options_REV += " -g " + adapt
if cle == 'b' :
for adapt in valeur :
self.options += " -b " + adapt
self.options_REV += " -b " + adapt
self.options += " ; FWD "+self.options_FWD+" ; REV "+self.options_REV
def post_process(self):
for output_file_R1 in self.log_files_R1:
sample = os.path.splitext(os.path.basename(output_file_R1))[0]
# Parse results
metrics = self.parse_metrics_file(output_file_R1)
self._add_result_element(sample, "processedread", str(metrics["processedread"]))
......@@ -121,8 +139,8 @@ class CutAdapt (Analysis):
if not samples.has_key(sample_name):
samples[sample_name] = {}
samples[sample_name]["final_read"] = str(nb_seq)
samples[sample_name]["size_extended"] = str(",".join([str(v) for v in x]))
samples[sample_name]["nb_size_extended"] = str(",".join([str(v) for v in y]))
samples[sample_name]["size"] = str(",".join([str(v) for v in x]))
samples[sample_name]["nb_size"] = str(",".join([str(v) for v in y]))
if self.is_paired :
......@@ -130,12 +148,12 @@ class CutAdapt (Analysis):
sample = os.path.splitext(os.path.basename(output_file_R2))[0]
# Parse results
metrics = self.parse_metrics_file(output_file_R2)
self._add_result_element(sample, "processedread", str(metrics["processedread"]), "processedread")
self._add_result_element(sample, "processedbase", str(metrics["processedbase"]), "processedbase")
self._add_result_element(sample, "trimmedread", str(metrics["trimmedread"]),"trimmedread")
self._add_result_element(sample, "trimmedbase", str(metrics["trimmedbase"]), "trimmedbase")
self._add_result_element(sample, "shortread", str(metrics["shortread"]),"shortread")
self._add_result_element(sample, "longread", str(metrics["longread"]), "longread")
self._add_result_element(sample, "processedread", str(metrics["processedread"]))
self._add_result_element(sample, "processedbase", str(metrics["processedbase"]))
self._add_result_element(sample, "trimmedread", str(metrics["trimmedread"]))
self._add_result_element(sample, "trimmedbase", str(metrics["trimmedbase"]))
self._add_result_element(sample, "shortread", str(metrics["shortread"]))
self._add_result_element(sample, "longread", str(metrics["longread"]))
# Process metrics from the final output reads 2
for filepath in self.output_files_R2:
......@@ -151,17 +169,15 @@ class CutAdapt (Analysis):
if not samples.has_key(sample_name):
samples[sample_name] = {}
samples[sample_name]["final_read"] = str(nb_seq)
samples[sample_name]["size_extended"] = str(",".join([str(v) for v in x]))
samples[sample_name]["nb_size_extended"] = str(",".join([str(v) for v in y]))
samples[sample_name]["size"] = str(",".join([str(v) for v in x]))
samples[sample_name]["nb_size"] = str(",".join([str(v) for v in y]))
for sample in samples:
self._add_result_element(sample, "final_read", samples[sample]["final_read"])
self._add_result_element(sample, "size_extended", samples[sample]["size_extended"])
self._add_result_element(sample, "nb_size_extended", samples[sample]["nb_size_extended"])
self._add_result_element(sample, "size", samples[sample]["size"])
self._add_result_element(sample, "nb_size", samples[sample]["nb_size"])
# Finaly create and add the archive to the analyse
self._create_and_archive([self.log_files_R1,self.log_files_R2], self.archive_name)
# Finaly create and add the archive to the analyse
def get_version(self):
cmd = [self.get_exec_path("cutadapt"),"--version"]
......@@ -170,31 +186,27 @@ class CutAdapt (Analysis):
return stdout.split()[0]
def process(self):
self.input_files_R1 = '/home/plage/raw_data/Test_MP_R1.fastq'
self.tmp_files_R1 = '/home/plage/scratch/work/illumina_matepair/wf000169/CutAdapt_default/Test_MP_R1.tmp'
#self.log_files_R1 = '/home/plage/scratch/work/illumina_matepair/wf000169/CutAdapt_default/Test_MP_R1.stderr'
self.get_exec_path("cutadapt")
print self.get_exec_path("cutadapt")
if self.is_paired :
print "is paired"
print self.log_files_R1
cutAdapt = ShellFunction(self.get_exec_path("cutadapt")+ self.options + " $1 -o $2 2> $3 ", cmd_format='{EXE} {IN} {OUT}')
cutAdapt = Map(cutAdapt, inputs = [self.input_files_R1], outputs = ["~/test.tmp", "~/test.log"])
# cutAdapt = MultiMap(cutAdapt, inputs = [self.input_files_R2], outputs = [self.tmp_files_R2, self.log_files_R2])
#
# for i in range(len(self.tmp_files_R1)):
# reducePair = PythonFunction(self.reduce_pair, cmd_format='{EXE} {IN} {OUT} {ARG}')
# reducePair(inputs = [self.tmp_files_R1, self.tmp_files_R2], outputs = [self.output_files_R1, self.output_files_R2], arguments = [20, None, None])
#
# else :
# cutAdapt = ShellFunction(self.get_exec_path("cutadapt")+ self.options + " $1 -o $2 2> $3", cmd_format='{EXE} {IN} {OUT}')
# cutAdapt = MultiMap(cutAdapt, inputs = self.input_files_R1, outputs = [self.output_files_R1, self.log_files_R1])
# o = open(self.output_files_R2, 'w')
# l = open(self.output_log_files_R2, 'w')
# o.close()
# l.close()
self.tmp_files_R1 = self.get_outputs('{basename_woext}_tmp.fastq', self.input_files_R1)
self.tmp_files_R2 = self.get_outputs('{basename_woext}_tmp.fastq', self.input_files_R2)
#cutAdapt = PythonFunction(cutadaptateur, cmd_format='{EXE} {IN} {OUT} {ARG}')
cutAdapt = ShellFunction(self.get_exec_path("cutadapt") + self.options_REV + self.parameters + " --paired-output $3 -o $4 $1 $2 > $5", cmd_format = '{EXE} {IN} {OUT}')
#cutAdaptR1 = MultiMap(cutAdaptR1, inputs = [self.input_files_R1, self.input_files_R2], outputs =[self.tmp_files_R1, self.tmp_files_R2, self.log_files_R1])
cutAdapt(inputs = [self.input_files_R1, self.input_files_R2], outputs =[self.tmp_files_R1, self.tmp_files_R2, self.log_files_R1])
cutAdapt = ShellFunction(self.get_exec_path("cutadapt") + self.options_FWD + self.parameters + " --paired-output $3 -o $4 $1 $2 > $5", cmd_format = '{EXE} {IN} {OUT} ')
#cutAdaptR2 = MultiMap(cutAdaptR2, inputs = [self.tmp_files_R2, self.tmp_files_R1], outputs = [self.output_files_R2, self.output_files_R1 ,self.log_files_R2])
cutAdapt(inputs = [self.tmp_files_R2, self.tmp_files_R1], outputs = [self.output_files_R2, self.output_files_R1 ,self.log_files_R2])
else :
cutAdapt = ShellFunction(self.get_exec_path("cutadapt")+ self.options_FWD + self.parameters + " $1 -o $2 > $3", cmd_format='{EXE} {IN} {OUT}')
cutAdapt = MultiMap(cutAdapt, inputs = self.input_files_R1, outputs = [self.output_files_R1, self.log_files_R1])
o = open(self.output_files_R2, 'w')
l = open(self.output_log_files_R2, 'w')
o.close()
l.close()
def parse_metrics_file(self, input_file):
"""
......@@ -202,6 +214,7 @@ class CutAdapt (Analysis):
@return : string with inserts sizes class, metrics dictionnary
"""
stats = {}
print input_file
for line in open(input_file, 'r').readlines():
if re.search("Processed reads",line):
stats["processedread"] = line.strip().split()[2]
......@@ -212,49 +225,9 @@ class CutAdapt (Analysis):
if re.search("Trimmed bases",line):
stats["trimmedbase"] = line.strip().split()[2]
if re.search("Too short reads",line):
stats["shortread"] = line.strip().split()[2]
stats["shortread"] = line.strip().split()[3]
if re.search("Too long reads",line):
stats["longread"] = line.strip().split()[2]
stats["longread"] = line.strip().split()[3]
return stats
def reduce_pair(self, input_file_R1, input_file_R2, output_file_R1, output_file_R2, min_length, trim_R1toR2, trim_R2toR1):
"""
@param input_file_R1: the output R1 file of cutadapt
@param in
@return: string with the number of reads trimmed
"""
import jflow.seqio as seqio
R1_fh = seqio.SequenceReader(input_file_R1)
R2_fh = seqio.SequenceReader(input_file_R2)
out_R1_fh = open(output_file_R1,'w')
out_R2_fh = open(output_file_R2,'w')
for id, desc, seq, qual in R1_fh :
id2, desc2, seq2, qual2 = R2_fh.__iter__()
if id2 :
if id == id2 :
R1_length = len(seq)
R2_length = len(seq2)
if (R1_length >= min_length and R2_length >= min_length) :
if trim_R2toR1 :
if R1_length < R2_length :
seq2 = seq2[:R1_length]
qual2 = qual2[:R1_length]
seqio.writefastq(out_R1_fh,[id,desc,seq,qual] )
seqio.writefastq(out_R2_fh,[id2,desc2,seq2,qual2])
else :
raise ValueError, "Read1 id isn't identical of read2 id"
else :
raise IOError, "input_file_R1 and input_file_R2 haven't the same number of sequences.\n"
out_R1_fh.close()
out_R2_fh.close()
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment