Commit ff7b8a92 authored by Claire Kuchly's avatar Claire Kuchly
Browse files

new add

parent 53e311d7
#
# Copyright (C) 2012 INRA
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
import os
from subprocess import Popen, PIPE
from jflow.component import Component
from jflow.iotypes import OutputFileList, InputFileList, Formats
from jflow.abstraction import MultiMap
from weaver.function import ShellFunction
from ng6.analysis import Analysis
from ng6.utils import Utils
import jflow.seqio as seqio
class Process_radtag (Analysis):
def define_parameters(self, read1_files, read2_files, enzyme_name=None, limit_score=10, quality_encode = 'phred33', max_length=None, rescue_radtag=False, discard_low_qual=False, discard_read_files=False, archive_name=False):
"""
@param read1_files : paths to reads 1
@param read2_files : paths to reads 2
@param enzyme_name : provide the restriction enzyme used (cut site occurs on single-end read)
Currently supported enzymes include:
'apeKI', 'bamHI', 'claI', 'dpnII', 'eaeI', 'ecoRI',
'ecoT22I', 'hindIII', 'mluCI', 'mseI', 'mspI', 'ndeI',
'nheI', 'nlaIII', 'notI', 'nsiI', 'pstI', 'sau3AI',
'sbfI', 'sexAI', 'sgrAI', 'sphI', 'taqI', or 'xbaI'.
@param limit_score : set the score limit. If the average score within the sliding window drops below this value, the read is discarded (default 10).
@param quality_encode : specify how quality scores are encoded, 'phred33' (Illumina 1.8+, Sanger, default) or 'phred64' (Illumina 1.3 - 1.5).
@param rescue_radtag : rescue barcodes and RAD-Tags.
@param discard_low_qual : discard reads with low quality scores.
@param discard_read_files : capture discarded reads to a file.
@param max_length : truncate final read length to this value. (default none)
@param archive_name : name for the output archive
"""
self.read1_files = InputFileList(read1_files)#, Formats.FASTQ)
self.read2_files = InputFileList(read2_files)#, Formats.FASTQ)
if len(read1_files) != len(read2_files):
raise Exception("[ERROR] : the number of files is not correct! (the number of files in read1_files and in read2_files must be the same)")
self.enzyme_name = enzyme_name
self.limit_score = limit_score
self.quality_encode = quality_encode
self.rescue_radtag = rescue_radtag
self.discard_low_qual = discard_low_qual
self.discard_read_files = discard_read_files
self.max_length = max_length
self.archive_name = archive_name
self.prefixes = self.get_outputs('{basename_woext}', [read1_files, read2_files])
self.output_read_1 = OutputFileList(self.get_outputs('{basename_woext}.fq.gz', self.read1_files), Formats.FASTQ)
self.output_read_2 = OutputFileList(self.get_outputs('{basename_woext}.fq.gz', self.read2_files), Formats.FASTQ)
self.discard_read_1 = OutputFileList(self.get_outputs('{basename_woext}.fq.discard.gz', self.read1_files), Formats.FASTQ)
self.discard_read_2 = OutputFileList(self.get_outputs('{basename_woext}.fq.discard.gz', self.read2_files), Formats.FASTQ)
self.stderrs = OutputFileList(self.get_outputs('{basename_woext}.stderr', self.prefixes))
def define_analysis(self):
self.name = "Process radtag"
self.description = "Correct fastq radtag"
self.software = "process_radtags"
self.options = ""
if self.enzyme_name:
self.options += " -e " + str(self.enzyme_name)
if self.limit_score:
self.options += " -s " + str(self.limit_score)
if self.quality_encode:
self.options += " -E " + str(self.quality_encode)
if self.rescue_radtag:
self.options += " -r "
if self.discard_low_qual:
self.options += " -q "
if self.discard_read_files:
self.options += " -D "
if self.max_length:
self.options += " -t " + str(self.max_length)
if self.phred_offset:
self.options += " -p " + str(self.phred_offset)
def post_process(self):
samples = {}
# Save files
for filepath in self.extended_frags:
self._save_file(filepath)
# Process metrics from the extended fragments
for filepath in self.extended_frags:
[nb_seq, sizes] = self._get_length_table(filepath)
x = []
y = []
for val in sizes.keys():
x.append(val)
x = sorted(x)
for i in x:
y.append(sizes[i])
sample_name = os.path.basename(filepath).split(".extendedFrags")[0]
if not samples.has_key(sample_name):
samples[sample_name] = {}
samples[sample_name]["nb_extended"] = str(nb_seq)
samples[sample_name]["size_extended"] = str(",".join([str(v) for v in x]))
samples[sample_name]["nb_size_extended"] = str(",".join([str(v) for v in y]))
# Process metrics from the not combined reads 1
for filepath in self.not_combined_read_1:
[nb_seq, sizes] = self._get_length_table(filepath)
sample_name = os.path.basename(filepath).split(".notCombined_1")[0]
if not samples.has_key(sample_name):
samples[sample_name] = {}
samples[sample_name]["nb_notcombined1"] = str(nb_seq)
# Process metrics from the not combined reads 2
for filepath in self.not_combined_read_2:
[nb_seq, sizes] = self._get_length_table(filepath)
sample_name = os.path.basename(filepath).split(".notCombined_2")[0]
if not samples.has_key(sample_name):
samples[sample_name] = {}
samples[sample_name]["nb_notcombined2"] = str(nb_seq)
# Save metrics
for sample in samples:
self._add_result_element(sample, "nb_extended", samples[sample]["nb_extended"])
self._add_result_element(sample, "size_extended", samples[sample]["size_extended"])
self._add_result_element(sample, "nb_size_extended", samples[sample]["nb_size_extended"])
self._add_result_element(sample, "nb_notcombined1", samples[sample]["nb_notcombined1"])
self._add_result_element(sample, "nb_notcombined2", samples[sample]["nb_notcombined2"])
def get_version(self):
cmd = [self.get_exec_path("process_radtags"), "--version"]
p = Popen(cmd, stdout=PIPE, stderr=PIPE)
stdout, stderr = p.communicate()
return stdout.split()[1]
def process(self):
# Tmp output
# Creates list for temporary uncompressed files
tmp_output_read_1 = self.get_outputs('{basename_woext}.fq', self.read1_files)
tmp_output_read_2 = self.get_outputs('{basename_woext}.fq', self.read2_files)
tmp_discard_read_1 = self.get_outputs('{basename_woext}.notCombined_1.fastq', self.prefixes)
tmp_discard_read_2 = self.get_outputs('{basename_woext}.notCombined_2.fastq', self.prefixes)
# Process radtags read1 files
for i in range(0, len(self.prefixes)):
process_radtag = ShellFunction(self.get_exec_path("process_radtags") + " -f $1 " + self.options + " -o " + self.output_directory + " 2> $2 ", cmd_format='{EXE} {IN} {OUT}')
process_radtag(inputs = [self.read1_files[i]], outputs = [self.stderrs[i]])
# Recover_mate and recover_discard
recover_mate = PythonFunction(recover_mate_ok, cmd_format="{EXE} {IN} {OUT}")
recover_mate = Map(recover_mate, inputs = [self.output_read_1, self.input_read_2], outputs=[self.output_read_2, self.stderr])
recover_discard = PythonFunction(recover_mate_discard, cmd_format="{EXE} {IN} {OUT}")
recover_discard = Map(recover_discardn, inputs = [self.discard_read_1, self.input_read_2], outputs=[self.discard_read_2, self.stderr])
# Compress
compress = ShellFunction("gzip $1 $2 $3", cmd_format='{EXE} {IN} {OUT}')
compress = MultiMap(compress, inputs = [tmp_extended_frags, tmp_not_combined_read_1, tmp_not_combined_read_2], outputs = [self.extended_frags, self.not_combined_read_1, self.not_combined_read_2])
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment