Commit 29fc8eca authored by Penom Nom's avatar Penom Nom
Browse files

Add split to increase speed.

parent 405df757
......@@ -24,16 +24,19 @@ from ng6.utils import Utils
class GeneDiversity (NG6Workflow):
def process(self):
# Add raw files
addrawfiles = self.add_component( "AddRawFiles", [self.runobj, self.args["read_1"]+self.args["read_2"], "none"] )
# Trim sequences
trim_R1 = self.add_component("Trimmer", [self.args['read_1'], 1, self.args["trim"]["read_1"]], component_prefix="R1")
trim_R2 = self.add_component("Trimmer", [self.args['read_2'], 1, self.args["trim"]["read_2"]], component_prefix="R2")
# Make some statistics on raw file
fastqc = self.add_component("FastQC", [trim_R1.output_files + trim_R2.output_files])
fastqc = self.add_component("FastQC", [trim_R1.output_files + trim_R2.output_files, False, True])
# Merge overlapping pair
join_pairs = self.add_component("Flash", [trim_R1.output_files, trim_R2.output_files, self.args["join_pair"]["mismatch_ratio"], self.args["join_pair"]["min_overlap"], self.args["join_pair"]["max_overlap"]])
# Fastq to fasta
fastq2fasta = self.add_component("Fastq2fasta", [join_pairs.extended_frags])
......@@ -44,7 +47,8 @@ class GeneDiversity (NG6Workflow):
chimera = self.add_component("UsearchChimera", [dereplicate.output_files])
# Sequence traduction
framebot = self.add_component("Framebot", [chimera.nonchimeras, self.args["database"], False])
split = self.add_component("SplitSeq", [chimera.nonchimeras, 10000])
framebot = self.add_component("Framebot", [split.output_files, self.args["database"], False])
# Rename the pre-clusters to provide traceback after merge and cd-hit
rename_clusters = self.add_component("RenameClusters", [framebot.corrected_proteins])
......
#
# Copyright (C) 2012 INRA
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
import os
import jflow.seqio as seqio
from jflow.component import Component
from jflow.iotypes import OutputFileEndsWith, OutputFileList, InputFileList, Formats
from weaver.function import PythonFunction
from weaver.abstraction import Map
def split_seq (sequences_file, outdir, nb_seq_by_file):
"""
@summary : Split a sequences file in sequences files with 'nb_seq_by_file' sequences per file.
@param sequences_file : file before cut.
@param outdir: output directory.
@param nb_seq_by_file: number of sequences per file.
"""
import os
import jflow.seqio as seqio
basename = os.path.basename(sequences_file)
basename_woext = basename.split(".")[0]
extensions = ".".join( basename.split(".")[1:] )
current_nb_seq_on_file = 0
current_file_id = 0
out_fh = None
# For each sequence
reader = seqio.SequenceReader( sequences_file )
for id, desc, seq, qual in reader:
current_nb_seq_on_file += 1
# If current file is complete
if int(current_nb_seq_on_file) > int(nb_seq_by_file) or out_fh is None:
# Close current file
if out_fh is not None:
out_fh.close()
# Next output file
current_nb_seq_on_file = 1
current_file_id += 1
current_file_path = os.path.join(outdir, basename_woext + '_' + str(current_file_id) + '.' + extensions)
if sequences_file.endswith(".gz"):
out_fh = seqio.xopen( current_file_path, "w" )
else:
out_fh = open( current_file_path, "w" )
# If sequences file is a FASTQ
if reader.__class__.__name__ == "FastqReader":
seqio.writefastq(out_fh, [[id, desc, seq, qual]])
# If sequences file is a FASTA
elif reader.__class__.__name__ == "FastaReader":
seqio.writefasta(out_fh, [[id, desc, seq, qual]])
out_fh.close()
class SplitSeq (Component):
"""
@summary : Split a sequences file in sequences files with 'nb_seq_by_file' sequences by file.
"""
def define_parameters(self, sequences_files, nb_seq_by_file=200):
"""
@param sequences_files : [list] files before cut.
@param nb_seq_by_file : number of sequences per file before cut.
"""
self.input_files = InputFileList(sequences_files)
self.nb_seq_by_file = int(nb_seq_by_file)
self.stderr = OutputFileList( self.get_outputs('{basename_woext}.stderr', self.input_files) )
self.output_files = OutputFileEndsWith( self.output_directory, ".stderr", sequences_files.format, False )
def process(self):
split = PythonFunction( split_seq, cmd_format='{EXE} {IN} ' + self.output_directory + ' ' + str(self.nb_seq_by_file) + ' 2> {OUT}' )
Map(split, inputs=self.input_files, outputs=self.stderr)
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment