Commit 029671e5 authored by Penom Nom's avatar Penom Nom
Browse files

The component UsearchChimera becomes an analysis.

parent 29fc8eca
......@@ -44,7 +44,7 @@ class GeneDiversity (NG6Workflow):
dereplicate = self.add_component("UsearchDereplication", [fastq2fasta.output_files, self.args["dereplication"]["method"], self.args["dereplication"]["strand"], self.args["dereplication"]["min_unique_size"]])
# Remove chimeric sequences
chimera = self.add_component("UsearchChimera", [dereplicate.output_files])
chimera = self.add_component("UsearchChimera", [dereplicate.output_files], parent=join_pairs)
# Sequence traduction
split = self.add_component("SplitSeq", [chimera.nonchimeras, 10000])
......@@ -66,4 +66,4 @@ class GeneDiversity (NG6Workflow):
blast = self.add_component("Blast", [cdhit.output_files, [{'file':blast_index.databank, 'max_hit':1}], 6, "blastp"])
# Stat on OTU
stat = self.add_component("GeneOTUStat", [[rename_clusters.merged_logs], cdhit.cluster_files, blast.outputs[os.path.basename(blast_index.databank)], [self.args["taxonomy"]], cdhit.output_files], parent=join_pairs)
\ No newline at end of file
stat = self.add_component("GeneOTUStat", [[rename_clusters.merged_logs], cdhit.cluster_files, blast.outputs[os.path.basename(blast_index.databank)], [self.args["taxonomy"]], cdhit.output_files], parent=chimera)
\ No newline at end of file
......@@ -15,7 +15,11 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
import os
import os, re
from subprocess import Popen, PIPE
from ng6.analysis import Analysis
from jflow.component import Component
from jflow.iotypes import OutputFileList, InputFileList, OutputFile, Formats
......@@ -24,7 +28,7 @@ from jflow.abstraction import MultiMap
from weaver.function import ShellFunction
class UsearchChimera (Component):
class UsearchChimera (Analysis):
"""
@summary: Remove chimeric sequences.
@see : http://drive5.com/usearch/manual/uchime_algo.html
......@@ -33,7 +37,7 @@ class UsearchChimera (Component):
def define_parameters(self, fasta_list):
"""
@param files_list : path of files that will be processed.
@param files_list : [list] path of files that will be processed.
"""
# Files
self.input_fasta = InputFileList(fasta_list, Formats.FASTA)
......@@ -42,7 +46,58 @@ class UsearchChimera (Component):
self.chimeras = OutputFileList( self.get_outputs('{basename_woext}_chimeras.fasta', self.input_fasta), Formats.FASTA )
self.nonchimeras = OutputFileList( self.get_outputs('{basename_woext}_nonchimeras.fasta', self.input_fasta), Formats.FASTA )
self.stderr = OutputFileList( self.get_outputs('{basename_woext}.stderr', self.input_fasta) )
def define_analysis(self):
self.name = "RemoveChimera"
self.description = "Remove chimeric sequences."
self.software = "usearch"
self.options = "-uchime_denovo"
def _parse_stderr(self, file_path):
"""
@summmary : Return the number of chimeras and the number of non-chimeras from the usearch uchime_denovo stderr.
@param file_path : [string] the stderr filepath.
@note : stderr example
00:00 2.2Mb Reading /work/ng6/jflow/gene_diversity/wf000908/UsearchDereplication_default/dsrB-ADNc-3-FM3-C0-Jour_CAGTAT_L001_R.extendedFrags_nr.fasta, 11Mb
00:00 13Mb 24362 (24.4k) seqs, min 251, avg 377, max 430nt
01:46 131Mb 100.0% Search 3226/24362 chimeras found (13.2%)
01:47 131Mb 100.0% Writing alignments
01:47 131Mb 100.0% Writing hits
01:47 131Mb 100.0% Writing 3226 chimeras
01:47 131Mb 100.0% Writing 21136 non-chimeras
"""
nb_chimeras = 0
nb_non_chimeras = 0
stderr_fh = open(file_path)
for line in stderr_fh:
line = line.strip()
matches = re.search("(\d+) chimeras$", line)
if matches is not None:
nb_chimeras = matches.group(0)
else:
matches = re.search("\d+ non-chimeras", line)
if matches is not None:
nb_non_chimeras = matches.group(0)
stderr_fh.close()
return nb_chimeras, nb_non_chimeras
def post_process(self):
self._create_and_archive(self.nonchimeras + self.log, "usearch_chimera.gz")
for filepath in self.stderr:
sample = os.path.basename(filepath).split(".")[0]
nb_chimeras, nb_non_chimeras = self._parse_stderr( filepath )
self._add_result_element( sample, "nb_chimeras", str(nb_chimeras) )
self._add_result_element( sample, "nb_non_chimeras", str(nb_non_chimeras) )
def get_version(self):
cmd = [self.get_exec_path("usearch"), "--version"]
p = Popen(cmd, stdout=PIPE, stderr=PIPE)
stdout, stderr = p.communicate()
return stdout.split()[0]
def process(self):
chimera = ShellFunction( self.get_exec_path("usearch") + " -uchime_denovo $1 -uchimeout $2 -uchimealns $3 -chimeras $4 -nonchimeras $5 2> $6", cmd_format='{EXE} {IN} {OUT}' )
chimera = MultiMap(chimera, inputs=self.input_fasta, outputs=[self.stdout, self.log, self.chimeras, self.nonchimeras, self.stderr])
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment