Commit 12ebd2f5 authored by Penom Nom's avatar Penom Nom
Browse files

Manage pre-clustering.

parent 816b1886
......@@ -25,9 +25,48 @@ from jflow.component import Component
from jflow.iotypes import OutputFileList, InputFileList, OutputFile, Formats
from jflow.abstraction import MultiMap
from weaver.function import ShellFunction
from weaver.function import ShellFunction, PythonFunction
def filter_count( size_separator, chimeras_file, non_chimeras_file, output_file ):
"""
@summary : Writes file with the non-chimeras and chimeras count.
@param size_separator : [str] The number of sequences represented by each sequence/pre-cluster must be stored at
the end of each sequence ID. The separator must be size_separator.
Example : size_sep=';' where sequence ID = 'seq10001;83'
@param chimeras_file : [str] path to the fasta file off chimeras.
@param non_chimeras_file : [str] path to the fasta file off non-chimeras.
@param output_file : [str] path to the output file.
"""
import jflow.seqio as seqio
chimeras_count = 0
non_chimeras_count = 0
# Chimeras
reader = seqio.SequenceReader( chimeras_file )
for id, desc, seq, qual in reader :
count = id.split( size_separator )[-1]
# Patch for Usearch dereplication
if count.endswith( ';' ):
count = count[0:-1]
chimeras_count += int(count)
# Non chimeras
reader = seqio.SequenceReader( non_chimeras_file )
for id, desc, seq, qual in reader :
count = id.split( size_separator )[-1]
# Patch for Usearch dereplication
if count.endswith( ';' ):
count = count[0:-1]
non_chimeras_count += int(count)
# Write
out_fh = open( output_file, "w" )
out_fh.write( str(chimeras_count) + " chimeras" + "\n" )
out_fh.write( str(non_chimeras_count) + " non-chimeras" + "\n" )
out_fh.close()
class UsearchChimera (Analysis):
"""
@summary: Remove chimeric sequences.
......@@ -35,13 +74,18 @@ class UsearchChimera (Analysis):
@see : http://drive5.com/usearch/manual/chimera_formation.html
"""
def define_parameters(self, fasta_list, min_diffs=3):
def define_parameters(self, fasta_list, min_diffs=3, cluster_size_sep=None):
"""
@param fasta_list : [list] path of files that will be processed.
@param min_diffs : [int] minimum number of diffs in a segment. Must be > 0.
@param cluster_size_sep : [str] Each sequence in the input files can represent several sequences (by example after dereplication).
The number of sequences represented by each sequence/pre-cluster is located at the end of the sequence
ID separated by cluster_size_sep.
Example : cluster_size_sep=';' where sequence ID = 'seq10001;83'
"""
# Parameters
self.min_diffs = min_diffs
self.cluster_size_sep = cluster_size_sep
# Files
self.input_fasta = InputFileList(fasta_list, Formats.FASTA)
......@@ -49,7 +93,10 @@ class UsearchChimera (Analysis):
self.log = OutputFileList( self.get_outputs('{basename_woext}.log', self.input_fasta) )
self.chimeras = OutputFileList( self.get_outputs('{basename_woext}_chimeras.fasta', self.input_fasta), Formats.FASTA )
self.nonchimeras = OutputFileList( self.get_outputs('{basename_woext}_nonchimeras.fasta', self.input_fasta), Formats.FASTA )
self.stderr = OutputFileList( self.get_outputs('{basename_woext}.stderr', self.input_fasta) )
self.stderr = OutputFileList( self.get_outputs('{basename_woext}.stderr', self.input_fasta) )
self.stat = self.stderr
if self.cluster_size_sep is not None:
self.stat = OutputFileList( self.get_outputs('{basename_woext}.stat', self.input_fasta) )
def define_analysis(self):
self.name = "RemoveChimera"
......@@ -57,10 +104,10 @@ class UsearchChimera (Analysis):
self.software = "usearch"
self.options = "-mindiffs " + str(self.min_diffs) + " -uchime_denovo"
def _parse_stderr(self, file_path):
def _parse_stat(self, file_path):
"""
@summmary : Return the number of chimeras and the number of non-chimeras from the usearch uchime_denovo stderr.
@param file_path : [string] the stderr filepath.
@summmary : Return the number of chimeras and the number of non-chimeras from the usearch uchime_denovo stderr or stat file.
@param file_path : [string] the stderr or stat filepath.
@note : stderr example
00:00 2.2Mb Reading /work/ng6/jflow/gene_diversity/wf000908/UsearchDereplication_default/dsrB-ADNc-3-FM3-C0-Jour_CAGTAT_L001_R.extendedFrags_nr.fasta, 11Mb
00:00 13Mb 24362 (24.4k) seqs, min 251, avg 377, max 430nt
......@@ -90,9 +137,9 @@ class UsearchChimera (Analysis):
def post_process(self):
self._create_and_archive(self.nonchimeras + self.log, "usearch_chimera.gz")
for filepath in self.stderr:
for filepath in self.stat:
sample = os.path.basename(filepath).split(".")[0]
nb_chimeras, nb_non_chimeras = self._parse_stderr( filepath )
nb_chimeras, nb_non_chimeras = self._parse_stat( filepath )
self._add_result_element( sample, "nb_chimeras", str(nb_chimeras) )
self._add_result_element( sample, "nb_non_chimeras", str(nb_non_chimeras) )
......@@ -103,5 +150,11 @@ class UsearchChimera (Analysis):
return stdout.split()[0]
def process(self):
# Process
chimera = ShellFunction( self.get_exec_path("usearch") + " -mindiffs " + str(self.min_diffs) + " -uchime_denovo $1 -uchimeout $2 -uchimealns $3 -chimeras $4 -nonchimeras $5 2> $6", cmd_format='{EXE} {IN} {OUT}' )
chimera = MultiMap(chimera, inputs=self.input_fasta, outputs=[self.stdout, self.log, self.chimeras, self.nonchimeras, self.stderr])
\ No newline at end of file
chimera = MultiMap(chimera, inputs=self.input_fasta, outputs=[self.stdout, self.log, self.chimeras, self.nonchimeras, self.stderr])
# Statistics
if self.cluster_size_sep is not None:
stat = PythonFunction( filter_count, cmd_format="{EXE} '" + self.cluster_size_sep + "' {IN} {OUT}" )
stat = MultiMap(stat, inputs=[self.chimeras, self.nonchimeras], outputs=[self.stat])
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment