Maintenance - Mise à jour mensuelle Lundi 6 Avril 2020 entre 7h00 et 9h00

Commit 9f2a7f0c authored by Penom Nom's avatar Penom Nom

fix gene_diversity workflow

parent 87d3e9fe
......@@ -71,6 +71,7 @@
<li><a href="#storage" >Storage</a></li>
<li><a href="#softwares" >Softwares</a></li>
<li><a href="#components" >Components</a></li>
<li><a href="#workflows" >Workflows</a></li>
<li><a href="#resources" >Resources</a></li>
</ul>
</nav>
......@@ -343,6 +344,16 @@ cutadapt = /path/to/cutadapt</code></pre>
BWAIndex.batch_options = -l mem=1G -l h_vmem=1G -q myflowq
BWAmem.batch_options = -l mem=10G -l h_vmem=10G -q myflow2q</code></pre>
</section>
<section id="workflows">
<h1 class="page-header">Workflows <small>section</small></h1>
<p>Each workflow has a default group. In this section you can set a group name for each one of your workflow,
thus while using the <code>availablewf()</code> plugin, you will be able to specify which group of workflow you want to get.
You can set the group of a workflow by adding a line of the following schema : [workflow_classname].group = [group_name].</p>
<pre class="pre-hl "><code class="ini">[workflows]
QuickStart.group = tutorial</code></pre>
</section>
<section id="ressources">
<h1 class="page-header">Ressources <small>section</small></h1>
......
......@@ -164,7 +164,7 @@ class GeneDiversity (NG6Workflow):
dereplicate = self.add_component("UsearchDereplication", [fastq2fasta.output_files])
# Remove chimeric sequences
chimera = self.add_component("UsearchChimera", [dereplicate.output_files, 6, ';size='], parent=filter)
chimera = self.add_component("UsearchChimera", [dereplicate.output_files, 6], parent=filter)
# Sequence traduction
split = self.add_component("SplitSeq", [chimera.nonchimeras, 6000])
......@@ -174,14 +174,14 @@ class GeneDiversity (NG6Workflow):
framebot = self.add_component("Framebot", [split_outputs, self.database, self.protein_min_length, False])
# Rename the pre-clusters to provide traceback after merge and cd-hit
rename_clusters = self.add_component("AddSamplesNames", [framebot.corrected_proteins, '|', ';size=', new_samples_names])
rename_clusters = self.add_component("AddSamplesNames", [framebot.corrected_proteins, new_samples_names])
# Merge sequences
merge = self.add_component("ConcatenateFiles", [rename_clusters.output_files, "all_trad_sequences.fasta"])
# Create OTU
cdhit = self.add_component("Cdhit", [merge.output_file, self.otu_identity_threshold, self.otu_length_diff_cutoff,
self.otu_cluster_most_similar, 5, 'euclidean', 'average', ';size=', '|'], parent=chimera)
self.otu_cluster_most_similar, 5, 'euclidean', 'average'], parent=chimera)
# Sampling
groups = merge_groups if merge_groups else None
......
......@@ -18,26 +18,21 @@
import os
from jflow.component import Component
from jflow.abstraction import MultiMap
from weaver.function import ShellFunction, PythonFunction
from weaver.function import PythonFunction
def add_sample_name( sample_name, sequence_name_sep, sequence_count_sep, input_file, output_file ):
def add_sample_name( sample_name, input_file, output_file ):
"""
@summary : Adds the sample name to each sequence IDs.
@param sample_name : [str] the sample name to add.
@param sequence_name_sep : [char] the separator between the sequence ID and the name of the sample.
@param sequence_count_sep : [char] the separator between the sequence ID and the number of copies of this sequence or
between the name of the sample and the number of copies of this sequence.
Example : sequence ID = 'seq10001;83'.
OR
sequence ID = 'seq10001|lake_spot_1;83'.
@param input_file : [str] path to the fasta processed.
@param output_file : [str] path to the output.
"""
import jflow.seqio as seqio
sequence_name_sep = '|'
sequence_count_sep = ';size='
reader = seqio.SequenceReader(input_file)
out_fh = open(output_file, "w")
for id, desc, seq, qual in reader :
......@@ -56,25 +51,15 @@ class AddSamplesNames (Component):
@summary : Adds the sample name to each sequence IDs.
"""
def define_parameters(self, input_fasta, sequence_name_sep='|', sequence_count_sep=None, samples_names=None):
def define_parameters(self, input_fasta, samples_names=None):
"""
@param input_fasta : [list] fasta processed.
@param sequence_name_sep : [char] the separator between the sequence ID and the name of the sample.
@param sequence_count_sep : [char] the separator between the sequence ID and the number of copies of this sequence or
between the name of the sample and the number of copies of this sequence.
Example : sequence ID = 'seq10001;83'.
OR
sequence ID = 'seq10001|lake_spot_1;83'.
@param samples_names : [list] the sample name for each input fasta.
"""
self.add_input_file_list( "input_fasta", "fasta processed.", default=input_fasta, required=True, file_format='fasta' )
self.add_output_file_list("output_files", "The BWA bam files.", pattern='{basename}', items=self.input_fasta, file_format="fasta")
self.add_output_file("stderr", "stderr", filename='addSample.stderr')
# Parameters
self.add_parameter("sequence_name_sep", "he separator between the sequence ID and the name of the sample.", default=sequence_name_sep)
self.add_parameter("sequence_count_sep", "sequence_count_sep.", default=('none',sequence_count_sep)[sequence_count_sep!=None] )
if samples_names == None:
samples_names = list()
for current_input in self.input_fasta:
......@@ -85,5 +70,5 @@ class AddSamplesNames (Component):
def process(self):
# Rename files
for file_idx in range( len(self.input_fasta) ):
rename = PythonFunction( add_sample_name, cmd_format='{EXE} ' + self.samples_names[file_idx] + ' "' + str(self.sequence_name_sep) + '" "' + str(self.sequence_count_sep) + '" {IN} {OUT} 2>> ' + self.stderr )
rename = PythonFunction( add_sample_name, cmd_format='{EXE} ' + self.samples_names[file_idx] + ' {IN} {OUT} 2>> ' + self.stderr )
rename( inputs=self.input_fasta[file_idx], outputs=self.output_files[file_idx] )
\ No newline at end of file
......@@ -28,23 +28,15 @@ from weaver.function import ShellFunction, PythonFunction
from weaver.abstraction import Map
def to_biom( output_biom, clusters_file, precluster_sample_sep, precluster_size_sep ):
def to_biom( output_biom, clusters_file):
"""
@summary : Write a biom file from cdhit results.
@param output_biom : [str] path to the output file.
@param clusters_file : [str] path to the '.clstr' file.
@param precluster_sample_sep : [str] used if sequences provided to Cdhit come from differents samples ("none" otherwise). The sample name is stored in each
sequence id. It is separated by the character precluster_sample_sep and it is placed before the pre-cluster size information.
Example : sequence ID = 'seq10001|lake_spot_1;83'.
OR
sequence ID = 'seq10001|lake_spot_1'.
@param precluster_size_sep : [str] used if sequences provided to Cdhit are pre-clusters (ex : dereplication step before cd-hit). The number of sequences
represented by each pre-cluster can be added to the end of its ID. In this place the number is separated by the character
precluster_size_sep.
Example : precluster_size_sep=';' where sequence ID = 'seq10001;83'.
"""
from jflow.featureio import Biom, BiomIO
precluster_size_sep = ';size='
precluster_sample_sep = '|'
samples_seen = dict()
biom = Biom( generated_by='cdhit', matrix_type="sparse" )
......@@ -130,7 +122,7 @@ class Cdhit (Analysis):
"""
def define_parameters(self, input_fasta, identity_threshold=0.95, length_diff_cutoff=0.8, cluster_most_similar=True, word_length=5,
distance_method='euclidean', linkage_method='average', precluster_size_sep=None, precluster_sample_sep=None):
distance_method='euclidean', linkage_method='average'):
"""
@param input_fasta : [str] fasta list to process
@param identity_threshold : [float] sequence identity threshold. Calculated as : number of identical amino acids in alignment divided by the full length of the shorter sequence.
......@@ -141,16 +133,6 @@ class Cdhit (Analysis):
@param word_length : [int] word length.
@param distance_method : [str] distance method for the hierarchical clustering. Accepted values @see biomstat.samples_hclassification.
@param linkage_method : [str] linkage method for the hierarchical clustering. Accepted values @see biomstat.samples_hclassification.
@param precluster_size_sep : [str] used if sequences provided to Cdhit are pre-clusters (ex : dereplication step before cd-hit).
The number of sequences represented by each pre-cluster can be added to the end of its ID.
In this place the number is separated by the character precluster_size_sep.
Example : precluster_size_sep=';' where sequence ID = 'seq10001;83'.
@param precluster_sample_sep : [str] used if sequences provided to Cdhit come from differents samples. The sample name is stored in
each sequence id. It is separated by the character precluster_sample_sep and it is placed before the
pre-cluster size information.
Example : sequence ID = 'seq10001|lake_spot_1;83'.
OR
sequence ID = 'seq10001|lake_spot_1'.
"""
self.add_parameter("identity_threshold", "sequence identity threshold. Calculated as : number of "+
......@@ -167,20 +149,6 @@ class Cdhit (Analysis):
self.add_parameter("word_length", "word length.", default=word_length, type='int')
self.add_parameter("distance_method", "distance method for the hierarchical clustering. Accepted values @see biomstat.samples_hclassification.", default=distance_method)
self.add_parameter("linkage_method", "linkage method for the hierarchical clustering. Accepted values @see biomstat.samples_hclassification.", default=linkage_method)
self.add_parameter("precluster_size_sep", """
used if sequences provided to Cdhit are pre-clusters (ex : dereplication step before cd-hit).
The number of sequences represented by each pre-cluster can be added to the end of its ID.
In this place the number is separated by the character precluster_size_sep.
Example : precluster_size_sep=';' where sequence ID = 'seq10001;83'.
""", default=('none',precluster_size_sep)[precluster_size_sep!=None] )
self.add_parameter("precluster_sample_sep", """
used if sequences provided to Cdhit come from differents samples. The sample name is stored in
each sequence id. It is separated by the character precluster_sample_sep and it is placed before the
pre-cluster size information.
Example : sequence ID = 'seq10001|lake_spot_1;83'.
OR
sequence ID = 'seq10001|lake_spot_1'.
""", default=('none',precluster_sample_sep)[precluster_sample_sep!=None] )
# Parameters
self.cdhit_options = ""
......@@ -249,7 +217,7 @@ class Cdhit (Analysis):
rename = MultiMap( rename, inputs=[tmp_fasta_files, self.cluster_files], outputs=self.output_files )
# Build biom
biom = PythonFunction( to_biom, cmd_format='{EXE} {OUT} {IN} "' + str(self.precluster_sample_sep) + '" "' + str(self.precluster_size_sep) + '" 2>> ' + self.stderr )
biom = PythonFunction( to_biom, cmd_format='{EXE} {OUT} {IN} 2>> ' + self.stderr )
biom = Map( biom, inputs=self.cluster_files, outputs=self.biom_files )
# Depths stats
......
......@@ -26,7 +26,7 @@ from jflow.abstraction import MultiMap
from weaver.function import ShellFunction, PythonFunction
def filter_count( size_separator, chimeras_file, non_chimeras_file, output_file ):
def filter_count( chimeras_file, non_chimeras_file, output_file ):
"""
@summary : Writes file with the non-chimeras and chimeras count.
@param size_separator : [str] The number of sequences represented by each sequence/pre-cluster must be stored at
......@@ -37,7 +37,7 @@ def filter_count( size_separator, chimeras_file, non_chimeras_file, output_file
@param output_file : [str] path to the output file.
"""
import jflow.seqio as seqio
size_separator = ';size='
chimeras_count = 0
non_chimeras_count = 0
......@@ -72,31 +72,19 @@ class UsearchChimera (Analysis):
@see : http://drive5.com/usearch/manual/chimera_formation.html
"""
def define_parameters(self, fasta_list, min_diffs=3, cluster_size_sep=None):
def define_parameters(self, fasta_list, min_diffs=3):
"""
@param fasta_list : [list] path of files that will be processed.
@param min_diffs : [int] minimum number of diffs in a segment. Must be > 0.
@param cluster_size_sep : [str] Each sequence in the input files can represent several sequences (by example after dereplication).
The number of sequences represented by each sequence/pre-cluster is located at the end of the sequence
ID separated by cluster_size_sep.
Example : cluster_size_sep=';' where sequence ID = 'seq10001;83'
"""
self.add_parameter("min_diffs", "inimum number of diffs in a segment. Must be > 0.", default=min_diffs, type='int')
self.add_parameter("cluster_size_sep", """Each sequence in the input files can represent several sequences (by example after dereplication).
The number of sequences represented by each sequence/pre-cluster is located at the end of the sequence
ID separated by cluster_size_sep.
Example : cluster_size_sep=';' where sequence ID = 'seq10001;83'""", default=cluster_size_sep)
self.add_input_file_list( "input_fasta", "path of files that will be processed.", default=fasta_list, file_format='fasta' )
self.add_output_file_list("stdout", "stdout", pattern='{basename_woext}.stdout', items=self.input_fasta)
self.add_output_file_list("stderr", "stderr", pattern='{basename_woext}.stderr', items=self.input_fasta)
self.add_output_file_list("log", "log", pattern='{basename_woext}.log', items=self.input_fasta)
self.add_output_file_list("chimeras", "chimeras", pattern='{basename_woext}_chimeras.fasta', items=self.input_fasta, file_format='fasta')
self.add_output_file_list("nonchimeras", "nonchimeras", pattern='{basename_woext}_nonchimeras.fasta', items=self.input_fasta, file_format='fasta')
self.stat = self.stderr
if self.cluster_size_sep != None:
self.add_output_file_list("stat", "stat", pattern='{basename_woext}.stat', items=self.input_fasta)
self.add_output_file_list("stat", "stat", pattern='{basename_woext}.stat', items=self.input_fasta)
def define_analysis(self):
self.name = "RemoveChimera"
......@@ -155,6 +143,6 @@ class UsearchChimera (Analysis):
chimera = MultiMap(chimera, inputs=self.input_fasta, outputs=[self.stdout, self.log, self.chimeras, self.nonchimeras, self.stderr])
# Statistics
if self.cluster_size_sep is not None:
stat = PythonFunction( filter_count, cmd_format='{EXE} "' + self.cluster_size_sep + '" {IN} {OUT}' )
stat = MultiMap(stat, inputs=[self.chimeras, self.nonchimeras], outputs=[self.stat])
\ No newline at end of file
stat = PythonFunction( filter_count, cmd_format='{EXE} {IN} {OUT}' )
stat = MultiMap(stat, inputs=[self.chimeras, self.nonchimeras], outputs=[self.stat])
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment