Commit a71ccf7e authored by Penom Nom's avatar Penom Nom
Browse files

Gene diversity

- Use sample tag in sequence id.
- Reduce memory usage.
- Fix bug in view.
parent 8dc50c04
......@@ -102,12 +102,17 @@ $(function () {
$(":checked[id^=chk_sample_]").each(function(){
var index = $(this).attr("id").split("_")[2] ;
var counts = $("#counts_"+index).val().split(",") ;
var depths = $("#depths_"+index).val().split(",") ;
var counts_at_scale = new Array();
for (var i=0 ; i<counts.length ; i++) {
counts[i] = parseInt( counts[i] );
while( counts_at_scale.length < parseInt(depths[i]) ) {
counts_at_scale.push(0) ;
}
counts_at_scale[counts_at_scale.length -1] = parseInt( counts[i] );
}
counts_data.push(
{ name: $("#sample_id_"+index).text(),
data: counts,
data: counts_at_scale,
pointStart: 1 }
);
sum_data[$("#sample_id_"+index).text()] = parseFloat( $("#count_"+index).text() );
......
......@@ -57,6 +57,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
<td id="uq_{$i}">{$sample_results["default"].upper_quartile}</td>
<td id="max_{$i}">{$max}</td>
</tr>
{$i = $i + 1}
{/foreach}
</tbody>
<tfoot>
......
......@@ -60,8 +60,8 @@ class Flash (Analysis):
@param nb_thread : set the number of worker threads
@param archive_name : name for the output archive
"""
self.read1_files = InputFileList(read1_files, Formats.FASTQ)
self.read2_files = InputFileList(read2_files, Formats.FASTQ)
self.read1_files = InputFileList(read1_files)#, Formats.FASTQ)
self.read2_files = InputFileList(read2_files)#, Formats.FASTQ)
if len(read1_files) != len(read2_files):
raise Exception("[ERROR] : the number of files is not correct! (the number of files in read1_files and in read2_files must be the same)")
self.nb_thread = nb_thread
......
......@@ -47,23 +47,22 @@ class GeneDiversity (NG6Workflow):
chimera = self.add_component("UsearchChimera", [dereplicate.output_files, 8], parent=join_pairs)
# Sequence traduction
split = self.add_component("SplitSeq", [chimera.nonchimeras, 10000])
split = self.add_component("SplitSeq", [chimera.nonchimeras, 7000])
framebot = self.add_component("Framebot", [split.output_files, self.args["database"], False])
# Rename the pre-clusters to provide traceback after merge and cd-hit
rename_clusters = self.add_component("RenameClusters", [framebot.corrected_proteins])
rename_clusters = self.add_component("AddSamplesNames", [framebot.corrected_proteins, '|', ';size='])
# Merge sequences
merge = self.add_component("ConcatenateFiles", [rename_clusters.output_files, "all_trad_sequences.fasta"])
# Create OTU
cdhit = self.add_component("Cdhit", [merge.output_file, self.args["otu_identity_threshold"], self.args["otu_length_diff_cutoff"], self.args["otu_cluster_most_similar"], 5, ';', [rename_clusters.merged_logs]], parent=chimera)
cdhit = self.add_component("Cdhit", [merge.output_file, self.args["otu_identity_threshold"], self.args["otu_length_diff_cutoff"],
self.args["otu_cluster_most_similar"], 5, 'euclidean', 'average', ';size=', '|'], parent=chimera)
# Stat on OTU
blast_index = self.add_component("BlastIndex", [self.args["database"], "prot"])
blast = self.add_component("Blast", [cdhit.output_files, [{'file':blast_index.databank, 'max_hit':1}], 6, "blastp"])
#import_tax_on_biom
otu_stat = self.add_component("GeneOTUStat", [[rename_clusters.merged_logs], cdhit.cluster_files, blast.outputs[os.path.basename(blast_index.databank)], [self.args["taxonomy"]], cdhit.output_files], parent=cdhit)
otu_classify = self.add_component("GeneOTUClassify", [cdhit.biom_files, [merge.output_file], self.args["taxonomy"], blast_index.databank], parent=cdhit)
# Normalisation
normalisation = self.add_component("BiomNormalisation", [cdhit.biom_files, 1000, 3000, 100, 1], parent=cdhit)
\ No newline at end of file
#
# Copyright (C) 2012 INRA
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
import os
from jflow.component import Component
from jflow.iotypes import OutputFileList, InputFileList, OutputFile, Formats
from jflow.abstraction import MultiMap
from weaver.function import ShellFunction, PythonFunction
def add_sample_name( sample_name, sequence_name_sep, sequence_count_sep, input_file, output_file ):
"""
@summary : Adds the sample name to each sequence IDs.
@param sample_name : [str] the sample name to add.
@param sequence_name_sep : [char] the separator between the sequence ID and the name of the sample.
@param sequence_count_sep : [char] the separator between the sequence ID and the number of copies of this sequence or
between the name of the sample and the number of copies of this sequence.
Example : sequence ID = 'seq10001;83'.
OR
sequence ID = 'seq10001|lake_spot_1;83'.
@param input_file : [str] path to the fasta processed.
@param output_file : [str] path to the output.
"""
import jflow.seqio as seqio
reader = seqio.SequenceReader(input_file)
out_fh = open(output_file, "w")
for id, desc, seq, qual in reader :
new_id = id + sequence_name_sep + sample_name
if sequence_count_sep != "none":
split_id = id.split( sequence_count_sep )
new_id = sequence_count_sep.join(split_id[0:-1]) + sequence_name_sep + sample_name + sequence_count_sep + split_id[-1]
# patch
if new_id.endswith(';'):
new_id = new_id[:-1]
seqio.writefasta( out_fh, [[new_id, desc, seq, qual]] )
out_fh.close()
class AddSamplesNames (Component):
"""
@summary : Adds the sample name to each sequence IDs.
"""
def define_parameters(self, input_fasta, sequence_name_sep='|', sequence_count_sep=None, samples_names=None):
"""
@param input_fasta : [list] fasta processed.
@param sequence_name_sep : [char] the separator between the sequence ID and the name of the sample.
@param sequence_count_sep : [char] the separator between the sequence ID and the number of copies of this sequence or
between the name of the sample and the number of copies of this sequence.
Example : sequence ID = 'seq10001;83'.
OR
sequence ID = 'seq10001|lake_spot_1;83'.
@param samples_names : [list] the sample name for each input fasta.
"""
# Parameters
self.sequence_name_sep = sequence_name_sep
self.sequence_count_sep = "none" if sequence_count_sep is None else sequence_count_sep
if samples_names is None:
samples_names = list()
for current_input in input_fasta:
basename = os.path.basename(current_input)
samples_names.append( ".".join(basename.split('.')[:-1]) )
self.samples_names = samples_names
# Files
self.input_fasta = InputFileList( input_fasta, Formats.FASTA )
self.output_files = OutputFileList( self.get_outputs('{basename}', self.input_fasta), Formats.FASTA )
self.stderr = OutputFile( os.path.join(self.output_directory, 'addSample.stderr') )
def process(self):
# Rename files
for file_idx in range( len(self.input_fasta) ):
rename = PythonFunction( add_sample_name, cmd_format="{EXE} " + self.samples_names[file_idx] + " '" + str(self.sequence_name_sep) + "' '" + str(self.sequence_count_sep) + "' {IN} {OUT} 2>> " + self.stderr )
rename( inputs=self.input_fasta[file_idx], outputs=self.output_files[file_idx] )
\ No newline at end of file
......@@ -70,9 +70,9 @@ class BiomNormalisation (Analysis):
# Files
self.biom = InputFileList( biom )
self.output_files = OutputFileList( self.get_outputs('{basename_woext}.biom', self.biom) )
self.depth_files = OutputFileList( self.get_outputs('{basename_woext}_depth.tsv', self.biom) )
self.hclust_files = OutputFileList( self.get_outputs('{basename_woext}_hclust.json', self.biom) )
self.output_files = OutputFileList( self.get_outputs('{basename_woext}_norm.biom', self.biom) )
self.depth_files = OutputFileList( self.get_outputs('{basename_woext}_norm_depth.tsv', self.biom) )
self.hclust_files = OutputFileList( self.get_outputs('{basename_woext}_norm_hclust.json', self.biom) )
self.stderr = os.path.join( self.output_directory, 'normalisation.stderr' )
def get_template(self):
......@@ -93,7 +93,7 @@ class BiomNormalisation (Analysis):
# Parse depths
for filepath in self.depth_files:
[depths, counts, sum, upper_quartile, median, lower_quartile] = observations_depth_to_stat(filepath)
sample_name = os.path.basename(filepath).split("_depth.tsv")[0]
sample_name = os.path.basename(filepath).split("_norm_depth.tsv")[0]
self._add_result_element( sample_name, "depths", ",".join(map(str, depths)) )
self._add_result_element( sample_name, "counts", ",".join(map(str, counts)) )
self._add_result_element( sample_name, "sum", str(sum) )
......@@ -105,7 +105,7 @@ class BiomNormalisation (Analysis):
json_fh = open( filepath )
hierarchical_clustering = ' '.join( line.strip() for line in json_fh )
json_fh.close()
sample_name = os.path.basename(filepath).split("_hclust.json")[0]
sample_name = os.path.basename(filepath).split("_norm_hclust.json")[0]
self._add_result_element( sample_name, "linkage", hierarchical_clustering )
def process(self):
......
......@@ -16,7 +16,6 @@
#
import os
from subprocess import Popen, PIPE
from workflows.gene_diversity.lib.biomstat import *
......@@ -29,64 +28,67 @@ from jflow.abstraction import MultiMap
from weaver.function import ShellFunction, PythonFunction
from weaver.abstraction import Map
def to_biom( output_biom, clusters_file, precluster_size_sep, trace_file=None ):
def to_biom( output_biom, clusters_file, precluster_sample_sep, precluster_size_sep ):
"""
@summary : Write a biom file from cdhit results.
@param output_biom : [str] path to the output file.
@param clusters_file : [str] path to the '.clstr' file.
@param precluster_size_sep : [char] if sequences provided to Cdhit are pre-clusters (ex : dereplication step before cd-hit), the number of sequences
@param precluster_size_sep : [char] used if sequences provided to Cdhit are pre-clusters (ex : dereplication step before cd-hit). The number of sequences
represented by each pre-cluster can be added to the end of its ID. In this place the number is separated by the character
precluster_size_sep.
Example : precluster_size_sep=';' where cluster ID = 'seq10001;83'.
@param trace_file : [str] ********************************************
Example : precluster_size_sep=';' where sequence ID = 'seq10001;83'.
@param precluster_size_sep : [char] used if sequences provided to Cdhit come from differents samples ("none" otherwise). The sample name is stored in each
sequence id. It is separated by the character precluster_sample_sep and it is placed before the pre-cluster size information.
Example : sequence ID = 'seq10001|lake_spot_1;83'.
OR
sequence ID = 'seq10001|lake_spot_1'.
"""
from workflows.gene_diversity.lib.Biom import Biom, BiomIO
samples_seen = dict()
biom = Biom( generated_by='cdhit', matrix_type="sparse" )
# Sample by cluster ID
cluster_samples = dict()
if trace_file is not None:
samples = dict()
for line in open(trace_file).readlines():
parts = line.strip().split() # Tace line example 'OLD_ID<tab>FILE_PATH<tab>NEW_ID'
cluster_new_id = parts[2]
sample = os.path.basename( parts[1] ).split('_')[0]
cluster_samples[cluster_new_id] = sample
if not samples.has_key( sample ):
samples[sample] = 1
biom.add_sample( sample )
else:
biom.add_sample( "sample" )
# Process count
cluster_name = None
for line in open(clusters_file).readlines():
# New cluster
if line.startswith(">"): # Line example : '>Cluster 0' => 'Cluster_0'
if cluster_name is not None:
# Add previous cluster on biom
biom.add_observation( cluster_name )
for sample_name in cluster_count:
biom.add_observation( cluster_name )
if not samples_seen.has_key( sample_name ):
biom.add_sample( sample_name )
samples_seen[sample_name] = 1
biom.add_count( cluster_name, sample_name, cluster_count[sample_name] )
# Start new cluster
cluster_name = line.strip()[1:].replace(" ", "_")
cluster_count = dict()
# Additional information on current cluster
else: # Line example : '60 130aa, >c6104.0;1... at 99.23%'
# Add the cluster count to the wright sample
pre_cluster_id = line.strip().split()[2][1:-3] # c6104.0;1
# Count
pre_cluster_count = 1
if precluster_size_sep != "None":
pre_cluster_count = int(pre_cluster_id.split(precluster_size_sep)[-1])
pre_cluster_sample = "sample"
if trace_file is not None:
pre_cluster_sample = cluster_samples[pre_cluster_id]
if precluster_size_sep != "none":
pre_cluster_count = int( pre_cluster_id.split(precluster_size_sep)[-1] )
pre_cluster_id = precluster_size_sep.join( pre_cluster_id.split(precluster_size_sep)[0:-1] ) # trim cluster size from ID
# Sample
pre_cluster_sample = "all"
if precluster_sample_sep != "none":
pre_cluster_sample = pre_cluster_id.split(precluster_sample_sep)[-1]
# Store
if cluster_count.has_key(pre_cluster_sample):
cluster_count[pre_cluster_sample] += pre_cluster_count
else:
cluster_count[pre_cluster_sample] = pre_cluster_count
if cluster_name is not None:
# Add last cluster on biom
biom.add_observation( cluster_name )
for sample_name in cluster_count:
biom.add_observation( cluster_name )
if not samples_seen.has_key( sample_name ):
biom.add_sample( sample_name )
samples_seen[sample_name] = 1
biom.add_count( cluster_name, sample_name, cluster_count[sample_name] )
# Write
BiomIO.write( output_biom, biom )
......@@ -129,17 +131,27 @@ class Cdhit (Analysis):
"""
def define_parameters(self, input_fasta, identity_threshold=0.95, length_diff_cutoff=0.8, cluster_most_similar=True, word_length=5,
distance_method='euclidean', linkage_method='average', precluster_size_sep=None, trace_files=None):
distance_method='euclidean', linkage_method='average', precluster_size_sep=None, precluster_sample_sep=None):
"""
@param input_fasta : [str] fasta list to process
@param identity_threshold : [float] sequence identity threshold. Calculated as : number of identical amino acids in alignment divided by the full length of the shorter sequence.
@param length_diff_cutoff : [float] Maximum length difference between shorter end representative sequence of the cluster. If set to 0.9, the shorter sequences need to be at least 90% length of the representative of the cluster.
@param word_length : [int] word length.
@param distance_method : [str] distance method for the hierarchical clustering. Accepted values @see biomstat.samples_hclassification.
@param linkage_method : [str] linkage method for the hierarchical clustering. Accepted values @see biomstat.samples_hclassification.
@param cluster_most_similar : [bool] False => the sequence can be clustered to the first cluster that meet the threshold (fast cluster).
True => the program will cluster it into the most similar cluster that meet the threshold (accurate but slow mode).
Either won't change the representatives of final clusters.
@param word_length : [int] word length.
@param distance_method : [str] distance method for the hierarchical clustering. Accepted values @see biomstat.samples_hclassification.
@param linkage_method : [str] linkage method for the hierarchical clustering. Accepted values @see biomstat.samples_hclassification.
@param precluster_size_sep : [char] used if sequences provided to Cdhit are pre-clusters (ex : dereplication step before cd-hit).
The number of sequences represented by each pre-cluster can be added to the end of its ID.
In this place the number is separated by the character precluster_size_sep.
Example : precluster_size_sep=';' where sequence ID = 'seq10001;83'.
@param precluster_sample_sep : [char] used if sequences provided to Cdhit come from differents samples. The sample name is stored in
each sequence id. It is separated by the character precluster_sample_sep and it is placed before the
pre-cluster size information.
Example : sequence ID = 'seq10001|lake_spot_1;83'.
OR
sequence ID = 'seq10001|lake_spot_1'.
"""
# Parameters
self.cdhit_options = ""
......@@ -152,13 +164,11 @@ class Cdhit (Analysis):
self.cdhit_options += " -n " + str(word_length)
self.distance_method = distance_method
self.linkage_method = linkage_method
self.precluster_sample_sep = "none" if precluster_sample_sep is None else precluster_sample_sep
self.precluster_size_sep = "none" if precluster_size_sep is None else precluster_size_sep
# Files
self.input_fasta = InputFileList( input_fasta )
self.trace_files = None
if trace_files is not None:
self.trace_files = InputFileList( trace_files )
self.output_files = OutputFileList( self.get_outputs('{basename_woext}.fasta', self.input_fasta), Formats.FASTA )
self.cluster_files = OutputFileList( self.get_outputs('{basename_woext}.clstr', self.input_fasta) )
self.biom_files = OutputFileList( self.get_outputs('{basename_woext}.biom', self.input_fasta) )
......@@ -182,7 +192,7 @@ class Cdhit (Analysis):
return stdout.split()[3]
def post_process(self):
self._save_files( self.depth_files + self.hclust_files )
self._save_files( self.biom_files, self.depth_files + self.hclust_files )
# Parse depths
for filepath in self.depth_files:
[depths, counts, sum, upper_quartile, median, lower_quartile] = observations_depth_to_stat(filepath)
......@@ -213,13 +223,8 @@ class Cdhit (Analysis):
rename = MultiMap( rename, inputs=[tmp_fasta_files, self.cluster_files], outputs=self.output_files )
# Build biom
if self.trace_files is None:
biom = PythonFunction( to_biom, cmd_format='{EXE} {OUT} {IN} ' + self.precluster_size_sep + ' 2>> ' + self.stderr )
biom = Map( biom, inputs=self.cluster_files, outputs=self.biom_files )
else:
for i in range(len(self.cluster_files)):
biom = PythonFunction( to_biom, cmd_format="{EXE} {OUT} {IN} '" + self.precluster_size_sep + "' " + self.trace_files[i] + ' 2>> ' + self.stderr )
biom( inputs=self.cluster_files[i], outputs=self.biom_files[i], includes=self.trace_files[i] )
biom = PythonFunction( to_biom, cmd_format="{EXE} {OUT} {IN} '" + str(self.precluster_sample_sep) + "' '" + str(self.precluster_size_sep) + "' 2>> " + self.stderr )
biom = Map( biom, inputs=self.cluster_files, outputs=self.biom_files )
# Depths stats
depth = PythonFunction( observations_depth, cmd_format='{EXE} {IN} {OUT} 2>> ' + self.stderr )
......
#
# Copyright (C) 2012 INRA
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
import os
from jflow.iotypes import InputFile, InputFileList, OutputFileList, Formats
from jflow.abstraction import MultiMap
from weaver.function import ShellFunction, PythonFunction
from weaver.abstraction import Map
from ng6.analysis import Analysis
def biom_to_krona( exec_path, biom_file, krona_data_file, krona_view_file ):
"""
@summary : Create a krona visualisation for the taxonomy data of the biom file.
@param exec_path : [str] path to the software ktImportText.
@param biom_file : [str] path to the biom to process.
@param krona_data_file : [str] path to the output krona data file (tsv with count).
@param krona_view_file : [str] path to the output krona visualisation file.
"""
from workflows.gene_diversity.lib.Biom import Biom, BiomIO
from subprocess import Popen, PIPE
# Format data
biom = BiomIO.from_json( biom_file )
BiomIO.write_krona_table( krona_data_file, biom )
# Build krona
cmd = [exec_path, "-n", "root", "-o", krona_view_file, krona_data_file]
p = Popen(cmd, stdout=PIPE, stderr=PIPE)
stdout, stderr = p.communicate()
# write down the stdout
stdoh = open(stdout_path, "w")
stdoh.write(stdout)
stdoh.close()
# write down the stderr
stdeh = open(stderr_path, "w")
stdeh.write(stderr)
stdeh.close()
def add_tax_metadata( biom_file, blast_file, taxonomy_file, output_file ):
"""
@summary : Add taxonomy metadata on biom file from a blast result.
@param biom_file : [string] the path to the Biom file to process.
@param blast_file : [string] the path to the blast result in tabular format (outfmt 6 with NCBI Blast+).
@param taxonomy_file : [string] the path to the taxonomy file.
@param output_file : [string] the path to the output file.
"""
import os
from workflows.gene_diversity.lib.Biom import Biom, BiomIO
def init_cluster_count(sample_names):
samples_count = {}
for sample in sample_names:
samples_count[sample] = 0
return samples_count
# Build an hash with the taxonomy for each gene (key=gene_id ; value=gene_taxonomy)
tax = {}
for line in open(taxonomy_file).readlines():
parts = line.strip().split("\t") # Line example : 'ADC62191 Bacteria; Proteobacteria; Gammaproteobacteria; Chromatiales; Chromatiaceae; Allochromatium.; Allochromatium vinosum DSM 180'
tax[parts[0]] = parts[1]
# Retrieve clusters annotations
cluster_annot = dict()
for line in open(blast_file).readlines():
parts = line.strip().split()
query_id = parts[0].split(";")[0]
cluster_annot[query_id] = dict()
subject_id = parts[1].split("#")[0] # Subject field : <ID>#<PARTIAL_DESC>
cluster_annot[query_id]['tax'] = tax[subject_id]
cluster_annot[query_id]['evalue'] = parts[10]
cluster_annot[query_id]['identity'] = parts[2]
cluster_annot[query_id]['aln_length'] = parts[3]
# Add metadata to biom
biom = BiomIO.from_json( biom_file )
for cluster_id in biom.rows:
biom.add_metadata( cluster_id, "taxonomy", cluster_annot[cluster_id]['tax'], "observation")
biom.add_metadata( cluster_id, "evalue", cluster_annot[cluster_id]['evalue'], "observation")
biom.add_metadata( cluster_id, "identity", cluster_annot[cluster_id]['identity'], "observation")
biom.add_metadata( cluster_id, "aln_length", cluster_annot[cluster_id]['aln_length'], "observation")
BioIO.write( output_file, biom )
class GeneOTUClassify( Analysis ):
def define_parameters(self, biom_files, input_fasta, taxonomy_file, databank, blast_used="blastp", evalue="1e-5"):
"""
@param biom_files : [list] the Biom files with OTU to process.
@param input_fasta : [list] the fasta file containing the sequences of OTUs. Each biom file correspond to one input fasta.
@param taxonomy_file : [str] path to the taxonomy files.
@param databank : [str] path to the databank used for retrieve taxonomy by best hit search.
@param blast_used : [str] the type of blast used ("blastx", "blastp", ...).
@param evalue : [str] the maximum e-value to keep the best hit in taxonomy step.
"""
# Parameters
self.blast_used = blast_used
self.databank = databank
self.blast_options = " -max_target_seqs 1 -evalue " + str(evalue) + " -outfmt 6"
# Files
self.biom_files = InputFileList( biom_files )
self.input_fasta = InputFileList( input_fasta, Formats.FASTA )
self.taxonomy_file = InputFile( taxonomy_file )
self.output_files = OutputFileList( self.get_outputs('{basename}', self.biom_files) )
self.krona_files = OutputFileList( self.get_outputs('{basename_woext}_krona.html', self.biom_files) )
self.stderr = os.path.join(self.output_directory, 'geneOTUclassify.stderr')
def define_analysis(self):
self.name = "GeneOTUClassify"
self.description = "Operational Taxon Unit analysis."
self.software = "NCBI Blast+"
self.options = self.blast_options
def get_version(self):
return "-"
def post_process(self):
self._save_files( self.output_files, self.krona_files )
def process(self):
blast_files = self.get_outputs( '{basename_woext}_blast.tsv', self.biom_files )
# Blast
blast = ShellFunction( self.get_exec_path(self.blast_used) + " " + self.blast_options + " -query $1 -out $2 -db " + self.databank + " 2>> " + self.stderr, cmd_format='{EXE} {IN} {OUT}' )
Map( blast, inputs=self.input_fasta, outputs=blast_files, includes=self.databank )
# Process metadata
metadata = PythonFunction( add_tax_metadata, cmd_format='{EXE} {IN} ' + self.taxonomy_file + ' {OUT} 2>> ' + self.stderr )
MultiMap( metadata, inputs=[self.biom_files, blast_files], outputs=self.output_files, includes=self.taxonomy_file )
# Biom to krona
krona_data_files = self.get_outputs( '{basename_woext}_krona.tsv', self.krona_files )
krona = PythonFunction( biom_to_krona, cmd_format='{EXE} ' + self.get_exec_path("ktImportText") + ' {IN} {OUT} 2>> ' + self.stderr )
MultiMap( krona, inputs=self.output_files, outputs=[krona_data_files, self.krona_files] )
\ No newline at end of file
......@@ -651,6 +651,18 @@ class BiomIO:
for line in biom.to_count_table():
out_fh.write( "\t".join(map(str, line)) + "\n" )
out_fh.close()
@staticmethod
def write_krona_table( path, biom ):
"""
@todo test
"""
out_fh = open( path, "w" )
for idx in range(len(biom.rows)):
count = biom.data.get_row_sum( idx )######################## TO DO wrapping
tax = biom.rows[idx]["metadata"]["taxonomy"]
out_fh.write( str(count) + "\t" + "\t".join(map(str, tax)) + "\n" )
out_fh.close()
@staticmethod
def load_metadata( biom, metadata_file, subject_type="sample", types={}, list_sep={} ):
......
......@@ -17,6 +17,15 @@
def observations_depth( input_biom, output_depth ):
"""
@summary : Write the depths of the observation in file.
@param input_biom : [str] path to the biom file processed .
@param output_depth : [str] path to the output file.
@note : Example of one output file
#Depth<TAB>Nb_Observ_concerned<TAB>Prct_Observ_concerned
1<TAB>65<TAB>65.000
2<TAB>30<TAB>30.000
3<TAB>0<TAB>0.000
4<TAB>5<TAB>5.000
"""
from workflows.gene_diversity.lib.Biom import BiomIO
......@@ -28,7 +37,8 @@ def observations_depth( input_biom, output_depth ):
while len(obs_depth) <= observation_count:
obs_depth.append(0)
obs_depth[observation_count] += 1
nb_observ += 1
if observation_count != 0:
nb_observ += 1
del biom
# Write output
out_fh = open( output_depth, 'w' )
......@@ -40,6 +50,7 @@ def observations_depth( input_biom, output_depth ):
def samples_hclassification( input_biom, output_dendrogram, distance_method, linkage_method ):
"""
@todo : test
"""
import matplotlib, json
matplotlib.use('Agg') # Force matplotlib to not use any Xwindows backend
......@@ -116,7 +127,7 @@ def samples_hclassification( input_biom, output_dendrogram, distance_method, lin
def observations_depth_to_stat( file ):
"""
@todo : test