Commit 6d09d08d authored by Penom Nom's avatar Penom Nom
Browse files

- Add krona by sample.

- Add merge samples.
- Add filter fasta.
parent 7d71398b
......@@ -15,20 +15,42 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
import os
import os, pickle
from workflows.gene_diversity.lib.biomstat import *
from ng6.analysis import Analysis
from jflow.iotypes import InputFileList, OutputFileList, Formats
from jflow.abstraction import MultiMap
from weaver.function import PythonFunction
from weaver.abstraction import Map
def merge_samples( input_biom, merge_dump, output_biom ):
"""
@summary : Adds counts and merges metadata of a list of samples.
@param input_biom : [str] path to the biom file processed.
@param merge_dump : [str] path to the dump file with the rules to
"""
import pickle
from workflows.gene_diversity.lib.Biom import BiomIO
# Retrieve merge rules
merge = open(merge_dump, "rb")
merge_groups = pickle.load(merge)
merge.close()
# Merge
biom = BiomIO.from_json( input_biom )
for merged_sample in merge_groups.keys():
biom.merge_samples( merge_groups[merged_sample], merged_sample )
BiomIO.write( output_biom, biom )
def filter_and_bootstrap( input_biom, output_biom, observation_threshold, nb_deleted, nb_selected, nb_round ):
"""
@summary :
"""
from workflows.gene_diversity.lib.Biom import BiomIO
......@@ -43,14 +65,45 @@ def filter_and_bootstrap( input_biom, output_biom, observation_threshold, nb_del
# Write
BiomIO.write( output_biom, biom )
def filter_seq( input_fasta, input_biom, output_fasta ):
"""
@summary : Writes a new fasta with only the sequences corresponding of the observations.
@param input_fasta : [str] path to the file to filter (fasta file).
@param input_biom : [str] path to the file with the IDs of kept sequences (biom file).
@param output_fasta : [str] path to the filtered fasta.
"""
from workflows.gene_diversity.lib.Biom import BiomIO
import jflow.seqio as seqio
biom = BiomIO.from_json( input_biom )
# Find the ids for kept sequences
kept_sequences = dict()
for observation_id in biom.get_observations_names():
kept_sequences[observation_id] = 1
# Filter sequences
reader = seqio.SequenceReader( input_fasta )
out_fh = open( output_fasta, "w" )
for id, desc, seq, qual in reader :
if kept_sequences.has_key(id):
seqio.writefasta( out_fh, [[id, desc, seq, qual]] )
out_fh.close()
class BiomNormalisation (Analysis):
class BiomSampling (Analysis):
"""
@summary : Merge, filter and normalise the count of clusters in Biom file.
The normalisation is based on multiple random samplings with replacement.
@summary : Sampling the count of clusters in Biom file.
[1- Merge specified samples]
2- Filter observation on minimum depth
3- Sampling
For each round :
a- Discard several sequences (replacement at the end of the current round)
b- Select several sequences with replacement
c- Add selected sequences in sampling
4- Delete observation without sequences.
"""
def define_parameters(self, biom, nb_deleted, nb_selected, nb_round, observation_threshold=0, distance_method='euclidean', linkage_method='average', merge_groups=None):
def define_parameters(self, biom, nb_deleted, nb_selected, nb_round, observation_threshold=0, fasta_files=None, distance_method='euclidean', linkage_method='average', merge_groups=None):
"""
@param biom : [str] path to the biom file processed.
@param nb_deleted : [int] the number of elements randomly removed at each round from the initial file.
......@@ -59,7 +112,13 @@ class BiomNormalisation (Analysis):
@param observation_threshold : [int]
@param distance_method : [str] distance method for the hierarchical clustering. Accepted values @see biomstat.samples_hclassification.
@param linkage_method : [str] linkage method for the hierarchical clustering. Accepted values @see biomstat.samples_hclassification.
@param merge_groups : [dict] *****************************
@param merge_groups : [list] the list of samples to merge.
Example : [
{ "sample_a" : ["sample_a_1", "sample_a_2"], # Biom_A
"sample_b" : ["sample_b_1", "sample_b_2", "sample_b_3"] }
{ "sample_a" : ["sample_a_1", "sample_a_2"], # Biom_B
"sample_b" : ["sample_b_1", "sample_b_2", "sample_b_3"] }
]
"""
# Parameters
self.nb_deleted = nb_deleted
......@@ -71,27 +130,31 @@ class BiomNormalisation (Analysis):
self.merge_groups = merge_groups
# Files
self.biom = InputFileList( biom )
self.output_files = OutputFileList( self.get_outputs('{basename_woext}_norm.biom', self.biom) )
self.depth_files = OutputFileList( self.get_outputs('{basename_woext}_norm_depth.tsv', self.biom) )
self.hclust_files = OutputFileList( self.get_outputs('{basename_woext}_norm_hclust.json', self.biom) )
self.input_bioms = InputFileList( biom )
self.output_fasta = list()
if fasta_files is not None:
self.input_fasta = InputFileList( fasta_files )
self.output_fasta = OutputFileList( self.get_outputs('{basename}', fasta_files) )
self.output_files = OutputFileList( self.get_outputs('{basename_woext}_norm.biom', self.input_bioms) )
self.depth_files = OutputFileList( self.get_outputs('{basename_woext}_norm_depth.tsv', self.input_bioms) )
self.hclust_files = OutputFileList( self.get_outputs('{basename_woext}_norm_hclust.json', self.input_bioms) )
self.stderr = os.path.join( self.output_directory, 'normalisation.stderr' )
def get_template(self):
return "ClustersStats"
def define_analysis(self):
self.name = "Normalisation"
self.description = "Filter and normalisation."
self.name = "Sampling"
self.description = "Filter and random sampling."
self.software = "-"
self.options = "normalisation deleted=" + str(self.nb_deleted) + " selected=" + str(self.nb_deleted) + " round=" + str(self.nb_round) + \
self.options = "sampling deleted=" + str(self.nb_deleted) + " selected=" + str(self.nb_deleted) + " round=" + str(self.nb_round) + " obs_min=" + str(self.observation_threshold) +\
";hierarchical_clustering distance=" + self.distance_method + " linkage=" + self.linkage_method
def get_version(self):
return "-"
def post_process(self):
self._save_files( self.output_files + self.depth_files + self.hclust_files )
self._save_files( self.output_files + self.depth_files + self.hclust_files + self.output_fasta )
# Parse depths
for filepath in self.depth_files:
[depths, counts, sum, upper_quartile, median, lower_quartile] = observations_depth_to_stat(filepath)
......@@ -112,11 +175,26 @@ class BiomNormalisation (Analysis):
def process(self):
# Merge samples
# TO DO *********************************************************
tmp_files = self.input_bioms
if self.merge_groups is not None:
tmp_files = self.get_outputs( '{basename_woext}_tmp.biom', self.input_bioms )
merge_dumps = self.get_outputs( '{basename_woext}.dump', self.input_bioms )
for idx in range(len(self.merge_groups)):
merge_dump_path = merge_dumps[idx]
merge_dump = open( merge_dump_path, "wb" )
pickle.dump( self.merge_groups[idx], merge_dump_path )
merge_dump.close()
merge = PythonFunction( merge_samples, cmd_format='{EXE} {IN} {OUT} ' + str(self.observation_threshold) + ' ' + str(self.nb_deleted) + ' ' + str(self.nb_selected) + ' ' + str(self.nb_round) + ' 2>> ' + self.stderr )
MultiMap( merge, inputs=[self.input_bioms, merge_dumps], outputs=tmp_files )
# Process filter and normalisation
normalisation = PythonFunction( filter_and_bootstrap, cmd_format='{EXE} {IN} {OUT} ' + str(self.observation_threshold) + ' ' + str(self.nb_deleted) + ' ' + str(self.nb_selected) + ' ' + str(self.nb_round) + ' 2>> ' + self.stderr )
Map( normalisation, inputs=self.biom, outputs=self.output_files )
Map( normalisation, inputs=tmp_files, outputs=self.output_files )
# Update fasta
if len(self.output_fasta) > 0:
filter = PythonFunction( filter_seq, cmd_format='{EXE} {IN} {OUT} 2>> ' + self.stderr )
filter = MultiMap( filter, inputs=[self.input_fasta, self.output_files], outputs=self.output_fasta )
# Depths stats
depth = PythonFunction( observations_depth, cmd_format='{EXE} {IN} {OUT} 2>> ' + self.stderr )
......
......@@ -26,13 +26,14 @@ from weaver.abstraction import Map
from ng6.analysis import Analysis
def biom_to_krona( exec_path, biom_file, krona_data_file, krona_view_file ):
def biom_to_krona( exec_path, stem_path_krona, biom_file, log_file ):
"""
@summary : Create a krona visualisation for the taxonomy data of the biom file.
@summary : Creates the krona charts for the taxonomic data of each samples in the biom file.
@param exec_path : [str] path to the software ktImportText.
@param stem_path_krona : [str] stem of the path for all krona charts.
The function creates one file by sample with this path : <stem_path_krona><sample_name>.html
@param biom_file : [str] path to the biom to process.
@param krona_data_file : [str] path to the output krona data file (tsv with count).
@param krona_view_file : [str] path to the output krona visualisation file.
@param log_file : [str] path to the log file.
"""
from workflows.gene_diversity.lib.Biom import Biom, BiomIO
from subprocess import Popen, PIPE
......@@ -41,14 +42,20 @@ def biom_to_krona( exec_path, biom_file, krona_data_file, krona_view_file ):
# Format data
biom = BiomIO.from_json( biom_file )
BiomIO.write_krona_table( krona_data_file, biom )
# Build krona
cmd = [exec_path, "-n", "root", "-o", krona_view_file, krona_data_file]
p = Popen(cmd, stdout=PIPE, stderr=PIPE)
stdout, stderr = p.communicate()
# write down the stdout
sys.stdout.write(stdout)
# write down the stderr
sys.stderr.write(stderr)
log_fh = open( log_file )
for sample in biom.get_samples_names():
krona_text_file = stem_path_krona + sample.replace( " ", "_") + ".text"
krona_html_file = stem_path_krona + sample.replace( " ", "_") + ".html"
BiomIO.write_krona_table_by_sample( krona_text_file, biom, sample )
# Build krona
cmd = [exec_path, "-n", "root", "-o", krona_html_file, krona_text_file]
p = Popen(cmd, stdout=PIPE, stderr=PIPE)
stdout, stderr = p.communicate()
log_fh.write(stdout)
# Write down the stderr
sys.stderr.write(stderr)
log_fh.close()
def add_tax_metadata( biom_file, blast_file, taxonomy_file, output_file ):
"""
......@@ -80,7 +87,7 @@ def add_tax_metadata( biom_file, blast_file, taxonomy_file, output_file ):
query_id = parts[0].split(";")[0]
cluster_annot[query_id] = dict()
subject_id = parts[1].split("#")[0] # Subject field : <ID>#<PARTIAL_DESC>
cluster_annot[query_id]['tax'] = tax[subject_id]
cluster_annot[query_id]['tax'] = map(str.strip, tax[subject_id].split(";"))
cluster_annot[query_id]['evalue'] = parts[10]
cluster_annot[query_id]['identity'] = parts[2]
cluster_annot[query_id]['aln_length'] = parts[3]
......@@ -98,7 +105,7 @@ def add_tax_metadata( biom_file, blast_file, taxonomy_file, output_file ):
class GeneOTUClassify( Analysis ):
def define_parameters(self, biom_files, input_fasta, taxonomy_file, databank, blast_used="blastp", evalue="1e-5", word_size=None):
def define_parameters(self, biom_files, input_fasta, taxonomy_file, databank, blast_used="blastp", evalue="1e-5"):
"""
@param biom_files : [list] the Biom files with OTU to process.
@param input_fasta : [list] the fasta file containing the sequences of OTUs. Each biom file correspond to one input fasta.
......@@ -111,17 +118,16 @@ class GeneOTUClassify( Analysis ):
self.blast_used = blast_used
self.databank = databank
self.blast_options = " -max_target_seqs 1 -evalue " + str(evalue) + " -outfmt 6"
if word_size is not None:
self.blast_options += " -word_size " + str(word_size)
self.biom_basename_woext = self.get_outputs('{basename_woext}_', biom_files)
self.stem_path_krona = self.get_outputs('{basename_woext}_', biom_files) # <output_dir>/<biom>_<sample>.html
# Files
self.biom_files = InputFileList( biom_files )
self.input_fasta = InputFileList( input_fasta, Formats.FASTA )
self.taxonomy_file = InputFile( taxonomy_file )
self.output_files = OutputFileList( self.get_outputs('{basename}', self.biom_files) )
self.krona_files = OutputFileList( self.get_outputs('{basename_woext}_krona.html', self.biom_files) )
self.stderr = os.path.join(self.output_directory, 'geneOTUclassify.stderr')
def define_analysis(self):
self.name = "GeneOTUClassify"
self.description = "Operational Taxon Unit analysis."
......@@ -132,7 +138,14 @@ class GeneOTUClassify( Analysis ):
return "-"
def post_process(self):
self._save_files( self.output_files, self.krona_files )
self._save_files( self.output_files )
# Add the krona files
for krona_file in os.listdir( self.output_directory ):
for current_biom in self.biom_basename_woext:
if krona_file.startswith( current_biom ) and krona_file.endswith( ".html" ):
sample = krona_file.split( current_biom )[1]
self._add_result_element( sample, "html", self._save_file(krona_file), biom_name )
def process(self):
blast_files = self.get_outputs( '{basename_woext}_blast.tsv', self.biom_files )
......@@ -146,6 +159,8 @@ class GeneOTUClassify( Analysis ):
MultiMap( metadata, inputs=[self.biom_files, blast_files], outputs=self.output_files, includes=self.taxonomy_file )
# Biom to krona
krona_data_files = self.get_outputs( '{basename_woext}_krona.tsv', self.krona_files )
krona = PythonFunction( biom_to_krona, cmd_format='{EXE} ' + self.get_exec_path("ktImportText") + ' {IN} {OUT} 2>> ' + self.stderr )
MultiMap( krona, inputs=self.output_files, outputs=[krona_data_files, self.krona_files] )
\ No newline at end of file
krona_data_files = self.get_outputs( '{basename_woext}_krona.tsv', self.biom_files )
krona_log_files = self.get_outputs( '{basename_woext}_krona.log', self.biom_files )
for idx in range(len(self.biom_files)):
krona = PythonFunction( biom_to_krona, cmd_format='{EXE} ' + self.get_exec_path("ktImportText") + ' ' + self.stem_path_krona[idx] + ' {IN} {OUT} 2>> ' + self.stderr )
krona( inputs=self.output_files[idx], outputs=krona_log_files[idx] )
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment