Commit 6f86913c authored by Penom Nom's avatar Penom Nom
Browse files

Add filter by replicates evidence.

Remove pyplot dependency.
parent 916a2db8
......@@ -209,8 +209,8 @@ class SparseData( dict ):
def nb_at( self, row_idx, col_idx ):
"""
@return : [int] The count for the column col_idx in row row_idx.
@param row_idx : [int] the index of the row.
@param col_idx : [int] the index of the column.
@param row_idx : [int] The index of the row.
@param col_idx : [int] The index of the column.
"""
nb = 0
if self.has_key(row_idx) and self[row_idx].has_key(col_idx):
......@@ -219,7 +219,8 @@ class SparseData( dict ):
def get_col_sum( self, col_idx ):
"""
@todo test
@return : [int] The sum of count for the column col_idx.
@param col_idx : [int] The index of the column.
"""
total = 0
for row_idx in self.keys():
......@@ -229,7 +230,8 @@ class SparseData( dict ):
def get_row_sum( self, row_idx ):
"""
@todo test
@return : [int] The sum of count for the row row_idx.
@param row_idx : [int] The index of the row.
"""
total = 0
if self.has_key( row_idx ):
......@@ -239,7 +241,10 @@ class SparseData( dict ):
def row_to_array( self, row_idx, nb_col ):
"""
@todo test
@return : [list] The count for the row for each column.
Example : '[0, 2, 0, 0]' only the second column has this observation.
@param row_idx : [int] The index of the row.
@param nb_col : [int] The expected number of columns.
"""
array = [0 for current in range(nb_col)]
if self.has_key( row_idx ):
......@@ -249,13 +254,32 @@ class SparseData( dict ):
def add( self, row_idx, col_idx, value ):
"""
@todo test
@summary : Add the 'value' to the count for the column col_idx in row row_idx.
@param row_idx : [int] The index of the row.
@param col_idx : [int] The index of the column.
@param value : [int] The value to add.
"""
if not self.has_key( row_idx ):
self[row_idx] = { col_idx : 0 }
elif not self[row_idx].has_key( col_idx ):
self[row_idx][col_idx] = 0
self[row_idx][col_idx] += value
def change( self, row_idx, col_idx, value ):
"""
@summary : Change the 'value' to the count for the column col_idx in row row_idx.
@param row_idx : [int] The index of the row.
@param col_idx : [int] The index of the column.
@param value : [int] The new value.
"""
if value != 0:
if not self.has_key( row_idx ):
self[row_idx] = { col_idx : value }
else:
self[row_idx][col_idx] = value
else:
if self.has_key( row_idx ) and self[row_idx].has_key( col_idx ) :
del self[row_idx][col_idx]
def add_row( self ):
pass # Nothing to do
......@@ -294,43 +318,73 @@ class Biom:
def __repr__(self):
return str( self.__dict__ )
def OTU_count(self):
def observations_counts(self):
"""
@summary : Return the list of OTU count.
@return : [list] the OTU ID and the OTU count for each OTU.
@summary : Return the list of the observations counts.
@return : [list] the observation ID and the observation count for each observation.
Example :
[
["OTU_1", 128],
["OTU_2", 8]
]
"""
for OTU_idx in range(len(self.rows)):
yield self.rows[OTU_idx]['id'], self.data.get_row_sum(OTU_idx)
for observation_idx in range(len(self.rows)):
yield self.rows[observation_idx]['id'], self.data.get_row_sum(observation_idx)
def remove_OTUs( self, OTU_names ):
def remove_observations( self, observations_names ):
"""
@summary : Removes the specified OTUs.
@param OTU_names : [list] The IDs of the OTUs to remove.
@summary : Removes the specified observations.
@param observations_names : [list] The IDs of the observations to remove.
"""
for current_OTU in OTU_names :
OTU_idx = self.find_idx( self.rows, current_OTU )
for current_observation in observations_names :
observation_idx = self.find_idx( self.rows, current_observation )
# Remove OTU from the self.rows
del self.rows[OTU_idx]
del self.rows[observation_idx]
# Remove OTU from the self.data
self.data.remove_row( OTU_idx )
self.data.remove_row( observation_idx )
def reset_count_by_replicates_evidence( self, samples_names, min_evidence_nb=2 ):
"""
@summary : Puts to 0 the counts of an observation for all samples in a replicate if the
number of samples with this observation is lower than 'min_evidence_nb' in
the replicate.
example : with min_evidence_nb = 2 and 2 triplicates (A and B)
Before process
sample_A_1 sample_A_2 sample_A_3 sample_B_1 sample_B_2 sample_B_3
obs 1 1 0 0 0 2
After process
sample_A_1 sample_A_2 sample_A_3 sample_B_1 sample_B_2 sample_B_3
obs 1 1 0 0 0 0
@param samples_names : [list] The names of the replicates.
@param min_evidence_nb : [int] The minimun number of replicates with the observation.
"""
samples_idx = [self.find_idx(self.columns, sample) for sample in samples_names]
# For each observation
for row_idx in range( len(self.rows) ):
obs_evidence = 0
# Process evidence number
for col_idx in samples_idx:
if self.data.nb_at(row_idx, col_idx) > 0: # if count > 0
obs_evidence += 1
# If the evidence is insufficient
if obs_evidence < min_evidence_nb and obs_evidence > 0:
# for each sample
for col_idx in samples_idx:
# Set observation count to 0
self.data.change( row_idx, col_idx, 0 )
def filter_OTU_by_count( self, min_nb, max_nb=None ):
"""
@summary : Filter OTUs on count value.
@param min_nb : [int] OTU with a count inferior to 'min_nb' is removed.
@param max_nb : [int] OTU with a count superior to 'min_nb' is removed.
@summary : Filter observations on count value.
@param min_nb : [int] The observations with a count inferior to 'min_nb' is removed.
@param max_nb : [int] The observations with a count superior to 'min_nb' is removed.
"""
OTU_count, nb_total_elts = self._count_by_OTU()
removed_OTU_names = list()
for current_OTU in OTU_count:
if current_OTU['nb'] < min_nb or (max_nb is not None and current_OTU['nb'] > max_nb):
removed_OTU_names.append( current_OTU["id"] )
self.remove_OTUs( removed_OTU_names )
self.remove_observations( removed_OTU_names )
def merge_samples( self, samples, merged_sample_id=None ):
"""
......
......@@ -33,7 +33,7 @@ def observations_depth( input_biom, output_depth ):
nb_observ = 0
# Process depth calculation
biom = BiomIO.from_json( input_biom )
for observation_id, observation_count in biom.OTU_count():
for observation_id, observation_count in biom.observations_counts():
while len(obs_depth) <= observation_count:
obs_depth.append(0)
obs_depth[observation_count] += 1
......@@ -52,9 +52,7 @@ def samples_hclassification( input_biom, output_dendrogram, distance_method, lin
"""
@todo : test
"""
import matplotlib, json
matplotlib.use('Agg') # Force matplotlib to not use any Xwindows backend
import matplotlib.pyplot as plt
import json
from scipy.spatial.distance import pdist, squareform
from scipy.cluster.hierarchy import linkage, dendrogram
import scipy.cluster.hierarchy
......@@ -106,20 +104,13 @@ def samples_hclassification( input_biom, output_dendrogram, distance_method, lin
del biom
if len(samples_names) == 1 :
# Write JSON
out_tree = dict( children=[], name=samples_names[0], dist="0.000" )
json.dump( out_tree, open(output_dendrogram, "w"), sort_keys=True, indent=4 )
else:
# Computing the distance and linkage
data_dist = pdist( data_array, distance_method )
data_link = linkage( data_dist, linkage_method )
# Write graph
dendrogram( data_link, labels=samples_names )
plt.xlabel('Samples')
plt.ylabel('Distance')
plt.xticks(rotation=30, ha='right')
plt.tight_layout()
plt.savefig( output_dendrogram + ".png" )
# Write JSON
scipy_hc_tree = scipy.cluster.hierarchy.to_tree( data_link , rd=False )
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment