Commit 8cb6acb4 authored by Penom Nom's avatar Penom Nom
Browse files

Switch to the new version of SparseData.

parent 02d39bb8
......@@ -130,8 +130,143 @@ class DenseData( list ):
"""
self = DenseData()
def _to_list( self ):
"""
@todo test
"""
return self
class SparseData( dict ):
def __init__( self, list=list() ):
for data in list:
if not self.has_key( data[0] ):
self[data[0]] = dict()
self[data[0]][data[1]] = data[2]
def _to_list( self ):
"""
@todo test
"""
sparse = list()
for rows_idx in sorted(self.keys()):
for columns_idx in sorted(self[rows_idx].keys()):
sparse.append([ rows_idx, columns_idx, self[rows_idx][columns_idx] ])
return sparse
def get_matrix_type( self ):
"""
@summary : Retruns the type of matrix.
@return : [str] The matrix type.
"""
return "sparse"
def remove_col( self, remove_idx ):
"""
@summary : Remove all the count for the column provided.
@param remove_idx : [int] The real index of the column to remove.
"""
for rows_idx in self.keys():
# Remove data
if self[rows_idx].has_key( remove_idx ):
del self[rows_idx][remove_idx]
# Change index
row_columns_idx = self[rows_idx].keys()
for column_idx in row_columns_idx:
if column_idx > remove_idx:
self[rows_idx][column_idx -1] = self[rows_idx][column_idx]
del self[rows_idx][column_idx]
def remove_row( self, remove_idx ):
"""
@summary : Remove all the count for the row provided.
@param remove_idx : [int] The real index of the row to remove.
"""
# Remove data
del self[remove_idx]
# Change indexes
all_rows_idx = self.keys()
for row_idx in all_rows_idx:
if row_idx > remove_idx:
self[row_idx - 1] = self[row_idx]
del self[row_idx]
def merge_col( self, sum_idx, added_idx ):
"""
@summary : Merge two columns. The count of each row of the first column (sum_idx) becomes the sum of the values of the two columns ; the second column is deleted.
@param sum_idx : [int] The index of the first column to merge. This column is replaced by the new merged column.
@param added_idx : [int] The index of the second column to merge. This column is deleted after the process.
"""
# Merge counts
added_values = dict()
for row_idx in self.keys():
if row_idx.has_key( added_idx ):
self.add( row_idx, sum_idx, self[row_idx][added_idx] )
# Remove column
self.remove_col( added_idx )
def clear( self ):
"""
@summary : Clear data.
"""
self = SparseData()
def nb_at( self, row_idx, col_idx ):
"""
@todo test
"""
nb = 0
if self.has_key(row_idx) and self[row_idx].has_key(col_idx):
nb = self[row_idx][col_idx]
return nb
def get_col_sum( self, col_idx ):
"""
@todo test
"""
total = 0
for row_idx in self.keys():
if self[row_idx].has_key( col_idx ):
total += self[row_idx][col_idx]
return total
def get_row_sum( self, row_idx ):
"""
@todo test
"""
total = 0
if self.has_key( row_idx ):
for column_idx in self[row_idx].keys():
total += self[row_idx][column_idx]
return total
def row_to_array( self, row_idx, nb_col ):
"""
@todo test
"""
array = [0 for current in range(nb_col)]
if self.has_key( row_idx ):
for column_idx in sorted( self[row_idx].keys() ):
array[column_idx] = self[row_idx][column_idx]
return array
def add( self, row_idx, col_idx, value ):
"""
@todo test
"""
if not self.has_key( row_idx ):
self[row_idx] = { col_idx : 0 }
elif not self[row_idx].has_key( col_idx ):
self[row_idx][col_idx] = 0
self[row_idx][col_idx] += value
def add_row( self ):
pass # Nothing to do
class SparseData( list ):
def add_column( self ):
pass # Nothing to do
class SparseDataOld( list ):
def get_matrix_type( self ):
"""
@summary : Retruns the type of matrix.
......@@ -155,7 +290,7 @@ class SparseData( list ):
for clean_data in self:
if clean_data[1] >= remove_idx:
clean_data[1] -= 1
def remove_row( self, remove_idx ):
"""
@summary : Remove all the count for the row provided.
......@@ -172,7 +307,7 @@ class SparseData( list ):
for clean_data in self:
if clean_data[0] >= remove_idx:
clean_data[0] -= 1
def merge_col( self, sum_idx, added_idx ):
"""
@summary : Merge two columns. The count of each row of the first column (sum_idx) becomes the sum of the values of the two columns ; the second column is deleted.
......@@ -268,6 +403,7 @@ class SparseData( list ):
def add_column( self ):
pass # Nothing to do
class Biom:
"""
@summary : Store biological sample by observation contingency tables.
......@@ -416,8 +552,8 @@ class Biom:
def add_metadata( self, subject_name, metadata_name, metadata_value, subject_type="sample"):
"""
@summary : Add a metadata on subject (a sample or an OTU).
@param subject_name : [str] Metadata is added to the sample/OTU with this name.
@summary : Add a metadata on subject (a sample or an observation).
@param subject_name : [str] Metadata is added to the sample/observation with this name.
@param metadata_name : [str] The metadata category (ex : 'taxonomy').
@param metadata_name : [str] The value of metadata (ex : 'Bacteria').
@param subject_type : [str] The type of subject : "sample" or "observation".
......@@ -453,36 +589,18 @@ class Biom:
len(self.columns)
]
self.matrix_type = self.data.get_matrix_type()
save_data = self.data
self.data = save_data._to_list()
json_str = json.dumps( self, default=lambda o: o.__dict__, sort_keys=False, indent=4 )
self.data = save_data
del self.shape
del self.matrix_type
return json_str
def to_count_table( self ):
"""
@todo test
"""
# Return Title
yield ["#OTU"] + [col['id'] for col in self.columns]
# Return lines
row_idx = 0
for row in self.to_count():
OTU_name = [self.rows[row_idx]['id']]
row_idx += 1
yield OTU_name + row
def to_count( self ):
"""
@todo test
"""
nb_rows = len(self.rows)
nb_columns = len(self.columns)
for row_idx in range(nb_rows):
yield self.data.row_to_array( row_idx, nb_columns )
def remove_samples( self, samples_names ):
"""
@todo test
@summary : Removes sample(s) from biom.
@param samples_names : [str] The name of the sample to rename.
"""
for current_sample in samples_names :
sample_idx = self.find_idx( self.columns, current_sample )
......@@ -492,23 +610,60 @@ class Biom:
self.data.remove_col( sample_idx )
def add_count( self, observation_name, sample_name, value ):
"""
@summary : Add a value to the count for one observation of one sample.
@param observation_name : [str] The observation name.
@param sample_name : [str] The sample name.
@param value : [int] The value to add.
"""
row_idx = self.find_idx( self.rows, observation_name )
col_idx = self.find_idx( self.columns, sample_name )
self.data.add( row_idx, col_idx, value )
def add_observation( self, observation_name, metadata=dict() ):
self.rows.append( {'id':observation_name, 'metadata':None } )
self.data.add_row()
for metadata_name in metadata.keys():
self.add_metadata( observation_name, metadata_name, metadata[metadata_name], "OTU" )
"""
@summary : Add one observation in biom.
@param observation_name : [str] The observation name.
@param metadata : [dict] The metadata (keys : metadata names ; values : metadata values).
"""
try:
self.find_idx( self.rows, observation_name )
# Observation doesn't exist
except ValueError:
self.rows.append( {'id':observation_name, 'metadata':None } )
self.data.add_row()
for metadata_name in metadata.keys():
self.add_metadata( observation_name, metadata_name, metadata[metadata_name], "observation" )
# Observation already exists
else:
raise ValueError( "The observation '" + observation_name + "' already exists." )
def add_sample( self, sample_name, metadata=dict() ):
self.columns.append( {'id':sample_name, 'metadata':None } )
self.data.add_column()
for metadata_name in metadata.keys():
self.add_metadata( sample_name, metadata_name, metadata[metadata_name], "sample" )
"""
@summary : Add one sample in biom.
@param sample_name : [str] The sample name.
@param metadata : [dict] The metadata (keys : metadata names ; values : metadata values).
"""
try:
self.find_idx( self.columns, sample_name )
# Sample doesn't exist
except ValueError:
self.columns.append( {'id':sample_name, 'metadata':None } )
self.data.add_column()
for metadata_name in metadata.keys():
self.add_metadata( sample_name, metadata_name, metadata[metadata_name], "sample" )
# Sample already exists
else:
raise ValueError( "The sample '" + sample_name + "' already exists." )
def get_samples_names( self ):
"""
@summary : Returns a generator to iterate on samples names.
@return : [generator] the generator to iterate on samples names.
"""
for col in self.columns:
yield col["id"]
########################################################################
def _hash_OTU_by_sample( self ):
"""
@summary : Count the number of elements by OTU.
......@@ -531,6 +686,9 @@ class Biom:
return nb_OTU_by_sample, sum_by_sample
def bootstrap_by_sample( self, nb_selected_elts, nb_removed_elts, nb_selection_round=1000 ):
"""
@todo test
"""
nb_OTU_by_sample, sum_by_sample = self._hash_OTU_by_sample()
self.data.clear()
self.generated_by += " | normalisation[delete : " + str(nb_removed_elts) + "; select : " + str(nb_selected_elts) + "; round : " + str(nb_selection_round) + "]"
......@@ -546,9 +704,39 @@ class Biom:
OTU_idx = self.find_idx( self.rows, OTU_id )
sample_idx = self.find_idx( self.columns, current_sample['id'] )
self.data.add( OTU_idx, sample_idx, selected[OTU_id] )
self.data.sort()
########################################################################
##################################################
# self.data.sort()
##################################################
def to_count( self ):
"""
@summary : Returns the count of observations by sample.
@return : [generator] The generator to iterate on observations. Each observation is a list of count
by sample.
Example : [1, 0] # Iteration 1 : sample_1 has one observation_1, sample_2 has zero observation_1
[1, 8] # Iteration 2 : sample_1 has one observation_2, sample_2 has eight observation_2
"""
nb_rows = len(self.rows)
nb_columns = len(self.columns)
for row_idx in range(nb_rows):
yield self.data.row_to_array( row_idx, nb_columns )
def to_count_table( self ):
"""
@summary : Returns the count of observations by sample with titles.
@return : [generator] The generator to iterate on observations. First line is a title.
Example : ['#Observation', 'Sample1', 'Sample2'] # Iteration 1 : title
['GG_OTU_1', 1, 0] # Iteration 2 : Sample1 has one GG_OTU_1, Sample1 has zero GG_OTU_1
['GG_OTU_2', 1, 8] # Iteration 3 : Sample2 has one GG_OTU_2, Sample2 has eight GG_OTU_2
"""
# Return Title
yield ["#Observation"] + [col['id'] for col in self.columns]
# Return lines
row_idx = 0
for row in self.to_count():
OTU_name = self.rows[row_idx]['id']
row_idx += 1
yield [OTU_name] + row
class BiomIO:
......@@ -645,7 +833,6 @@ class BiomIO:
@summary : Write count table from an object 'Biom'.
@param path : [str] The path of the biom file.
@param biom : [Biom] The Biom object to write.
@todo test
"""
out_fh = open( path, "w" )
for line in biom.to_count_table():
......@@ -659,7 +846,7 @@ class BiomIO:
"""
out_fh = open( path, "w" )
for idx in range(len(biom.rows)):
count = biom.data.get_row_sum( idx )######################## TO DO wrapping
count = biom.data.get_row_sum( idx )
tax = biom.rows[idx]["metadata"]["taxonomy"]
if isinstance(tax, list) or isinstance(tax, tuple):
tax = "\t".join( map(str, tax) )
......@@ -668,6 +855,25 @@ class BiomIO:
tax = "\t".join( map(str.strip, tax.split(";")) ) # Replace space separator between ranks by tabulation
out_fh.write( str(count) + "\t" + tax + "\n" )
out_fh.close()
@staticmethod
def write_krona_table_by_sample( path, biom, sample ):
"""
@todo test
"""
out_fh = open( path, "w" )
col_idx = biom.find_idx( biom.columns, sample )
for row_idx in range(len(biom.rows)):
count = biom.data.nb_at( row_idx, col_idx )
if count > 0:
tax = biom.rows[idx]["metadata"]["taxonomy"]
if isinstance(tax, list) or isinstance(tax, tuple):
tax = "\t".join( map(str, tax) )
else:
tax = str( tax )
tax = "\t".join( map(str.strip, tax.split(";")) ) # Replace space separator between ranks by tabulation
out_fh.write( str(count) + "\t" + tax + "\n" )
out_fh.close()
@staticmethod
def load_metadata( biom, metadata_file, subject_type="sample", types={}, list_sep={} ):
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment