Commit 071951bf authored by Penom Nom's avatar Penom Nom
Browse files

- Fix bugs with gene otu analysis.

- Add visualisation for Gene_diversity.
parent 38765a46
{*
Copyright (C) 2009 INRA
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*}
{extends file='AnalysisTemplate.tpl'}
\ No newline at end of file
......@@ -45,9 +45,12 @@ class GeneDiversity (NG6Workflow):
# Sequence traduction
framebot = self.add_component("Framebot", [chimera.nonchimeras, self.args["database"], False])
# Rename the pre-clusters to provide traceback after merge and cd-hit
rename_clusters = self.add_component("RenameClusters", [framebot.corrected_proteins])
# Merge sequences
merge = self.add_component("ConcatenateFiles", [framebot.corrected_proteins, "all_trad_sequences.fasta"])
merge = self.add_component("ConcatenateFiles", [rename_clusters.output_files, "all_trad_sequences.fasta"])
# Create OTU
cdhit = self.add_component("Cdhit", [merge.output_file, self.args["otu"]["identity_threshold"], self.args["otu"]["length_diff_cutoff"], self.args["otu"]["cluster_most_similar"]])
......@@ -56,7 +59,7 @@ class GeneDiversity (NG6Workflow):
blast_index = self.add_component("BlastIndex", [self.args["database"], "prot"])
# Align OTU
blast = self.add_component("Blast", [cdhit.output_files, [{'file':blast_index.databank}], 6, "blastp"])
blast = self.add_component("Blast", [cdhit.output_files, [{'file':blast_index.databank, 'max_hit':1}], 6, "blastp"])
# Stat on OTU
stat = self.add_component("GeneOTUStat", [cdhit.cluster_files, blast.outputs[os.path.basename(blast_index.databank)], self.args["taxonomy"]])
stat = self.add_component("GeneOTUStat", [[rename_clusters.merged_logs], cdhit.cluster_files, blast.outputs[os.path.basename(blast_index.databank)], [self.args["taxonomy"]]])
\ No newline at end of file
......@@ -17,14 +17,15 @@
import os
from jflow.component import Component
from jflow.iotypes import InputFileList, OutputFileList, Formats
from jflow.abstraction import MultiMap
from weaver.function import PythonFunction
from ng6.analysis import Analysis
def gene_OTU_write( cdhit_file, blast_file, taxonomy_file, stat_file ):
def gene_OTU_write( trace_file, cdhit_file, blast_file, taxonomy_file, stat_file ):
"""
@param cdhit_file : [string] the list of '.clstr' files produced by cdhit.
@param blast_file : [string] the blastp result in tabular format (outfmt 6 with NCBI Blast+).
......@@ -39,68 +40,79 @@ def gene_OTU_write( cdhit_file, blast_file, taxonomy_file, stat_file ):
samples_count[sample] = 0
return samples_count
out_fh = open(stat_file, "w")
# first gathers seq/tax values in a hash
# Build an hash with the taxonomy for each gene (key=gene_id ; value=gene_taxonomy)
tax = {}
for line in open(taxonomy_file).readlines():
parts = line.strip().split("\t")
parts = line.strip().split("\t") # Line example : 'ADC62191 Bacteria; Proteobacteria; Gammaproteobacteria; Chromatiales; Chromatiaceae; Allochromatium.; Allochromatium vinosum DSM 180'
tax[parts[0]] = parts[1]
# then merge info from with the blast
sample_tax = {}
sample_evalue = {}
sample_identity = {}
# Retrieve clusters annotations
cluster_annot = dict()
for line in open(blast_file).readlines():
parts = line.strip().split()
sample_tax[parts[0].split(";")[0]] = tax[parts[1]]
sample_evalue[parts[0].split(";")[0]] = parts[10]
sample_identity[parts[0].split(";")[0]] = parts[2]
sample_names = []
# first gathers all sample
for line in open(cdhit_file).readlines():
parts = line.strip().split()
# if this is a sample line
if len(parts) == 5:
sample = "_".join(parts[2][1:-3].split("_")[:-1])
if sample not in sample_names: sample_names.append(sample)
query_id = parts[0].split(";")[0]
cluster_annot[query_id] = dict()
cluster_annot[query_id]['tax'] = tax[parts[1]]
cluster_annot[query_id]['evalue'] = parts[10]
cluster_annot[query_id]['identity'] = parts[2]
# List all samples name
sample_names_dict = dict()
cluster_samples = dict()
for line in open(trace_file).readlines():
parts = line.strip().split() # Tace line example 'OLD_ID<tab>FILE_PATH<tab>NEW_ID'
cluster_new_id = parts[2]
sample = os.path.basename( parts[1] ).split('_')[0]
cluster_samples[cluster_new_id] = sample
if not sample_names_dict.has_key( sample ):
sample_names_dict[sample] = 1
sample_names = sorted(sample_names_dict.keys())
del sample_names_dict
# Process stat
out_fh = open(stat_file, "w")
out_fh.write( "#ClusterID\t" + "\t".join(sample_names) + "\tTaxonomy\tEvalue\tIdentity\n" )
new_cluster, cluster_name, cluster_count, cluster_taxa, taxa_evalue, taxa_identity = False, None, init_cluster_count(sample_names), None, None, None
cluster_count = init_cluster_count(sample_names)
cluster_name, cluster_taxa, taxa_evalue, taxa_identity = None, None, None, None
for line in open(cdhit_file).readlines():
if line.startswith(">") and cluster_name == None:
cluster_name = line.strip()[1:].replace(" ", "_")
elif line.startswith(">"):
to_print = cluster_name
for sample in sample_names:
to_print += "\t" + str(cluster_count[sample])
out_fh.write( to_print + "\t" + str(cluster_taxa) + "\t" + str(taxa_evalue) + "\t" + str(taxa_identity) + "\n" )
cluster_count = init_cluster_count(sample_names)
cluster_name = line.strip()[1:].replace(" ", "_")
else:
csample = "_".join(line.strip().split()[2][1:-3].split("_")[:-1])
ccount = int(line.strip().split()[2][1:-3].split(";")[1].split("=")[1])
cseq = line.strip().split()[2][1:-3].split(";")[0]
if sample_tax.has_key(cseq):
cluster_taxa = sample_tax[cseq]
taxa_evalue = sample_evalue[cseq]
taxa_identity = sample_identity[cseq]
cluster_count[csample] += ccount
if line.startswith(">"):
if cluster_name is not None:
to_print = cluster_name
for sample in sample_names:
to_print += "\t" + str(cluster_count[sample])
out_fh.write( to_print + "\t" + str(cluster_taxa) + "\t" + str(taxa_evalue) + "\t" + str(taxa_identity) + "\n" )
cluster_count = init_cluster_count(sample_names)
cluster_name = line.strip()[1:].replace(" ", "_") #Example : '>Cluster 0' => 'Cluster_0'
else:
pre_cluster_sample = cluster_samples[ line.strip().split()[2][1:-3] ] # Sample line example : '0 126aa, >Cluster33;size=20... at 99.21%'
pre_cluster_count = int(line.strip().split()[2][1:-3].split(";")[-1]) # Sample line example : '0 126aa, >Cluster33;20... at 99.21%'
pre_cluster_name = line.strip().split()[2][1:-3].split(";")[0] # Sample line example : '0 126aa, >Cluster33;size=20... at 99.21%'
# if current pre-cluster is the representative of final cluster
if cluster_annot.has_key(pre_cluster_name):
cluster_taxa = cluster_annot[pre_cluster_name]['tax']
taxa_evalue = cluster_annot[pre_cluster_name]['evalue']
taxa_identity = cluster_annot[pre_cluster_name]['identity']
cluster_count[pre_cluster_sample] += pre_cluster_count
# Last cluster
to_print = cluster_name
for sample in sample_names:
to_print += "\t" + str(cluster_count[sample])
out_fh.write( to_print + "\t" + str(cluster_taxa) + "\t" + str(taxa_evalue) + "\t" + str(taxa_identity) + "\n" )
class GeneOTUStat (Analysis):
def define_parameters(self, cdhit_cluster_file, blast_file, taxonomy_file):
def define_parameters(self, cluster_trace_file, cdhit_cluster_file, blast_file, taxonomy_file):
"""
@param cdhit_cluster_file : [list] the list of '.clstr' files produced by cdhit.
@param blast_file : [list] the blastp results in tabular format (outfmt 6 with NCBI Blast+).
@param taxonomy_file : [list] the taxonomy files.
"""
self.trace_file = cluster_trace_file
self.cdhit_file = InputFileList( cdhit_cluster_file )
self.blast_file = InputFileList( blast_file )
self.taxonomy_file = InputFileList( blast_file )
self.taxonomy_file = InputFileList( taxonomy_file )
self.output_files = OutputFileList( self.get_outputs('{basename_woext}.stat', self.cdhit_file) )
def define_analysis(self):
......@@ -108,10 +120,13 @@ class GeneOTUStat (Analysis):
self.description = "Organizational Taxon Unit analysis."
self.software = "-"
self.options = "-"
def get_version(self):
return "-"
def post_process(self):
self._save_files(self.output_files)
def process(self):
stat = PythonFunction( gene_OTU_write, cmd_format='{EXE} {IN} {OUT}' )
MultiMap(stat, inputs=[self.cdhit_file, self.blast_file, self.taxonomy_file], outputs=self.output_files )
\ No newline at end of file
MultiMap(stat, inputs=[self.trace_file, self.cdhit_file, self.blast_file, self.taxonomy_file], outputs=self.output_files )
\ No newline at end of file
#
# Copyright (C) 2012 INRA
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
import os
from jflow.component import Component
from jflow.iotypes import OutputFileList, InputFileList, OutputFile, Formats
from jflow.abstraction import MultiMap
from weaver.function import ShellFunction, PythonFunction
def rename_clusters( file_idx, input_file, output_file, log_file ):
"""
@param file_idx : [int] the unique id for the input file. It is used to differentiate the ID of clusters in same position of different files.
@param input_fasta : [string] fasta list to process
@param output_file : [string] the fasta file with the news IDs
@param log_file : [string] each line contains 'OLD_ID<tab>FILE_PATH<tab>NEW_ID'
"""
import jflow.seqio as seqio
reader = seqio.SequenceReader(input_file)
out_fh = open(output_file, "w")
log_fh = open(log_file, "w")
seq_idx = 0
for id, desc, seq, qual in reader :
id_fields = id.split(';')
cluster_size_tag = id_fields[-2] # Id example : 'HWI-M00185:22:000000000-A53C9:1:1101:19665:4362;size=557;'
tag, cluster_size = cluster_size_tag.split('=')
new_id = 'c' + str(seq_idx) + '.' + str(file_idx) + ';' + str(cluster_size)
if len( new_id ) > 19:
raise "Cluster name '" + new_id + "' is to long to traceback on cd-hit."
seqio.writefasta( out_fh, [[new_id, desc, seq, qual]] )
log_fh.write( id + "\t" + input_file + "\t" + new_id + "\n" )
seq_idx += 1
class RenameClusters (Component):
"""
@summary: Change ID of each sequence and keep the traceback.
The new ID is 'c<IDX_SEQ_IN_FILE>.<IDX_OF_FILE_IN_LIST>;<CLUSTER_SIZE>' (Maximum length : 19 characters).
The traceback is a log file with for each line 'OLD_ID<tab>FILE_PATH<tab>NEW_ID'
@precondition : The IDs before rename step must have this syntax : '<UNIQ_ID>;size=<CLUSTER_SIZE>;' (@see usearch dreplication).
"""
def define_parameters(self, input_fasta):
"""
@param input_fasta : [list] fasta list to process
"""
self.input_fasta = InputFileList( input_fasta, Formats.FASTA )
logs = list()
output_files = list()
for current_input in input_fasta:
basename_woext = os.path.basename( current_input ).split(".")[0]
logs.append( os.path.join(self.output_directory, basename_woext + '_renameLog.tsv') )
output_files.append( os.path.join(self.output_directory, basename_woext + '_rename.fasta') )
self.output_files = OutputFileList( output_files, Formats.FASTA )
self.logs = OutputFileList( logs )
self.merged_logs = OutputFile( os.path.join(self.output_directory, 'all_rename_log.tsv') )
self.stderr = OutputFile( os.path.join(self.output_directory, 'rename.stderr') )
def process(self):
# Rename files
for file_idx in range( len(self.input_fasta) ):
rename = PythonFunction( rename_clusters, cmd_format='{EXE} ' + str(file_idx) + ' {IN} {OUT} 2>> ' + self.stderr )
rename( inputs=self.input_fasta[file_idx], outputs=[self.output_files[file_idx], self.logs[file_idx]] )
# Merge logs
merge_logs = ShellFunction( "cat " + " ".join(self.logs) + " > $1 2>> " + self.stderr, cmd_format='{EXE} {OUT}' )
merge_logs( includes=self.logs, outputs=self.merged_logs )
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment