Commit 405df757 authored by Penom Nom's avatar Penom Nom
Browse files

Add renamed seq.

parent 5d7e4a76
......@@ -62,4 +62,4 @@ class GeneDiversity (NG6Workflow):
blast = self.add_component("Blast", [cdhit.output_files, [{'file':blast_index.databank, 'max_hit':1}], 6, "blastp"])
# Stat on OTU
stat = self.add_component("GeneOTUStat", [[rename_clusters.merged_logs], cdhit.cluster_files, blast.outputs[os.path.basename(blast_index.databank)], [self.args["taxonomy"]]])
\ No newline at end of file
stat = self.add_component("GeneOTUStat", [[rename_clusters.merged_logs], cdhit.cluster_files, blast.outputs[os.path.basename(blast_index.databank)], [self.args["taxonomy"]], cdhit.output_files], parent=join_pairs)
\ No newline at end of file
......@@ -25,6 +25,32 @@ from weaver.function import PythonFunction
from ng6.analysis import Analysis
def rename_seq( rename_rules_file, input_fasta, renamed_fasta ):
"""
@summary: Write a renamed_fasta where the sequences ID will be replaced by the new names in rename_rules_file.
@param rename_rules_file : [string] file with rules to rename sequences.each line has this syntax 'NEW_NAME<tab>[ELT2<tab>...<tab>]OLD_NAME'.
@param input_fasta : [string] fasta to process.
@param renamed_fasta : [string] fasta after process.
"""
import jflow.seqio as seqio
# Retrieve new names
new_names = dict()
rules_fh = open( rename_rules_file )
for line in rules_fh:
if not line.startswith('#'):
parts = line.strip().split("\t")
new_names[parts[-1]] = parts[0]
rules_fh.close()
# Rename sequences
reader = seqio.SequenceReader( input_fasta )
out_fh = open( renamed_fasta, "w" )
for id, desc, seq, qual in reader :
real_id = id.split(';')[0]
seqio.writefasta( out_fh, [[new_names[real_id], desc, seq, qual]] )
def gene_OTU_write( trace_file, cdhit_file, blast_file, taxonomy_file, stat_file ):
"""
@param trace_file : [string] each line of this file is a link between cluster ID and sample file ('OLD_ID<tab>FILE_PATH<tab>CLUSTER_ID').
......@@ -53,9 +79,11 @@ def gene_OTU_write( trace_file, cdhit_file, blast_file, taxonomy_file, stat_file
parts = line.strip().split()
query_id = parts[0].split(";")[0]
cluster_annot[query_id] = dict()
cluster_annot[query_id]['tax'] = tax[parts[1]]
subject_id = parts[1].split("#")[0] # Subject filed : <ID>#<PARTIAL_DESC>
cluster_annot[query_id]['tax'] = tax[subject_id]
cluster_annot[query_id]['evalue'] = parts[10]
cluster_annot[query_id]['identity'] = parts[2]
cluster_annot[query_id]['aln_length'] = parts[3]
# List all samples name
sample_names_dict = dict()
......@@ -72,17 +100,17 @@ def gene_OTU_write( trace_file, cdhit_file, blast_file, taxonomy_file, stat_file
# Process stat
out_fh = open(stat_file, "w")
out_fh.write( "#Cluster_ID\t" + "\t".join(sample_names) + "\tTaxonomy\tTaxonomy_Evalue\tTaxonomy_Identity\n" )
out_fh.write( "#Cluster_ID" + "\t" + "\t".join(sample_names) + "\t" + "Taxonomy" + "\t" + "Taxonomy_evalue" + "\t" + "Taxonomy_identity" + "\t" + "Cluster_alignment_length" + "\t" + "Representative_seq" + "\n" )
cluster_count = init_cluster_count(sample_names)
cluster_name, cluster_taxa, taxa_evalue, taxa_identity = None, None, None, None
cluster_name, cluster_taxa, taxa_evalue, taxa_identity, cluster_representative = None, None, None, None, None
for line in open(cdhit_file).readlines():
if line.startswith(">"): # Line example : '>Cluster 0' => 'Cluster_0'
if cluster_name is not None:
to_print = cluster_name
for sample in sample_names:
to_print += "\t" + str(cluster_count[sample])
out_fh.write( to_print + "\t" + str(cluster_taxa) + "\t" + str(taxa_evalue) + "\t" + str(taxa_identity) + "\n" )
out_fh.write( to_print + "\t" + str(cluster_taxa) + "\t" + str(taxa_evalue) + "\t" + str(taxa_identity) + "\t" + aln_length + "\t" + cluster_representative + "\n" )
cluster_count = init_cluster_count(sample_names)
cluster_name = line.strip()[1:].replace(" ", "_")
else: # Line example : '60 130aa, >c6104.0;1... at 99.23%'
......@@ -91,31 +119,36 @@ def gene_OTU_write( trace_file, cdhit_file, blast_file, taxonomy_file, stat_file
pre_cluster_name = line.strip().split()[2][1:-3].split(";")[0]
# if current pre-cluster is the representative of final cluster
if cluster_annot.has_key(pre_cluster_name):
cluster_representative = pre_cluster_name
cluster_taxa = cluster_annot[pre_cluster_name]['tax']
taxa_evalue = cluster_annot[pre_cluster_name]['evalue']
taxa_identity = cluster_annot[pre_cluster_name]['identity']
aln_length = cluster_annot[pre_cluster_name]['aln_length']
cluster_count[pre_cluster_sample] += pre_cluster_count
# Last cluster
to_print = cluster_name
for sample in sample_names:
to_print += "\t" + str(cluster_count[sample])
out_fh.write( to_print + "\t" + str(cluster_taxa) + "\t" + str(taxa_evalue) + "\t" + str(taxa_identity) + "\n" )
out_fh.write( to_print + "\t" + str(cluster_taxa) + "\t" + str(taxa_evalue) + "\t" + str(taxa_identity) + "\t" + aln_length + "\t" + cluster_representative + "\n" )
class GeneOTUStat (Analysis):
def define_parameters(self, cluster_trace_file, cdhit_cluster_file, blast_file, taxonomy_file):
def define_parameters(self, cluster_trace_file, cdhit_cluster_file, blast_file, taxonomy_file, fasta_file):
"""
@param trace_file : [list] each line of this file is a link between cluster ID and sample file ('OLD_ID<tab>FILE_PATH<tab>CLUSTER_ID').
@param cdhit_cluster_file : [list] the list of '.clstr' files produced by cdhit.
@param blast_file : [list] the blastp results in tabular format (outfmt 6 with NCBI Blast+).
@param taxonomy_file : [list] the taxonomy files.
@param taxonomy_file : [list] the taxonomy files.
@param fasta_file : [list] sequences of the clusters.
"""
self.trace_file = cluster_trace_file
self.trace_file = InputFileList( cluster_trace_file )
self.cdhit_file = InputFileList( cdhit_cluster_file )
self.blast_file = InputFileList( blast_file )
self.taxonomy_file = InputFileList( taxonomy_file )
self.output_files = OutputFileList( self.get_outputs('{basename_woext}.stat', self.cdhit_file) )
self.fasta_file = InputFileList( fasta_file, Formats.FASTA )
self.stats = OutputFileList( self.get_outputs('{basename_woext}.stat', self.cdhit_file) )
self.fasta_renamed = OutputFileList( self.get_outputs('{basename_woext}.fasta', self.fasta_file) )
def define_analysis(self):
self.name = "GeneOTUAnalysis"
......@@ -127,8 +160,13 @@ class GeneOTUStat (Analysis):
return "-"
def post_process(self):
self._save_files(self.output_files)
self._save_files(self.stats + self.fasta_renamed)
def process(self):
# Process stats
stat = PythonFunction( gene_OTU_write, cmd_format='{EXE} {IN} {OUT}' )
MultiMap(stat, inputs=[self.trace_file, self.cdhit_file, self.blast_file, self.taxonomy_file], outputs=self.output_files )
\ No newline at end of file
MultiMap(stat, inputs=[self.trace_file, self.cdhit_file, self.blast_file, self.taxonomy_file], outputs=self.stats )
# Rename sequences
rename = PythonFunction( rename_seq, cmd_format='{EXE} {IN} {OUT}' )
MultiMap(rename, inputs=[self.stats, self.fasta_file], outputs=self.fasta_renamed )
\ No newline at end of file
......@@ -25,10 +25,11 @@ from weaver.function import ShellFunction, PythonFunction
def rename_clusters( file_idx, input_file, output_file, log_file ):
"""
@param file_idx : [int] the unique id for the input file. It is used to differentiate the ID of clusters in same position of different files.
@param input_fasta : [string] fasta list to process
@param output_file : [string] the fasta file with the news IDs
@param log_file : [string] each line contains 'OLD_ID<tab>FILE_PATH<tab>NEW_ID'
@summary: Change ID of each sequence and keep the traceback.
@param file_idx : [int] the unique id for the input file. It is used to differentiate the ID of clusters in same position of different files.
@param input_fasta : [string] fasta list to process
@param output_file : [string] the fasta file with the news IDs
@param log_file : [string] each line contains 'OLD_ID<tab>FILE_PATH<tab>NEW_ID'
"""
import jflow.seqio as seqio
......
......@@ -17,7 +17,7 @@
[global]
name = gene_diversity
description = DEV
description = Analysis the composition and function of a microbial community from a functional gene.
#
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment