Commit ec1f497f authored by Penom Nom's avatar Penom Nom
Browse files

Add GeneOTUStat.

parent 59550112
......@@ -28,7 +28,7 @@ class GeneDiversity (NG6Workflow):
fastqc = self.add_component("FastQC", [self.args['read_1'] + self.args['read_2']])
# Merge overlapping pair
join_pairs = self.add_component("Flash", [self.args['read_1'], self.args['read_2'], self.args["mismatch_ratio"], self.args["min_overlap"], self.args["max_overlap"]])
join_pairs = self.add_component("Flash", [self.args['read_1'], self.args['read_2'], self.args["join_pair"]["mismatch_ratio"], self.args["join_pair"]["min_overlap"], self.args["join_pair"]["max_overlap"]])
# Fastq to fasta
fastq2fasta = self.add_component("Fastq2fasta", [join_pairs.extended_frags])
......@@ -40,7 +40,7 @@ class GeneDiversity (NG6Workflow):
chimera = self.add_component("UsearchChimera", [dereplicate.output_files])
# Index the reference proteins for framebot
framebot_index = self.add_component("FramebotIndex", [self.args["taxonomy"]])
framebot_index = self.add_component("FramebotIndex", [self.args["database"]])
# Sequence traduction
framebot = self.add_component("Framebot", [chimera.nonchimeras, framebot_index.index])
......@@ -52,9 +52,10 @@ class GeneDiversity (NG6Workflow):
cdhit = self.add_component("Cdhit", [merge.output_file, self.args["otu"]["identity_threshold"], self.args["otu"]["length_diff_cutoff"], self.args["otu"]["cluster_most_similar"]])
# Index the reference proteins for blast
blast_index = self.add_component("BlastIndex", [self.args["taxonomy"], "prot"])
blast_index = self.add_component("BlastIndex", [self.args["database"], "prot"])
# Align OTU
blast = self.add_component("Blast", [cdhit.output_files, [{'file':blast_index.databank}], 8, "blastp"])
blast = self.add_component("Blast", [cdhit.output_files, [{'file':blast_index.databank}], 6, "blastp"])
# Stat on OTU
stat = self.add_component("GeneOTUStat", [cdhit.cluster_files, blast.outputs[os.path.basename(blast_index.databank)], self.args["taxonomy"]])
#
# Copyright (C) 2012 INRA
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
import os
from jflow.component import Component
from jflow.iotypes import InputFile, OutputFile, Formats
from jflow.abstraction import MultiMap
from weaver.function import PythonFunction
def gene_OTU_write( cdhit_file, blast_file, taxonomy_file, stat_file ):
"""
@param cdhit_file : the list of '.clstr' files produced by cdhit.
@param blast_file : the blastp result in tabular format (outfmt 6 with NCBI Blast+).
@param taxonomy_file : the taxonomy file.
@param stat_file : the ouput file.
"""
import os
def init_cluster_count(sample_names):
samples_count = {}
for sample in sample_names:
samples_count[sample] = 0
return samples_count
out_fh = open(stat_file, "w")
# first gathers seq/tax values in a hash
tax = {}
for line in open(taxonomy_file).readlines():
parts = line.strip().split("\t")
tax[parts[0]] = parts[1]
# then merge info from with the blast
sample_tax = {}
sample_evalue = {}
sample_identity = {}
for line in open(blast_file).readlines():
parts = line.strip().split()
sample_tax[parts[0].split(";")[0]] = tax[parts[1]]
sample_evalue[parts[0].split(";")[0]] = parts[10]
sample_identity[parts[0].split(";")[0]] = parts[2]
sample_names = []
# first gathers all sample
for line in open(cdhit_file).readlines():
parts = line.strip().split()
# if this is a sample line
if len(parts) == 5:
sample = "_".join(parts[2][1:-3].split("_")[:-1])
if sample not in sample_names: sample_names.append(sample)
out_fh.write( "#ClusterID\t" + "\t".join(sample_names) + "\tTaxonomy\tEvalue\tIdentity\n" )
new_cluster, cluster_name, cluster_count, cluster_taxa, taxa_evalue, taxa_identity = False, None, init_cluster_count(sample_names), None, None, None
for line in open(cdhit_file).readlines():
if line.startswith(">") and cluster_name == None:
cluster_name = line.strip()[1:].replace(" ", "_")
elif line.startswith(">"):
to_print = cluster_name
for sample in sample_names:
to_print += "\t" + str(cluster_count[sample])
out_fh.write( to_print + "\t" + str(cluster_taxa) + "\t" + str(taxa_evalue) + "\t" + str(taxa_identity) + "\n" )
cluster_count = init_cluster_count(sample_names)
cluster_name = line.strip()[1:].replace(" ", "_")
else:
csample = "_".join(line.strip().split()[2][1:-3].split("_")[:-1])
ccount = int(line.strip().split()[2][1:-3].split(";")[1].split("=")[1])
cseq = line.strip().split()[2][1:-3].split(";")[0]
if sample_tax.has_key(cseq):
cluster_taxa = sample_tax[cseq]
taxa_evalue = sample_evalue[cseq]
taxa_identity = sample_identity[cseq]
cluster_count[csample] += ccount
class GeneOTUStat (Component):
def define_parameters(self, cdhit_cluster_file, blast_file, taxonomy_file):
"""
@param cdhit_cluster_file : the list of '.clstr' files produced by cdhit.
@param blast_file : the blastp result in tabular format (outfmt 6 with NCBI Blast+).
@param taxonomy_file : the taxonomy file.
"""
self.cdhit_file = InpuFileList( cdhit_cluster_file )
self.blast_file = InputFileList( blast_file )
self.taxonomy_file = InputFileList( blast_file )
self.output_files = OutputFileList( self.get_outputs('{basename_woext}.stat', self.cdhit_file) )
def process(self):
stat = PythonFunction( gene_OTU_write, cmd_format='{EXE} {IN} {OUT}' )
MultiMap(stat, inputs=[self.cdhit_file, self.blast_file, self.taxonomy_file], outputs=self.output_files )
\ No newline at end of file
......@@ -46,32 +46,45 @@ read_2.help = Read2
read_2.action = append
read_2.required = True
database.name = Gene database
database.flag = --database
database.help = The reference set contains protein representative sequences of the gene target and should be compiled to have a good coverage of diversity of the gene family.
database.type = localfile
database.required = True
taxonomy.name = Gene taxonomy
taxonomy.flag = --taxonomy
taxonomy.help = The reference set contains protein representative sequences of the gene target and should be compiled to have a good coverage of diversity of the gene family.
taxonomy.help = The gene taxonomy. Format : 'GENE_ID<tab>TAX; TAX; TAX;'.
taxonomy.type = localfile
taxonomy.required = True
# Flash
mismatch_ratio.name = Mismatch ratio
mismatch_ratio.flag = --mismatch-ratio
mismatch_ratio.help = Maximum allowed ratio between the number of mismatched base pairs and the overlap length.
mismatch_ratio.type = float
mismatch_ratio.default = 0.1
min_overlap.name = Minimum overlap
min_overlap.flag = --min-overlap
min_overlap.help = The minimum required overlap length between two reads to provide a confident overlap.
min_overlap.type = int
min_overlap.default = 20
max_overlap.name = Maximum overlap
max_overlap.flag = --max-overlap
max_overlap.help = Maximum overlap length expected in approximately 90 percent of read pairs.
max_overlap.type = int
max_overlap.default = 55
# Join pairs
join_pair.name = Join pairs
join_pair.flag = --join
join_pair.help = Options for join the overlapping pairs
join_pair.type = multiple
join_pair.group = JOIN section
# Parameter mismatch_ratio
join_pair.mismatch_ratio.name = Mismatch ratio
join_pair.mismatch_ratio.flag = mismatch-ratio
join_pair.mismatch_ratio.help = Maximum allowed ratio between the number of mismatched base pairs and the overlap length.
join_pair.mismatch_ratio.type = float
join_pair.mismatch_ratio.default = 0.1
# Parameter min_overlap
join_pair.min_overlap.name = Minimum overlap
join_pair.min_overlap.flag = min-overlap
join_pair.min_overlap.help = The minimum required overlap length between two reads to provide a confident overlap.
join_pair.min_overlap.type = int
join_pair.min_overlap.default = 20
# Parameter max_overlap
join_pair.max_overlap.name = Maximum overlap
join_pair.max_overlap.flag = max-overlap
join_pair.max_overlap.help = Maximum overlap length expected in approximately 90 percent of read pairs.
join_pair.max_overlap.type = int
join_pair.max_overlap.default = 55
# Dereplication
dereplication.name = dereplication
dereplication.name = Dereplication
dereplication.flag = --dereplication
dereplication.help = Options for the dereplication
dereplication.type = multiple
......@@ -95,7 +108,7 @@ dereplication.min_unique_size.help = Minimum size for a cluster.
dereplication.min_unique_size.type = int
# OTU
otu.name = dereplication
otu.name = Operational taxonomic unit
otu.flag = --OTU
otu.help = Options for the OTU
otu.type = multiple
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment