Commit f8e5e398 authored by Penom Nom's avatar Penom Nom
Browse files

Add some statistics on clustering.

parent 14d56a1f
//Global data
//Depths histogram
var sum_data = new Array();
//Linkage view
var linkage_data = null ;
var tree = null ;
var diagonal = null ;
var svg_layout = null ;
var i = 0 ;
$(function () {
$(".depth-stat-view-btn").click(function() {
//Set dialog window
$("#modal-label-tmpl").html("NG6 <small> " + $("#analyse_name").val() + "</small>");
$("#modal-body-tmpl").html('<div id="highcharts_container"></div>');
$("#modal-foot-tmpl").html('<button class="btn" data-dismiss="modal" aria-hidden="true"><i class="icon-remove"></i> Close</button>');
$("#highcharts_container").css('max-width', '700px');
$("#ng6modal").css('width', 'auto');
//Retrieve values
var clustering_names = new Array() ;
var clustering_data = new Array() ;
$(":checked[id^=chk_sample_]").each(function(){
var index = $(this).attr("id").split("_")[2];
var min = parseFloat( $("#min_"+index).text() );
var lq = parseFloat( $("#lq_"+index).text() );
var median = parseFloat( $("#median_"+index).text() );
var uq = parseFloat( $("#uq_"+index).text() );
var max = parseFloat( $("#max_"+index).text() );
clustering_data.push( new Array(min, lq, median, uq, max) );
clustering_names.push( $("#sample_id_"+index).text() );
});
//Build chart
var chart = new Highcharts.Chart({
chart: {
type: 'boxplot',
renderTo: 'highcharts_container',
},
title: {
text: 'Clustering depths'
},
legend: {
enabled: false
},
xAxis: {
categories: clustering_names
},
yAxis: {
title: {
text: 'Depths'
}
},
series: [{
name: 'Depths',
data: clustering_data
}],
credits: {
enabled: false
},
});
//Display
$("#ng6modal").modal();
})
$(".depth-hist-view-btn").click(function() {
//Set dialog window
$("#modal-label-tmpl").html("NG6 <small> " + $("#analyse_name").val() + "</small>");
$("#modal-body-tmpl").html('<div id="highcharts_container"></div>');
$("#modal-foot-tmpl").html('<button class="btn" data-dismiss="modal" aria-hidden="true"><i class="icon-remove"></i> Close</button>');
$("#highcharts_container").css('width', '845px');
$("#ng6modal").css('width', 'auto');
$("#ng6modal").css('margin-left', '-435px');
//Retrieve values
sum_data = new Array();
var counts_data = new Array() ;
$(":checked[id^=chk_sample_]").each(function(){
var index = $(this).attr("id").split("_")[2] ;
var counts = $("#counts_"+index).val().split(",") ;
for (var i=0 ; i<counts.length ; i++) {
counts[i] = parseInt( counts[i] );
}
counts_data.push(
{ name: $("#sample_id_"+index).text(),
data: counts,
pointStart: 1 }
);
sum_data[$("#sample_id_"+index).text()] = parseFloat( $("#count_"+index).text() );
});
//Build chart
var chart = new Highcharts.Chart({
chart: {
type: 'column',
renderTo: 'highcharts_container',
},
title: {
text: 'Depths by cluster count'
},
xAxis: {
title: {
text: 'Depths'
},
tickInterval: 20
},
yAxis: {
title: {
text: 'Clusters count'
},
type: 'logarithmic',
minorTickInterval: 0.1
},
tooltip: {
formatter:function() {
tooltip_head = '<b>Depths ' + this.x + '</b>' ;
tooltip_body = '' ;
for( var i=0 ; i<this.points.length ; i++) {
var pcnt = (this.points[i].point.y / sum_data[this.points[i].series.name]) * 100;
tooltip_body += '<tr>' +
'<td style="color:' + this.points[i].series.color +'">' + this.points[i].series.name + ' : </td>' +
'<td> ' + this.points[i].point.y + ' </td>' +
'<td> (' + Highcharts.numberFormat(pcnt) + '%)</td>' +
'<td> clusters</td>' +
'</tr>' ;
}
return tooltip_head + '<table>' + tooltip_body + '</table>' ;
},
shared: true,
useHTML: true
},
plotOptions: {
column: {
pointPadding: 0.1,
borderWidth: 0
}
},
series: counts_data,
credits: {
enabled: false
},
});
//Display
$("#ng6modal").modal();
});
$(".tree-view-btn").click(function() {
//Set dialog window
$("#modal-label-tmpl").html("NG6 <small> " + $("#analyse_name").val() + "</small>");
$("#modal-body-tmpl").html("");
$("#modal-foot-tmpl").html('<button class="btn" data-dismiss="modal" aria-hidden="true"><i class="icon-remove"></i> Close</button>');
$("#ng6modal").css('width', 'auto');
$("#ng6modal").css('margin-left', '-435px');
//Retrieve values
linkage_data = null ;
$(":checked[id^=chk_sample_]").each(function(){
var index = $(this).attr("id").split("_")[2];
linkage_data = $.parseJSON( $("#linkage_"+index).val() );
});
//Build chart
var width = "800" ;
var height = "550" ;
tree = null ;
diagonal = null ;
svg_layout = null ;
i = 0 ;
tree = d3.layout.tree()
.size([height, width]);
diagonal = d3.svg.diagonal()
.projection(function(d) { return [d.y, d.x]; });
svg_layout = d3.select("#modal-body-tmpl").append("svg:svg")
.attr("width", width)
.attr("height", height)
.append("svg:g")
.attr("transform", "translate(20,20)");
linkage_data.x0 = height / 2;
linkage_data.y0 = 0;
update_linkage_nodes(linkage_data);
//Display
$("#ng6modal").modal();
});
/**
* Draw/update linkage tree.
*/
function update_linkage_nodes(source) {
var duration = d3.event && d3.event.altKey ? 5000 : 500;
// Compute the new tree layout.
var nodes = tree.nodes(linkage_data).reverse();
// Normalize for fixed-depth.
nodes.forEach(function(d) { d.y = d.depth * 180; });
// Update the nodes.
var node = svg_layout.selectAll("g.d3-node")
.data(nodes, function(d) { return d.id || (d.id = ++i); });
// Enter any new nodes at the parent's previous position.
var nodeEnter = node.enter().append("svg:g")
.attr("class", "d3-node")
.attr("transform", function(d) { return "translate(" + source.y0 + "," + source.x0 + ")"; })
.on("click", function(d) { toggle(d); });
nodeEnter.append("svg:circle")
.attr("r", 1e-6)
.attr("class", function(d) { return d._children ? "d3-collapsed-node" : "d3-node-dot"; });
nodeEnter.append("svg:text")
.attr("x", function(d) { return d.children || d._children ? -10 : 10; })
.attr("dy", ".35em")
.attr("text-anchor", function(d) { return d.children || d._children ? "end" : "start"; })
.text(function(d) { return d.name; })
.style("fill-opacity", 1e-6);
// Transition nodes to their new position.
var nodeUpdate = node.transition()
.duration(duration)
.attr("transform", function(d) { return "translate(" + d.y + "," + d.x + ")"; });
nodeUpdate.select("circle")
.attr("r", 4.5)
.attr("class", function(d) { return d._children ? "d3-collapsed-node" : "d3-node-dot"; });
nodeUpdate.select("text")
.style("fill-opacity", 1);
// Transition exiting nodes to the parent's new position.
var nodeExit = node.exit().transition()
.duration(duration)
.attr("transform", function(d) { return "translate(" + source.y + "," + source.x + ")"; })
.remove();
nodeExit.select("circle")
.attr("r", 1e-6);
nodeExit.select("text")
.style("fill-opacity", 1e-6);
// Update the links
var link = svg_layout.selectAll("path.d3-link")
.data( tree.links(nodes), function(d) { return d.target.id; } );
// Enter any new links at the parent's previous position.
link.enter().insert("svg:path", "g")
.attr("class", "d3-link")
.attr("d", function(d) {
var o = {x: source.x0, y: source.y0};
return diagonal({source: o, target: o});
})
.transition()
.duration(duration)
.attr("d", diagonal);
// Transition links to their new position.
link.transition()
.duration(duration)
.attr("d", diagonal);
// Transition exiting nodes to the parent's new position.
link.exit().transition()
.duration(duration)
.attr("d", function(d) {
var o = {x: source.x, y: source.y};
return diagonal({source: o, target: o});
})
.remove();
// Stash the old positions for transition.
nodes.forEach(function(d) {
d.x0 = d.x;
d.y0 = d.y;
});
}
/**
* Hide/Unhide children's node when toggle on node in linkage view.
*/
function toggle( d ) {
//Hide/Unhide attributes
if (d.children) {
d._children = d.children;
d.children = null;
} else {
d.children = d._children;
d._children = null;
}
//Update node and her children in SVG
update_linkage_nodes(d);
}
});
\ No newline at end of file
{*
Copyright (C) 2009 INRA
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*}
{extends file='AnalysisTemplate.tpl'}
{block name=results_title} Clustering results {/block}
{block name=results}
<table class="table table-striped table-bordered dataTable analysis-result-table">
<thead>
<tr>
<th><center><input type="checkbox" id="chk_all_sample"></center></th>
<th class="string-sort">Samples ({$analyse_results|@count})</th>
<th class="numeric-sort">Count</th>
<th class="numeric-sort">Min depth</th>
<th class="numeric-sort">Lower quartile depth</th>
<th class="numeric-sort">Median depth</th>
<th class="numeric-sort">Upper quartile depth</th>
<th class="numeric-sort">Max depth</th>
</tr>
</thead>
<tbody>
{assign var="i" value=0}
{assign var="totale" value=0}
{assign var="total1" value=0}
{assign var="prct_extend_total" value=0}
{assign var="analyse_results_sorted" value=$analyse_results|@ksort}
{foreach from=$analyse_results_sorted key=sample item=sample_results}
<tr>
<td><center>
<input type="checkbox" id="chk_sample_{$i}" value="sample">
<input type="hidden" id="depths_{$i}" value="{$sample_results["default"].depths}"/>
<input type="hidden" id="counts_{$i}" value="{$sample_results["default"].counts}"/>
<input type="hidden" id="linkage_{$i}" value='{$sample_results["default"].linkage}'/>
</center></td>
<td id="sample_id_{$i}">{$sample|get_description:$descriptions}</td>
{assign var="array_depths" value=","|explode:$sample_results["default"].depths}
{assign var="min" value=$array_depths[0]}
{assign var="max" value=$array_depths[count($array_depths)-1]}
<td id="count_{$i}">{$sample_results["default"].sum}</td>
<td id="min_{$i}">{$min}</td>
<td id="lq_{$i}">{$sample_results["default"].lower_quartile}</td>
<td id="median_{$i}">{$sample_results["default"].median}</td>
<td id="uq_{$i}">{$sample_results["default"].upper_quartile}</td>
<td id="max_{$i}">{$max}</td>
</tr>
{/foreach}
</tbody>
<tfoot>
<tr>
<th align="left" colspan="8">
With selection :
<button type="button" class="btn multiple-selection-btn depth-stat-view-btn"><i class=" icon-signal"></i> Dispersion</button>
<button type="button" class="btn multiple-selection-btn depth-hist-view-btn"><i class=" icon-signal"></i> Depths</button>
<button type="button" class="btn single-selection-btn tree-view-btn"><i class=" icon-signal"></i> Linkages</button>
</th>
</tr>
</tfoot>
</table>
{/block}
\ No newline at end of file
......@@ -25,15 +25,15 @@ class GeneDiversity (NG6Workflow):
def process(self):
# Add raw files
addrawfiles = self.add_component( "AddRawFiles", [self.runobj, self.args["read_1"]+self.args["read_2"], "none"] )
addrawfiles = self.add_component( "AddRawFiles", [self.runobj, self.args['read_1'] + self.args['read_2'], "none"] )
# Trim sequences
trim_R1 = self.add_component("Trimmer", [self.args['read_1'], 1, self.args["trim_read_1"]], component_prefix="R1")
trim_R2 = self.add_component("Trimmer", [self.args['read_2'], 1, self.args["trim_read_2"]], component_prefix="R2")
trim_R1 = self.add_component("Trimmer", [self.args['read_1'], 1, self.args['trim_read_1']], component_prefix="R1")
trim_R2 = self.add_component("Trimmer", [self.args['read_2'], 1, self.args['trim_read_2']], component_prefix="R2")
# Make some statistics on raw file
fastqc = self.add_component("FastQC", [trim_R1.output_files + trim_R2.output_files, False, True])
# Merge overlapping pair
join_pairs = self.add_component("Flash", [trim_R1.output_files, trim_R2.output_files, self.args["mismatch_ratio"], self.args["min_overlap"], self.args["max_overlap"]])
......@@ -42,9 +42,9 @@ class GeneDiversity (NG6Workflow):
# Dereplicates sequences
dereplicate = self.add_component("UsearchDereplication", [fastq2fasta.output_files])
# Remove chimeric sequences
chimera = self.add_component("UsearchChimera", [dereplicate.output_files], parent=join_pairs)
chimera = self.add_component("UsearchChimera", [dereplicate.output_files, 8], parent=join_pairs)
# Sequence traduction
split = self.add_component("SplitSeq", [chimera.nonchimeras, 10000])
......@@ -57,13 +57,13 @@ class GeneDiversity (NG6Workflow):
merge = self.add_component("ConcatenateFiles", [rename_clusters.output_files, "all_trad_sequences.fasta"])
# Create OTU
cdhit = self.add_component("Cdhit", [merge.output_file, self.args["otu_identity_threshold"], self.args["otu_length_diff_cutoff"], self.args["otu_cluster_most_similar"]])
# Index the reference proteins for blast
blast_index = self.add_component("BlastIndex", [self.args["database"], "prot"])
cdhit = self.add_component("Cdhit", [merge.output_file, self.args["otu_identity_threshold"], self.args["otu_length_diff_cutoff"], self.args["otu_cluster_most_similar"], 5, ';', [rename_clusters.merged_logs]], parent=chimera)
# Align OTU
# Stat on OTU
blast_index = self.add_component("BlastIndex", [self.args["database"], "prot"])
blast = self.add_component("Blast", [cdhit.output_files, [{'file':blast_index.databank, 'max_hit':1}], 6, "blastp"])
#import_tax_on_biom
otu_stat = self.add_component("GeneOTUStat", [[rename_clusters.merged_logs], cdhit.cluster_files, blast.outputs[os.path.basename(blast_index.databank)], [self.args["taxonomy"]], cdhit.output_files], parent=cdhit)
# Stat on OTU
stat = self.add_component("GeneOTUStat", [[rename_clusters.merged_logs], cdhit.cluster_files, blast.outputs[os.path.basename(blast_index.databank)], [self.args["taxonomy"]], cdhit.output_files], parent=chimera)
\ No newline at end of file
# Normalisation
normalisation = self.add_component("BiomNormalisation", [cdhit.biom_files, 1000, 3000, 100, 1], parent=cdhit)
\ No newline at end of file
#
# Copyright (C) 2012 INRA
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
import os
from workflows.gene_diversity.lib.biomstat import *
from ng6.analysis import Analysis
from jflow.iotypes import InputFileList, OutputFileList, Formats
from weaver.function import PythonFunction
from weaver.abstraction import Map
def filter_and_bootstrap( input_biom, output_biom, observation_threshold, nb_deleted, nb_selected, nb_round ):
"""
"""
from workflows.gene_diversity.lib.Biom import BiomIO
biom = BiomIO.from_json( input_biom )
# Filter
if observation_threshold > 0:
biom.filter_OTU_by_count( int(observation_threshold) )
# Normalisation
biom.bootstrap_by_sample( int(nb_selected), int(nb_deleted), int(nb_round) )
# Write
BiomIO.write( output_biom, biom )
class BiomNormalisation (Analysis):
"""
@summary : Merge, filter and normalise the count of clusters in Biom file.
The normalisation is based on multiple random samplings with replacement.
"""
def define_parameters(self, biom, nb_deleted, nb_selected, nb_round, observation_threshold=0, distance_method='euclidean', linkage_method='average', merge_groups=None):
"""
@param biom : [str] path to the biom file processed.
@param nb_deleted : [int] the number of elements randomly removed at each round from the initial file.
@param nb_selected : [int] the number of elements randomly selected at each round from the file without the deleted elements.
@param nb_round : [int] the number of execution of the process "initial biom -> delete X elements -> select Y elements -> add elements to final biom".
@param observation_threshold : [int]
@param distance_method : [str] distance method for the hierarchical clustering. Accepted values @see biomstat.samples_hclassification.
@param linkage_method : [str] linkage method for the hierarchical clustering. Accepted values @see biomstat.samples_hclassification.
@param merge_groups : [dict] *****************************
"""
# Parameters
self.nb_deleted = nb_deleted
self.nb_selected = nb_selected
self.nb_round = nb_round
self.observation_threshold = observation_threshold
self.distance_method = distance_method
self.linkage_method = linkage_method
self.merge_groups = merge_groups
# Files
self.biom = InputFileList( biom )
self.output_files = OutputFileList( self.get_outputs('{basename_woext}.biom', self.biom) )
self.depth_files = OutputFileList( self.get_outputs('{basename_woext}_depth.tsv', self.biom) )
self.hclust_files = OutputFileList( self.get_outputs('{basename_woext}_hclust.json', self.biom) )
self.stderr = os.path.join( self.output_directory, 'normalisation.stderr' )
def define_analysis(self):
self.name = "Normalisation"
self.description = "Filter and normalisation."
self.software = "-"
self.options = "normalisation deleted=" + str(self.nb_deleted) + " selected=" + str(self.nb_deleted) + " round=" + str(self.nb_round) + \
";hierarchical_clustering distance=" + self.distance_method + " linkage=" + self.linkage_method
def get_version(self):
return "-"
def post_process(self):
self._save_files( self.depth_files + self.hclust_files )
# Parse depths
for filepath in self.depth_files:
[depths, counts, sum, upper_quartile, median, lower_quartile] = observations_depth_to_stat(filepath)
sample_name = os.path.basename(filepath).split("_depth.tsv")[0]
self._add_result_element( sample_name, "depths", ",".join(map(str, depths)) )
self._add_result_element( sample_name, "counts", ",".join(map(str, counts)) )
self._add_result_element( sample_name, "sum", str(sum) )
self._add_result_element( sample_name, "upper_quartile", str(upper_quartile) )
self._add_result_element( sample_name, "median", str(median) )
self._add_result_element( sample_name, "lower_quartile", str(lower_quartile) )
# Parse JSON
for filepath in self.hclust_files:
json_fh = open( filepath )
hierarchical_clustering = ' '.join( line.strip() for line in json_fh )
json_fh.close()
sample_name = os.path.basename(filepath).split("_hclust.json")[0]
self._add_result_element( sample_name, "linkage", hierarchical_clustering )
def process(self):
# Merge samples
# TO DO *********************************************************
# Process filter and normalisation
normalisation = PythonFunction( filter_and_bootstrap, cmd_format='{EXE} {IN} {OUT} ' + str(self.observation_threshold) + ' ' + str(self.nb_deleted) + ' ' + str(self.nb_selected) + ' ' + str(self.nb_round) + ' 2>> ' + self.stderr )
Map( normalisation, inputs=self.biom, outputs=self.output_files )
# Depths stats
depth = PythonFunction( observations_depth, cmd_format='{EXE} {IN} {OUT} 2>> ' + self.stderr )
Map( depth, inputs=self.output_files, outputs=self.depth_files )
# Linkage stats
dendrogram = PythonFunction( samples_hclassification, cmd_format='{EXE} {IN} {OUT} ' + self.distance_method + ' ' + self.linkage_method + ' 2>> ' + self.stderr )
Map( dendrogram, inputs=self.output_files, outputs=self.hclust_files )
\ No newline at end of file
......@@ -17,12 +17,79 @@
import os
from jflow.component import Component
from subprocess import Popen, PIPE
from workflows.gene_diversity.lib.biomstat import *
from ng6.analysis import Analysis
from jflow.iotypes import OutputFileList, InputFileList, OutputFile, Formats
from jflow.abstraction import MultiMap
from weaver.function import ShellFunction, PythonFunction
from weaver.abstraction import Map
def to_biom( output_biom, clusters_file, precluster_size_sep, trace_file=None ):
"""
@summary : Write a biom file from cdhit results.
@param output_biom : [str] path to the output file.
@param clusters_file : [str] path to the '.clstr' file.
@param precluster_size_sep : [char] if sequences provided to Cdhit are pre-clusters (ex : dereplication step before cd-hit), the number of sequences
represented by each pre-cluster can be added to the end of its ID. In this place the number is separated by the character