Commit 8755d3ca authored by Penom Nom's avatar Penom Nom
Browse files

Add demultiplex_stat analysis.

parent 3f7ebbb4
......@@ -339,6 +339,8 @@ class Utils(object):
"""
read1_files = []
read2_files = []
undetermined_read1_files = []
undetermined_read2_files = []
mids_desc_array = {}
# Parse the sample sheet
......@@ -374,7 +376,7 @@ class Utils(object):
# Filter on project name
aux_samples = []
for current_sample in samples:
if re.match("Project_" + project_name + "/Sample_.+", current_sample['subdir']) is not None:
if (re.match("Project_" + project_name + "/Sample_.+", current_sample['subdir']) is not None) or (current_sample['subdir'].startswith("Undetermined_indices")):
aux_samples.append(current_sample)
samples = aux_samples
......@@ -382,18 +384,25 @@ class Utils(object):
raise ValueError, "The project " + project_name + " in lane " + lane_number + " doesn't exist in CASAVA directory " + casava_directory + "."
# Create files lists
print samples
for current_sample in samples:
if not current_sample['subdir'].startswith("Undetermined_indices"): # Skip the folder with the incorrect indexes
print current_sample
# Write line in the index description
if current_sample['barcode'] != "NoIndex":
if (current_sample['barcode'] != "NoIndex") and (not current_sample['subdir'].startswith("Undetermined_indices")) :
mids_desc_array[current_sample['barcode']] = current_sample['sample_id']
# Write files lists
for file in os.listdir(casava_directory + "/" + current_sample['subdir']):
if file.endswith(".fastq.gz") and re.search(".*_L00" + lane_number + "_.*", file):
if re.search(".*_R1_.*", file):
if not current_sample['subdir'].startswith("Undetermined_indices"):
read1_files.append(casava_directory + "/" + current_sample['subdir'] + "/" + file)
else:
undetermined_read1_files.append(casava_directory + "/" + current_sample['subdir'] + "/" + file)
if re.search(".*_R2_.*", file):
if not current_sample['subdir'].startswith("Undetermined_indices"):
read2_files.append(casava_directory + "/" + current_sample['subdir'] + "/" + file)
else:
undetermined_read2_files.append(casava_directory + "/" + current_sample['subdir'] + "/" + file)
return mids_desc_array, read1_files, read2_files
\ No newline at end of file
return mids_desc_array, read1_files, read2_files, undetermined_read1_files, undetermined_read2_files
\ No newline at end of file
{*
Copyright (C) 2009 INRA
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*}
{extends file='AnalysisTemplate.tpl'}
{block name=params_title} Parameters {/block}
{block name=params_content}
{assign var="params" value=" "|explode:$analyse.params}
<ul>
<li class="parameter">Unknown indices with a number of fragments < {$params[0]*100}% of the number of fragments in the sample with the littlest population are merged in "All others".</li>
</ul>
{/block}
{block name=results_title} Illumina metrics {/block}
{block name=results}
<table class="table table-striped table-bordered dataTable analysis-result-table">
<thead>
<tr>
<th class="string-sort">Sample ({$analyse_results|@count})</th>
<th class="string-sort">Type</th>
<th class="string-sort">Index</th>
<th class="numeric-sort">% of total fragments</th>
<th class="numeric-sort">Number of fragments</th>
<th class="numeric-sort">% of passing filter</th>
</tr>
</thead>
{assign var="analyse_results_sorted" value=$analyse_results|@ksort}
{assign var="total" value=0}
{assign var="undetermined" value=0}
{assign var="determined" value=0}
{foreach from=$analyse_results_sorted key=sample item=sample_results}
{if array_key_exists("undetermined", $sample_results)}
{$total = $total+$sample_results["undetermined"].number}
{$undetermined = $undetermined+$sample_results["undetermined"].number}
{else}
{$total = $total+$sample_results["determined"].number}
{$determined = $determined+$sample_results["determined"].number}
{/if}
{/foreach}
<tbody>
{foreach from=$analyse_results_sorted key=sample item=sample_results}
<tr>
{if array_key_exists("undetermined", $sample_results)}
<td>-</td>
<td>Undetermined</td>
<td>{$sample}</td>
<td>{((($sample_results["undetermined"].number)/$total)*100)|number_format:2:'.':' '}</td>
<td>{$sample_results["undetermined"].number|number_format:0:' ':' '}</td>
<td>{($sample_results["undetermined"].passing_filter*100/$sample_results["undetermined"].number)|number_format:2:'.':' '}</td>
{else}
<td>{$sample|get_description:$descriptions}</td>
<td>Sample</td>
<td>{$sample}</td>
<td>{((($sample_results["determined"].number)/$total)*100)|number_format:2:'.':' '}</td>
<td>{$sample_results["determined"].number|number_format:0:' ':' '}</td>
<td>{($sample_results["determined"].passing_filter*100/$sample_results["determined"].number)|number_format:2:'.':' '}</td>
{/if}
</tr>
{/foreach}
</tbody>
{if $analyse_results|@count > 1 }
<tfoot>
<tr>
<th>Total Samples</th>
<th></th>
<th></th>
<th>{(($determined/$total)*100)|number_format:2:'.':' '}</th>
<th>{$determined|number_format:0:' ':' '}</th>
<th></th>
</tr>
<tr>
<th>Total Undetermined</th>
<th></th>
<th></th>
<th>{(($undetermined/$total)*100)|number_format:2:'.':' '}</th>
<th>{$undetermined|number_format:0:' ':' '}</th>
<th></th>
</tr>
</tfoot>
{/if}
</table>
{/block}
{block name=download}{/block}
\ No newline at end of file
......@@ -37,9 +37,12 @@ class CasavaQualityCheck (NG6Workflow):
if self.args['casava_directory'] is not None :
if self.args['lane_number'] is None :
raise ValueError, "lane-number must be specified with casava-directory."
mids_desc_array, self.read1_files, self.read2_files = Utils.filesFromCasava( self.args['casava_directory'], self.project.get_name(), self.args['lane_number'] )
mids_desc_array, self.read1_files, self.read2_files, undetermined_read1_files, undetermined_read2_files = Utils.filesFromCasava( self.args['casava_directory'], self.project.get_name(), self.args['lane_number'] )
group_prefix = (Utils.get_group_basenames(self.read1_files+self.read2_files, "read")).keys()
self.runobj.add_mids_description(mids_desc_array)
# statistics about demultiplexing
if len(undetermined_read1_files) > 0 :
demultiplex_stats = self.add_component("DemultiplexStats", [self.read1_files, undetermined_read1_files])
elif (self.args['read_1'] is not None) and (len(self.args['read_1']) > 0) :
self.read1_files = []
self.read2_files = []
......
#
# Copyright (C) 2012 INRA
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
import os, glob, re
from subprocess import Popen, PIPE
from xml.dom.minidom import parse
from jflow.component import Component
from jflow.iotypes import OutputFile, OutputFileList, InputFile, InputFileList, Formats
from jflow.abstraction import MultiMap
from ng6.analysis import Analysis
import ng6.seqio as seqio
class DemultiplexStats (Analysis):
def define_parameters(self, determined_R1_files, undetermined_R1_files, index_count_threshold=0.8):
self.determined_R1_files = determined_R1_files
self.undetermined_R1_files = undetermined_R1_files
self.index_count_threshold = index_count_threshold #Unknown indices with a number of fragments < index_count_threshold*number_of_fragments_in_sample_with_littlest_population are merged in "All others".
def define_analysis(self):
self.name = "DemultiplexStats"
self.description = "Statistics about demultiplexing"
self.software = "-"
self.options = str(self.index_count_threshold)
def _count_indices(self, files):
indices_stat = {}
for current_file in files:
reader = seqio.SequenceReader(current_file)
for id, desc, seq, qualities in reader:
match = re.search("^\d:[Y|N]:\d+:([ATGCN]+)", desc)
if match:
index_seq = match.group(1)
if not indices_stat.has_key(index_seq) :
indices_stat[index_seq] = {}
indices_stat[index_seq]["number"] = 0
indices_stat[index_seq]["passing_filter"] = 0
indices_stat[index_seq]["number"] += 1
if re.match("^\d:N:\d+:[ATGCN]+", desc) :
indices_stat[index_seq]["passing_filter"] += 1
else:
raise ValueError, "The description '" + desc + "' of the sequence " + id + " is in an invalid format."
return indices_stat
def post_process(self):
# Process samples
min_determined = -1
indices_stat = self._count_indices(self.determined_R1_files)
for index_seq in indices_stat.keys():
if min_determined == -1 or min > indices_stat[index_seq]["number"] :
min_determined = indices_stat[index_seq]["number"]
self._add_result_element(index_seq, "number", str(indices_stat[index_seq]["number"]), "determined")
self._add_result_element(index_seq, "passing_filter", str(indices_stat[index_seq]["passing_filter"]), "determined")
# Process unknown indices
other = {"number":0, "passing_filter":0}
indices_stat = self._count_indices(self.undetermined_R1_files)
for index_seq in indices_stat.keys():
if indices_stat[index_seq]["number"] >= self.index_count_threshold*min_determined:
self._add_result_element(index_seq, "number", str(indices_stat[index_seq]["number"]), "undetermined")
self._add_result_element(index_seq, "passing_filter", str(indices_stat[index_seq]["passing_filter"]), "undetermined")
else:
other["number"] += indices_stat[index_seq]["number"]
other["passing_filter"] += indices_stat[index_seq]["passing_filter"]
self._add_result_element("All others", "number", str(other["number"]), "undetermined")
self._add_result_element("All others", "passing_filter", str(other["passing_filter"]), "undetermined")
def get_version(self):
return "-"
def process(self):
pass
\ No newline at end of file
......@@ -37,9 +37,12 @@ class IlluminaDiversityQC (NG6Workflow):
if self.args['casava_directory'] is not None :
if self.args['lane_number'] is None :
raise ValueError, "lane-number must be specified with casava-directory."
mids_desc_array, self.read1_files, self.read2_files = Utils.filesFromCasava( self.args['casava_directory'], self.project.get_name(), self.args['lane_number'] )
mids_desc_array, self.read1_files, self.read2_files, undetermined_read1_files, undetermined_read2_files = Utils.filesFromCasava( self.args['casava_directory'], self.project.get_name(), self.args['lane_number'] )
group_prefix = (Utils.get_group_basenames(self.read1_files+self.read2_files, "read")).keys()
self.runobj.add_mids_description(mids_desc_array)
# statistics about demultiplexing
if len(undetermined_read1_files) > 0 :
demultiplex_stats = self.add_component("DemultiplexStats", [self.read1_files, undetermined_read1_files])
elif (self.args['read_1'] is not None) and (len(self.args['read_1']) > 0) :
self.read1_files = []
self.read2_files = []
......
......@@ -37,9 +37,12 @@ class IlluminaQualityCheck (NG6Workflow):
if self.args['casava_directory'] is not None :
if self.args['lane_number'] is None :
raise ValueError, "lane-number must be specified with casava-directory."
mids_desc_array, self.read1_files, self.read2_files = Utils.filesFromCasava( self.args['casava_directory'], self.project.get_name(), self.args['lane_number'] )
mids_desc_array, self.read1_files, self.read2_files, undetermined_read1_files, undetermined_read2_files = Utils.filesFromCasava( self.args['casava_directory'], self.project.get_name(), self.args['lane_number'] )
group_prefix = (Utils.get_group_basenames(self.read1_files+self.read2_files, "read")).keys()
self.runobj.add_mids_description(mids_desc_array)
# statistics about demultiplexing
if len(undetermined_read1_files) > 0 :
demultiplex_stats = self.add_component("DemultiplexStats", [self.read1_files, undetermined_read1_files])
elif (self.args['read_1'] is not None) and (len(self.args['read_1']) > 0) :
self.read1_files = []
self.read2_files = []
......
......@@ -35,9 +35,12 @@ class RnaSeqQualityCheck (NG6Workflow):
if self.args['casava_directory'] is not None :
if self.args['lane_number'] is None :
raise ValueError, "lane-number must be specified with casava-directory."
mids_desc_array, self.read1_files, self.read2_files = Utils.filesFromCasava( self.args['casava_directory'], self.project.get_name(), self.args['lane_number'] )
mids_desc_array, self.read1_files, self.read2_files, undetermined_read1_files, undetermined_read2_files = Utils.filesFromCasava( self.args['casava_directory'], self.project.get_name(), self.args['lane_number'] )
group_prefix = (Utils.get_group_basenames(self.read1_files+self.read2_files, "read")).keys()
self.runobj.add_mids_description(mids_desc_array)
# statistics about demultiplexing
if len(undetermined_read1_files) > 0 :
demultiplex_stats = self.add_component("DemultiplexStats", [self.read1_files, undetermined_read1_files])
elif (self.args['read_1'] is not None) and (len(self.args['read_1']) > 0) :
self.read1_files = []
self.read2_files = []
......
......@@ -45,9 +45,12 @@ class Methylseq (NG6Workflow):
if self.args['lane_number'] is None :
raise ValueError, "lane-number must be specified with casava-directory."
logging.getLogger("Methylseq").debug("self.args['lane_number'] = " + str(self.args['lane_number']))
mids_desc_array, self.read1_files, self.read2_files = Utils.filesFromCasava( self.args['casava_directory'], self.project.get_name(), self.args['lane_number'] )
mids_desc_array, self.read1_files, self.read2_files, undetermined_read1_files, undetermined_read2_files = Utils.filesFromCasava( self.args['casava_directory'], self.project.get_name(), self.args['lane_number'] )
group_prefix = (Utils.get_group_basenames(self.read1_files+self.read2_files, "read")).keys()
self.runobj.add_mids_description(mids_desc_array)
# statistics about demultiplexing
if len(undetermined_read1_files) > 0 :
demultiplex_stats = self.add_component("DemultiplexStats", [self.read1_files, undetermined_read1_files])
elif (self.args['read_1'] is not None) and (len(self.args['read_1']) > 0) :
logging.getLogger("Methylseq").debug("self.args['read_1'] = " + str(self.args['read_1']))
self.read1_files = []
......
......@@ -34,9 +34,12 @@ class MiSeqDiversity (NG6Workflow):
if self.args['casava_directory'] is not None :
if self.args['lane_number'] is None :
raise ValueError, "lane-number must be specified with casava-directory."
mids_desc_array, self.read1_files, self.read2_files = Utils.filesFromCasava( self.args['casava_directory'], self.project.get_name(), self.args['lane_number'] )
mids_desc_array, self.read1_files, self.read2_files, undetermined_read1_files, undetermined_read2_files = Utils.filesFromCasava( self.args['casava_directory'], self.project.get_name(), self.args['lane_number'] )
group_prefix = (Utils.get_group_basenames(self.read1_files+self.read2_files, "read")).keys()
self.runobj.add_mids_description(mids_desc_array)
# statistics about demultiplexing
if len(undetermined_read1_files) > 0 :
demultiplex_stats = self.add_component("DemultiplexStats", [self.read1_files, undetermined_read1_files])
elif (self.args['read_1'] is not None) and (len(self.args['read_1']) > 0) :
self.read1_files = []
self.read2_files = []
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment