Commit 0f7ce5f2 authored by ckuchly's avatar ckuchly

Add new analysis/components "duplicateRate" for duplicate statistical

analysis
parent 9b9e0e65
#
# Copyright (C) 2012 INRA
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
import logging
from weaver.function import PythonFunction
from ng6.analysis import Analysis
from operator import __getitem__
class DuplicateStats (Analysis):
def define_parameters(self, determined_R1_files, subset_R1_files, determined_R2_files=None, subset_R2_files=None, percent_trim_sides = 10 ):
#input file list
self.add_input_file_list( "determined_R1_files", "determined_R1_files", default=determined_R1_files, required=True, file_format = 'fastq')
self.add_input_file_list( "subset_R1_files", "subset_R1_files", default=subset_R1_files, required=True, file_format = 'fastq')
self.add_input_file_list( "determined_R2_files", "determined_R2_files", default=determined_R2_files, file_format = 'fastq')
self.add_input_file_list( "subset_R2_files", "subset_R2_files", default=subset_R2_files, file_format = 'fastq')
#parameter
self.add_add_parameter( "percent_trim_sides", "percent_trim_sides", default=percent_trim_sides, type='int')
#output file list
self.add_output_file_list( "log_file", "stat_log_files", pattern='{basename_woext}.stdout', items=self.determined_R1_files)
def get_version(self):
return "Genoscope based FASTX 0.0.13.1"
def define_analysis(self):
self.name = "DuplicateStats"
self.description = "Statistics about duplicate rate"
self.software = "fastx_estimate_duplicatedReads"
self.options = " -c " + self.percent_trim_sides
def post_process(self):
# Process samples
min_determined = -1
indices_stat = self._merged_indices_stats(self.determined_idx_count_files)
for index_seq in indices_stat.keys():
if min_determined == -1 or min_determined > indices_stat[index_seq]["number"] :
min_determined = indices_stat[index_seq]["number"]
self._add_result_element(index_seq, "number", str(indices_stat[index_seq]["number"]), "determined")
self._add_result_element(index_seq, "passing_filter", str(indices_stat[index_seq]["passing_filter"]), "determined")
# Process unknown indices
other = {"number":0, "passing_filter":0}
indices_stat = self._merged_indices_stats(self.undetermined_idx_count_files)
# check undetermined
overmin = 0
for data in indices_stat.values():
if data["number"] >= min_determined :
overmin += 1
# determine the maximum number of undetermined index (with too much sequences) that have to be saved like new indexs
max_nbindexsaved = float(len(list(indices_stat.values()))) # maximum number of undetermined index saved as new indexs
if max_nbindexsaved > 100 :
max_nbindexsaved = 100
# Sort undetermined index on number of sequences
indices_stat_sorted = sorted(indices_stat, key=lambda x: indices_stat[x]['number'], reverse=True)
nbindexsaved = 0
for index_seq in indices_stat_sorted:
if indices_stat[index_seq]["number"] >= self.index_count_threshold*min_determined and nbindexsaved <= max_nbindexsaved :
self._add_result_element(index_seq, "number", str(indices_stat[index_seq]["number"]), "undetermined")
self._add_result_element(index_seq, "passing_filter", str(indices_stat[index_seq]["passing_filter"]), "undetermined")
nbindexsaved = nbindexsaved + 1
else:
other["number"] += indices_stat[index_seq]["number"]
other["passing_filter"] += indices_stat[index_seq]["passing_filter"]
self._add_result_element("All others", "number", str(other["number"]), "undetermined")
self._add_result_element("All others", "passing_filter", str(other["passing_filter"]), "undetermined")
def _merged_indices_stats(self, files):
indices_stat = {}
for current_file in files:
fh_current_file = open(current_file, 'r')
for line in fh_current_file:
line = line.rstrip()
index, number, passing_filter = line.split(';')
if index not in indices_stat :
indices_stat[index] = {}
indices_stat[index]["number"] = 0
indices_stat[index]["passing_filter"] = 0
indices_stat[index]["number"] += int(number)
indices_stat[index]["passing_filter"] += int(passing_filter)
fh_current_file.close()
return indices_stat
def process(self):
demultiplex_stats = PythonFunction(write_indices_stats, cmd_format="{EXE} {IN} {OUT} {ARG}")
# determined
for idx, infile in enumerate(self.determined_R1_files) :
if self.expected_indexes :
demultiplex_stats( inputs = infile, outputs = self.determined_idx_count_files[idx], arguments= [ 'determined', self.expected_indexes[idx]])
else :
demultiplex_stats( inputs = infile, outputs = self.determined_idx_count_files[idx], arguments = 'determined')
# undetermined
for idx, infile in enumerate(self.undetermined_R1_files) :
demultiplex_stats( inputs = infile, outputs = self.undetermined_idx_count_files[idx], arguments = 'undetermined')
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment