Maintenance - Mise à jour mensuelle Lundi 6 Avril 2020 entre 7h00 et 9h00

Commit b6c46384 authored by ckuchly's avatar ckuchly

update

parent 0f7ce5f2
...@@ -25,6 +25,13 @@ from operator import __getitem__ ...@@ -25,6 +25,13 @@ from operator import __getitem__
class DuplicateStats (Analysis): class DuplicateStats (Analysis):
def define_parameters(self, determined_R1_files, subset_R1_files, determined_R2_files=None, subset_R2_files=None, percent_trim_sides = 10 ): def define_parameters(self, determined_R1_files, subset_R1_files, determined_R2_files=None, subset_R2_files=None, percent_trim_sides = 10 ):
"""
@param determined_R1_files : [list] Paths to reads 1 files
@param determined_R2_files : [list] Paths to reads 2 files.
@param subset_R1_files : [list] Paths to subset of reads 1 files.
@param subset_R2_files : [list] Paths to subset of reads 2 files.
@param percent_trim_sides : [int] Trim sides of reads by a specified percentage (default: 10%)
"""
#input file list #input file list
self.add_input_file_list( "determined_R1_files", "determined_R1_files", default=determined_R1_files, required=True, file_format = 'fastq') self.add_input_file_list( "determined_R1_files", "determined_R1_files", default=determined_R1_files, required=True, file_format = 'fastq')
...@@ -48,66 +55,15 @@ class DuplicateStats (Analysis): ...@@ -48,66 +55,15 @@ class DuplicateStats (Analysis):
self.options = " -c " + self.percent_trim_sides self.options = " -c " + self.percent_trim_sides
def post_process(self): def post_process(self):
# Process samples return "ok"
min_determined = -1
indices_stat = self._merged_indices_stats(self.determined_idx_count_files)
for index_seq in indices_stat.keys():
if min_determined == -1 or min_determined > indices_stat[index_seq]["number"] :
min_determined = indices_stat[index_seq]["number"]
self._add_result_element(index_seq, "number", str(indices_stat[index_seq]["number"]), "determined")
self._add_result_element(index_seq, "passing_filter", str(indices_stat[index_seq]["passing_filter"]), "determined")
# Process unknown indices
other = {"number":0, "passing_filter":0}
indices_stat = self._merged_indices_stats(self.undetermined_idx_count_files)
# check undetermined
overmin = 0
for data in indices_stat.values():
if data["number"] >= min_determined :
overmin += 1
# determine the maximum number of undetermined index (with too much sequences) that have to be saved like new indexs
max_nbindexsaved = float(len(list(indices_stat.values()))) # maximum number of undetermined index saved as new indexs
if max_nbindexsaved > 100 :
max_nbindexsaved = 100
# Sort undetermined index on number of sequences
indices_stat_sorted = sorted(indices_stat, key=lambda x: indices_stat[x]['number'], reverse=True)
nbindexsaved = 0
for index_seq in indices_stat_sorted:
if indices_stat[index_seq]["number"] >= self.index_count_threshold*min_determined and nbindexsaved <= max_nbindexsaved :
self._add_result_element(index_seq, "number", str(indices_stat[index_seq]["number"]), "undetermined")
self._add_result_element(index_seq, "passing_filter", str(indices_stat[index_seq]["passing_filter"]), "undetermined")
nbindexsaved = nbindexsaved + 1
else:
other["number"] += indices_stat[index_seq]["number"]
other["passing_filter"] += indices_stat[index_seq]["passing_filter"]
self._add_result_element("All others", "number", str(other["number"]), "undetermined")
self._add_result_element("All others", "passing_filter", str(other["passing_filter"]), "undetermined")
def _merged_indices_stats(self, files):
indices_stat = {}
for current_file in files:
fh_current_file = open(current_file, 'r')
for line in fh_current_file:
line = line.rstrip()
index, number, passing_filter = line.split(';')
if index not in indices_stat :
indices_stat[index] = {}
indices_stat[index]["number"] = 0
indices_stat[index]["passing_filter"] = 0
indices_stat[index]["number"] += int(number)
indices_stat[index]["passing_filter"] += int(passing_filter)
fh_current_file.close()
return indices_stat
def process(self): def process(self):
demultiplex_stats = PythonFunction(write_indices_stats, cmd_format="{EXE} {IN} {OUT} {ARG}")
if self.determined_R2_files:
self.add_shell_execution(self.get_exec_path("fastx_estimate_duplicatedReads") + " " + self.options + " -p" + " -Q33" +
" -i $1 -j $2 -s $3 -t $4 > $5",
cmd_format='{EXE} {IN} {OUT}' , map=True,
inputs=[self.determined_R1_files, self.determined_R2_files, self.subset_R1_files, self.subset_R2_files], outputs=[unmerged_bam, self.stderrs])
# determined # determined
for idx, infile in enumerate(self.determined_R1_files) : for idx, infile in enumerate(self.determined_R1_files) :
if self.expected_indexes : if self.expected_indexes :
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment