Commit 9a66f129 authored by Romain Therville's avatar Romain Therville 🐭

Merge branch 'ng6-Slurm-10X' into 'master'

Ng6 slurm 10 x

See merge request !50
parents dd0f10c2 a3e88383
......@@ -188,6 +188,20 @@ class NG6ConfigReader(object):
except :
raise Exception("Failed when parsing the config file get_454_mids !")
def get_10X_indexs(self):
"""
return the 10X indexs list
@return : hash table with 10X barcode list of index
"""
try:
barcode_array = {}
barcodes = self.reader.items("10X_barcodes")
for barcode in barcodes:
barcode_array[barcode[0].upper()] = barcode[1].upper()
return barcode_array
except :
raise Exception("Failed when parsin the config file for 10X barcodes !")
def get_workflow_filters(self):
"""
Return a list of workflow class names
......
......@@ -33,7 +33,7 @@ from ng6.project import Project
from ng6.run import Run
from ng6.sample import Sample
from ng6.utils import Utils
from ng6.config_reader import NG6ConfigReader
class BasicNG6Workflow (Workflow):
......@@ -112,6 +112,7 @@ class BasicNG6Workflow (Workflow):
raise ValueError("Component " + cmpt_object.__class__.__name__ + " with prefix " +
cmpt_object.prefix + " already exist in this pipeline!")
self.component_nameids[cmpt_object.get_nameid()] = None
return cmpt_object
else:
raise ImportError(component_name + " component cannot be loaded, available components are: {0}".format(
......@@ -145,6 +146,7 @@ class NG6Workflow (BasicNG6Workflow):
self.samples = []
self.reads1 = []
self.reads2 = []
self.index = []
self.samples_names = []
self.reads1_indexes = []
self.reads2_indexes = []
......@@ -171,10 +173,11 @@ class NG6Workflow (BasicNG6Workflow):
self.add_parameter_list("metadata", "Add metadata to the sample", type='samplemetadata' ,add_to = "input_sample")
self.add_input_file_list("read1", "Read 1 data file path", required = True, add_to = "input_sample")
self.add_input_file_list("read2", "Read 2 data file path", add_to = "input_sample")
self.add_input_file_list("index", "Index data file path", add_to = "input_sample")
def __create_samples__(self):
for sd in self.input_sample :
sp_object = Sample( sd['sample_id'], sd['read1'], sd['read2'], name = sd['sample_name'], description = sd['sample_description'], type = sd['type'],
sp_object = Sample( sd['sample_id'], sd['read1'], sd['read2'], sd['index'],name = sd['sample_name'], description = sd['sample_description'], type = sd['type'],
insert_size = sd['insert_size'], species = sd['species'] )
for metadata in sd['metadata'] :
......@@ -209,6 +212,9 @@ class NG6Workflow (BasicNG6Workflow):
for rfile in sample.reads2 :
self.reads2_indexes.append(sample.sample_id)
self.reads2.append(rfile)
for rfile in sample.index :
self.index.append(rfile)
if len(self.samples_names) != 0 :
if len(self.samples_names) != len (self.samples) :
......@@ -219,8 +225,9 @@ class NG6Workflow (BasicNG6Workflow):
return self.reads1
elif type == 'read2' :
return self.reads2
return self.reads1 + self.reads2
elif type == 'index' :
return self.index
return self.reads1 + self.reads2 + self.index
def get_files_index(self, type = None):
if type == 'read1' :
......@@ -229,7 +236,6 @@ class NG6Workflow (BasicNG6Workflow):
return self.reads2_indexes
return self.reads1_indexes + self.reads2_indexes
def is_paired_end(self):
return len(self.reads2) > 0
......@@ -292,6 +298,25 @@ def get_files_from_casava(casava_directory, project_name, lane_number):
if file.endswith(".fastq.gz") and re.search(".*_L00" + str(lane) + "_.*", file):
files.append(filepath);
return files
def bcl2fastq_10X(directory, pname, lane):
"""longranger"""
files = []
with open(os.path.join(directory, "SampleSheet_10X.mk")) as fh :
subdirs_list = []
for line in fh :
if line.startswith("l" + str(lane) + "_SUBDIRS"):
parts = line.strip().split(":=")
subdirs_list = parts[1].split(" ")
# parse samples
for subdir in subdirs_list:
# filter on project name
if re.match("Project_" + pname + "/Sample_.+", subdir) or subdir.startswith("Undetermined_indices"):
for file in os.listdir(directory + "/" + subdir):
filepath = directory + "/" + subdir + "/" + file
if file.endswith(".fastq.gz") and re.search(".*_L00" + str(lane) + "_.*", file):
files.append(filepath);
return files
def bcl2fastq_216(directory, pname, lane):
"""bcl2fastq >= 1.9"""
......@@ -319,6 +344,8 @@ def get_files_from_casava(casava_directory, project_name, lane_number):
return bcl2fastq_18(casava_directory, project_name, lane_number)
elif os.path.exists(os.path.join( casava_directory, 'Stats', 'DemultiplexingStats.xml')) :
return bcl2fastq_216(casava_directory, project_name, lane_number)
elif os.path.exists(os.path.join(casava_directory, "SampleSheet_10X.mk")) :
return bcl2fastq_10X(casava_directory, project_name, lane_number)
......@@ -330,8 +357,10 @@ class CasavaNG6Workflow(NG6Workflow):
self.group_prefix = None
self.undetermined_reads1 = []
self.undetermined_reads2 = []
self.undetermined_index = []
self.log_files = []
self.is_casava = False
self.is_10Xcasava = False
def __add_sample_parameters__(self):
self.add_multiple_parameter('casava', 'Provide the options to retrieve samples from a CASAVA directory', group="Sample description")
......@@ -387,11 +416,18 @@ class CasavaNG6Workflow(NG6Workflow):
logging.getLogger("ng6").debug("CasavaNG6Workflow.__create_samples__ before self._process_casava_18")
all_samples, all_samples_id = self._process_casava_18(casava_directory, project_name, lane_number, input_files)
logging.getLogger("ng6").debug("CasavaNG6Workflow.__create_samples__ before self._process_casava_18")
elif os.path.exists(os.path.join( casava_directory, 'Stats', 'DemultiplexingStats.xml')) :
all_samples, all_samples_id = self._process_casava_216(casava_directory, project_name, lane_number, input_files)
elif os.path.exists(os.path.join( casava_directory, "SampleSheet_10X.mk")):
logging.getLogger("ng6").debug("CasavaNG6Workflow.__create_samples__ before self._process_casava_10X")
all_samples, all_samples_id = self._process_casava_10X(casava_directory, project_name, lane_number, input_files)
self.is_10Xcasava = True
logging.getLogger("ng6").debug("CasavaNG6Workflow.__create_samples__ after self._process_casava_10X")
selected_samples = self.casava['select_sample_id']
logging.getLogger("CasavaNG6Workflow").debug("__create_samples__. all_samples_id = a"+", ".join(all_samples_id)+"a")
logging.getLogger("CasavaNG6Workflow").debug("__create_samples__. all_samples_id = "+", ".join(all_samples_id))
if selected_samples :
for sid in selected_samples :
assert sid in all_samples_id , "The sample id %s is not in the SampleSheet.mk" % sid
......@@ -407,6 +443,7 @@ class CasavaNG6Workflow(NG6Workflow):
def __preprocess_samples__(self):
NG6Workflow.__preprocess_samples__(self)
if self.is_casava:
self.group_prefix = list((Utils.get_group_basenames(self.get_all_reads(), "read")).keys())
......@@ -454,7 +491,6 @@ class CasavaNG6Workflow(NG6Workflow):
# filter on project name
if re.match("Project_" + project_name + "/Sample_.+", sample['subdir']) or sample['subdir'].startswith("Undetermined_indices"):
for file in os.listdir(casava_directory + "/" + sample['subdir']):
filepath = casava_directory + "/" + sample['subdir'] + "/" + file
if file.endswith(".fastq.gz") and re.search(".*_L00" + str(lane_number) + "_.*", file):
......@@ -481,7 +517,6 @@ class CasavaNG6Workflow(NG6Workflow):
all_samples.append(sp_object)
all_samples_id.append(sample['sample_id'])
for file in os.listdir(casava_directory):
filepath = casava_directory + "/" + file
if file.endswith(".log"):
......@@ -489,9 +524,111 @@ class CasavaNG6Workflow(NG6Workflow):
logging.getLogger("ng6").debug("CasavaNG6Workflow._process_casava_18 self.log_files = " + ",".join(self.log_files))
logging.getLogger("ng6").debug("CasavaNG6Workflow._process_casava_18 all_samples_id = " + ",".join(all_samples_id))
logging.getLogger("ng6").debug("CasavaNG6Workflow._process_casava_18 exiting")
return all_samples, all_samples_id
def _process_casava_10X(self,casava_directory, project_name, lane_number, input_files):
logging.getLogger("ng6").debug("CasavaNG6Workflow._process_casava_10X enter")
logging.getLogger("ng6").debug("CasavaNG6Workflow._process_casava_10X casava_directory = " + casava_directory + ", project_name = " + str(project_name))
"""
Creates samples from casavadir from longranger demultiplexing
@param casava_directory:
@param project_name:
@param lane_number:
@param input_files:
"""
all_samples = []
all_samples_id = []
# open casava samplesheet again to associate our files with a sample
with open(os.path.join(casava_directory, "SampleSheet_10X.mk")) as fh :
barcodes_list = []
sample_ids_list = []
subdirs_list = []
for line in fh :
if line.startswith("l" + str(lane_number) + "_BARCODES"):
parts = line.strip().split(":=")
barcodes_list = [ re.sub( r"[_\s]+", "", x) for x in parts[1].split() ]
elif line.startswith("l" + str(lane_number) + "_SAMPLEIDS" ):
parts = line.strip().split(":=")
sample_ids_list = parts[1].split(" ")
elif line.startswith("l" + str(lane_number) + "_SUBDIRS"):
parts = line.strip().split(":=")
subdirs_list = parts[1].split(" ")
assert len(barcodes_list) == len(sample_ids_list) == len(subdirs_list), "Invalid lane {0} in SampleSheet_10X.mk".format(lane_number)
cfg_reader = NG6ConfigReader()
indexs = cfg_reader.get_10X_indexs()
# parse samples
for i in range(len(barcodes_list)):
if barcodes_list[i] == 'Undetermined' :
barcode = 'Undetermined'
else :
barcode = indexs[barcodes_list[i]]
sample = {
'barcode' : barcode,
'sample_id' : sample_ids_list[i],
'subdir' : subdirs_list[i],
'reads1' : [],
'reads2' : [],
'index' : []
}
# filter on project name
if re.match("Project_" + project_name + "/Sample_.+", sample['subdir']) or sample['subdir'].startswith("Undetermined_indices"):
for file in os.listdir(casava_directory + "/" + sample['subdir']):
filepath = casava_directory + "/" + sample['subdir'] + "/" + file
if file.endswith(".fastq.gz") and re.search(".*_L00" + str(lane_number) + "_.*", file):
for idx, iofile in enumerate(input_files) :
if iofile == filepath :
if re.search(".*_R1_.*", file):
if not sample['subdir'].startswith("Undetermined_indices"):
sample['reads1'].append(iofile)
else:
self.undetermined_reads1.append(iofile)
if re.search(".*_R2_.*", file):
if not sample['subdir'].startswith("Undetermined_indices"):
sample['reads2'].append(iofile)
else:
self.undetermined_reads2.append(iofile)
if re.search(".*_I1_.*", file):
if not sample['subdir'].startswith("Undetermined_indices"):
sample['index'].append(iofile)
else:
self.undetermined_index.append(iofile)
input_files.pop(idx)
break
if not sample['subdir'].startswith("Undetermined_indices") :
sp_object = Sample(sample['barcode'], sample['reads1'], reads2 = sample['reads2'], index = sample['index'], name=sample['sample_id'])
sp_object.add_metadata('barcode', sample['barcode'])
sp_object.add_metadata('is_casava', True)
all_samples.append(sp_object)
all_samples_id.append(sample['sample_id'])
for file in os.listdir(casava_directory):
filepath = casava_directory + "/" + file
if file.endswith(".log"):
self.log_files.append(filepath)
logging.getLogger("ng6").debug("CasavaNG6Workflow._process_casava_10X all_samples_id = " + ",".join(all_samples_id))
logging.getLogger("ng6").debug("CasavaNG6Workflow._process_casava_10X exiting")
return all_samples, all_samples_id
def _process_casava_216(self,casava_directory, project_name, lane_number, input_files):
"""
Creates samples from casavadir (>=1.9) using input files
......@@ -513,20 +650,30 @@ class CasavaNG6Workflow(NG6Workflow):
logging.getLogger("ng6").debug("illumina_process self.is_casava")
if len(self.log_files) > 0 :
add_log = self.add_component("BasicAnalysis", [self.log_files,"Log Files","Log files generated during primary analysis","-","-","-","gz", "","log.gz"])
if len(self.undetermined_reads1) > 0 :
if self.casava['mismatch_index'] :
demultiplex_stats = self.add_component("DemultiplexStats", [self.get_all_reads("read1"), self.undetermined_reads1, self.get_files_index('read1')])
elif self.is_10Xcasava :
logging.getLogger("ng6").debug("illumina_process self.is_10Xcasava = ")
logging.getLogger("ng6").debug(self.get_all_reads("read1"))
logging.getLogger("ng6").debug("illumina_process undetermined reads = " )
logging.getLogger("ng6").debug(self.undetermined_reads1)
logging.getLogger("ng6").debug("illumina_process file index =")
logging.getLogger("ng6").debug(self.get_files_index("read1"))
demultiplex_stats = self.add_component("Demultiplex10XStats", [self.get_all_reads("read1"), self.undetermined_reads1, self.get_files_index("read1")])
else :
demultiplex_stats = self.add_component("DemultiplexStats", [self.get_all_reads("read1"), self.undetermined_reads1])
demultiplex_stats = self.add_component("DemultiplexStats", [self.get_all_reads("read1"), self.undetermined_reads1])
if self.keep_reads != "all" :
logging.getLogger("ng6").debug("illumina_process self.keep_reads != all")
logging.getLogger("ng6").debug("illumina_process self.get_all_reads() = " + ",".join(self.get_all_reads()))
logging.getLogger("ng6").debug("illumina_process BEFORE FASTQILLUMINAFILTER self.get_all_reads() = " + ",".join(self.get_all_reads()))
logging.getLogger("ng6").debug("illumina_process self.group_prefix = " + ",".join(self.group_prefix))
# fastq illumina filter
fastqilluminafilter = self.add_component("FastqIlluminaFilter", [self.runobj,self.get_all_reads(), self.keep_reads, self.group_prefix])
# list filtered files
if self.is_paired_end() :
# split read 1 and read 2 from filtered files list
......@@ -543,7 +690,7 @@ class CasavaNG6Workflow(NG6Workflow):
# archive the files
#TODO : if self.group_prefix == None, the create the output of fastqilluminafilter in the run.get_work_directory()
saved_files = filtered_read1_files + filtered_read2_files
saved_files = filtered_read1_files + filtered_read2_files + self.get_all_reads("index")
logging.getLogger("CasavaNG6Workflow").debug("illumina_process saved_files = " + ",".join(saved_files))
reads_prefixes = None
if self.group_prefix != None :
......@@ -574,7 +721,7 @@ class CasavaNG6Workflow(NG6Workflow):
# contamination_search
if contam :
if self.contamination_databank: contam.extend(self.contamination_databank)
contamination_search = self.add_component("ContaminationSearch", [filtered_read1_files, contam, list((Utils.get_group_basenames(filtered_read1_files, "read")).keys())], parent = fastqilluminafilter)
contamination_search = self.add_component("ContaminationSearch", [filtered_read1_files+filtered_read2_files, contam, reads_prefixes], parent = fastqilluminafilter)
# make some statistics on raw file
fastqc = self.add_component("FastQC", [filtered_read1_files+filtered_read2_files, (self.group_prefix is not None), self.no_group, "fastqc.tar.gz"], parent = fastqilluminafilter)
......
......@@ -88,8 +88,7 @@ class Run(object):
ng6conf.get_space_directory(self.space_id), self.DIRECTORIES_STRUCTURE, directory_name)
work_dir = os.path.join(ng6conf.get_work_directory(), ng6conf.get_space_directory(self.space_id), \
self.DIRECTORIES_STRUCTURE, directory_name)
print (work_dir)
print (os.path.isdir(work_dir))
if not os.path.isdir(save_dir) and not os.path.isdir(work_dir):
break
directory_name = uuid.uuid4().hex[:9]
......
......@@ -22,13 +22,14 @@ class Sample(object):
AVAILABLE_TYPES = ["pe", "se", "ose", "ope", "mp"]
def __init__(self, sample_id, reads1, reads2 = None, name = None, description = None, type = None,
def __init__(self, sample_id, reads1, reads2 = None,index = None, name = None, description = None, type = None,
insert_size = None, species = None, nb_sequences = None, full_size = None, id = None ):
self.sample_id = sample_id
self.name = name
self.description = description
self.reads1 = reads1
self.reads2 = reads2
self.index = index
self.insert_size = insert_size
self.nb_sequences = nb_sequences
self.full_size = full_size
......@@ -41,6 +42,9 @@ class Sample(object):
if isinstance(reads2, str) :
self.reads2 = [reads2]
if isinstance(index, str) :
self.index = [index]
if self.type is None:
if self.reads2 :
......@@ -94,12 +98,13 @@ class Sample(object):
raise UnsavedRunError()
def __str__(self, *args, **kwargs):
return "sid={sid}; name={name}; desc={desc}; r1={r1}; r2={r2}; insize={insize}; nbs={nbs}; fsize={fsize}; spec={spec}; t={t}".format(
return "sid={sid}; name={name}; desc={desc}; r1={r1}; r2={r2}; i={i}; insize={insize}; nbs={nbs}; fsize={fsize}; spec={spec}; t={t}".format(
sid = self.sample_id or '',
name = self.name or '',
desc = self.description or '',
r1 = self.reads1 or [],
r2 = self.reads2 or [],
i = self.index or [],
insize = self.insert_size or '',
nbs = self.nb_sequences or '',
fsize = self.full_size or '',
......
......@@ -299,11 +299,14 @@ class Utils(object):
@param group_by : CASAVA_FILENAME key (ex : read)
"""
group_basenames = {}
logging.getLogger("Utils").debug("get_group_basenames. file_list = " + ",".join(file_list))
for file in file_list:
file_name_fields = os.path.basename(file).split(Utils.CASAVA_FILENAME_SEPARATOR)
group_tag = Utils.CASAVA_FILENAME_SEPARATOR.join( file_name_fields[:Utils.CASAVA_FILENAME[group_by]] )
if group_tag in group_basenames :
group_basenames[group_tag].append(file)
......
{*
Copyright (C) 2009 INRA
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>. #
*}
{extends file='AnalysisTemplate.tpl'}
{block name=params_title} Parameters {/block}
{block name=params_content}
{assign var="params" value=" "|explode:$analyse.params}
<ul>
<li class="parameter">Unknown indices with a number of fragments < {$params[0]*100}% of the number of fragments in the sample with the littlest population are merged in "All others". In other words, each unknown indice with a number of fragments > {$params[0]*100}% (included) of the number of fragments in the sample with the smallest population is displayed </li>
</ul>
{/block}
{block name=results_title} Illumina metrics {/block}
{block name=results}
<table class="table table-striped table-bordered dataTable analysis-result-table">
<thead>
<tr>
<th class="string-sort">Sample {if $analyse_results|@count > 1 }({$analyse_results|@count}){/if}</th>
<th class="string-sort">Type</th>
<th class="string-sort">Index</th>
<th class="numeric-sort">% of total fragments</th>
<th class="numeric-sort">Number of fragments</th>
<th class="numeric-sort">% of passing filter</th>
</tr>
</thead>
{assign var="analyse_results_sorted" value=$analyse_results|@ksort}
{assign var="total" value=0}
{assign var="undetermined" value=0}
{assign var="determined" value=0}
{foreach from=$analyse_results_sorted key=sample item=sample_results}
{if array_key_exists("undetermined", $sample_results)}
{$total = $total+$sample_results["undetermined"].number}
{$undetermined = $undetermined+$sample_results["undetermined"].number}
{else}
{$total = $total+$sample_results["determined"].number}
{$determined = $determined+$sample_results["determined"].number}
{/if}
{/foreach}
<tbody>
{foreach from=$analyse_results_sorted key=sample item=sample_results}
<tr>
{if array_key_exists("undetermined", $sample_results)}
<td>-</td>
<td>Undetermined</td>
<td>{$sample}</td>
<td>{((($sample_results["undetermined"].number)/$total)*100)|number_format:2:'.':' '}</td>
<td>{$sample_results["undetermined"].number|number_format:0:' ':' '}</td>
<td>{($sample_results["undetermined"].passing_filter*100/$sample_results["undetermined"].number)|number_format:2:'.':' '}</td>
{else}
<td>{$sample|get_description:$descriptions}</td>
<td>Sample</td>
<td>{$sample}</td>
<td>{((($sample_results["determined"].number)/$total)*100)|number_format:2:'.':' '}</td>
<td>{$sample_results["determined"].number|number_format:0:' ':' '}</td>
<td>{($sample_results["determined"].passing_filter*100/$sample_results["determined"].number)|number_format:2:'.':' '}</td>
{/if}
</tr>
{/foreach}
</tbody>
{if $analyse_results|@count > 1 }
<tfoot>
<tr>
<th>Total Samples</th>
<th></th>
<th></th>
<th>{(($determined/$total)*100)|number_format:2:'.':' '}</th>
<th>{$determined|number_format:0:' ':' '}</th>
<th></th>
</tr>
<tr>
<th>Total Undetermined</th>
<th></th>
<th></th>
<th>{(($undetermined/$total)*100)|number_format:2:'.':' '}</th>
<th>{$undetermined|number_format:0:' ':' '}</th>
<th></th>
</tr>
</tfoot>
{/if}
</table>
{/block}
{block name=download}{/block}
#
# Copyright (C) 2012 INRA
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
import os
import sys
import re
import logging
from ng6.ng6workflow import CasavaNG6Workflow
from ng6.utils import Utils
class Illumina10XQualityCheck (CasavaNG6Workflow):
def get_name(self):
return 'illumina_10X_qc'
def get_description(self):
return "illumina quality check pipeline for 10X analyses"
def define_parameters(self, function="process"):
self.add_input_file("reference_genome", "Which genome should the read being align on")
def process(self):
fastqilluminafilter, filtered_read1_files, filtered_read2_files, concat_files = self.illumina_process()
\ No newline at end of file
#
# Copyright (C) 2012 INRA
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
from weaver.function import PythonFunction
import logging
from ng6.analysis import Analysis
from operator import __getitem__
def write_indices_stats(reads_file, stat_file, state , expected_index_seq = None):
import re
import jflow.seqio as seqio
indices_stat = {}
total_number = 0
total_passing_filter = 0
# Count indices
reader = seqio.SequenceReader(reads_file)
for id, desc, seq, qualities in reader:
match = re.search("^\d:[Y|N]:\d+:([ATGCN]+)\+*([ATGCN]*)", desc)
print(match)
if match:
index_seq = match.group(1) + match.group(2)
if index_seq not in indices_stat :
indices_stat[index_seq] = {}
indices_stat[index_seq]["number"] = 0
indices_stat[index_seq]["passing_filter"] = 0
indices_stat[index_seq]["number"] += 1
total_number += 1
if re.match("^\d:N:\d+:[ATGCN]+", desc) :
indices_stat[index_seq]["passing_filter"] += 1
total_passing_filter += 1
else:
raise ValueError("The description '" + desc + "' of the sequence " + id + " is in an invalid format.")
if state == 'determined' and len(list(indices_stat.keys())) > 1 and expected_index_seq == None :
raise Exception("Please provide the expected index sequence for the file '%s'"%reads_file)
# expected index seq for mismatch (only for determined)
if state == 'determined' and expected_index_seq :
indices_stat.clear()
indices_stat[expected_index_seq] = {'number' : total_number, "passing_filter" : total_passing_filter }
# Write indices count
fh_stat_file = open(stat_file, 'w')
for index_seq in indices_stat.keys():
fh_stat_file.write( index_seq + ";" + str(indices_stat[index_seq]["number"]) + ";" + str(indices_stat[index_seq]["passing_filter"]) + "\n" )
fh_stat_file.close()
class Demultiplex10XStats (Analysis):
def define_parameters(self, determined_R1_files, undetermined_R1_files, expected_indexes = None, index_count_threshold=0.8, undetermined_threshold=0.8):
self.add_input_file_list( "determined_R1_files", "determined_R1_files", default=determined_R1_files, required=True, file_format = 'fastq')
self.add_input_file_list( "undetermined_R1_files", "undetermined_R1_files", default=undetermined_R1_files, required=True, file_format = 'fastq')
#Unknown indices with a number of fragments < index_count_threshold*number_of_fragments_in_sample_with_littlest_population are merged in "All others".
self.add_parameter("index_count_threshold", "index_count_threshold", default=index_count_threshold, type='float')
self.add_parameter("undetermined_threshold", "The percentage of undetermined indexes", default=undetermined_threshold, type='float')
self.add_parameter_list('expected_indexes', 'list of expected index for each determined input file, this is used to ensure it allows some mismatches', default = expected_indexes)
self.add_output_file_list( "determined_idx_count_files", "determined_idx_count_files", pattern='{basename_woext}.stdout', items=self.determined_R1_files)
self.add_output_file_list( "undetermined_idx_count_files", "undetermined_idx_count_files", pattern='{basename_woext}.stdout', items=self.undetermined_R1_files)
def define_analysis(self):
self.name = "Demultiplex10XStats"
self.description = "Statistics about 10X demultiplexing"
self.software = "-"
self.options = str(self.index_count_threshold)
def post_process(self):
# Process samples
min_determined = -1
indices_stat = self._merged_indices_stats(self.determined_idx_count_files)
for index_seq in indices_stat.keys():
if min_determined == -1 or min_determined > indices_stat[index_seq]["number"] :
min_determined = indices_stat[index_seq]["number"]
self._add_result_element(index_seq, "number", str(indices_stat[index_seq]["number"]), "determined")
self._add_result_element(index_seq, "passing_filter", str(indices_stat[index_seq]["passing_filter"]), "determined")
# Process unknown indices
other = {"number":0, "passing_filter":0}
indices_stat = self._merged_indices_stats(self.undetermined_idx_count_files)
# check undetermined
overmin = 0
for data in indices_stat.values():
if data["number"] >= min_determined :
overmin += 1
# determine the maximum number of undetermined index (with too much sequences) that have to be saved like new indexs
max_nbindexsaved = float(len(list(indices_stat.values()))) # maximum number of undetermined index saved as new indexs
if max_nbindexsaved > 100 :
max_nbindexsaved = 100
# Sort undetermined index on number of sequences
indices_stat_sorted = sorted(indices_stat, key=lambda x: indices_stat[x]['number'], reverse=True)
nbindexsaved = 0