Commit 9a66f129 authored by Romain Therville's avatar Romain Therville 🐭

Merge branch 'ng6-Slurm-10X' into 'master'

Ng6 slurm 10 x

See merge request !50
parents dd0f10c2 a3e88383
......@@ -188,6 +188,20 @@ class NG6ConfigReader(object):
except :
raise Exception("Failed when parsing the config file get_454_mids !")
def get_10X_indexs(self):
"""
return the 10X indexs list
@return : hash table with 10X barcode list of index
"""
try:
barcode_array = {}
barcodes = self.reader.items("10X_barcodes")
for barcode in barcodes:
barcode_array[barcode[0].upper()] = barcode[1].upper()
return barcode_array
except :
raise Exception("Failed when parsin the config file for 10X barcodes !")
def get_workflow_filters(self):
"""
Return a list of workflow class names
......
This diff is collapsed.
......@@ -88,8 +88,7 @@ class Run(object):
ng6conf.get_space_directory(self.space_id), self.DIRECTORIES_STRUCTURE, directory_name)
work_dir = os.path.join(ng6conf.get_work_directory(), ng6conf.get_space_directory(self.space_id), \
self.DIRECTORIES_STRUCTURE, directory_name)
print (work_dir)
print (os.path.isdir(work_dir))
if not os.path.isdir(save_dir) and not os.path.isdir(work_dir):
break
directory_name = uuid.uuid4().hex[:9]
......
......@@ -22,13 +22,14 @@ class Sample(object):
AVAILABLE_TYPES = ["pe", "se", "ose", "ope", "mp"]
def __init__(self, sample_id, reads1, reads2 = None, name = None, description = None, type = None,
def __init__(self, sample_id, reads1, reads2 = None,index = None, name = None, description = None, type = None,
insert_size = None, species = None, nb_sequences = None, full_size = None, id = None ):
self.sample_id = sample_id
self.name = name
self.description = description
self.reads1 = reads1
self.reads2 = reads2
self.index = index
self.insert_size = insert_size
self.nb_sequences = nb_sequences
self.full_size = full_size
......@@ -41,6 +42,9 @@ class Sample(object):
if isinstance(reads2, str) :
self.reads2 = [reads2]
if isinstance(index, str) :
self.index = [index]
if self.type is None:
if self.reads2 :
......@@ -94,12 +98,13 @@ class Sample(object):
raise UnsavedRunError()
def __str__(self, *args, **kwargs):
return "sid={sid}; name={name}; desc={desc}; r1={r1}; r2={r2}; insize={insize}; nbs={nbs}; fsize={fsize}; spec={spec}; t={t}".format(
return "sid={sid}; name={name}; desc={desc}; r1={r1}; r2={r2}; i={i}; insize={insize}; nbs={nbs}; fsize={fsize}; spec={spec}; t={t}".format(
sid = self.sample_id or '',
name = self.name or '',
desc = self.description or '',
r1 = self.reads1 or [],
r2 = self.reads2 or [],
i = self.index or [],
insize = self.insert_size or '',
nbs = self.nb_sequences or '',
fsize = self.full_size or '',
......
......@@ -299,11 +299,14 @@ class Utils(object):
@param group_by : CASAVA_FILENAME key (ex : read)
"""
group_basenames = {}
logging.getLogger("Utils").debug("get_group_basenames. file_list = " + ",".join(file_list))
for file in file_list:
file_name_fields = os.path.basename(file).split(Utils.CASAVA_FILENAME_SEPARATOR)
group_tag = Utils.CASAVA_FILENAME_SEPARATOR.join( file_name_fields[:Utils.CASAVA_FILENAME[group_by]] )
if group_tag in group_basenames :
group_basenames[group_tag].append(file)
......
{*
Copyright (C) 2009 INRA
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>. #
*}
{extends file='AnalysisTemplate.tpl'}
{block name=params_title} Parameters {/block}
{block name=params_content}
{assign var="params" value=" "|explode:$analyse.params}
<ul>
<li class="parameter">Unknown indices with a number of fragments < {$params[0]*100}% of the number of fragments in the sample with the littlest population are merged in "All others". In other words, each unknown indice with a number of fragments > {$params[0]*100}% (included) of the number of fragments in the sample with the smallest population is displayed </li>
</ul>
{/block}
{block name=results_title} Illumina metrics {/block}
{block name=results}
<table class="table table-striped table-bordered dataTable analysis-result-table">
<thead>
<tr>
<th class="string-sort">Sample {if $analyse_results|@count > 1 }({$analyse_results|@count}){/if}</th>
<th class="string-sort">Type</th>
<th class="string-sort">Index</th>
<th class="numeric-sort">% of total fragments</th>
<th class="numeric-sort">Number of fragments</th>
<th class="numeric-sort">% of passing filter</th>
</tr>
</thead>
{assign var="analyse_results_sorted" value=$analyse_results|@ksort}
{assign var="total" value=0}
{assign var="undetermined" value=0}
{assign var="determined" value=0}
{foreach from=$analyse_results_sorted key=sample item=sample_results}
{if array_key_exists("undetermined", $sample_results)}
{$total = $total+$sample_results["undetermined"].number}
{$undetermined = $undetermined+$sample_results["undetermined"].number}
{else}
{$total = $total+$sample_results["determined"].number}
{$determined = $determined+$sample_results["determined"].number}
{/if}
{/foreach}
<tbody>
{foreach from=$analyse_results_sorted key=sample item=sample_results}
<tr>
{if array_key_exists("undetermined", $sample_results)}
<td>-</td>
<td>Undetermined</td>
<td>{$sample}</td>
<td>{((($sample_results["undetermined"].number)/$total)*100)|number_format:2:'.':' '}</td>
<td>{$sample_results["undetermined"].number|number_format:0:' ':' '}</td>
<td>{($sample_results["undetermined"].passing_filter*100/$sample_results["undetermined"].number)|number_format:2:'.':' '}</td>
{else}
<td>{$sample|get_description:$descriptions}</td>
<td>Sample</td>
<td>{$sample}</td>
<td>{((($sample_results["determined"].number)/$total)*100)|number_format:2:'.':' '}</td>
<td>{$sample_results["determined"].number|number_format:0:' ':' '}</td>
<td>{($sample_results["determined"].passing_filter*100/$sample_results["determined"].number)|number_format:2:'.':' '}</td>
{/if}
</tr>
{/foreach}
</tbody>
{if $analyse_results|@count > 1 }
<tfoot>
<tr>
<th>Total Samples</th>
<th></th>
<th></th>
<th>{(($determined/$total)*100)|number_format:2:'.':' '}</th>
<th>{$determined|number_format:0:' ':' '}</th>
<th></th>
</tr>
<tr>
<th>Total Undetermined</th>
<th></th>
<th></th>
<th>{(($undetermined/$total)*100)|number_format:2:'.':' '}</th>
<th>{$undetermined|number_format:0:' ':' '}</th>
<th></th>
</tr>
</tfoot>
{/if}
</table>
{/block}
{block name=download}{/block}
#
# Copyright (C) 2012 INRA
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
import os
import sys
import re
import logging
from ng6.ng6workflow import CasavaNG6Workflow
from ng6.utils import Utils
class Illumina10XQualityCheck (CasavaNG6Workflow):
def get_name(self):
return 'illumina_10X_qc'
def get_description(self):
return "illumina quality check pipeline for 10X analyses"
def define_parameters(self, function="process"):
self.add_input_file("reference_genome", "Which genome should the read being align on")
def process(self):
fastqilluminafilter, filtered_read1_files, filtered_read2_files, concat_files = self.illumina_process()
\ No newline at end of file
#
# Copyright (C) 2012 INRA
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
from weaver.function import PythonFunction
import logging
from ng6.analysis import Analysis
from operator import __getitem__
def write_indices_stats(reads_file, stat_file, state , expected_index_seq = None):
import re
import jflow.seqio as seqio
indices_stat = {}
total_number = 0
total_passing_filter = 0
# Count indices
reader = seqio.SequenceReader(reads_file)
for id, desc, seq, qualities in reader:
match = re.search("^\d:[Y|N]:\d+:([ATGCN]+)\+*([ATGCN]*)", desc)
print(match)
if match:
index_seq = match.group(1) + match.group(2)
if index_seq not in indices_stat :
indices_stat[index_seq] = {}
indices_stat[index_seq]["number"] = 0
indices_stat[index_seq]["passing_filter"] = 0
indices_stat[index_seq]["number"] += 1
total_number += 1
if re.match("^\d:N:\d+:[ATGCN]+", desc) :
indices_stat[index_seq]["passing_filter"] += 1
total_passing_filter += 1
else:
raise ValueError("The description '" + desc + "' of the sequence " + id + " is in an invalid format.")
if state == 'determined' and len(list(indices_stat.keys())) > 1 and expected_index_seq == None :
raise Exception("Please provide the expected index sequence for the file '%s'"%reads_file)
# expected index seq for mismatch (only for determined)
if state == 'determined' and expected_index_seq :
indices_stat.clear()
indices_stat[expected_index_seq] = {'number' : total_number, "passing_filter" : total_passing_filter }
# Write indices count
fh_stat_file = open(stat_file, 'w')
for index_seq in indices_stat.keys():
fh_stat_file.write( index_seq + ";" + str(indices_stat[index_seq]["number"]) + ";" + str(indices_stat[index_seq]["passing_filter"]) + "\n" )
fh_stat_file.close()
class Demultiplex10XStats (Analysis):
def define_parameters(self, determined_R1_files, undetermined_R1_files, expected_indexes = None, index_count_threshold=0.8, undetermined_threshold=0.8):
self.add_input_file_list( "determined_R1_files", "determined_R1_files", default=determined_R1_files, required=True, file_format = 'fastq')
self.add_input_file_list( "undetermined_R1_files", "undetermined_R1_files", default=undetermined_R1_files, required=True, file_format = 'fastq')
#Unknown indices with a number of fragments < index_count_threshold*number_of_fragments_in_sample_with_littlest_population are merged in "All others".
self.add_parameter("index_count_threshold", "index_count_threshold", default=index_count_threshold, type='float')
self.add_parameter("undetermined_threshold", "The percentage of undetermined indexes", default=undetermined_threshold, type='float')
self.add_parameter_list('expected_indexes', 'list of expected index for each determined input file, this is used to ensure it allows some mismatches', default = expected_indexes)
self.add_output_file_list( "determined_idx_count_files", "determined_idx_count_files", pattern='{basename_woext}.stdout', items=self.determined_R1_files)
self.add_output_file_list( "undetermined_idx_count_files", "undetermined_idx_count_files", pattern='{basename_woext}.stdout', items=self.undetermined_R1_files)
def define_analysis(self):
self.name = "Demultiplex10XStats"
self.description = "Statistics about 10X demultiplexing"
self.software = "-"
self.options = str(self.index_count_threshold)
def post_process(self):
# Process samples
min_determined = -1
indices_stat = self._merged_indices_stats(self.determined_idx_count_files)
for index_seq in indices_stat.keys():
if min_determined == -1 or min_determined > indices_stat[index_seq]["number"] :
min_determined = indices_stat[index_seq]["number"]
self._add_result_element(index_seq, "number", str(indices_stat[index_seq]["number"]), "determined")
self._add_result_element(index_seq, "passing_filter", str(indices_stat[index_seq]["passing_filter"]), "determined")
# Process unknown indices
other = {"number":0, "passing_filter":0}
indices_stat = self._merged_indices_stats(self.undetermined_idx_count_files)
# check undetermined
overmin = 0
for data in indices_stat.values():
if data["number"] >= min_determined :
overmin += 1
# determine the maximum number of undetermined index (with too much sequences) that have to be saved like new indexs
max_nbindexsaved = float(len(list(indices_stat.values()))) # maximum number of undetermined index saved as new indexs
if max_nbindexsaved > 100 :
max_nbindexsaved = 100
# Sort undetermined index on number of sequences
indices_stat_sorted = sorted(indices_stat, key=lambda x: indices_stat[x]['number'], reverse=True)
nbindexsaved = 0
for index_seq in indices_stat_sorted:
if indices_stat[index_seq]["number"] >= self.index_count_threshold*min_determined and nbindexsaved <= max_nbindexsaved :
self._add_result_element(index_seq, "number", str(indices_stat[index_seq]["number"]), "undetermined")
self._add_result_element(index_seq, "passing_filter", str(indices_stat[index_seq]["passing_filter"]), "undetermined")
nbindexsaved = nbindexsaved + 1
else:
other["number"] += indices_stat[index_seq]["number"]
other["passing_filter"] += indices_stat[index_seq]["passing_filter"]
self._add_result_element("All others", "number", str(other["number"]), "undetermined")
self._add_result_element("All others", "passing_filter", str(other["passing_filter"]), "undetermined")
def get_version(self):
return "-"
def _merged_indices_stats(self, files):
indices_stat = {}
for current_file in files:
fh_current_file = open(current_file, 'r')
for line in fh_current_file:
line = line.rstrip()
index, number, passing_filter = line.split(';')
if index not in indices_stat :
indices_stat[index] = {}
indices_stat[index]["number"] = 0
indices_stat[index]["passing_filter"] = 0
indices_stat[index]["number"] += int(number)
indices_stat[index]["passing_filter"] += int(passing_filter)
fh_current_file.close()
return indices_stat
def process(self):
demultiplex_stats = PythonFunction(write_indices_stats, cmd_format="{EXE} {IN} {OUT} {ARG}")
# determined
for idx, infile in enumerate(self.determined_R1_files) :
logging.getLogger("ng6").debug("demultiplex10X Stat :")
logging.getLogger("ng6").debug(self.expected_indexes)
demultiplex_stats( inputs = infile, outputs = self.determined_idx_count_files[idx], arguments= [ 'determined', self.expected_indexes[idx]])
# undetermined
for idx, infile in enumerate(self.undetermined_R1_files) :
demultiplex_stats( inputs = infile, outputs = self.undetermined_idx_count_files[idx], arguments = 'undetermined')
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment