Commit 9a264575 authored by Maxime Manno's avatar Maxime Manno 🍜

Merge branch 'nG6_ont_demultiplex' into 'master'

N g6 ont demultiplex

See merge request !48
parents bbf78d99 e48d92fe
{*
Copyright (C) 2009 INRA
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*}
{extends file='AnalysisTemplate.tpl'}
{block name=params}
{/block}
{block name=results_title} Reports {/block}
{block name=results}
{* find if user is CTD *}
<input type="hidden" id="user_login" value="{$user_login}" />
{if $user_login == "CTD"}
{assign var="isCTD" value=true}
{else}
{assign var="isCTD" value=false}
{/if}
{assign var="stats_metrics" value=$analyse_results["stats_metrics"]|@ksort}
{assign var="stats_headers" value=','|explode:$analyse_results["metrics"].stats_names.headers}
{assign var="stats_names" value=['format' => 'Format','num_seqs' => 'NB Sequences','sum_len' => 'Total Length','avg_len' => 'Mean Length','min_len' => 'Min Length','max_len' => 'Max Length','N50' => 'N50']}
<table id="ont_stats_datatable" class="table table-striped table-bordered dataTable analysis-result-table" style="white-space:nowrap;">
<thead>
<th>Sample</th>
{foreach from=$stats_headers key=k item=stat}
<th>{$stats_names.$stat}</th>
{/foreach}
</thead>
<tbody>
{foreach from=$stats_metrics key=fastq item=fastq_metrics}
<tr><th>{$fastq}</th>
{foreach from=$stats_headers key=k item=stat}
<td>{$fastq_metrics[$stat]}</td>
{/foreach}
</tr>
{/foreach}
</tbody>
</table>
{* Help block *}
<div class="tx-nG6-pi1-help">
<img src="" alt="" class="img" />
<p>Help for ONT metrics report :</p>
<span class="meta">
<ul>
<li><strong>Format</strong> :
The format of the sample file.
</li>
<li><strong>NB Sequences</strong> :
The total number of reads/sequences for this sample.
</li>
<li><strong>Total Length</strong> :
The number of bases for this sample.
</li>
<li><strong>Mean Length</strong> :
The mean sequence length this sample (bases).
</li>
<li><strong>Min Length</strong> :
The minimum sequence length this sample (bases).
</li>
<li><strong>Max Length</strong> :
The maximum sequence length this sample (bases).
</li>
<li><strong>N50</strong> :
50% of all bases come from reads/sequences longer than this value.
</ul>
</span>
</div>
{/block}
\ No newline at end of file
......@@ -39,7 +39,7 @@ class OntQualityCheck (NG6Workflow):
self.add_parameter("compression", "How should the data be compressed once archived", choices= [ "none", "gz", "bz2"], default = "gz")
self.add_parameter("trimming", "use trimming with porechop or not",choices= [ "yes", "no"], default = "no")
self.add_input_file("summary_file", "Input summary basecalling file", default=None)
self.add_parameter("barcoded", "If barcoded run : correspondance file", default = None)
self.add_parameter("barcoded", "If barcoded run : yes", default = "no")
self.add_parameter("fast5dir", "path of the fast5 directory", default = None)
def process(self):
......@@ -64,17 +64,19 @@ class OntQualityCheck (NG6Workflow):
# find .log files
for file in glob(exec_path+"/*.log"):
self.log_files.append(file)
for file in glob(exec_path+"/*.pdf"):
self.log_files.append(file)
logging.getLogger("jflow").debug("OntQualityCheck._process.logfile self.log_files = " + ",".join(self.log_files))
logging.getLogger("jflow").debug("OntQualityCheck._process.logfile exiting")
# add logs
if len(self.log_files) > 0 :
add_log = self.add_component("BasicAnalysis", [self.log_files,"Log Files","Log files generated during primary analysis","-","-","-","gz", "","log.gz"])
add_log = self.add_component("BasicAnalysis", [self.log_files,"Log Files","Log files generated during primary analysis","-","-","-","none", "","log.gz"])
addrawfiles = self.add_component("AddRawFiles", [self.runobj, self.get_all_reads(), self.compression])
ontstat = self.add_component("Run_stats", [self.summary_file, sample_names[0]])
if (self.barcoded != None) or (self.barcoded != "no") :
demultiplexont = self.add_component("Demultiplex_ONT", [self.get_all_reads() , self.barcoded])
if (self.barcoded == "yes"):
demultiplexont = self.add_component("Demultiplex_ONT", [self.get_all_reads()])
if self.trimming == "yes":
trim_porechop = self.add_component("Trim_porechop", [self.get_all_reads() , "discard_middle"])
if self.fast5dir != None:
......
......@@ -31,18 +31,16 @@ class Demultiplex_ONT (Analysis):
def __init__(self, args={}, id=None, function= "process"):
Analysis.__init__(self, args, id, function)
def define_parameters(self, fastq_files, barcode_file, archivename="DemultiplexONT_archive.tar"):
def define_parameters(self, fastq_files, archivename="DemultiplexONT_archive.tar"):
self.add_input_file_list( "fastq_files", "fastq_files", default=fastq_files, required=True, file_format = 'fastq')
self.add_parameter("barcode_file", "Name of the barcode file", default=barcode_file, required=True , file_format = 'str')
#self.add_parameter("barcode_file", "Name of the barcode file", default=barcode_file, required=False , file_format = 'str')
self.add_parameter("archive_name", "Name of the archive", default=archivename, type='str')
self.add_parameter( "run_name", "The name of the run (from total fastq file)", pattern='{basename_woext}', items=self.fastq_files, file_format = "fastq")
#self.add_parameter( "run_name", "The name of the run (from total fastq file)", pattern='{basename_woext}', items=self.fastq_files, file_format = "fastq")
def define_analysis(self):
self.name = "DemultiplexONT"
self.description = "Demultiplexes the total fastq of a barcoded ONT run and produces stats"
self.software = "Qcat"
#if self.discard_middle == "discard_middle":
# self.options = "--discard_middle"
self.description = "Produces stats about demultiplex files from Qcat"
self.software = "Seqkit"
def __parse_stat_file (self, stat_file):
logging.getLogger("jflow").debug("Begin DemultiplexONT.__parse_stat_file! file =",stat_file)
......@@ -51,52 +49,51 @@ class Demultiplex_ONT (Analysis):
@param stat_file : the stdout porechop
@return : {"read_trim_start" : read_trim_start, ...}
"""
#File parsing : recording into a list
list_stats= []
with open(stat_file, "r") as f_stat :
for line in f_stat.readlines():
list_stats.append(line.split())
# Registering file's header into a list
header = list_stats.pop(0)
# Creating a dictionnary on this model
# dico_stats[SampleName][parameterName] = Value
dico_stats = {}
for sample_number in range(len(list_stats)):
print("Le numero de sample est : "+str(sample_number))
dico_stats[list_stats[sample_number][0]] = {}
for parameter_idx in range(1, len(header)):
print(" Le numero de parametre est : " + str(parameter_idx))
print(" Dans : dico [list_stats[spl_nb][0]] [header[param_idx]]")
print(" on va mettre: list_stats[spl_nb][param_idx]")
dico_stats[list_stats[sample_number][0]][header[parameter_idx]] = list_stats[sample_number][parameter_idx]
return dico_stats
def post_process(self):
logging.getLogger("jflow").debug("Begin DemultiplexONT.post_process! ont_qc")
# Create dictionary : key = file name or prefix, value = files path
results_files = []
stats_dico = self.__parse_stat_file(os.path.join(self.output_directory, "DemultiplexONT.output"))
# add header of stats
group = "statsporechop"
self._add_result_element("metrics", "headers", ','.join(["read_trim_start", "read_total_start", "bp_removed_start", "read_trim_end", "read_total_end", "bp_removed_end"]), group)
print(os.listdir(self.output_directory))
for file in os.listdir(self.output_directory):
full_file_path = os.path.join(self.output_directory, file)
logging.getLogger("jflow").debug("Trimporechop.post_process : full_file_path "+full_file_path)
if file.endswith(".fastq"):
logging.getLogger("jflow").debug("Trimporechop.post_process match .fastq : full_file_path "+full_file_path)
results_files.append(full_file_path)
elif file.endswith(".stdout"):
logging.getLogger("jflow").debug("Trimporechop.post_process match .stdout: full_file_path "+full_file_path)
results_files.append(full_file_path)
filename = os.path.basename(file).split(".stdout")[0]
resultlist = self.__parse_stat_file(full_file_path)
read_trim_start = resultlist[0]
read_total_start = resultlist[1]
bp_removed_start = resultlist[2]
read_trim_end = resultlist[3]
read_total_end = resultlist[4]
bp_removed_end = resultlist[5]
#add stats for each fastq file
self._add_result_element("ont_sample", "read_trim_start", read_trim_start,filename)
self._add_result_element("ont_sample", "read_total_start", read_total_start,filename)
self._add_result_element("ont_sample", "bp_removed_start", bp_removed_start,filename)
self._add_result_element("ont_sample", "read_trim_end", read_trim_end,filename)
self._add_result_element("ont_sample", "read_total_end", read_total_end,filename)
self._add_result_element("ont_sample", "bp_removed_end", bp_removed_end,filename)
#Finaly create and add the archive to the analysis
#self._create_and_archive(results_files,self.archive_name)
self._archive_files(results_files, "gz")
stats_names = ["format",'num_seqs','sum_len','avg_len','min_len','max_len',"N50"]
#'Q2', , , 'N50', , , 'format', 'Q1', 'Q3', 'type', 'sum_gap', 'Q30(%)', , 'Q20(%)'
self._add_result_element("metrics", "headers", ','.join(stats_names),"stats_names")
# Add stats metrics
for fastq in stats_dico:
if re.search(".fastq.gz",fastq):
fastq_name = os.path.basename(fastq).replace(".fastq.gz","").split('_')[-1]
else :
fastq_name = os.path.splitext(os.path.basename(fastq))[0].split('_')[-1]
for stat in stats_dico[fastq]:
self._add_result_element("stats_metrics", stat, stats_dico[fastq][stat],fastq_name)
logging.getLogger("jflow").debug("End DemultiplexONT.post_process! ")
def get_version(self):
shell_script = "module load system/Python-3.6.3;" + self.get_exec_path("qcat") + " --version"
shell_script = self.get_exec_path("seqkit") + " version | head -n1"
logging.getLogger("jflow").debug("DemultiplexONT.get_version ! shell_script " + str(shell_script))
cmd = ["sh","-c",shell_script]
p = Popen(cmd, stdout=PIPE, stderr=PIPE)
......@@ -107,15 +104,23 @@ class Demultiplex_ONT (Analysis):
def process(self):
logging.getLogger("jflow").debug("Begin DemultiplexONT.process! ont_qc")
iter = 1
str_input = ""
str_output = ""
for fastq in self.fastq_files:
str_input = str_input + " $" + str(iter)
iter = iter + 1
str_output = " $"+ str(iter)
# Create cmd
self.add_shell_execution(self.get_exec_path("qcat") +" " + self.options + "-f $1 -b " + str(self.outpath) + " -k " + str(self.kit) + " > ${" + str() + "}",
self.add_shell_execution(self.get_exec_path("seqkit") +" stats --all " + str_input + ">" + str_output,
cmd_format='{EXE} {IN} {OUT}' ,
map=False,
inputs = self.fastq_files)
inputs = [self.fastq_files],
outputs = os.path.join(self.output_directory, "DemultiplexONT.output"))
#archive = self.output_directory + '/' + self.archive_name + '.tar.gz'
#self.add_shell_execution('tar -czf $1 ' + self.output_directory + '/' + '*_trim.fastq ', cmd_format='{EXE} {OUT}', map=False, outputs = archive)
logging.getLogger("jflow").debug("End Trimporechop.process! ")
logging.getLogger("jflow").debug("End Seqkit.process! ")
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment