Commit f01a98e3 authored by Maxime Manno's avatar Maxime Manno 🍜
Browse files

Add trimporchop component

parent 6d118a0b
#
# Copyright (C) 2012 INRA
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
import re, os
from subprocess import Popen, PIPE
import logging
import time
from ng6.analysis import Analysis
from ng6.utils import Utils
from jflow.utils import get_argument_pattern
class Trim_porechop (Analysis):
"""
This module trim the reads from ONT data
"""
def define_parameters(self, fastq_files, nbthreads=4, formatfile="fastq", discard_middle="discard_middle", archivename="porechop_archive"):
self.add_input_file_list( "fastq_files", "fastq_files", default=fastq_files, required=True, file_format = 'fastq')
self.add_parameter("nbthreads", "number of threads to use", default=nbthreads, type='int')
self.add_parameter("formatfile", "format of the input files", default=formatfile, type='str')
self.add_parameter("discard_middle", "discard_middle", default=discard_middle, choices=[ "discard_middle", "do_not_discard_middle"])
self.add_parameter("archive_name", "Name of the archive", default=archivename, type='str')
output_ext = '_trim.'+self.formatfile
self.add_output_file_list( "files_trimmed", "files_trimmed", pattern='{basename_woext}' + output_ext, items=self.fastq_files, file_format = self.formatfile)
self.add_output_file_list("stdouts", "stdouts", pattern='{basename_woext}.stdout', items=self.fastq_files)
def define_analysis(self):
self.name = "TrimPorechop"
self.description = "Trim the reads generated by Albacore and remove ONT adapters"
self.software = "porechop"
if self.discard_middle == "discard_middle":
self.options = "--discard_middle"
def __parse_stat_file (self, stat_file):
logging.getLogger("jflow").debug("Begin Trimporechop.__parse_stat_file! file =",stat_file)
"""
Parse the stat file
@param stat_file : the stdout porechop
@return : {"read_trim_start" : read_trim_start, ...}
"""
read_trim_start = 0
read_total_start = 0
bp_removed_start = 0
read_trim_end = 0
read_total_end = 0
bp_removed_end = 0
while os.stat(stat_file).st_size == 0:
logging.getLogger("jflow").debug("Trimporechop.__parse_stat_file! file empty : "+stat_file)
logging.getLogger("jflow").debug("Trimporechop.__parse_stat_file! spleep 10...")
for line in open(stat_file, "r").readlines():
line = line.strip()
if re.search("(.*) reads had adapters trimmed from their start (.*)", line)!=None:
logging.getLogger("jflow").debug("Trimporechop.__parse_stat_file : line start "+str(line))
read_trim_start = line.split(" ")[0]
read_total_start = line.split(" ")[2]
bp_removed_start = line.split(" ")[10].replace("(","")
if re.search("(.*) reads had adapters trimmed from their end (.*)", line)!=None:
read_trim_end = line.split(" ")[0]
read_total_end = line.split(" ")[2]
bp_removed_end = line.split(" ")[10].replace("(","")
logging.getLogger("jflow").debug("Trimporechop.__parse_stat_file : read_trim_start "+str(read_trim_start))
logging.getLogger("jflow").debug("Trimporechop.__parse_stat_file : read_trim_end "+str(read_trim_end))
logging.getLogger("jflow").debug("End Trimporechop.__parse_stat_file! ")
return [read_trim_start,read_total_start,bp_removed_start,read_trim_end,read_total_end,bp_removed_end]
def post_process(self):
logging.getLogger("jflow").debug("Begin Trimporechop.post_process! ont_qc")
# Create dictionary : key = file name or prefix, value = files path
results_files = []
# add header of stats
group = "statsporechop"
self._add_result_element("metrics", "headers", ','.join(["read_trim_start", "read_total_start", "bp_removed_start", "read_trim_end", "read_total_end", "bp_removed_end"]), group)
print(os.listdir(self.output_directory))
for file in os.listdir(self.output_directory):
full_file_path = os.path.join(self.output_directory, file)
logging.getLogger("jflow").debug("Trimporechop.post_process : full_file_path "+full_file_path)
if file.endswith(".fastq"):
logging.getLogger("jflow").debug("Trimporechop.post_process match .fastq : full_file_path "+full_file_path)
results_files.append(full_file_path)
elif file.endswith(".stdout"):
logging.getLogger("jflow").debug("Trimporechop.post_process match .stdout: full_file_path "+full_file_path)
results_files.append(full_file_path)
filename = os.path.basename(file).split(".stdout")[0]
resultlist = self.__parse_stat_file(full_file_path)
read_trim_start = resultlist[0]
read_total_start = resultlist[1]
bp_removed_start = resultlist[2]
read_trim_end = resultlist[3]
read_total_end = resultlist[4]
bp_removed_end = resultlist[5]
#add stats for each fastq file
self._add_result_element("ont_sample", "read_trim_start", read_trim_start,filename)
self._add_result_element("ont_sample", "read_total_start", read_total_start,filename)
self._add_result_element("ont_sample", "bp_removed_start", bp_removed_start,filename)
self._add_result_element("ont_sample", "read_trim_end", read_trim_end,filename)
self._add_result_element("ont_sample", "read_total_end", read_total_end,filename)
self._add_result_element("ont_sample", "bp_removed_end", bp_removed_end,filename)
#Finaly create and add the archive to the analysis
self._create_and_archive(results_files,self.archive_name)
logging.getLogger("jflow").debug("End Trimporechop.post_process! ")
def get_version(self):
#os.system("module load bioinfo/Porechop-0.2.1")
cmd = [self.get_exec_path("porechop"), "--version"]
p = Popen(cmd, stdout=PIPE, stderr=PIPE)
stdout, stderr = p.communicate()
return stdout
def process(self):
logging.getLogger("jflow").debug("Begin Trimporechop.process! ont_qc")
for output_pos,output in enumerate(self.stdouts):
file_group = []
# Set prefix
reg = re.search("(.+).stdout$", output)
basename = os.path.basename(reg.group(1))
# Build fastq list for sample read
for file in self.fastq_files:
if (os.path.basename(file)) == basename+"."+self.formatfile :
file_group.append(file)
# Create cmd
[cmd_inputs_pattern, next_arg_number] = get_argument_pattern(file_group, 0)
self.add_shell_execution(self.get_exec_path("porechop") +" " + self.options + " --input ${" + str(next_arg_number) + "} --output ${" + str(next_arg_number+1) + "} --format " + self.formatfile + " --threads " + str(self.nbthreads) + " > " +" ${" + str(next_arg_number+2) + "}",
cmd_format='{EXE} {IN} {OUT}' ,
map=False,
inputs = file_group,
outputs = [self.files_trimmed[output_pos],self.stdouts[output_pos]])
#archive = self.output_directory + '/' + self.archive_name + '.tar.gz'
#self.add_shell_execution('tar -czf $1 ' + self.output_directory + '/' + '*_trim.fastq ', cmd_format='{EXE} {OUT}', map=False, outputs = archive)
logging.getLogger("jflow").debug("End Trimporechop.process! ")
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment