demultiplexont.py 5.92 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
#
# Copyright (C) 2012 INRA
# 
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#

import re, os
from subprocess import Popen, PIPE
import logging
import time
from ng6.analysis import Analysis
from ng6.utils import Utils
from jflow.utils import get_argument_pattern

class Demultiplex_ONT (Analysis):
    
    """
        This module demultiplexes the total fastq of a barcoded ONT run and produces stats
    """
    def __init__(self, args={}, id=None, function= "process"):
        Analysis.__init__(self, args, id, function)
        
34
    def define_parameters(self, fastq_files, archivename="DemultiplexONT_archive.tar"):
35
        self.add_input_file_list( "fastq_files", "fastq_files", default=fastq_files, required=True, file_format = 'fastq')
36
        #self.add_parameter("barcode_file", "Name of the barcode file", default=barcode_file, required=False , file_format = 'str')
37
        self.add_parameter("archive_name", "Name of the archive", default=archivename, type='str')
38
        #self.add_parameter( "run_name", "The name of the run (from total fastq file)", pattern='{basename_woext}', items=self.fastq_files, file_format = "fastq")
39
40
41
        
    def define_analysis(self):
        self.name = "DemultiplexONT"
Maxime Manno's avatar
Maxime Manno committed
42
43
        self.description = "Produces stats about demultiplex files from Qcat"
        self.software = "Seqkit"
44
45
46
47
48
49
50
51

    def __parse_stat_file (self, stat_file):
        logging.getLogger("jflow").debug("Begin DemultiplexONT.__parse_stat_file! file =",stat_file)
        """
        Parse the stat file
          @param stat_file : the stdout porechop
          @return             : {"read_trim_start" : read_trim_start, ...}
        """
Maxime Manno's avatar
Maxime Manno committed
52
53
54
55
        #File parsing : recording into a list
        list_stats= []
        with open(stat_file, "r") as f_stat :
            for line in f_stat.readlines():
56
<<<<<<< HEAD
57
                list_stats.append(line.split())
58
=======
Audrey Gibert's avatar
Audrey Gibert committed
59
                list_stat.append(line.split())
60
>>>>>>> branch 'nG6_ont_demultiplex' of https://forgemia.inra.fr/genotoul-bioinfo/ng6.git
Maxime Manno's avatar
Maxime Manno committed
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
                
        # Registering file's header into a list
        header = list_stats.pop(0)
        # Creating a dictionnary on this model
        # dico_stats[SampleName][parameterName] = Value
        dico_stats = {}
        for sample_number in range(len(list_stats)):
            print("Le numero de sample est : "+str(sample_number))
            dico_stats[list_stats[sample_number][0]] = {}
            for parameter_idx in range(1, len(header)):
                print("    Le numero de parametre est : " + str(parameter_idx))
                print("    Dans : dico [list_stats[spl_nb][0]] [header[param_idx]]")
                print("    on va mettre: list_stats[spl_nb][param_idx]")
                dico_stats[list_stats[sample_number][0]][header[parameter_idx]] = list_stats[sample_number][parameter_idx]
        return dico_stats
76
77
78
79
80
        
        
    def post_process(self):
        logging.getLogger("jflow").debug("Begin DemultiplexONT.post_process! ont_qc")
        # Create dictionary : key = file name or prefix, value = files path
81
82
        stats_dico = self.__parse_stat_file(os.path.join(self.output_directory, "DemultiplexONT.output"))
        
83
        # add header of stats
84
85
86
87
88
89
90
91
92
93
94
95
        stats_names = ["format",'num_seqs','sum_len','avg_len','min_len','max_len',"N50"]
        #'Q2', , , 'N50', , , 'format', 'Q1', 'Q3', 'type', 'sum_gap', 'Q30(%)', , 'Q20(%)' 
        self._add_result_element("metrics", "headers", ','.join(stats_names),"stats_names")
        
        # Add stats metrics
        for fastq in stats_dico:
            if re.search(".fastq.gz",fastq):
                fastq_name = os.path.basename(fastq).replace(".fastq.gz","").split('_')[-1]
            else :
                fastq_name = os.path.splitext(os.path.basename(fastq))[0].split('_')[-1]
            for stat in stats_dico[fastq]:
                self._add_result_element("stats_metrics", stat, stats_dico[fastq][stat],fastq_name)
96
97
98
99

        logging.getLogger("jflow").debug("End DemultiplexONT.post_process! ")
    
    def get_version(self):
100
        shell_script = self.get_exec_path("seqkit") + " version | head -n1"
101
102
103
104
105
106
107
108
109
110
        logging.getLogger("jflow").debug("DemultiplexONT.get_version ! shell_script " + str(shell_script))
        cmd = ["sh","-c",shell_script]
        p = Popen(cmd, stdout=PIPE, stderr=PIPE)
        stdout, stderr = p.communicate()
        logging.getLogger("jflow").debug("DemultiplexONT.get_version !" + str(stderr))
        return stdout
                     
    def process(self):
        logging.getLogger("jflow").debug("Begin DemultiplexONT.process! ont_qc")
        
111
112
113
114
115
116
117
118
        iter = 1
        str_input = ""
        str_output = ""
        for fastq in self.fastq_files:
            str_input = str_input + " $" + str(iter)
            iter = iter + 1
        str_output = " $"+ str(iter)
            
119
        # Create cmd
120
        self.add_shell_execution(self.get_exec_path("seqkit") +" stats --all " + str_input + ">" + str_output,
121
122
            cmd_format='{EXE} {IN} {OUT}' ,
            map=False,
123
124
            inputs = [self.fastq_files],
            outputs = os.path.join(self.output_directory, "DemultiplexONT.output"))
125
126
127
128

        #archive = self.output_directory + '/' + self.archive_name + '.tar.gz'
        #self.add_shell_execution('tar -czf $1 ' + self.output_directory + '/' + '*_trim.fastq ', cmd_format='{EXE} {OUT}', map=False, outputs = archive)

129
        logging.getLogger("jflow").debug("End Seqkit.process! ")
130