run_stats.py 15.1 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#
# Copyright (C) 2012 INRA
# 
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#


import os
import json
import logging
import re

from ng6.analysis import Analysis

from subprocess import Popen, PIPE
from jflow.utils import get_argument_pattern
from ng6.utils import Utils

class Run_stats (Analysis):
    """
        This module make some statistic from ONT run with graphs
    """
    
    def define_parameters(self, sequencing_summary_file, barcoded=False, archive_name="RunStats_archive.tar.gz"):
        logging.getLogger("jflow").debug("Begin Run_stats parameters")
        self.add_input_file( "sequencing_summary_file", "Input ont sequencing summary file from Albacore", default=sequencing_summary_file, file_format = "txt", required=True)
        self.add_parameter("barcoded", "Indicate that barcodes are used for this run", default=barcoded, type='str')
        self.add_parameter("archive_name", "Archive name", default=archive_name)
        
        self.add_output_file_list("stderr", "stderr ouput file",pattern='Run_stats.stderr', items = self.sequencing_summary_file)
        if self.barcoded == "yes":
            self.add_output_file_list("stderr_barcoded", "stderr ouput barcoded file",pattern='Run_stats_barcoded.stderr', items = self.sequencing_summary_file)

        
    def get_version(self):
        #cmd = [self.get_exec_path("Rscript")," /save/sbsuser/analyses_scripts/mmanno/graph_albacoresummary.R"]
        #p = Popen(cmd, stdout=PIPE, stderr=PIPE)
        #stdout, stderr = p.communicate()
        #return stdout.split()[1]
        return "v1.1"
        
    def define_analysis(self):
        self.name = "RUNStats"
        self.description = "Statistics on reads and their qualities with R."
        self.software = "Rscript"
        self.options = "-"

    def __parse_stat_file (self, stat_file):
        """
        Parse the stat file
          @param stat_file : the runstatsR stats file
          @return             : {"" : "", ...}
        """
        stats = {}
        logging.getLogger("jflow").debug("Begin post_process  _parse_stat_file!")
        for line in open(stat_file, 'r').readlines():
            parts = line.strip().split("\t")
            
            if parts[0] == "nb_reads": stats["nb_reads"] = parts[1]
            if parts[0] == "total_bases": stats["total_bases"] = parts[1]
            if parts[0] == "median_read_length": stats["median_read_length"] = parts[1] 
            if parts[0] == "mean_read_length": stats["mean_read_length"] = parts[1]
            if parts[0] == "N50_read_length": stats["N50_read_length"] = parts[1]
            if parts[0] == "median_read_quality": stats["median_read_quality"] = parts[1]
            if parts[0] == "mean_read_quality": stats["mean_read_quality"] = parts[1]
            if parts[0] == "median_yield_per_sec": stats["median_yield_per_sec"] = parts[1]
            if parts[0] == "mean_yield_per_sec": stats["mean_yield_per_sec"] = parts[1]
79
80
81
            if parts[0] == "nb_read_utils": stats["nb_read_useful_data"] = parts[1]
            if parts[0] == "total_bases_utils": stats["total_bases_useful_data"] = parts[1]
            if parts[0] == "N50_read_length_utils": stats["N50_read_length_useful_data"] = parts[1]
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
        #print(stats)
        return stats
        
    def __parse_barcode_file (self, barcode_file):
        """
        Parse the barcode file
          @param barcode_file : the runstatsR barcode file
          @return             : {"" : "", ...}
        """
        stats = {}
        barcode_names = []
        logging.getLogger("jflow").debug("Begin post_process  _parse_barcode_file!")
        for line in open(barcode_file, 'r').readlines():
            parts = line.strip().split(",")
            stats[parts[0]] = parts
            barcode_names.append(parts[0])
        #print(barcode_names)
        #print(stats)
        return stats,barcode_names
        
    def post_process(self):
        logging.getLogger("jflow").debug("Begin Run_stats.post_process! "+self.output_directory)
        results_files = []
        metrics = []
        
        cmd = [self.get_exec_path("pwd")]
        p = Popen(cmd, stdout=PIPE, stderr=PIPE)
        stdout, stderr = p.communicate()
        print(stdout.split()[0])
        
        sample = "ONT_sample"
        
114
115
116
117
        
        group = "Stats"
        self._add_result_element(sample, "sequencing_summary", self._save_file(self.sequencing_summary_file, sample + ".sequencing_summary.txt"), group)
        
118
119
120
121
122
123
124
125
126
127
128
129
130
131
        #logging.getLogger("jflow").debug("Begin Nanoplot.post_process - sample "+file)
        # stat file
        statfile = os.path.join(self.output_directory,"plot_stats.txt")
        for file in os.listdir(self.output_directory):
            full_file_path = os.path.join(self.output_directory, file)
            if file.endswith(".zip"):
                results_files.append(full_file_path)
        
        if os.path.isfile(statfile): 
            
            stat_info = self.__parse_stat_file(os.path.join(self.output_directory, "plot_stats.txt"))
                    
            group = 'basic'
            metrics.append(group)
132
            self._add_result_element("metrics", "headers", ','.join(["nb_reads", "total_bases", "median_read_length","mean_read_length" , "N50_read_length", "median_yield_per_sec"]), group)
133
134
135
136
137
138
            
            self._add_result_element(sample, "nb_reads", str(stat_info["nb_reads"]),group),
            self._add_result_element(sample, "total_bases", str(stat_info["total_bases"]),group),
            self._add_result_element(sample, "median_read_length", str(stat_info["median_read_length"]),group),
            self._add_result_element(sample, "mean_read_length", str(stat_info["mean_read_length"]),group),
            self._add_result_element(sample, "N50_read_length", str(stat_info["N50_read_length"]),group),
139
140
            self._add_result_element(sample, "median_yield_per_sec", str(stat_info["median_yield_per_sec"]),group),
            self._add_result_element(sample, "mean_yield_per_sec", str(stat_info["mean_yield_per_sec"]),group),
141
142
            group = 'quality'
            metrics.append(group)
143
            self._add_result_element("metrics", "headers", ','.join(["median_read_quality","nb_read_useful_data", "total_bases_useful_data", "N50_read_length_useful_data"]), group)
144
145
            self._add_result_element(sample, "median_read_quality", str(stat_info["median_read_quality"]),group),
            self._add_result_element(sample, "mean_read_quality", str(stat_info["mean_read_quality"]),group),
146
147
148
            self._add_result_element(sample, "nb_read_useful_data", str(stat_info["nb_read_useful_data"]),group),
            self._add_result_element(sample, "total_bases_useful_data", str(stat_info["total_bases_useful_data"]),group),
            self._add_result_element(sample, "N50_read_length_useful_data", str(stat_info["N50_read_length_useful_data"]),group),
149
150
151
            
        group = 'plots'
        metrics.append(group)
152
        self._add_result_element("metrics", "headers", ','.join(["cumulate_yield_per_hour", "distribution_length", "distribution_length_useful_data", "distribution_qscore", "length_vs_qscore_density"]), group)
153
        if os.path.isfile(os.path.join(self.output_directory, "plot_cumulyieldperhour.png")):
154
            self._add_result_element(sample, "cumulate_yield_per_hour", self._save_file(os.path.join(self.output_directory, "plot_cumulyieldperhour.png"), 
155
156
157
                                                                            sample + ".cumulyieldperhour.png"), group)
            results_files.append(os.path.join(self.output_directory, "plot_cumulyieldperhour.png"))                                                                
        if os.path.isfile(os.path.join(self.output_directory, "plot_outrm_distriblength.png")):
158
            self._add_result_element(sample, "distribution_length", self._save_file(os.path.join(self.output_directory, "plot_outrm_distriblength.png"), 
159
160
                                                                            sample + ".outrm_distriblength.png"), group)
            results_files.append(os.path.join(self.output_directory, "plot_outrm_distriblength.png"))
161
162
163
164
        if os.path.isfile(os.path.join(self.output_directory, "plot_outrm_distriblength_utils.png")):
            self._add_result_element(sample, "distribution_length_useful_data", self._save_file(os.path.join(self.output_directory, "plot_outrm_distriblength_utils.png"), 
                                                                            sample + ".outrm_distriblength_utils.png"), group)
            results_files.append(os.path.join(self.output_directory, "plot_outrm_distriblength_utils.png"))
165
        if os.path.isfile(os.path.join(self.output_directory, "plot_outrm_distribqscore.png")):
166
            self._add_result_element(sample, "distribution_qscore", self._save_file(os.path.join(self.output_directory, "plot_outrm_distribqscore.png"), 
167
168
169
                                                                            sample + ".outrm_distribqscore.png"), group)
            results_files.append(os.path.join(self.output_directory, "plot_outrm_distribqscore.png"))
        if os.path.isfile(os.path.join(self.output_directory, "plot_outrm_lengthvsqscore_density.png")):
170
            self._add_result_element(sample, "length_vs_qscore_density", self._save_file(os.path.join(self.output_directory, "plot_outrm_lengthvsqscore_density.png"), 
171
172
173
174
175
176
177
178
179
180
                                                                            sample + ".outrm_lengthvsqscore_density.png"), group)
            results_files.append(os.path.join(self.output_directory, "plot_outrm_lengthvsqscore_density.png"))

        if self.barcoded == "yes":
            barcodefile = os.path.join(self.output_directory,"plot_barcoded_statsbarcodes.txt")
            if os.path.isfile(barcodefile):
                barcode_info, barcode_names = self.__parse_barcode_file(os.path.join(self.output_directory, "plot_barcoded_statsbarcodes.txt"))
                    
                group = 'barcode'
                metrics.append(group)
181
                self._add_result_element("metrics", "headers", ','.join(["barcode_score","nb_reads","total_bases","median_read_length","mean_read_length", "N50_read_length","median_read_quality","median_yield_per_sec","nb_read_useful_data","total_bases_useful_data","N50_read_length_useful_data"]), group)
182
183
184
185
186
187
188
189
190
                self._add_result_element("metrics", "names",    ','.join(barcode_names),group)
                
                for barcode in barcode_names :
                    group = barcode
                    
                    self._add_result_element(sample, "barcode_score", str(barcode_info[barcode][1]),group),
                    self._add_result_element(sample, "nb_reads", str(barcode_info[barcode][2]),group),
                    self._add_result_element(sample, "total_bases", str(barcode_info[barcode][3]),group),
                    self._add_result_element(sample, "median_read_length", str(barcode_info[barcode][4]),group),
191
                    self._add_result_element(sample, "mean_read_length", str(barcode_info[barcode][5]),group),
192
                    self._add_result_element(sample, "N50_read_length", str(barcode_info[barcode][6]),group),
193
                    self._add_result_element(sample, "median_read_quality", str(barcode_info[barcode][7]),group),
194
195
196
197
                    self._add_result_element(sample, "median_yield_per_sec", str(barcode_info[barcode][9]),group),
                    self._add_result_element(sample, "nb_read_useful_data", str(barcode_info[barcode][11]),group),
                    self._add_result_element(sample, "total_bases_useful_data", str(barcode_info[barcode][12]),group),
                    self._add_result_element(sample, "N50_read_length_useful_data", str(barcode_info[barcode][13]),group),
198
199
200
                    
                group = 'plots_barcode'
                metrics.append(group)
201
                self._add_result_element("metrics", "headers", ','.join(["qscore_boxplot", "qscore_per_time_intervals_boxplot"]), group)
202
203
                
                if os.path.isfile(os.path.join(self.output_directory, "plot_barcoded_qscoreboxplot.png")):
204
                    self._add_result_element(sample, "qscore_boxplot", self._save_file(os.path.join(self.output_directory, "plot_barcoded_qscoreboxplot.png"), 
205
206
207
208
                                                                            sample + ".barcoded_qscoreboxplot.png"), group)
                    results_files.append(os.path.join(self.output_directory, "plot_barcoded_qscoreboxplot.png"))
                    
                if os.path.isfile(os.path.join(self.output_directory, "plot_barcoded_qscorepertimeintervalsboxplot.png")):
209
                    self._add_result_element(sample, "qscore_per_time_intervals_boxplot", self._save_file(os.path.join(self.output_directory, "plot_barcoded_qscorepertimeintervalsboxplot.png"), 
210
211
212
213
214
215
216
217
218
219
220
                                                                            sample + ".barcoded_qscorepertimeintervalsboxplot.png"), group)
                    results_files.append(os.path.join(self.output_directory, "plot_barcoded_qscorepertimeintervalsboxplot.png"))
                    
        # Finaly create and add the archive to the analysis
        self._create_and_archive(results_files,self.archive_name)

    def process(self):
        logging.getLogger("jflow").debug("Begin Run_stats.process! ont_qc")
        #print (self.sequencing_summary_file)
        
        
221
        self.add_shell_execution(self.get_exec_path("Rscript") +" /work/sbsuser/test/mmanno/scripts_ngs/graph_albacoresummary.R " +' -f '+ '$1' +' --out ' + self.output_directory + " 2> " +' $2', 
222
223
224
225
226
227
                                cmd_format='{EXE} {IN} {OUT}' ,
                                map=False, 
                                inputs = self.sequencing_summary_file,
                                outputs = self.stderr)
        
        if self.barcoded == "yes" :
228
            self.add_shell_execution(self.get_exec_path("Rscript") +" /work/sbsuser/test/mmanno/scripts_ngs/graph_albacoresummary_barcode.R " +' -f '+ '$1' +' --out ' + self.output_directory + " 2> " +' $2', 
229
230
231
232
233
234
235
236
237
                                cmd_format='{EXE} {IN} {OUT}' ,
                                map=False, 
                                inputs = self.sequencing_summary_file,
                                outputs = self.stderr_barcoded)
        
        #self.add_shell_execution('tar -czf '+ self.output_directory +'/'+'Run_stats_archive.tar.gz -C '+ self.output_directory +' plot_stats.txt -C '+ self.output_directory +' *.png ', cmd_format='{EXE} {OUT}', 
        #                             map=False, outputs = self.archive_name)
        
        logging.getLogger("jflow").debug("End Run_stats.process! ")