insertssizes.py 12.3 KB
Newer Older
Penom Nom's avatar
Penom Nom committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
#
# Copyright (C) 2012 INRA
# 
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#

Penom Nom's avatar
Penom Nom committed
18
import os,re,pickle
Gerald Salin's avatar
Gerald Salin committed
19
20
import logging

Penom Nom's avatar
Penom Nom committed
21
22
23
24
from subprocess import Popen, PIPE

from ng6.analysis import Analysis

25
26
27
28
29
30
31
32
33
34
35
36
# def inserts_metrics(bam_file, pairs_count_file, metrics_file, hist_file, log_file, samtools_path, collectinsertsizemetrics_path, options_dump_path, memory):
#     """
#       @param bam_file : path for bam
#       @param pairs_count_file : path to the produced file with the number of reads pairs in bam
#       @param metrics_file : path to the metrics file produced by collectinsertsizemetrics
#       @param hist_file : path to the histogram produced by collectinsertsizemetrics
#       @param log_file : path to the log produced by collectinsertsizemetrics
#       @param samtools_path : path to the software samtools
#       @param collectinsertsizemetrics_path : path to the software collectinsertsizemetrics
#       @param picard_path : path to the software picard.jar
#       @param options : options for the software collectinsertsizemetrics
#     """
37
def inserts_metrics(bam_file, pairs_count_file, metrics_file, hist_file, log_file, samtools_path, java_path, picard_path, options_dump_path, memory):
Penom Nom's avatar
Penom Nom committed
38
39
40
41
42
43
44
    """
      @param bam_file : path for bam
      @param pairs_count_file : path to the produced file with the number of reads pairs in bam
      @param metrics_file : path to the metrics file produced by collectinsertsizemetrics
      @param hist_file : path to the histogram produced by collectinsertsizemetrics
      @param log_file : path to the log produced by collectinsertsizemetrics
      @param samtools_path : path to the software samtools
45
      @param java_path : path to the software java
46
      @param picard_path : path to the software picard.jar
Penom Nom's avatar
Penom Nom committed
47
48
      @param options : options for the software collectinsertsizemetrics
    """
Penom Nom's avatar
Penom Nom committed
49
    from subprocess import Popen, PIPE
Penom Nom's avatar
Penom Nom committed
50
51
52
53
54
    import pickle

    options_dump = open(options_dump_path, "rb")
    options = pickle.load(options_dump)
    options_dump.close()
55
    xmx="-Xmx"+memory.lower()
Penom Nom's avatar
Penom Nom committed
56
57
58
    # Count nb properly paired in bam file
    command = Popen( ["-c", samtools_path + " view -f67 " + bam_file + "| wc -l"], shell=True, stdout=PIPE, stderr=PIPE)
    properly_paired_nb, stderr = command.communicate()
Penom Nom's avatar
Penom Nom committed
59
    properly_paired_nb = int(properly_paired_nb.decode().strip())
60
    
Penom Nom's avatar
Penom Nom committed
61
62
    if properly_paired_nb > 0 :
        # Process inserts sizes metrics
63
        command = Popen( ["-c", java_path+" " +xmx+" -jar " + picard_path + " CollectInsertSizeMetrics " +options + " HISTOGRAM_FILE=" + hist_file + " INPUT=" + bam_file + " OUTPUT=" + metrics_file + " 2> " + log_file], shell=True, stdout=PIPE, stderr=PIPE )
Penom Nom's avatar
Penom Nom committed
64
65
66
67
68
69
        stdout, stderr = command.communicate()
        # Count nb pairs in bam file
        command = Popen( ["-c", samtools_path + " view -F384 " + bam_file + " | wc -l"], shell=True, stdout=PIPE, stderr=PIPE) # First read in pair 
        pairs_nb, stderr = command.communicate()
        # Write count
        pairs_count_fh = open(pairs_count_file, "w")
Penom Nom's avatar
Penom Nom committed
70
        pairs_count_fh.write( "NB_PAIRED="+str(pairs_nb.decode().strip()) )
Penom Nom's avatar
Penom Nom committed
71
72
        pairs_count_fh.close()
    else:
Penom Nom's avatar
Penom Nom committed
73
        empty_fh = open(metrics_file, "wb")
Penom Nom's avatar
Penom Nom committed
74
        empty_fh.close()
Penom Nom's avatar
Penom Nom committed
75
        empty_fh = open(hist_file, "wb")
Penom Nom's avatar
Penom Nom committed
76
        empty_fh.close()
Penom Nom's avatar
Penom Nom committed
77
        empty_fh = open(log_file, "wb")
Penom Nom's avatar
Penom Nom committed
78
        empty_fh.close()
Penom Nom's avatar
Penom Nom committed
79
        empty_fh = open(pairs_count_file, "wb")
Penom Nom's avatar
Penom Nom committed
80
81
82
83
        empty_fh.close()
    

class InsertsSizes (Analysis):
Penom Nom's avatar
Penom Nom committed
84
 
85
    def define_parameters(self, bam_files, histogram_width=700, minimum_pct=0.01, validation_stringency="LENIENT", archive_name=None):
Penom Nom's avatar
Penom Nom committed
86
87
88
89
90
91
92
        """
          @param bam_files : paths for bams
          @param histogram_width : explicitly sets the histogram width, overriding automatic truncation of histogram tail
          @param minimum_pct : when generating the histogram, discard any data categories (out of FR, TANDEM, RF) that have fewer than this percentage of overall reads
          @param validation_stringency : Validation stringency for all SAM files read by this program. Setting stringency to SILENT can improve performance when processing a BAM file in which variable-length data (read, qualities, tags) do not otherwise need to be decoded. Default value: STRICT. This option can be set to 'null' to clear the default value. Possible values: {STRICT, LENIENT, SILENT}
          @param archive_name : name for the output archive
        """
Penom Nom's avatar
Penom Nom committed
93
94
95
96
97
98
99
100
101
        self.add_input_file_list( "bam_files", "bam_filess", default=bam_files, required=True, file_format = 'bam')
        self.add_parameter("histogram_width", "histogram_width", default=histogram_width, type=int)
        self.add_parameter("minimum_pct", "minimum_pct", default=minimum_pct, type=float)
        self.add_parameter("validation_stringency", "validation_stringency", default=validation_stringency)
        self.add_parameter("archive_name", "Archive name", default=archive_name)
        self.add_output_file_list( "info_files", "info_files", pattern='{basename_woext}.txt', items=self.bam_files)
        self.add_output_file_list( "hist_files", "hist_files", pattern='{basename_woext}.pdf', items=self.bam_files)
        self.add_output_file_list( "log_files", "log_files", pattern='{basename_woext}.log', items=self.bam_files)
        self.add_output_file_list( "pairs_count_files", "pairs_count_files", pattern='{basename_woext}.count', items=self.bam_files)
102
103
104
105
        self.memory = '1G'
        if self.get_memory() != None :
            self.memory=self.get_memory()
            
Penom Nom's avatar
Penom Nom committed
106
    def define_analysis(self):
Penom Nom's avatar
Penom Nom committed
107
        self.name = "InsertsSizes"
Penom Nom's avatar
Penom Nom committed
108
109
        self.description = "Insert size statistics"
        self.software = "Picards tools - Insert size"
Penom Nom's avatar
Penom Nom committed
110
        self.options = "HISTOGRAM_WIDTH="+str(self.histogram_width)+" VALIDATION_STRINGENCY="+self.validation_stringency+" MINIMUM_PCT="+str(self.minimum_pct)
Penom Nom's avatar
Penom Nom committed
111
112
        
    def post_process(self):
Penom Nom's avatar
Penom Nom committed
113
114
        nb_omitted_samples = 0
        
Penom Nom's avatar
Penom Nom committed
115
        for info_file in self.info_files:
Penom Nom's avatar
Penom Nom committed
116
117
118
119
120
121
122
123
124
125
126
127
128
            if os.path.exists(info_file) and os.path.getsize(info_file) > 0:
                sample = os.path.splitext(os.path.basename(info_file))[0]                     
                # Parse results
                insert_sizes , metrics = self.parse_metrics_file(info_file)               
                # Save inserts metrics
                self._add_result_element(sample, "inserts_sizes", insert_sizes[:-1])                    
                for orientation in metrics:
                    self._add_result_element(sample, "nb_pair_used", metrics[orientation]["READ_PAIRS"], orientation)
                    self._add_result_element(sample, "median_size", metrics[orientation]["MEDIAN_INSERT_SIZE"], orientation)
                    self._add_result_element(sample, "mean_size", metrics[orientation]["MEAN_INSERT_SIZE"], orientation)
                    self._add_result_element(sample, "min_size", metrics[orientation]["MIN_INSERT_SIZE"], orientation)
                    self._add_result_element(sample, "max_size", metrics[orientation]["MAX_INSERT_SIZE"], orientation)
                    self._add_result_element(sample, "std_deviation", metrics[orientation]["STANDARD_DEVIATION"], orientation)
Penom Nom's avatar
Penom Nom committed
129
                    if "NB_INSERTS_SIZES" in metrics[orientation] :
130
                        self._add_result_element(sample, "nb_inserts_sizes", metrics[orientation]["NB_INSERTS_SIZES"][:-1], orientation)
Penom Nom's avatar
Penom Nom committed
131
132
133
134
135
136
137
138
139
                # Save paired count
                for pairs_count_file in self.pairs_count_files:
                    if os.path.splitext(os.path.basename(pairs_count_file))[0] == sample :
                        self._add_result_element(sample, "nb_pair_in_file", self.parse_pairs_count_file(pairs_count_file))    
            else:
                nb_omitted_samples += 1
        
        self._add_result_element("analysis", "nb_omitted_samples", nb_omitted_samples)
        
Penom Nom's avatar
Penom Nom committed
140
141
142
143
        # Finaly create and add the archive to the analyse
        self._create_and_archive(self.info_files, self.archive_name)
        
    def get_version(self):
Gerald Salin's avatar
Gerald Salin committed
144
        xmx ="-Xmx1g"
145
        cmd = self.get_exec_path("javaPICARD")+" " + xmx +" -jar {} CollectInsertSizeMetrics --version".format(self.get_exec_path("Picard"))
146
        p = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True)
Penom Nom's avatar
Penom Nom committed
147
        stdout, stderr = p.communicate()
148
        return(stderr.decode("utf-8").rsplit()[0])
Penom Nom's avatar
Penom Nom committed
149
150
                     
    def process(self):
Penom Nom's avatar
Penom Nom committed
151
152
153
154
155
        options_dump_path = self.get_temporary_file(".dump")
        options_dump = open(options_dump_path, "wb")
        pickle.dump(self.options, options_dump)
        options_dump.close()
        
Penom Nom's avatar
Penom Nom committed
156
        for i in range(len(self.bam_files)):
157
158
            self.add_python_execution(inserts_metrics,cmd_format="{EXE} {IN} {OUT} {ARG}",
                                      inputs=self.bam_files[i], outputs=[self.pairs_count_files[i], self.info_files[i], self.hist_files[i], self.log_files[i]], 
159
                                      arguments=[self.get_exec_path("samtools"), self.get_exec_path("javaPICARD"), self.get_exec_path("Picard"), options_dump_path, self.memory])
160
            
Penom Nom's avatar
Penom Nom committed
161
162
    def parse_pairs_count_file(self, input_file):
        """
163
          @param input_file  : the pairs  count file path
Penom Nom's avatar
Penom Nom committed
164
165
166
167
168
169
170
171
          @return            : the number of properly paired
        """
        count_file_fh = open(input_file, "r")
        first_line = count_file_fh.readline()
        name, count = first_line.split("=")     #Line pattern : NB_PAIRED=1000
        count_file_fh.close()
        return count.strip()
 
Penom Nom's avatar
Penom Nom committed
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
    def parse_metrics_file(self, input_file):
        """
          @param input_file  : the metrics file path
          @return            : string with inserts sizes class, metrics dictionnary
        """
        inserts_sizes = ""
        metrics = {}
        metrics_titles = []
        metrics_section = False
        histogram_titles = []
        histogram_section = False
        title = False
     
        fh_file = open(input_file, "r")
         
        for line in fh_file:
            line.rstrip('\s\n\r') ;
             
            if metrics_section:
                if not line.strip():
                    metrics_section = False
                else :
                    if title:
                        metrics_titles = line.split()
                        orientation_index = metrics_titles.index('PAIR_ORIENTATION') 
                        title = False
                    else :
                        metrics_fields = line.split()
                        line_metrics = {}
                        for index in range(len(metrics_fields)):
                            line_metrics[metrics_titles[index]] = metrics_fields[index]
                        metrics[metrics_fields[orientation_index]] = line_metrics
            elif histogram_section:
                if not line.strip():
                    histogram_section = False
                else :
                    if title:
                        histogram_titles = line.split()
                        for index in range(1,len(histogram_titles)):
                            m = re.search("\.([a-z]+)_count", histogram_titles[index])
                            histogram_titles[index] = m.group(1).upper()
                            metrics[histogram_titles[index]]["NB_INSERTS_SIZES"] = ""
                        title = False
                    else :
                        histogram_fields = line.split()
                        inserts_sizes += histogram_fields[0] + ","
                        for index in range(1,len(histogram_fields)):
                            metrics[histogram_titles[index]]["NB_INSERTS_SIZES"] += histogram_fields[index] + ","
            elif line.startswith("## METRICS CLASS"):
                metrics_section = True
                title = True
            elif line.startswith("## HISTOGRAM"):
                histogram_section = True
                title = True
         
        fh_file.close()
         
        return inserts_sizes, metrics