insertssizes.py 12.2 KB
Newer Older
Penom Nom's avatar
Penom Nom committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
#
# Copyright (C) 2012 INRA
# 
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#

Penom Nom's avatar
Penom Nom committed
18
import os,re,pickle
Penom Nom's avatar
Penom Nom committed
19
20
21
22
from subprocess import Popen, PIPE

from ng6.analysis import Analysis

23
24
25
26
27
28
29
30
31
32
33
34
35
# def inserts_metrics(bam_file, pairs_count_file, metrics_file, hist_file, log_file, samtools_path, collectinsertsizemetrics_path, options_dump_path, memory):
#     """
#       @param bam_file : path for bam
#       @param pairs_count_file : path to the produced file with the number of reads pairs in bam
#       @param metrics_file : path to the metrics file produced by collectinsertsizemetrics
#       @param hist_file : path to the histogram produced by collectinsertsizemetrics
#       @param log_file : path to the log produced by collectinsertsizemetrics
#       @param samtools_path : path to the software samtools
#       @param collectinsertsizemetrics_path : path to the software collectinsertsizemetrics
#       @param picard_path : path to the software picard.jar
#       @param options : options for the software collectinsertsizemetrics
#     """
def inserts_metrics(bam_file, pairs_count_file, metrics_file, hist_file, log_file, samtools_path, picard_path, options_dump_path, memory):
Penom Nom's avatar
Penom Nom committed
36
37
38
39
40
41
42
    """
      @param bam_file : path for bam
      @param pairs_count_file : path to the produced file with the number of reads pairs in bam
      @param metrics_file : path to the metrics file produced by collectinsertsizemetrics
      @param hist_file : path to the histogram produced by collectinsertsizemetrics
      @param log_file : path to the log produced by collectinsertsizemetrics
      @param samtools_path : path to the software samtools
43
      @param picard_path : path to the software picard.jar
Penom Nom's avatar
Penom Nom committed
44
45
      @param options : options for the software collectinsertsizemetrics
    """
Penom Nom's avatar
Penom Nom committed
46
    from subprocess import Popen, PIPE
Penom Nom's avatar
Penom Nom committed
47
48
49
50
51
    import pickle

    options_dump = open(options_dump_path, "rb")
    options = pickle.load(options_dump)
    options_dump.close()
52
    xmx="-Xmx"+memory.lower()
Penom Nom's avatar
Penom Nom committed
53
54
55
    # Count nb properly paired in bam file
    command = Popen( ["-c", samtools_path + " view -f67 " + bam_file + "| wc -l"], shell=True, stdout=PIPE, stderr=PIPE)
    properly_paired_nb, stderr = command.communicate()
Penom Nom's avatar
Penom Nom committed
56
    properly_paired_nb = int(properly_paired_nb.decode().strip())
57
    
Penom Nom's avatar
Penom Nom committed
58
59
    if properly_paired_nb > 0 :
        # Process inserts sizes metrics
60
        command = Popen( ["-c", self.get_exec_path("javaPICARD")+" " +xmx+" -jar " + picard_path + " CollectInsertSizeMetrics " +options + " HISTOGRAM_FILE=" + hist_file + " INPUT=" + bam_file + " OUTPUT=" + metrics_file + " 2> " + log_file], shell=True, stdout=PIPE, stderr=PIPE )
Penom Nom's avatar
Penom Nom committed
61
62
63
64
65
66
        stdout, stderr = command.communicate()
        # Count nb pairs in bam file
        command = Popen( ["-c", samtools_path + " view -F384 " + bam_file + " | wc -l"], shell=True, stdout=PIPE, stderr=PIPE) # First read in pair 
        pairs_nb, stderr = command.communicate()
        # Write count
        pairs_count_fh = open(pairs_count_file, "w")
Penom Nom's avatar
Penom Nom committed
67
        pairs_count_fh.write( "NB_PAIRED="+str(pairs_nb.decode().strip()) )
Penom Nom's avatar
Penom Nom committed
68
69
        pairs_count_fh.close()
    else:
Penom Nom's avatar
Penom Nom committed
70
        empty_fh = open(metrics_file, "wb")
Penom Nom's avatar
Penom Nom committed
71
        empty_fh.close()
Penom Nom's avatar
Penom Nom committed
72
        empty_fh = open(hist_file, "wb")
Penom Nom's avatar
Penom Nom committed
73
        empty_fh.close()
Penom Nom's avatar
Penom Nom committed
74
        empty_fh = open(log_file, "wb")
Penom Nom's avatar
Penom Nom committed
75
        empty_fh.close()
Penom Nom's avatar
Penom Nom committed
76
        empty_fh = open(pairs_count_file, "wb")
Penom Nom's avatar
Penom Nom committed
77
78
79
80
        empty_fh.close()
    

class InsertsSizes (Analysis):
Penom Nom's avatar
Penom Nom committed
81
 
82
    def define_parameters(self, bam_files, histogram_width=700, minimum_pct=0.01, validation_stringency="LENIENT", archive_name=None):
Penom Nom's avatar
Penom Nom committed
83
84
85
86
87
88
89
        """
          @param bam_files : paths for bams
          @param histogram_width : explicitly sets the histogram width, overriding automatic truncation of histogram tail
          @param minimum_pct : when generating the histogram, discard any data categories (out of FR, TANDEM, RF) that have fewer than this percentage of overall reads
          @param validation_stringency : Validation stringency for all SAM files read by this program. Setting stringency to SILENT can improve performance when processing a BAM file in which variable-length data (read, qualities, tags) do not otherwise need to be decoded. Default value: STRICT. This option can be set to 'null' to clear the default value. Possible values: {STRICT, LENIENT, SILENT}
          @param archive_name : name for the output archive
        """
Penom Nom's avatar
Penom Nom committed
90
91
92
93
94
95
96
97
98
        self.add_input_file_list( "bam_files", "bam_filess", default=bam_files, required=True, file_format = 'bam')
        self.add_parameter("histogram_width", "histogram_width", default=histogram_width, type=int)
        self.add_parameter("minimum_pct", "minimum_pct", default=minimum_pct, type=float)
        self.add_parameter("validation_stringency", "validation_stringency", default=validation_stringency)
        self.add_parameter("archive_name", "Archive name", default=archive_name)
        self.add_output_file_list( "info_files", "info_files", pattern='{basename_woext}.txt', items=self.bam_files)
        self.add_output_file_list( "hist_files", "hist_files", pattern='{basename_woext}.pdf', items=self.bam_files)
        self.add_output_file_list( "log_files", "log_files", pattern='{basename_woext}.log', items=self.bam_files)
        self.add_output_file_list( "pairs_count_files", "pairs_count_files", pattern='{basename_woext}.count', items=self.bam_files)
99
100
101
102
        self.memory = '1G'
        if self.get_memory() != None :
            self.memory=self.get_memory()
            
Penom Nom's avatar
Penom Nom committed
103
    def define_analysis(self):
Penom Nom's avatar
Penom Nom committed
104
        self.name = "InsertsSizes"
Penom Nom's avatar
Penom Nom committed
105
106
        self.description = "Insert size statistics"
        self.software = "Picards tools - Insert size"
Penom Nom's avatar
Penom Nom committed
107
        self.options = "HISTOGRAM_WIDTH="+str(self.histogram_width)+" VALIDATION_STRINGENCY="+self.validation_stringency+" MINIMUM_PCT="+str(self.minimum_pct)
Penom Nom's avatar
Penom Nom committed
108
109
        
    def post_process(self):
Penom Nom's avatar
Penom Nom committed
110
111
        nb_omitted_samples = 0
        
Penom Nom's avatar
Penom Nom committed
112
        for info_file in self.info_files:
Penom Nom's avatar
Penom Nom committed
113
114
115
116
117
118
119
120
121
122
123
124
125
            if os.path.exists(info_file) and os.path.getsize(info_file) > 0:
                sample = os.path.splitext(os.path.basename(info_file))[0]                     
                # Parse results
                insert_sizes , metrics = self.parse_metrics_file(info_file)               
                # Save inserts metrics
                self._add_result_element(sample, "inserts_sizes", insert_sizes[:-1])                    
                for orientation in metrics:
                    self._add_result_element(sample, "nb_pair_used", metrics[orientation]["READ_PAIRS"], orientation)
                    self._add_result_element(sample, "median_size", metrics[orientation]["MEDIAN_INSERT_SIZE"], orientation)
                    self._add_result_element(sample, "mean_size", metrics[orientation]["MEAN_INSERT_SIZE"], orientation)
                    self._add_result_element(sample, "min_size", metrics[orientation]["MIN_INSERT_SIZE"], orientation)
                    self._add_result_element(sample, "max_size", metrics[orientation]["MAX_INSERT_SIZE"], orientation)
                    self._add_result_element(sample, "std_deviation", metrics[orientation]["STANDARD_DEVIATION"], orientation)
Penom Nom's avatar
Penom Nom committed
126
                    if "NB_INSERTS_SIZES" in metrics[orientation] :
127
                        self._add_result_element(sample, "nb_inserts_sizes", metrics[orientation]["NB_INSERTS_SIZES"][:-1], orientation)
Penom Nom's avatar
Penom Nom committed
128
129
130
131
132
133
134
135
136
                # Save paired count
                for pairs_count_file in self.pairs_count_files:
                    if os.path.splitext(os.path.basename(pairs_count_file))[0] == sample :
                        self._add_result_element(sample, "nb_pair_in_file", self.parse_pairs_count_file(pairs_count_file))    
            else:
                nb_omitted_samples += 1
        
        self._add_result_element("analysis", "nb_omitted_samples", nb_omitted_samples)
        
Penom Nom's avatar
Penom Nom committed
137
138
139
140
        # Finaly create and add the archive to the analyse
        self._create_and_archive(self.info_files, self.archive_name)
        
    def get_version(self):
141
        cmd = self.get_exec_path("javaPICARD")+" " + xmx +" -jar {} CollectInsertSizeMetrics --version".format(self.get_exec_path("Picard"))
142
        p = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True)
Penom Nom's avatar
Penom Nom committed
143
        stdout, stderr = p.communicate()
144
        return(stderr.decode("utf-8").rsplit()[0])
Penom Nom's avatar
Penom Nom committed
145
146
                     
    def process(self):
Penom Nom's avatar
Penom Nom committed
147
148
149
150
151
        options_dump_path = self.get_temporary_file(".dump")
        options_dump = open(options_dump_path, "wb")
        pickle.dump(self.options, options_dump)
        options_dump.close()
        
Penom Nom's avatar
Penom Nom committed
152
        for i in range(len(self.bam_files)):
153
154
            self.add_python_execution(inserts_metrics,cmd_format="{EXE} {IN} {OUT} {ARG}",
                                      inputs=self.bam_files[i], outputs=[self.pairs_count_files[i], self.info_files[i], self.hist_files[i], self.log_files[i]], 
155
                                      arguments=[self.get_exec_path("samtools"), self.get_exec_path("Picard"), options_dump_path, self.memory])
156
            
Penom Nom's avatar
Penom Nom committed
157
158
    def parse_pairs_count_file(self, input_file):
        """
159
          @param input_file  : the pairs  count file path
Penom Nom's avatar
Penom Nom committed
160
161
162
163
164
165
166
167
          @return            : the number of properly paired
        """
        count_file_fh = open(input_file, "r")
        first_line = count_file_fh.readline()
        name, count = first_line.split("=")     #Line pattern : NB_PAIRED=1000
        count_file_fh.close()
        return count.strip()
 
Penom Nom's avatar
Penom Nom committed
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
    def parse_metrics_file(self, input_file):
        """
          @param input_file  : the metrics file path
          @return            : string with inserts sizes class, metrics dictionnary
        """
        inserts_sizes = ""
        metrics = {}
        metrics_titles = []
        metrics_section = False
        histogram_titles = []
        histogram_section = False
        title = False
     
        fh_file = open(input_file, "r")
         
        for line in fh_file:
            line.rstrip('\s\n\r') ;
             
            if metrics_section:
                if not line.strip():
                    metrics_section = False
                else :
                    if title:
                        metrics_titles = line.split()
                        orientation_index = metrics_titles.index('PAIR_ORIENTATION') 
                        title = False
                    else :
                        metrics_fields = line.split()
                        line_metrics = {}
                        for index in range(len(metrics_fields)):
                            line_metrics[metrics_titles[index]] = metrics_fields[index]
                        metrics[metrics_fields[orientation_index]] = line_metrics
            elif histogram_section:
                if not line.strip():
                    histogram_section = False
                else :
                    if title:
                        histogram_titles = line.split()
                        for index in range(1,len(histogram_titles)):
                            m = re.search("\.([a-z]+)_count", histogram_titles[index])
                            histogram_titles[index] = m.group(1).upper()
                            metrics[histogram_titles[index]]["NB_INSERTS_SIZES"] = ""
                        title = False
                    else :
                        histogram_fields = line.split()
                        inserts_sizes += histogram_fields[0] + ","
                        for index in range(1,len(histogram_fields)):
                            metrics[histogram_titles[index]]["NB_INSERTS_SIZES"] += histogram_fields[index] + ","
            elif line.startswith("## METRICS CLASS"):
                metrics_section = True
                title = True
            elif line.startswith("## HISTOGRAM"):
                histogram_section = True
                title = True
         
        fh_file.close()
         
        return inserts_sizes, metrics