cutadapt.py 12.1 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
#
# Copyright (C) 2012 INRA
# 
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#

18
19
import os
import re
20
21
from subprocess import Popen, PIPE

22
import jflow.seqio as seqio
23

24
25
26
27
28
29
30
31
32
33
34
35
36
from ng6.analysis import Analysis

class CutAdapt (Analysis):
    def _get_length_table(self, input_file):
        """
          @param input_file  : the fastq file path
          @return            : [nb_seq, {size: nb, size:...}]
        """       
        nb_seq = 0
        reader = seqio.SequenceReader(input_file)
        sizes = {}
        for id, desc, seq, qualities in reader:
            nb_seq += 1
Penom Nom's avatar
Penom Nom committed
37
            if len(seq) in sizes:
38
39
40
41
                sizes[len(seq)] += 1
            else:
                sizes[len(seq)] = 1
        return [nb_seq, sizes]
42
43
44
    
    def define_parameters(self, input_files_R1, adapt_FWD, input_files_R2 = None, adapt_REV=None, error_rate=0.1, 
                          times=None, min_length=None, max_length=None, overlap_length=None):
45
46
47
48
49
        self.add_parameter("error_rate", "error_rate", default=error_rate, type='float')
        self.add_parameter("min_length", "min_length", default=min_length, type='int')
        self.add_parameter("max_length", "max_length", default=max_length, type='int')
        self.add_parameter("overlap_length", "overlap_length", default=overlap_length, type='int')
        self.add_parameter("times", "times", default=times, type='int')
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
        
        self.adapt_FWD = adapt_FWD
        self.add_input_file_list( "input_files_R1", "input files read1", default=input_files_R1, required=True, file_format='fastq')
        self.add_output_file_list( "output_files_R1", "output files read1", pattern='{basename_woext}_cut.fastq', items=self.input_files_R1)
        self.add_output_file_list( "log_files_R1", "log files read1", pattern='{basename_woext}.stderr', items=self.input_files_R1)
        
        self.adapt_REV = adapt_REV
        self.is_paired = False
        if input_files_R2 :
            self.is_paired = True
            self.add_input_file_list( "input_files_R2", "input files read2", default=input_files_R2, file_format='fastq')
            self.add_output_file_list( "output_files_R2", "output files read2", pattern='{basename_woext}_cut.fastq', items=self.input_files_R2)
            self.add_output_file_list( "log_files_R2", "log files read2", pattern='{basename_woext}.stderr', items=self.input_files_R2)
            
            if self.min_length or self.max_length or self.overlap_length :
                self.add_output_file_list( "tmp_files_R1", "tmp filefiles read1", pattern='{basename_woext}_tmp.fastq', items=self.input_files_R1)
                self.add_output_file_list( "tmp_files_R2", "tmp filefiles read2", pattern='{basename_woext}_tmp.fastq', items=self.input_files_R2)

68
69
70
71
72
        
    def define_analysis(self):
        self.name = "CutAdapt"
        self.description = "Remove adapters"
        self.software = "cutadapt"
73
        self.options = "MAIN "
74
        self.cmdline_options = []
75
        if self.error_rate:
76
            self.cmdline_options.extend(["--error-rate", str(self.error_rate)])
77
        if self.min_length:
78
            self.cmdline_options.extend(["--minimum-length", str(self.min_length)])
79
        if self.max_length:
80
            self.cmdline_options.extend(["--maximum-length", str(self.max_length)])
81
        if self.overlap_length:
82
            self.cmdline_options.extend(["--overlap", str(self.overlap_length)])
83
        if self.times:
84
85
86
87
88
89
            self.cmdline_options.extend(["--times", str(self.times) ])
        
        self.options += " ".join(self.cmdline_options)
        self.options += " ; FWD "
        self.options_FWD = []
        # adapter for forward
Penom Nom's avatar
Penom Nom committed
90
        for cle, valeur in list(self.adapt_FWD.items()) : 
91
92
93
94
95
96
97
98
            if cle in ['a', 'b', 'g'] :
                for adapt in valeur :
                    self.options_FWD.extend([ '-'+cle, adapt ])
                    self.options += " -%s %s "% (cle,adapt)
        
        self.options_REV = []
        if self.adapt_REV :
            self.options += " ; REV "
Penom Nom's avatar
Penom Nom committed
99
            for cle, valeur in list(self.adapt_REV.items()) : 
100
101
102
103
                if cle in ['a', 'b', 'g'] :
                    for adapt in valeur :
                        self.options_REV.extend([ '-'+cle, adapt ])
                        self.options += " -%s %s "% (cle,adapt)
104
105
106
107
108
        
    def post_process(self):
                  
        for output_file_R1 in self.log_files_R1:
            sample = os.path.splitext(os.path.basename(output_file_R1))[0]
109
                            
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
            # Parse results
            metrics = self.parse_metrics_file(output_file_R1)
            self._add_result_element(sample, "processedread", str(metrics["processedread"]))
            self._add_result_element(sample, "processedbase", str(metrics["processedbase"]))
            self._add_result_element(sample, "trimmedread", str(metrics["trimmedread"]))
            self._add_result_element(sample, "trimmedbase", str(metrics["trimmedbase"]))
            self._add_result_element(sample, "shortread", str(metrics["shortread"]))
            self._add_result_element(sample, "longread", str(metrics["longread"]))

        samples = {}
        # Process metrics from the final output reads 1
        for filepath in self.output_files_R1:
            [nb_seq, sizes] = self._get_length_table(filepath)
            x = []
            y = []
Penom Nom's avatar
Penom Nom committed
125
            for val in list(sizes.keys()):
126
127
128
129
130
131
132
133
                x.append(val)
            x = sorted(x)
            for i in x:
                y.append(sizes[i])
            if self.is_paired : 
                sample_name = os.path.basename(filepath).split("_cut")[0]
            else : 
                sample_name = os.path.splitext(os.path.basename(filepath))[0]
Penom Nom's avatar
Penom Nom committed
134
            if sample_name not in samples:
135
136
                samples[sample_name] = {}
            samples[sample_name]["final_read"] = str(nb_seq)
137
138
            samples[sample_name]["size"] = str(",".join([str(v) for v in x]))
            samples[sample_name]["nb_size"] = str(",".join([str(v) for v in y]))
Penom Nom's avatar
Penom Nom committed
139
            self._save_file(filepath, gzip=True)
140
141
142
143
144
145
146
                
        if self.is_paired : 
            
            for output_file_R2 in self.log_files_R2 : 
                sample = os.path.splitext(os.path.basename(output_file_R2))[0]
                # Parse results
                metrics = self.parse_metrics_file(output_file_R2)
147
148
149
150
151
152
                self._add_result_element(sample, "processedread", str(metrics["processedread"]))
                self._add_result_element(sample, "processedbase", str(metrics["processedbase"]))
                self._add_result_element(sample, "trimmedread", str(metrics["trimmedread"]))
                self._add_result_element(sample, "trimmedbase", str(metrics["trimmedbase"]))
                self._add_result_element(sample, "shortread", str(metrics["shortread"]))
                self._add_result_element(sample, "longread", str(metrics["longread"]))
153
154
155
156
157
158
          
            # Process metrics from the final output reads 2
            for filepath in self.output_files_R2:
                [nb_seq, sizes] = self._get_length_table(filepath)
                x = []
                y = []
Penom Nom's avatar
Penom Nom committed
159
                for val in list(sizes.keys()):
160
161
162
163
164
                    x.append(val)
                x = sorted(x)
                for i in x:
                    y.append(sizes[i])
                sample_name = os.path.basename(filepath).split("_cut")[0]
Penom Nom's avatar
Penom Nom committed
165
                if sample_name not in samples:
166
167
                    samples[sample_name] = {}
                samples[sample_name]["final_read"] = str(nb_seq)
168
169
                samples[sample_name]["size"] = str(",".join([str(v) for v in x]))
                samples[sample_name]["nb_size"] = str(",".join([str(v) for v in y]))
Penom Nom's avatar
Penom Nom committed
170
                self._save_file(filepath, gzip=True)
171
172
173
                
        for sample in samples:        
            self._add_result_element(sample, "final_read", samples[sample]["final_read"])
174
175
            self._add_result_element(sample, "size", samples[sample]["size"])
            self._add_result_element(sample, "nb_size", samples[sample]["nb_size"])
176
177
178
179
180
181
182
183
         
    def get_version(self):
        cmd = [self.get_exec_path("cutadapt"),"--version"]
        p = Popen(cmd, stdout=PIPE, stderr=PIPE)
        stdout, stderr = p.communicate()
        return stdout.split()[0]
                     
    def process(self):
184
185
186
187
188
189
        if self.is_paired :
            if self.min_length or self.max_length or self.overlap_length :
                params = " ".join(self.options_REV + self.cmdline_options)
                self.cmdline_options.extend(["--paired-output", "$3"])

                # cutadapt forward
190
191
192
                self.add_shell_execution(self.get_exec_path("cutadapt") + " " + " ".join(self.options_FWD + self.cmdline_options) 
                                         + " --paired-output $4 -o $3 $1 $2 > $5", cmd_format = '{EXE} {IN} {OUT}', map=True,
                                         inputs = [self.input_files_R1, self.input_files_R2], outputs=[self.tmp_files_R1, self.tmp_files_R2, self.log_files_R1])
193
194
                
                #cutadapt reverse
195
196
197
                self.add_shell_execution(self.get_exec_path("cutadapt") + " " + " ".join(self.options_REV + self.cmdline_options) 
                                         + " --paired-output $3 -o $4 $1 $2 > $5", cmd_format = '{EXE} {IN} {OUT}', map=True,
                                         inputs = [self.tmp_files_R2, self.tmp_files_R1], outputs = [self.output_files_R1, self.output_files_R2, self.log_files_R2])
198
199
            else :
                #forward read
200
201
202
                self.add_shell_execution(self.get_exec_path("cutadapt") + " " + " ".join(self.options_FWD + self.cmdline_options) 
                                         + " -o $2 $1 > $3", cmd_format = '{EXE} {IN} {OUT}', map=True,
                                         inputs = [self.input_files_R1], outputs=[self.output_files_R1, self.log_files_R1])
203
204
                
                # reverse read
205
206
207
                self.add_shell_execution(self.get_exec_path("cutadapt") + " " + " ".join(self.options_REV + self.cmdline_options) 
                                         + " -o $2 $1 > $3", cmd_format = '{EXE} {IN} {OUT}', map=True,
                                         inputs = [self.input_files_R2], outputs=[self.output_files_R2, self.log_files_R2])
208
                
209
        else :
210
211
212
            self.add_shell_execution(self.get_exec_path("cutadapt") + " " + " ".join(self.options_FWD + self.cmdline_options) 
                             + " -o $2 $1 > $3", cmd_format = '{EXE} {IN} {OUT}', map=True,
                             inputs = [self.input_files_R1], outputs=[self.output_files_R1, self.log_files_R1])
213
214
215
216
217
218
219
            
    def parse_metrics_file(self, input_file):
        """
          @param input_file  : the output metrics file path
          @return            : string with inserts sizes class, metrics dictionnary
        """
        stats = {}
Penom Nom's avatar
Penom Nom committed
220
        print(input_file)
221
        for line in open(input_file, 'r').readlines():
Gerald Salin's avatar
Gerald Salin committed
222
            if re.search("Total read pairs processed",line):
223
                stats["processedread"] = line.strip().split()[2]
Gerald Salin's avatar
Gerald Salin committed
224
            if re.search("Total basepairs processed:",line):
225
226
227
228
229
                stats["processedbase"] = line.strip().split()[2]
            if re.search("Trimmed reads",line):
                stats["trimmedread"] = line.strip().split()[2]
            if re.search("Trimmed bases",line):
                stats["trimmedbase"] = line.strip().split()[2]
Gerald Salin's avatar
Gerald Salin committed
230
            if re.search("Pairs that were too short",line):
231
                stats["shortread"] = line.strip().split()[3]
232
            if re.search("Too long reads",line):
233
                stats["longread"] = line.strip().split()[3]
234
235
        return stats

236