16Scleaner.py 24.2 KB
Newer Older
Jerome Mariette's avatar
Jerome Mariette committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
#
# Copyright (C) 2009 INRA
# 
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#
17
18
19
# v1.1 (06/2011) : Add the option --qual option to clean and trim quality file as well
# v1.0 (09/2010) : 16Scleaner first versio
#
Jerome Mariette's avatar
Jerome Mariette committed
20
21
22
23

__author__ = 'Plateforme bioinformatique Midi Pyrenees'
__copyright__ = 'Copyright (C) 2009 INRA'
__license__ = 'GNU General Public License'
24
__version__ = '1.1'
Jerome Mariette's avatar
Jerome Mariette committed
25
26
27
28
29
30
31
32
33
34
__email__ = 'support.genopole@toulouse.inra.fr'
__status__ = 'beta'

from optparse import *
import os, sys, gzip, datetime, re
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord


35
36
37
38
39
40
41
def version_string ():
    """
    Return the 16Scleaner version
    """
    return "16Scleaner " + __version__


Jerome Mariette's avatar
Jerome Mariette committed
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
def get_reads_tab (seqs, options):
    """
    Trim input seqs defined by primers
      @param seqs    : table of seqs to filter   
      @param options : the options asked by the user
      @return        : reads table with primer matches
    """
    # Cross_match reverse matches pattern
    rev_regex = re.compile("(\s+)?(\d+)\s+(\S+)\s+\S+\s+\S+\s+(\S+)\s+(\S+)\s+(\S+)\s+\S+\s+C\s+(\S+)\s+\S+\s+(\S+)\s+(\S+)")
    # Cross_match forward matches pattern
    fwd_regex = re.compile("(\s+)?(\d+)\s+(\S+)\s+\S+\s+\S+\s+(\S+)\s+(\S+)\s+(\S+)\s+\S+\s+(\S+)\s+(\S+)\s+(\S+)\s+\S+")

    oligo_file = open(os.path.join(options.output, os.path.basename(options.input) + ".oligo.fna"), "w")
    SeqIO.write([SeqRecord(Seq(options.forward), id = 'fwd'), SeqRecord(Seq(options.reverse), id = 'rvs')], oligo_file, "fasta")        
    oligo_file.close()

    input_file = open(os.path.join(options.output, os.path.basename(options.input) + ".cross_match.fna"), "w")
    SeqIO.write(seqs, input_file, "fasta")
    input_file.close()

    cmd = "cross_match " + os.path.join(options.output, os.path.basename(options.input) + ".cross_match.fna") + " " + os.path.join(options.output, os.path.basename(options.input) + ".oligo.fna") + " -minmatch 4 -minscore 6 -penalty -1 -gap_init -1 -gap_ext -1 -ins_gap_ext -1 -del_gap_ext -1 -raw > " + os.path.join(options.output, os.path.basename(options.input)) + ".cross_match.res"
    os.system(cmd)

    reads = {}
    for line in open(os.path.join(options.output, os.path.basename(options.input)) + ".cross_match.res", 'r'):
        rm = rev_regex.match(line)
        fm = fwd_regex.match(line)
        strand = ""
        save = False
        if rm != None: # If it's a reverse matches
            (score, percentMis, primary_match, startFirstMatch, endFirstMatch, secondary_match, endSecondMatch, startSecondMatch)=(int(rm.group(2)), float(rm.group(3)), rm.group(4), int(rm.group(5)), int(rm.group(6)), rm.group(7), int(rm.group(8)), int(rm.group(9)))
            save = True
            strand = 'rvs'
        elif fm != None: # If it's a forward matches
            (score, percentMis, primary_match, startFirstMatch, endFirstMatch, secondary_match, startSecondMatch, endSecondMatch)=(int(fm.group(2)), float(fm.group(3)), fm.group(4), int(fm.group(5)), int(fm.group(6)), fm.group(7), int(fm.group(8)), int(fm.group(9)))
            save = True
            strand = 'fwd'

        if line.startswith("Discrepancy summary:"): # This is the end of the section
            break
    
        if save :
            try :
                reads[primary_match].append([strand, secondary_match, startFirstMatch, endFirstMatch, startSecondMatch, endSecondMatch, score])
            except:
                reads[primary_match] = [[strand, secondary_match, startFirstMatch, endFirstMatch, startSecondMatch, endSecondMatch, score]]

    #Clean up temp files
    os.remove(os.path.join(options.output, os.path.basename(options.input) + ".oligo.fna"))
    os.remove(os.path.join(options.output, os.path.basename(options.input) + ".cross_match.res"))
    os.remove(os.path.join(options.output, os.path.basename(options.input) + ".cross_match.fna"))
    os.remove(os.path.join(options.output, os.path.basename(options.input) + ".cross_match.fna.log"))

    return reads


def filter_reads (seqs, options):
    """
    Filter input seqs by length, ns and complexity if options are asked by user
      @param seqs    : table of seqs to filter   
      @param options : the options asked by the user
    """
    reads_ok = []
105
    del_by_ns, del_by_homopolymers, del_by_length, del_by_areas, del_by_triming, full_sequence, reversed_sequence, del_by_full_length = 0, 0, 0, 0, 0, 0, 0, 0
Jerome Mariette's avatar
Jerome Mariette committed
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
    seqs_to_return = []
    
    if options.trim or options.clean_areas or options.switch_strand :
        reads = get_reads_tab(seqs, options)
    
    # Go throught all sequences
    for i, reads_record in enumerate(seqs) :
        
        reads_ok.append(0)
        is_reversed = False
        fwd_present = False
        rvs_present = False
        start = 0
        stop = len(reads_record)
        
        # If is asked to trim or to clean sequences if Ns appears into specifics area
        if options.trim or options.clean_areas or options.switch_strand :
            if reads.has_key(reads_record.id) :
                for match in reads[reads_record.id] :
                    if match[0] == 'fwd' and match[1] == 'fwd' and match[6] >= options.fwd_minmatch :
                        start = match[2] - 1
                        fwd_present = True
                    elif match[0] == 'fwd' and match[1] == 'rvs' and match[6] >= options.rvrs_minmatch:
                        start = match[2] - 1
                        fwd_present = True
                        is_reversed = True
                    elif match[0] == 'rvs' and match[1] == 'rvs' and match[6] >= options.rvrs_minmatch:
                        stop = match[3]
                        rvs_present = True
                    elif match[0] == 'rvs' and match[1] == 'fwd' and match[6] >= options.fwd_minmatch:
                        stop = match[3]
                        rvs_present = True
                        is_reversed = True
                # Then trim the sequence
                if options.trim and (fwd_present or rvs_present):
                    reads_record = reads_record[start:stop]
                elif options.trim :
                    del_by_triming += 1
                    reads_ok[i] = 1
                    log.write(reads_record.id + " deleted -> No primers found.\n")
            else :
                del_by_triming += 1
                reads_ok[i] = 1
                log.write(reads_record.id + " deleted -> No primers found.\n")
        
        # If is asked to clean seqeunces by length using min:max
        if options.clean_length and reads_ok[i] == 0:
            # Check if the sequence is longer than the min asked, if not flagged it as deleted
            if len(reads_record) < int(options.min_length) :
                reads_ok[i] = 1
                del_by_length += 1
                log.write(reads_record.id + " deleted -> Length ( " + str(len(reads_record)) + "<" + str(options.min_length) + " )\n")
            elif len(reads_record) > int(options.max_length) :
                reads_ok[i] = 1
                del_by_length += 1
                log.write(reads_record.id + " deleted -> Length ( " + str(len(reads_record)) + ">" + str(options.max_length) + " )\n")
        
        # If is asked to clean seqeunces with too much Ns
        # and the sequence has not been flagged as deleted
        if options.clean_ns and reads_ok[i] == 0:
            # Compute the rate of Ns into the current sequence
            nb_n = (float(reads_record.seq.count("n")+reads_record.seq.count("N"))/float(len(reads_record)))*float(100)
            # If the rate is higher than the threshold flagged it as deleted
            if nb_n > float(options.ns_percent) :
                reads_ok[i] = 1
                del_by_ns += 1
                log.write(reads_record.id + " deleted -> Ns ( Reads containing " + str(nb_n) + "% of Ns > to the limit : " + str(options.ns_percent) + "% )\n")

174
        # If is asked to clean sequences if Ns appears into specifics area
Jerome Mariette's avatar
Jerome Mariette committed
175
176
177
178
179
180
181
182
183
        # and the sequence has not been flagged as deleted
        if options.clean_areas and reads_ok[i] == 0:
            areas = options.area.split(";")
            for area in areas:
                start = int(area.split(":")[0])
                stop = int(area.split(":")[1])
                
                seq_to_check = reads_record
                if is_reversed :
184
                    seq_to_check = seq_to_check.reverse_complement(id=True, description=True)
Jerome Mariette's avatar
Jerome Mariette committed
185
186
187
188
189
190
191
                
                if seq_to_check.seq[start:stop].count("N") + seq_to_check.seq[start:stop].count("n") > 0:
                    reads_ok[i] = 1
                    del_by_areas += 1
                    log.write(reads_record.id + " deleted -> N found in [ " + str(start) + ":" + str(stop) + "].\n")
                    break

192
193
194
195
196
197
198
199
200
201
        # If is asked to clean sequences if both primers found and sequence length too short
        # and the sequence has not been flagged as deleted
        if options.clean_full_length and reads_ok[i] == 0:
            # fwd and rvrs primers found
            if fwd_present and rvs_present :
                if len(reads_record) < int(options.full_length_min_size) :
                    reads_ok[i] = 1
                    del_by_full_length += 1
                    log.write(reads_record.id + " deleted -> Full sequence length ( " + str(len(reads_record)) + "<" + str(options.full_length_min_size) + " )\n")

Jerome Mariette's avatar
Jerome Mariette committed
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
        # If is asked to clean sequences if homopolymers are found
        # and the sequence has not been flagged as deleted
        if options.clean_homopolymers and reads_ok[i] == 0:
            h_len = 1
            for j, nucl in enumerate(reads_record.seq):
                if j != 0 and reads_record[j-1] == nucl:
                    h_len += 1
                    save = False
                elif j == 0:
                    save = False
                else :
                    save = True
                if len(reads_record) == j+1:
                    save = True
                if save and h_len >= options.homopolymers_length :
                    reads_ok[i] = 1
                    del_by_homopolymers += 1
                    log.write(reads_record.id + " deleted -> " + str(h_len) + "bp homopolymer found.\n")
                    break
                elif save :
                    h_len = 1

        if reads_ok[i] == 0:
            if options.switch_strand and is_reversed :
226
                reads_record = reads_record.reverse_complement(id=True, description=True)
Jerome Mariette's avatar
Jerome Mariette committed
227
228
229
230
231
232
233
234
                reversed_sequence += 1
                log.write(reads_record.id + " information -> Sequence has been reversed and complemented.\n")
            if fwd_present and rvs_present :
                log.write(reads_record.id + " information -> Both primer found.\n")
                full_sequence += 1
            seqs_to_return.append(reads_record)
    
    # Return values
235
    return [seqs_to_return, del_by_ns, del_by_homopolymers, del_by_length, del_by_areas, del_by_triming, del_by_full_length, full_sequence, reversed_sequence]
Jerome Mariette's avatar
Jerome Mariette committed
236
237
238
239
240
241

def get_seqs (options):
    """
    Converts input seqs in a BioPython seq table
      @param options : the options asked by the user
    """
242
243
244
    
    # First get fasta or/and qual input files
    qual_file = ""
Jerome Mariette's avatar
Jerome Mariette committed
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
    if options.format == "sff":
        sff_file = options.input
        
        if sff_file.endswith(".gz"):
            '''Gunzip the given file and then remove the file.'''
            r_file = gzip.GzipFile(sff_file, 'r')
            write_file = os.path.join(options.output, string.rstrip(os.path.basename(sff_file), '.gz'))
            w_file = open(write_file, 'w')
            w_file.write(r_file.read())
            w_file.close()
            r_file.close()
            sff_file = write_file
            
        base = os.path.basename(sff_file)
        fasta_file = os.path.join(options.output, base + '.fasta')
        qual_file = os.path.join(options.output, base + '.qual')
        xml_file = os.path.join(options.output, base + '.xml')
        format = "fasta"
        if not os.path.isfile(fasta_file) or not os.path.isfile(qual_file) or not os.path.isfile(xml_file):
            #First extract the sff file to fasta, qual and xml file
265
            cmd = "sff_extract.py -c -s " + fasta_file + " -q " + qual_file + " -x " + xml_file + " " + sff_file + " >> " + options.output+"/"+options.log_file
Jerome Mariette's avatar
Jerome Mariette committed
266
            os.system(cmd)
267
268
269
270
        else :
            log = open(options.log_file, "a+")
            log.write(fasta_file + ", " + qual_file + ", " + xml_file + " already exist, working on existing files\n")
            log.close()
Jerome Mariette's avatar
Jerome Mariette committed
271
272
273
274
275
276
277
    elif options.format == "fasta" or options.format == "fastq":
        format = options.format
        fasta_file = options.input
    else :
        print "Error : " + options.format + " is not a supported format!"
        sys.exit(1)

278
279
280
281
282
283
284
285
286
287
288
289
290
291
    if options.qual_file:
        qual_file = options.qual_file

    # If we got a quality file
    if qual_file :
        quals = {}
        # If the file is gziped
        if qual_file.endswith(".gz"):
            for qual in SeqIO.parse(gzip.open(qual_file), "qual") :
                quals[qual.id] = qual
        else :
            for qual in SeqIO.parse(open(qual_file), "qual") :
                quals[qual.id] = qual
    
Jerome Mariette's avatar
Jerome Mariette committed
292
293
294
295
    # If the fasta file is gziped
    if fasta_file.endswith(".gz"):
        seqs = []
        for record in SeqIO.parse(gzip.open(fasta_file), format) :
296
297
            if qual_file :
                record.letter_annotations["phred_quality"] = quals[record.id].letter_annotations["phred_quality"]
Jerome Mariette's avatar
Jerome Mariette committed
298
299
300
301
            seqs.append(record)
    else :
        seqs = []
        for record in SeqIO.parse(open(fasta_file), format) :
302
303
            if qual_file :
                record.letter_annotations["phred_quality"] = quals[record.id].letter_annotations["phred_quality"]
Jerome Mariette's avatar
Jerome Mariette committed
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
            seqs.append(record)
    
    # Clean temporary files
    if options.format == "sff":
        try:
            os.remove(fasta_file)
            os.remove(qual_file)
            os.remove(xml_file)
        except:
            pass
    
    # Returns the table of sequences
    return seqs

def which (program):
    """
    Return if the asked program exist in the user path
      @param options : the options asked by the user
    """
    import os
    def is_exe(fpath):
        return os.path.exists(fpath) and os.access(fpath, os.X_OK)
    fpath, fname = os.path.split(program)
    if fpath:
        if is_exe(program):
            return program
    else:
        for path in os.environ["PATH"].split(os.pathsep):
            exe_file = os.path.join(path, program)
            if is_exe(exe_file):
                return exe_file
    return None

def depts_error(options):
    """
    Return the list of dependencies missing to run the program
    """
    error = ""
    if which("sff_extract.py") == None:
        error += " - sff_extract.py\n" 
    if which("cross_match") == None and options.clean_pairends != None:
        error += " - cross_match\n"
    if error != "":
        error = "Cannot execute %prog, following dependencies are missing :\n" + error + "Please install them before to run!"
    return error

if __name__ == '__main__':

352
    parser = OptionParser(usage="Usage: %prog -i FILE -f format -o DIRECTORY", description = "Filter Sequences if Ns are found in defined area", version = version_string())
Jerome Mariette's avatar
Jerome Mariette committed
353
354
355
356

    igroup = OptionGroup(parser, "Input files options","")
    igroup.add_option("-i", "--in", dest="input",
                      help="The file to clean, can be [sff|fastq|fasta]", metavar="FILE")
357
358
    igroup.add_option("-q", "--qual", dest="qual_file",
                      help="The quality file to use if input file is fasta", metavar="FILE")
Jerome Mariette's avatar
Jerome Mariette committed
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
    igroup.add_option("-m", "--format", dest="format",
                      help="The format of the input file [sff|fastq|fasta] default is sff", type="string", default="sff")
    parser.add_option_group(igroup)

    ogroup = OptionGroup(parser, "Output files options","")
    ogroup.add_option("-c", "--switch-strand", action="store_true", dest="switch_strand", 
                      default=False, help="Switch read strand if not correctly oriented")
    ogroup.add_option("-o", "--out", dest="output",
                      help="The output folder where to store results")
    ogroup.add_option("-g", "--log", dest="log_file",
                      help="The log file name (default:16Scleaner.log)", metavar="FILE", default="16Scleaner.log")
    parser.add_option_group(ogroup)

    tgroup = OptionGroup(parser, "Triming options and parameters","")
    tgroup.add_option("-t", "--trim", action="store_true", dest="trim", 
                      default=False, help="Filter reads with a length in between [x:y]")
    igroup.add_option("-f", "--forward", dest="forward",
                      help="The forward primer used", type="string")
    igroup.add_option("-r", "--reverse", dest="reverse",
                      help="The reverse primer used", type="string")
    tgroup.add_option("-s", "--rvrs-minmatch", dest="rvrs_minmatch",
                      help="The min number of match allowed for the reverse primer (default=12)", type="int", default=12)
    tgroup.add_option("-b", "--fwd-minmatch", dest="fwd_minmatch",
                      help="The min number of match allowed for the forward primer (default=12)", type="int", default=12)
    parser.add_option_group(tgroup)

    cgroup = OptionGroup(parser, "Cleaning options and parameters","")
    cgroup.add_option("-a", "--clean-areas", action="store_true", dest="clean_areas",
                      default=False, help="Clean reads if at least a N is found in specified areas (Variable regions)")
388
389
390
391
    cgroup.add_option("-d", "--clean-full-length", action="store_true", dest="clean_full_length",
                      default=False, help="Clean reads if fwd and rvrs primer found and if length shorter than full_length_min_size parameters")
    cgroup.add_option("-k", "--full-length-min-size", dest="full_length_min_size", type="int", default=350,
                      help="Sequence min length when both primers presents")
Jerome Mariette's avatar
Jerome Mariette committed
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
    cgroup.add_option("-l", "--clean-length", action="store_true", dest="clean_length", 
                      default=False, help="Filter reads with a legnth in between [x:y]")
    cgroup.add_option("-n", "--clean-ns", action="store_true", dest="clean_ns", 
                      default=False, help="Filter reads with too much N")
    cgroup.add_option("-p", "--ns-percent", dest="ns_percent",
                      help="Percentage of N to use to filter reads (default=2)", type="int", default=2)
    cgroup.add_option("-e", "--area", dest="area", type="string", default="90:154;233:339", 
                      help="Areas where variable regions are located(default:90:154;233:339)")
    cgroup.add_option("-x", "--min-length", dest="min_length", type="int", default=150,
                      help="Sequence min length")
    cgroup.add_option("-y", "--max-length", dest="max_length", type="int", default=600,
                      help="Sequence min length")
    cgroup.add_option("-z", "--clean-homopolymers", action="store_true", dest="clean_homopolymers", 
                      default=False, help="Clean sequences if homopolymers found")
    cgroup.add_option("-w", "--homopolymers-length", dest="homopolymers_length",
                      help="Max length of homopolymer accepted in a sequence", type="int", default=10) 
    parser.add_option_group(cgroup)

    (options, args) = parser.parse_args()

    if not depts_error(options):
        if options.input == None or options.output == None or options.format == None :
            parser.print_help()
            sys.exit(1)
        elif options.trim and (options.forward == None or options.reverse == None) :
            print "Error : Forward primer and reverse primer required with option trim."
            parser.print_help()
            sys.exit(1)
        elif options.switch_strand and (options.forward == None or options.reverse == None) :
            print "Error : Forward primer and reverse primer required with option switch_strand."
            parser.print_help()
            sys.exit(1)
        elif options.clean_areas and (options.forward == None or options.reverse == None) :
            print "Error : Forward primer and reverse primer required with option clean_areas."
            parser.print_help()
            sys.exit(1)
428
429
430
431
        elif options.clean_full_length and (options.forward == None or options.reverse == None) :
            print "Error : Forward primer and reverse primer required with option clean_full_length."
            parser.print_help()
            sys.exit(1)
Jerome Mariette's avatar
Jerome Mariette committed
432
433
434
435
436
437
438
439
        else :    
            global log
            log = open(os.path.join(options.output, options.log_file), "w")
            log.write("## Start processing (" + str(datetime.datetime.now()) + ")\n")
            log.write("## with the following options: \n")
            opts = ""
            if options.clean_homopolymers:
                opts += " - Clean sequences if homopolymers length higher to " + str(options.homopolymers_length) + "bp.\n" 
440
441
            if options.clean_full_length:
                opts += " - Clean sequences if both primers found and sequence length shorter than " + str(options.full_length_min_size) + "bp.\n" 
Jerome Mariette's avatar
Jerome Mariette committed
442
443
444
445
446
447
448
            if options.clean_areas :
                opts += " - Clean sequences with Ns in areas [" + str(options.area) + "].\n" 
            if options.clean_length:
                opts += " - Clean reads with a length not in between [" + str(options.min_length) + ";" + str(options.max_length) + "].\n" 
            if options.clean_ns:
                opts += " - Clean reads with a percentage of Ns higher than " + str(options.ns_percent) + "%.\n" 
            if options.trim:
Jerome Mariette's avatar
Jerome Mariette committed
449
                opts += " - Trim reads using " + options.forward +  " as forward primer and " + options.reverse + " as reverse primer as start/stop. [fwd min matches = " + str(options.fwd_minmatch) + "; rvrs min matches = " + str(options.rvrs_minmatch) + "]\n"
Jerome Mariette's avatar
Jerome Mariette committed
450
451
452
453
454
455
456
            log.write(opts)  
            
            # 1 - First get inputs from options
            iseqs = get_seqs(options)
            before_cleaning = len(iseqs)
            
            # 2 - Clean sequences
457
            [seqs, del_by_ns, del_by_homopolymers, del_by_length, del_by_areas, del_by_triming, del_by_full_length, full_sequence, reversed_sequence] = filter_reads(iseqs, options)
Jerome Mariette's avatar
Jerome Mariette committed
458
459
460
461
462
463
464
465
466
467
468
469
470
            after_cleaning = len(seqs)
            
            # 3 - Write down seqs
            if options.format == "fastq":
                output = os.path.join(options.output, os.path.splitext(os.path.basename(options.input))[0] + ".clean.fastq")
                fasta = open(output, "w")
                SeqIO.write(seqs, fasta, "fastq")
                fasta.close()
            else :
                output = os.path.join(options.output, os.path.splitext(os.path.basename(options.input))[0] + ".clean.fasta")
                fasta = open(output, "w")
                SeqIO.write(seqs, fasta, "fasta")
                fasta.close()
471
472
473
474
475
                if options.format == "sff" or options.qual_file:
                    output = os.path.join(options.output, os.path.splitext(os.path.basename(options.input))[0] + ".clean.qual")
                    qual = open(output, "w")
                    SeqIO.write(seqs, qual, "qual")
                    qual.close()
Jerome Mariette's avatar
Jerome Mariette committed
476
477
    
            # 4 - Display the result summary
478
479
            log.write("## header (global) : nb_reads_at_begining\tnb_reads_after_clean_up\tlength\tns\tns_in_area\thomopolymers\ttriming\tfull_sequence_length\tfull_sequence\treversed_sequence\n")
            log.write("## summary (global) : " + str(before_cleaning) + "\t" + str(after_cleaning)  + "\t" + str(del_by_length) + "\t" + str(del_by_ns) + "\t" + str(del_by_areas) + "\t" + str(del_by_homopolymers) + "\t" + str(del_by_triming) + "\t" + str(del_by_full_length) + "\t" + str(full_sequence) + "\t" + str(reversed_sequence) + "\n")
Jerome Mariette's avatar
Jerome Mariette committed
480
481
482
483
484
485
486
            log.write("## Ended with code 0 (" + str(datetime.datetime.now()) + ")\n")
            log.close()

            sys.exit(0)
    else :
        print depts_error(options)
        sys.exit(1)