adaptorcleaner.py 14.4 KB
Newer Older
Celine Noirot's avatar
Celine Noirot committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
#
# Copyright (C) 2009 INRA
# 
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#

__author__ = 'Plateforme bioinformatique Midi Pyrenees'
__copyright__ = 'Copyright (C) 2009 INRA'
__license__ = 'GNU General Public License'
__version__ = '1.0'
__email__ = 'support.genopole@toulouse.inra.fr'
__status__ = 'beta'

from optparse import *
import os, sys, gzip, zlib, datetime, re
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

31
32
33

def version_string ():
    """
34
    Return the adaptorcleaner version
35
36
37
    """
    return "adaptorcleaner " + __version__

38

Celine Noirot's avatar
Celine Noirot committed
39
def mask_sequences (fafile, tab_adapt, len_adapt, options, log, minscore,minmatch):
Celine Noirot's avatar
Celine Noirot committed
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
    """
    Search and mask adaptors in seqs
      @param seqs    : table of seqs to filter  
      @param adapt   : SeqRecord object of the adaptor 
      @param adaptators_found: hash table for each sequence to set if adaptors is found 
      @param options : the options asked by the user
    """
    #  30  0.00 3.23 0.00  F7QVD2L01CVLN4       15    45 (804)    primer2        1    32 (0) 
    # Cross_match primer2 matches pattern
    rev_regex = re.compile("(\s+)?(\d+)\s+(\S+)\s+\S+\s+\S+\s+(\S+)\s+(\S+)\s+(\S+)\s+\S+\s+C\s+(\S+)\s+\S+\s+(\S+)\s+(\S+)")
    # Cross_match primer1 matches pattern
    fwd_regex = re.compile("(\s+)?(\d+)\s+(\S+)\s+\S+\s+\S+\s+(\S+)\s+(\S+)\s+(\S+)\s+\S+\s+(\S+)\s+(\S+)\s+(\S+)\s+\S+")

    adapt_ids=""
    for adapt in tab_adapt:
        adapt_ids=adapt_ids+adapt.id+"_"
    adapt_ids=adapt_ids.rstrip("_")

    oligo_file = open(os.path.join(options.output, os.path.basename(options.fasta)  + "."+ adapt_ids + ".oligo"), "w")
    SeqIO.write(tab_adapt, oligo_file, "fasta")
    oligo_file.close()
Celine Noirot's avatar
Celine Noirot committed
61
62
63
64
    if (minscore == -1):
        minscore=int(len_adapt*int(options.minscore)/100)
    if (minmatch == -1) :
        minmatch=int(len_adapt*int(options.minmatch)/100)
Celine Noirot's avatar
Celine Noirot committed
65
66
67
    
    cmd = "cross_match " + fafile + " " + os.path.join(options.output, os.path.basename(options.fasta)  + "."+ adapt_ids + ".oligo") + " -minmatch "+str(minmatch)+" -minscore "+str(minscore)+"  -penalty -1 -gap_init -1 -gap_ext -1 -ins_gap_ext -1 -del_gap_ext -1 -raw -screen > " + os.path.join(options.output, os.path.basename(options.fasta)) + "."+ adapt_ids + ".cross_match.res"
    os.system(cmd)
68
    log.write("###Adaptor : "+ adapt_ids + "\n"+cmd+"\n")
Celine Noirot's avatar
Celine Noirot committed
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
    os.system ("mv "+ fafile + ".screen " + os.path.join(options.output, os.path.basename(options.fasta)+ "."+ adapt_ids + ".screen"))
    log.write("Read\tStrand\tOligo\tReadStart\tReadEnd\tOligoStart\tOligoEnd\tScore\n")
    
    adaptators_found={}
    for line in open(os.path.join(options.output, os.path.basename(options.fasta)) + "."+ adapt_ids + ".cross_match.res", 'r'):
        rm = rev_regex.match(line)
        fm = fwd_regex.match(line)
        strand = ""
        save = False
        if rm != None: # If it's a primer2 matches
            (score, percentMis, primary_match, startFirstMatch, endFirstMatch, secondary_match, endSecondMatch, startSecondMatch)=(int(rm.group(2)), float(rm.group(3)), rm.group(4), int(rm.group(5)), int(rm.group(6)), rm.group(7), int(rm.group(8)), int(rm.group(9)))
            save = True
            strand = 'rvs'
        elif fm != None: # If it's a primer1 matches
            (score, percentMis, primary_match, startFirstMatch, endFirstMatch, secondary_match, startSecondMatch, endSecondMatch)=(int(fm.group(2)), float(fm.group(3)), fm.group(4), int(fm.group(5)), int(fm.group(6)), fm.group(7), int(fm.group(8)), int(fm.group(9)))
            save = True
            strand = 'fwd'

        if line.startswith("Discrepancy summary:"): # This is the end of the section
            break
    
        if save :
            try :
                adaptators_found[primary_match].append([strand, secondary_match, startFirstMatch, endFirstMatch, startSecondMatch, endSecondMatch, score])
            except:
                adaptators_found[primary_match] = [[strand, secondary_match, startFirstMatch, endFirstMatch, startSecondMatch, endSecondMatch, score]]
95
96
97
                log.write("%"+primary_match+"\t"+strand+"\t"+secondary_match+"\t"+str(startFirstMatch)+"\t"+str(endFirstMatch)+"\t"+str(startSecondMatch)+"\t"+str(endSecondMatch)+"\t"+str(score)+"\n")
                
    log.write("###Number of sequences with adaptor "+ adapt_ids + " " +str(len(adaptators_found.keys())) +" \n")
Celine Noirot's avatar
Celine Noirot committed
98
    
99
    # Clean up temp files
Celine Noirot's avatar
Celine Noirot committed
100
101
102
103
104
    os.remove(os.path.join(options.output, os.path.basename(options.fasta)  + "."+ adapt_ids + ".oligo"))
    os.remove(os.path.join(options.output, os.path.basename(options.fasta)) + "."+ adapt_ids + ".cross_match.res")
    
    return(os.path.join(options.output, os.path.basename(options.fasta) + "."+ adapt_ids + ".screen"))

105

Celine Noirot's avatar
Celine Noirot committed
106
def get_longest_subsequence (sequence):
Celine Noirot's avatar
Celine Noirot committed
107
108
    """
    Extract the longest non Xed subsequence
109
110
      @param sequence  : string of the sequence   
      @return          : coords [X,Y] of the sub sequence 
Celine Noirot's avatar
Celine Noirot committed
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
    """
    coords = []
    inc = 0
    start = 1 
    indic = 0
    # searching the non XXed blocks
    for char in sequence:
        inc=inc+1
        if char.upper() == 'X':
            if indic != 1 :
                coords.append([start,inc])
                indic = 1            
        else :
            if indic != 0 :
                start = inc
                indic = 0    
    if indic == 0:
        coords.append([start,inc+1])
129
        
Celine Noirot's avatar
Celine Noirot committed
130
131
132
133
134
135
136
137
138
139
140
141
142
    maxi = 0
    inc = 0
    select = -1
    # searching the longest non XXed blocks
    for i in coords:
        if (i[1] - i[0]) > maxi :
            select = inc
            maxi=i[1] - i[0]
        inc = inc+1
    if select > -1:
        return coords[select]
    else :
        return [0,0]
Celine Noirot's avatar
Celine Noirot committed
143

144
def get_adaptators (file, log): 
Celine Noirot's avatar
Celine Noirot committed
145
146
147
148
149
    """
    Return a in a BioPython seq table of adaptors
    @param file : the fasta file of the adaptors
    """
    seqs_len={}
Celine Noirot's avatar
Celine Noirot committed
150
    seqs_parameters={}
Celine Noirot's avatar
Celine Noirot committed
151
152
153
    for seqrecord in SeqIO.parse(open(file,"r"), "fasta") :
        if not seqs_len.has_key(len(seqrecord.seq)) :
            seqs_len[len(seqrecord.seq)]=[]
Celine Noirot's avatar
Celine Noirot committed
154
155
156
157
158
159
160
161
162
163
164
165
        minscore=-1
        minmatch=-1
        if re.search("MINSCORE=",seqrecord.description) :
            m=re.search("MINSCORE=(\d+)",seqrecord.description)
            minscore=m.group(1)
        if re.search("MINMATCH=",seqrecord.description) :
            m=re.search("MINMATCH=(\d+)",seqrecord.description)
            minmatch=m.group(1)

        seqs_parameters[len(seqrecord.seq)]=[minscore,minmatch]
        log.write (">"+seqrecord.id+"\n")
        log.write (seqrecord.seq.tostring()+"\n")
Celine Noirot's avatar
Celine Noirot committed
166
        seqs_len[len(seqrecord.seq)].append(seqrecord)
Celine Noirot's avatar
Celine Noirot committed
167
    return (seqs_len, seqs_parameters)
Celine Noirot's avatar
Celine Noirot committed
168
        
169
        
Celine Noirot's avatar
Celine Noirot committed
170
171
172
def which (program):
    """
    Return if the asked program exist in the user path
173
      @param program : the program to test
Celine Noirot's avatar
Celine Noirot committed
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
    """
    import os
    def is_exe(fpath):
        return os.path.exists(fpath) and os.access(fpath, os.X_OK)
    fpath, fname = os.path.split(program)
    if fpath:
        if is_exe(program):
            return program
    else:
        for path in os.environ["PATH"].split(os.pathsep):
            exe_file = os.path.join(path, program)
            if is_exe(exe_file):
                return exe_file
    return None

189

Celine Noirot's avatar
Celine Noirot committed
190
191
192
193
194
195
196
197
198
199
200
def depts_error(options):
    """
    Return the list of dependencies missing to run the program
    """
    error = ""
    if which("cross_match") == None and options.clean_pairends != None:
        error += " - cross_match\n"
    if error != "":
        error = "Cannot execute %prog, following dependencies are missing :\n" + error + "Please install them before to run!"
    return error

201

Celine Noirot's avatar
Celine Noirot committed
202
203
if __name__ == '__main__':

204
        parser = OptionParser(usage="Usage: %prog -f FILE -q qual -a adaptor -o DIRECTORY", description = "Delete adaptors in the fasta file.", version = version_string())
Celine Noirot's avatar
Celine Noirot committed
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229

        igroup = OptionGroup(parser, "Input files options","")
        igroup.add_option("-f", "--fa", dest="fasta",
                          help="The fasta file to clean REQUIRE", metavar="FILE")
        igroup.add_option("-q", "--qual", dest="qual",
                          help="The qual file", metavar="FILE")
        parser.add_option_group(igroup)
    
        ogroup = OptionGroup(parser, "Output files options","")
        ogroup.add_option("-o", "--out", dest="output",
                          help="The output folder where to store results REQUIRE")
        ogroup.add_option("-g", "--log", dest="log_file",
                          help="The log file name (default:Adaptorcleaner.log)", metavar="FILE", default="Adaptorcleaner.log")
        parser.add_option_group(ogroup)
  
        tgroup = OptionGroup(parser, "Triming options and parameters","")
        igroup.add_option("-s", "--score", dest="minscore",
                          help="The pourcent of adaptor length for minscore adaptator file REQUIRE ",  default=80 )
        igroup.add_option("-m", "--match", dest="minmatch",
                          help="The pourcent of adaptor length for minmatch adaptator file REQUIRE ",  default=25 )
        igroup.add_option("-a", "--adaptor", dest="adaptator",
                          help="The adaptator file REQUIRE ",  metavar="FILE")
        igroup.add_option("-l", "--minlen", dest="minlen",
                          help="The minimum length after trimming to keep the sequence ",  default=150 )
        parser.add_option_group(tgroup)
230

Celine Noirot's avatar
Celine Noirot committed
231
232
        (options, args) = parser.parse_args()
    
Celine Noirot's avatar
Celine Noirot committed
233
234
235
236
237
238
239
240
241
242
243
244
245
        if depts_error(options):
            print depts_error(options)
            sys.exit(1)

        if options.fasta == None or options.output == None or options.adaptator ==None :
            parser.print_help()
            sys.exit(1)
        if options.qual == None :
            if os.path.exists(options.fasta+".qual") :
                options.qual = options.fasta+".qual"
            elif os.path.exists(os.path.splitext(options.fasta)[0]+".qual"):
                options.qual =  os.path.splitext(options.fasta)[0]+".qual"
            else :
246
                print "A quality file is required\n"
Celine Noirot's avatar
Celine Noirot committed
247
248
                parser.print_help()
                sys.exit(1)
Celine Noirot's avatar
Celine Noirot committed
249
250
251
252
253
254
255
256
257
        global log
        
        if not os.path.isdir(options.output):
            os.makedirs(options.output, 0777)
        log = open(os.path.join(options.output, options.log_file), "w")
        log.write("## Start processing (" + str(datetime.datetime.now()) + ")\n")
        log.write("## with the following options: \n")
        opts = ""
        if options.adaptator :
258
259
260
261
            opts += " - Minscore = " + str(options.minscore) +"% of the adaptor length\n"
            opts += " - Minmatch = " + str(options.minmatch) +"% de la longueur de l'adaptateur\n"
            opts += " - Min seqeunce length after adaptor removal = " + str(options.minlen) +"\n"
            opts += " - Adaptor file : "+ options.adaptator+" \n"
Celine Noirot's avatar
Celine Noirot committed
262
263
264
265
266
267
268
269
270
271
        log.write(opts)  
        
        
        # 2 - Gets hash of len -> adaptator sequences
        (iadapt,adapt_parameters) = get_adaptators(options.adaptator, log)

        # 3 - Clean sequences adaptor 2 (polyT)
        start_file =os.path.join(options.output,os.path.basename(options.fasta))
        screen_file=start_file
        previous_file=start_file
272
273
        
        # Create symlink for crossmatch
Celine Noirot's avatar
Celine Noirot committed
274
275
276
277
278
279
280
281
282
283
        os.system ("ln -s "+options.fasta+ " "+ screen_file)
        for adapt_len in sorted(iadapt.keys(),reverse=True):
            (minscore,minmatch)=adapt_parameters[adapt_len]
            screen_file=mask_sequences(previous_file, iadapt[adapt_len],adapt_len, options,log,minscore,minmatch)
            #clean output directory
            if previous_file != start_file and os.path.exists(previous_file):
                 os.system ("rm "+previous_file )
            if  os.path.exists(previous_file+ ".log"):
                 os.system ("rm " + previous_file+ ".log")
            previous_file=screen_file
284

Celine Noirot's avatar
Celine Noirot committed
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
        record_dict = SeqIO.to_dict(SeqIO.parse(open(screen_file), "fasta"))
        new_fasta = []
        for rec in SeqIO.parse(open(options.qual,"r"), "qual") :
            if record_dict.has_key(rec.id) :
                if (record_dict[rec.id].seq.tostring().find("X") != -1):
                    (x,y)=get_longest_subsequence(record_dict[rec.id].seq.tostring())
                    diff=y-x 
                    if ( diff <= 0 or diff < int(options.minlen) ) :
                        if (diff <= 0 ):
                            log.write("Discard "+rec.id+" contains only adaptor\n" )
                        elif (diff < int(options.minlen) ):
                            log.write("Discard "+rec.id+" too short "+str(diff)+"  < "+str(options.minlen)+"\n" )
                    else :
                        new_record=SeqRecord(record_dict[rec.id].seq[x-1:y-1],record_dict[rec.id].id,record_dict[rec.id].description)
                        new_record.letter_annotations["phred_quality"] = rec.letter_annotations["phred_quality"][x-1:y-1]
                        new_fasta.append(new_record)
Celine Noirot's avatar
Celine Noirot committed
301
                else :
Celine Noirot's avatar
Celine Noirot committed
302
303
304
305
306
                    if len(rec.seq) >= int(options.minlen) :
                        record_dict[rec.id].letter_annotations["phred_quality"] = rec.letter_annotations["phred_quality"]
                        new_fasta.append(record_dict[rec.id])
                    else :
                        log.write("Discard "+rec.id+" too short "+str(len(rec.seq))+"  < "+str(options.minlen)+"\n" )
Celine Noirot's avatar
Celine Noirot committed
307

Celine Noirot's avatar
Celine Noirot committed
308
        output = os.path.join(options.output, os.path.splitext(os.path.basename(options.fasta))[0] + ".adaptorcleaner.fasta")
Celine Noirot's avatar
Celine Noirot committed
309
310
311
        fasta = open(output, "w")
        SeqIO.write(new_fasta, fasta, "fasta")
        fasta.close()
Celine Noirot's avatar
Celine Noirot committed
312
        output_qual = os.path.join(options.output, os.path.splitext(os.path.basename(options.fasta))[0] + ".adaptorcleaner.fasta.qual")
Celine Noirot's avatar
Celine Noirot committed
313
314
        qual = open(output_qual, "w")
        SeqIO.write(new_fasta, qual, "qual")
315
316
        
        # clean output directory
Celine Noirot's avatar
Celine Noirot committed
317
318
        os.system ("unlink "+start_file)
        os.system ("rm "+screen_file)
Celine Noirot's avatar
Celine Noirot committed
319

Celine Noirot's avatar
Celine Noirot committed
320
        # 4 - Display the result summary
321
        log.write("## Number of sequences analysed :"+ str(len (record_dict)) + "\n")
Celine Noirot's avatar
Celine Noirot committed
322
323
324
325
        log.write("## Ended with code 0 (" + str(datetime.datetime.now()) + ")\n")
        log.close()

        sys.exit(0)
326