bedify.py 10.6 KB
Newer Older
1
2
3
4
5
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import re
import sys
6
import argparse
7

8
def formate_support(support_by_library, soft, indiv_list, alt_names=[] ):
9
    """
10
        transform number of support by individual from native to table format
11

12
        for each individual get the name and support. Using the indiv_list, update 
13
14
15
16
        the corresponding column of the support table row.

        :param support_by_library: original information of support/individual
        :param soft: software used to generate the SV
17
18
19
        :param indiv_list: a list of individuals' name
        :param alt_names: list of bam files used for breakdancer to process names
        
20
21
        :type support_by_library: list of strings
        :type soft: string
22
23
        :type indiv_list: list [strings]
        :type alt_names: list of string (pathways)
24

25
        :return: row of support table (1 individual/column + IDs)
26
27
28
29
        :rtype: string

        :Example:

30
31
32
33
34
35
36
37
38
39
40
41
        >>> formate_support(['A.bam|2', 'B.bam|1', 'D.bam|1'], "breakdancer", my_indiv_list)
        "A;B;D    2  1   0   1"
        
        >>> formate_support(['SAMPLE1', '0', '4', '20', '20', '25', '25', 'SAMPLE2', '0', '0', '20', '20', '28', '28', 'SAMPLE3', '0', '4', '26', '26', '24', '24']
                                            , "pindel", my_indiv_list)
        'SAMPLE1;SAMPLE2;SAMPLE3    45    48    50'
        
        >>> formate_support(["0/1:-27.8789,0,-43.8829:279:PASS:162:13:21:11:12", "0/1:-27.8789,0,-43.8829:279:PASS:162:13:21:11:12", "0/1:-27.8789,0,-43.8829:279:PASS:162:13:21:11:12"],
                                 "delly", my_indiv_list)
        'SAMPLE1;SAMPLE2;SAMPLE3    21+12    21+12    21+12'
        
        :note: for delly support = PE + SR
42
    """
43
    n = len(indiv_list)
44
45
    detected_individuals = []
    support_line = ["0"] * n
46
    if soft ==  "breakdancer":
47
48
        for i in support_by_library:
            individual, support = i.split("|")
49
50
            pos = alt_names.index(individual)   # breakdancer use file paths instead of sample names
            individual = indiv_list[pos]        # so get the corresponding name
51
52
53
            detected_individuals.append(individual)
            support_line[pos] = str(support)
    elif soft == "pindel":
54
        for i in range(0, len(support_by_library), 7):
55
56
57
58
59
60
            individual=support_by_library[i]
            upstream_support = int(support_by_library[i+4])
            downstream_support = int(support_by_library[i+6])
            support = upstream_support + downstream_support
            if support > 0:
                detected_individuals.append(individual)
61
                pos = indiv_list.index(individual)
62
63
                support_line[pos] = str(support)
    elif soft == "delly":
64
65
        for i,indi in enumerate(support_by_library):
            tmp = indi.split(":")
66
67
68
69
70
71
72
            pe = tmp[6]
            sr = tmp[8]
            if sr != "0":
                support = "+".join([pe,sr])
            else:
                support = pe
            if support != "0":
73
                individual = indiv_list[i]
74
                detected_individuals.append(individual)
75
                support_line[i] = str(support)
76
77
    support_line = "\t".join(support_line)
    id_line = ";".join(detected_individuals)
78
    return id_line +"\t"+ support_line
79

80
def parse_sv(sv, soft, indiv_list, alt_names=[]):
81
82
83
84
85
    """
        Read 1 row of pindel|breakancer|delly 's output to find essential informations

        :param sv: one strucural variant in raw native format with a tabulation split
        :param soft: software used to generate the SV
86
87
88
        :param indiv_list: a list individuals' name
        :param alt_names: list of bam files used for breakdancer to process names

89
90
91

        :type sv: list of strings
        :type soft: string
92
93
94
        :type indiv_list: list
        :type alt_names: list of string (pathways)
        
95
96
97
98
        :return: ordered essential features needed for final file
        :rtype: list of strings

        :Example:
99
name_list = sys.argv[8:]
100
101
102
        >>> sv = ['h1', '1', 'D', '1', 'NT', '0', '""', 'ChrID', '1', 'BP', '4117', '4119', 'BP_range',
        '4117', '4120', 'Supports', '5', '5', '+', '2', '2', '-', '3', '3', 'S1', '12', 'SUM_MS',
        '218', '1', 'NumSupSamples', '1', '1', 'Cow_H', '36', '36', '2', '2', '3', '3']
103
104
        >>> indiv_list = ['Cow_H','Cow_J']
        >>> parse_sv(sv, "pindel", indiv_list)
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
        "1   4117    4119    h1  D   1   5   Cow_H   5   0"
        
        :Note: Number of columns matters ! In strange case of strange behavior of this function
        please check if ID column was added on the left side of the original SV file. (e.g. h1 field
        in the example)
    """
    if soft == "breakdancer":
        pos_list = [0,1,2,5,7,8,10]
        support_by_library = sv[11].split(":")
    elif soft ==  "pindel":
        pos_list = [0,8,10,11,2,3,17] # /!\ depends de si j'ai ajoute ou modifie l'ID de pindel.
        support_by_library = sv[32:]
    elif soft == "delly":
        pos_list = [0,1,2,6,6,6,6]
        support_by_library = sv[10:]
120
121
122
123
124
125
126
127
128
129
    elif soft == "pindel_translocation":
#         pos_list = [0,2,4,6,8,10,12] # int_final
        pos_list = [0,3,4,6,7,9,11] # int
        call_id, chr1, start1, chr2, start2, seq, sup = [ sv[p] for p in pos_list ]
        if len(seq)-2:
            size = str(len(seq)-2)
        else:
            size = "?"
        new_sv = [chr1, start1, chr2, start2, call_id, size, sup] # TODo add formate support
        return new_sv
130
    call_id, chromosome, start, end, sv_type, length, support_total = [ sv[p] for p in pos_list ]
131
132
133
134
135
136
137
138
139
    if soft == "delly":
        results = re.search("TYPE=([A-Z]+).*END=([0-9]+).*PE=([0-9]+)(?:.*?SR=|)([0-9]+)?", sv[8]) # field INFOS
        sv_type, end, pe, sr = results.groups()
        try:
            support_total = pe + "+" + sr
            #raise error if sr is not matched
        except TypeError:
            support_total = pe
        length = str(int(end) - int(start))
140
    support_line = formate_support(support_by_library, soft, indiv_list, alt_names)
Penom Nom's avatar
Penom Nom committed
141
142
143
144
    if sv_type in ["D", "deletion", "DEL"]:
        sv_type = "DEL"
    elif sv_type in ["duplication", "DUP", "TDUP", "TD"]:
        sv_type = "DUP"
145
146
147
148
149
150
151
152
153
154
155
156
157
158
    elif sv_type in ["ITX", "CTX", "TRA"]: #TODO: report interK + append results
        if soft =="delly":
            results = re.search("CHR2=(\w+)(?:.*?CONSENSUS=|)([ATCG]+)?", sv[8])
            try :
                chr2, consensus = results.groups()
                size = str(len(consensus))
            except TypeError:
                chr2 = results.groups()[0]
                size="?"
            else:
                print("bad regex")
        elif soft == "breakdancer":
            chr2 = sv[4]
            size = "?"
Penom Nom's avatar
Penom Nom committed
159
        sv_type = "TRA"
160
161
        new_sv = [chromosome, str(start), chr2, str(end), call_id, sv_type, size, str(support_total), support_line]
        return new_sv
Penom Nom's avatar
Penom Nom committed
162
163
164
165
    elif sv_type in ["I"]:
        sv_type = "INS"
    elif sv_type in ["INV"]:
        sv_type = "INV"
166
167
168
169
170
171
172
173
174
175
    new_sv = [chromosome, str(start), str(end), call_id, sv_type, str(length), str(support_total), support_line]
    return new_sv

def filter_out(sv, size_min, size_max):
    size = int(sv[5])
    if size < int(size_min) or size > int(size_max):
        return True
    else:
        return False

176
def main(SV_file_path, good_path, soft, tag, len_min, len_max, indiv_list, alt_names=[]):
177
178
179
180
181
182
183
184
185
    """
        Read a structural variant file generated from a set of individuals and 
        transform it in a bed-like format.

        Open input and output files.For each line of input process the line
        How to process it depends on the software used. Finaly write it in the
        output file.

        :param SV_file_path: pathway to structural variant file
186
        :param good_path: name of the output file
187
        :param soft: name of the soft used to generate the SV file
188
        :param tag: name of the soft used to generate the SV file
189
190
        :param len_min : minimum length of a SV
        :param len_max : maximum length of a SV
191
        :param indiv_list: list of individuals' names
192
        :param alt_names: list of bam files used for breakdancer to process names
193
194
195

        :type SV_file_path: string (pathway)
        :type soft: string
196
197
198
        :type good_path: string (pathway)
        :type len_min: integer
        :type len_max: integer
199
        :type indiv_list: list of string
200
        :type alt_names: list of string (pathways)
201
202

        :return: none
203
        
204
205
206
207
208
209
210
211
212
213
214
        :output: bed-like file. The columns corresponds to the following:
            -CHR
            -START
            -END
            -SVR_ID
            -LENGTH
            -MEMBERS
            -SUPPORT   (1 col/ind)

        :Example:

215
        >>> main("/home/sangoku/kamehameha.breakdancer", "kame.bed", "fail.bed", "breakdancer", "DBZ", 50, 9000, ["Krilin","Freezer","Boo"]) 
216
217
    """
    support_table = []
218
219
    with open(SV_file_path, 'r') as SV_list, open(good_path,'w') as good_output_file :
        for c,line in enumerate(SV_list):
220
221
            if line[0] != "#":
                sv = line.split()
Penom Nom's avatar
Penom Nom committed
222
                iD = tag + "-" + str(c)
223
224
                sv.insert(0, iD)
                new_sv = parse_sv(sv, soft, indiv_list, alt_names)
225
226
227
228
                if soft == "pindel_translocation":
                    new_sv = "\t".join(new_sv) + "\n"
                    good_output_file.write(new_sv)
                elif new_sv[5] != "TRA" and filter_out(new_sv, len_min, len_max):
229
                    pass
230
                else:
231
232
233
                    new_sv = "\t".join(new_sv) + "\n"
                    good_output_file.write(new_sv)

234
235

# ================================================================
236
if __name__ == "__main__":
237
238
239
240
241
242
243
244
245
246
247
248
249
250
    parser = argparse.ArgumentParser()
    parser.add_argument('--input_file', help='outfile of pindel/delly/breakdancer to translate to a bedlike format')
    parser.add_argument('--output_file', help='bedlike translation')
    parser.add_argument('--software', help='choose : pindel/breakdancer/delly')
    parser.add_argument('--tag', help='tag of each structural variant')
    parser.add_argument('--min_length', help='min length of a variant', type=int)
    parser.add_argument('--max_length', help='max length of a variant', type=int)
    parser.add_argument('--names', nargs='*', help='list of individual s name the input file was processed on')
    parser.add_argument('--alt_names', nargs='*', help='list of bam files the input file was processed on (only for break dancer)')
    args = parser.parse_args()
    if args.alt_names:
        main(args.input_file, args.output_file, args.software, args.tag, args.min_length, args.max_length, args.names, args.alt_names)
    else :
        main(args.input_file, args.output_file, args.software, args.tag, args.min_length, args.max_length, args.names)