paf.py 13.9 KB
Newer Older
1
2
#!/usr/bin/env python3

3
import os
Floreal Cabanettes's avatar
Floreal Cabanettes committed
4
from math import sqrt
5
from numpy import mean
6
from pathlib import Path
Floreal Cabanettes's avatar
Floreal Cabanettes committed
7

8
9

class Paf:
10
    limit_idy = [0.25, 0.5, 0.75]
11
    max_nb_lines = 100000
12

13
    def __init__(self, paf: str, idx_q: str, idx_t: str, auto_parse: bool=True):
14
15
16
        self.paf = paf
        self.idx_q = idx_q
        self.idx_t = idx_t
17
        self.sorted = False
18
        if os.path.exists(os.path.join(os.path.dirname(paf), ".sorted")):
19
20
21
            self.paf += ".sorted"
            self.idx_q += ".sorted"
            self.sorted = True
22
        self.sampled= False
23
24
25
26
        self.len_q = None
        self.len_t = None
        self.min_idy = None
        self.max_idy = None
Floreal Cabanettes's avatar
Floreal Cabanettes committed
27
28
29
30
31
        self.lines = {}
        self.q_contigs = {}
        self.q_order = []
        self.t_contigs = {}
        self.t_order = []
32
33
34
35
36
        self.name_q = None
        self.name_t = None
        self.parsed = False
        self.error = False

37
38
        if auto_parse:
            self.parse_paf()
39

40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
    @staticmethod
    def __flush_blocks(index_c, new_index_c, new_index_o, current_block):
        if len(current_block) >= 5:
            block_length = 0
            for contig in current_block:
                block_length += index_c[contig]
            b_name = "###MIX###_" + "###".join(current_block)
            new_index_c[b_name] = block_length
            new_index_o.append(b_name)
        elif len(current_block) > 0:
            for b_name in current_block:
                new_index_c[b_name] = index_c[b_name]
                new_index_o.append(b_name)
        return new_index_c, new_index_o

    def parse_index(self, index_o: list, index_c: dict, full_len: int):
        """
        Parse index and merge too small contigs
        :param index_o: index order
        :param index_c: index contigs def
        :param full_len: length of the sequence
        :return: new index orders and contigs def
        """
        new_index_o = []
        new_index_c = {}
        current_block = []
        for index in index_o:
            if index_c[index] >= 0.002 * full_len:
                new_index_c, new_index_o = self.__flush_blocks(index_c, new_index_c, new_index_o, current_block)
                current_block = []
                new_index_c[index] = index_c[index]
                new_index_o.append(index)
            else:
                current_block.append(index)
        new_index_c, new_index_o = self.__flush_blocks(index_c, new_index_c, new_index_o, current_block)
        return new_index_c, new_index_o

    def parse_paf(self, merge_index=True):
78
79
80
        min_idy = 10000000000
        max_idy = -10000000000
        lines = {
81
82
83
84
            "0": [],
            "1": [],
            "2": [],
            "3": []
85
        }
86
87
        q_abs_start = {}
        q_abs_current_start = 0
88
        q_len = 0
89
90
        try:
            with open(self.idx_q, "r") as idx_q_f:
91
                name_q = idx_q_f.readline().strip("\n")
92
93
94
95
96
97
98
                q_order = []
                q_contigs = {}
                for line in idx_q_f:
                    parts = line.strip("\n").split("\t")
                    id_c = parts[0]
                    len_c = int(parts[1])
                    q_order.append(id_c)
99
                    q_abs_start[id_c] = q_abs_current_start
100
                    q_contigs[id_c] = len_c
101
                    q_len += len_c
102
                    q_abs_current_start += len_c
103
104
            if merge_index:
                q_contigs, q_order = self.parse_index(q_order, q_contigs, q_len)
105
106
107
108
        except IOError:
            self.error = "Index file does not exist for query!"
            return False

109
110
        t_abs_start = {}
        t_abs_current_start = 0
111
        t_len = 0
112
113
        try:
            with open(self.idx_t, "r") as idx_t_f:
114
                name_t = idx_t_f.readline().strip("\n")
115
116
117
118
119
120
121
                t_order = []
                t_contigs = {}
                for line in idx_t_f:
                    parts = line.strip("\n").split("\t")
                    id_c = parts[0]
                    len_c = int(parts[1])
                    t_order.append(id_c)
122
                    t_abs_start[id_c] = t_abs_current_start
123
                    t_contigs[id_c] = len_c
124
                    t_len += len_c
125
                    t_abs_current_start += len_c
126
127
            if merge_index:
                t_contigs, t_order = self.parse_index(t_order, t_contigs, t_len)
128
129
130
131
        except IOError:
            self.error = "Index file does not exist for target!"
            return False

132
133
134
135
136
        len_q = q_abs_current_start
        len_t = t_abs_current_start

        try:
            with open(self.paf, "r") as paf_file:
137
138
139
140
                paf_lines = paf_file.readlines()
                if len(paf_lines) > self.max_nb_lines:
                    self.sampled = True
                for line in paf_lines[:self.max_nb_lines]:
141
142
143
144
                    parts = line.strip("\n").split("\t")
                    v1 = parts[0]
                    v6 = parts[5]
                    strand = 1 if parts[4] == "+" else -1
145
                    idy = int(parts[9]) / int(parts[10])
146
147
148
149
150
151
152
                    min_idy = min(min_idy, idy)
                    max_idy = max(max_idy, idy)
                    # x1, x2, y1, y2, idy
                    y1 = int(parts[2]) + q_abs_start[v1]
                    y2 = int(parts[3]) + q_abs_start[v1]
                    x1 = int(parts[7 if strand == 1 else 8]) + t_abs_start[v6]
                    x2 = int(parts[8 if strand == 1 else 7]) + t_abs_start[v6]
153
154
155
156
157
158
                    if idy < self.limit_idy[0]:
                        class_idy = "0"
                    elif idy < self.limit_idy[1]:
                        class_idy = "1"
                    elif idy < self.limit_idy[2]:
                        class_idy = "2"
159
                    else:
160
                        class_idy = "3"
Floreal Cabanettes's avatar
Floreal Cabanettes committed
161
                    lines[class_idy].append([x1, x2, y1, y2, idy, v1, v6])
162
163
164
165
        except IOError:
            self.error = "PAF file does not exist!"
            return False

166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
        self.parsed = True
        self.len_q = len_q
        self.len_t = len_t
        self.min_idy = min_idy
        self.max_idy = max_idy
        self.lines = lines
        self.q_contigs = q_contigs
        self.q_order = q_order
        self.t_contigs = t_contigs
        self.t_order = t_order
        self.name_q = name_q
        self.name_t = name_t

    def get_d3js_data(self):
        return {
181
182
            'y_len': self.len_q,
            'x_len': self.len_t,
183
184
185
            'min_idy': self.min_idy,
            'max_idy': self.max_idy,
            'lines': self.lines,
186
187
188
189
190
191
            'y_contigs': self.q_contigs,
            'y_order': self.q_order,
            'x_contigs': self.t_contigs,
            'x_order': self.t_order,
            'name_y': self.name_q,
            'name_x': self.name_t,
192
193
            'limit_idy': self.limit_idy,
            'sorted': self.sorted,
194
195
            'sampled': self.sampled,
            "max_nb_lines": self.max_nb_lines,
196
197
198
199
200
201
202
203
204
205
        }

    def save_json(self, out):
        import json
        success, data = self.parse_paf()
        if success:
            with open(out, "w") as out_f:
                out_f.write(json.dumps(data))
        else:
            raise Exception(data)
Floreal Cabanettes's avatar
Floreal Cabanettes committed
206

207
208
209
210
211
212
213
214
215
216
217
218
219
220
    def is_contig_well_oriented(self, lines: list, contig, chrom):
        """
        Returns True if the contig is well oriented. A well oriented contig
        must have y increased when x increased. We check that only for highest matchs
        (small matches must be ignores)
        :param lines: lines inside the contig
        :return: True if well oriented, False else
        """
        lines.sort(key=lambda x: -x[-1])
        max_len = lines[0][-1]
        i = 0

        # Select sample of tested lines:
        while i < len(lines) and lines[i][-1] > max_len * 0.1 \
221
                and lines[i][-1] >= 0.05 * min(self.q_contigs[contig], self.t_contigs[chrom]):
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
            i += 1
        selected_lines = lines[:i]

        # Check orientation of each line:
        if len(selected_lines) > 1:
            selected_lines.sort(key=lambda x: x[0])
            orients = []
            for i in range(1, len(selected_lines)):
                if selected_lines[i][2] > selected_lines[i-1][2]:
                    orients.append(1)
                else:
                    orients.append(-1)
            if mean(orients) > -0.1:  # We have a good orientation (-0.1 to ignore ambiguous cases)
                return True
        elif len(selected_lines) == 1:
            orient_first = "+" if selected_lines[0][3] < selected_lines[0][4] else "-"  # Get X orientation
            if selected_lines[0][-2 if orient_first == "+" else -3] > \
                    selected_lines[0][-3 if orient_first == "+" else -2]:  # Check Y according to X orientation
                return True
        elif len(selected_lines) == 0:  # None line were selected: we ignore this block
            return True

        # In all other cases the orientation is wrong:
        return False

    def reorient_contigs_in_paf(self, contigs):
        """
        Reorient contigs in the PAF file
        :param contigs: contigs to be reoriented
        """
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
        sorted_file = self.paf + ".sorted"
        with open(self.paf, "r") as source, open(sorted_file, "w") as target:
            for line in source:
                parts = line.strip("\n").split("\t")
                if parts[0] in contigs:
                    len_q = int(parts[1])
                    x1_q = int(parts[2])
                    x2_q = int(parts[3])
                    if parts[4] == "-":
                        parts[4] = "+"
                    else:
                        parts[4] = "-"
                    parts[2] = str(len_q - x2_q)
                    parts[3] = str(len_q - x1_q)
                target.write("\t".join(parts) + "\n")
        self.paf = sorted_file
        return True

    def _update_query_index(self, contigs_reoriented):
        with open(self.idx_q, "w") as idx:
            idx.write(self.name_q + "\n")
            for contig in self.q_order:
                idx.write("\t".join([contig, str(self.q_contigs[contig]), "1" if contig in contigs_reoriented else "0"])
                          + "\n")
276

277
278
279
280
281
282
283
284
285
    def set_sorted(self, is_sorted):
        self.sorted = is_sorted
        sorted_touch = os.path.join(os.path.dirname(self.paf), ".sorted")
        if is_sorted:
            Path(sorted_touch).touch()
        else:
            if os.path.exists(sorted_touch):
                os.remove(sorted_touch)

Floreal Cabanettes's avatar
Floreal Cabanettes committed
286
    def sort(self):
287
        self.parse_paf(False)
288
        if not self.sorted:  # Do the sort
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
            gravity_contig = {}
            lines_on_block = {}
            # Compute size of blocks (in term of how many big match they have), and save median of each match on each one
            # (for next step)
            for line in [j for i in list(self.lines.values()) for j in i]:
                x1 = int(line[0])
                x2 = int(line[1])
                y1 = int(line[2])
                y2 = int(line[3])
                contig = line[5]
                chrm = line[6]
                block = (contig, chrm)
                # X1 and X2 (in good orientation):
                x_1 = min(x1, x2)
                x_2 = max(x2, x1)
                med_q = x_1 + (abs(x_2 - x_1) / 2)
                # Y1 and Y2 (in good orientation):
                y_1 = min(y1, y2)
                y_2 = max(y2, y1)
                med_t = y_1 + (abs(y_2 - y_1) / 2)
                len_m = sqrt(pow(x2 - x1, 2) + pow(y2 - y1, 2))  # Pow of len
                len_m_2 = pow(1 + len_m, 2)
                if block not in lines_on_block:
                    lines_on_block[block] = []
                lines_on_block[block].append((med_q, len_m_2, med_t, x1, x2, y1, y2, len_m))
Floreal Cabanettes's avatar
Floreal Cabanettes committed
314

315
316
317
318
319
                if contig not in gravity_contig:
                    gravity_contig[contig] = {}
                if chrm not in gravity_contig[contig]:
                    gravity_contig[contig][chrm] = 0
                gravity_contig[contig][chrm] += len_m_2
Floreal Cabanettes's avatar
Floreal Cabanettes committed
320

321
322
323
324
325
326
327
328
329
330
331
            # For each contig, find best block, and deduce gravity of contig:
            gravity_on_contig = {}
            reorient_contigs = []
            for contig, chr_blocks in gravity_contig.items():
                # Find best block:
                max_number = 0
                max_chr = None
                for chrm, size in chr_blocks.items():
                    if size > max_number:
                        max_number = size
                        max_chr = chrm
Floreal Cabanettes's avatar
Floreal Cabanettes committed
332

333
334
335
336
337
338
339
340
                # Compute gravity of contig:
                nb_items = 0
                sum_items = 0
                lines_on_selected_block = lines_on_block[(contig, max_chr)]
                for med in lines_on_selected_block:
                    sum_items += med[0] * med[1]
                    nb_items += med[1]
                gravity_on_contig[contig] = sum_items / nb_items
Floreal Cabanettes's avatar
Floreal Cabanettes committed
341

342
343
344
345
                # Check if contig must be re-oriented:
                if len(lines_on_selected_block) > 0:
                    if not self.is_contig_well_oriented(lines_on_selected_block, contig, max_chr):
                        reorient_contigs.append(contig)
346

347
348
            # Sort contigs:
            self.q_order.sort(key=lambda x: gravity_on_contig[x] if x in gravity_on_contig else self.len_q + 1000)
Floreal Cabanettes's avatar
Floreal Cabanettes committed
349

350
351
            self.idx_q += ".sorted"

352
353
354
355
356
357
358
359
            with open(self.idx_q, "w") as idx_q_f:
                idx_q_f.write(self.name_q + "\n")
                for contig in self.q_order:
                    idx_q_f.write("\t".join([contig, str(self.q_contigs[contig])]) + "\n")

            # Re-orient contigs:
            if len(reorient_contigs) > 0:
                self.reorient_contigs_in_paf(reorient_contigs)
360

361
362
363
            # Update index:
            self._update_query_index(reorient_contigs)

364
            self.set_sorted(True)
365
366
367
368
369
370
371
372

        else:  # Undo the sort
            if os.path.exists(self.paf):
                os.remove(self.paf)
                self.paf = self.paf.replace(".sorted", "")
            if os.path.exists(self.idx_q):
                os.remove(self.idx_q)
                self.idx_q = self.idx_q.replace(".sorted", "")
373
            self.set_sorted(False)
374
375

        # Re parse PAF file:
Floreal Cabanettes's avatar
Floreal Cabanettes committed
376
377
        self.parsed = False
        self.parse_paf()