paf.py 17.8 KB
Newer Older
1
2
#!/usr/bin/env python3

3
import os
4
import shutil
Floreal Cabanettes's avatar
Floreal Cabanettes committed
5
from math import sqrt
6
from numpy import mean
7
from pathlib import Path
8
9
import matplotlib as mpl
mpl.use('Agg')
10
from matplotlib import pyplot as plt
Floreal Cabanettes's avatar
Floreal Cabanettes committed
11

12
13

class Paf:
14
    limit_idy = [0.25, 0.5, 0.75]
15
    max_nb_lines = 100000
16

17
    def __init__(self, paf: str, idx_q: str, idx_t: str, auto_parse: bool=True):
18
19
20
        self.paf = paf
        self.idx_q = idx_q
        self.idx_t = idx_t
21
        self.sorted = False
22
        if os.path.exists(os.path.join(os.path.dirname(paf), ".sorted")):
23
24
25
            self.paf += ".sorted"
            self.idx_q += ".sorted"
            self.sorted = True
26
        self.sampled= False
27
28
29
30
        self.len_q = None
        self.len_t = None
        self.min_idy = None
        self.max_idy = None
Floreal Cabanettes's avatar
Floreal Cabanettes committed
31
32
33
34
35
        self.lines = {}
        self.q_contigs = {}
        self.q_order = []
        self.t_contigs = {}
        self.t_order = []
36
        self.q_reversed = {}
37
38
39
40
41
        self.name_q = None
        self.name_t = None
        self.parsed = False
        self.error = False

42
43
        if auto_parse:
            self.parse_paf()
44

45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
    @staticmethod
    def __flush_blocks(index_c, new_index_c, new_index_o, current_block):
        if len(current_block) >= 5:
            block_length = 0
            for contig in current_block:
                block_length += index_c[contig]
            b_name = "###MIX###_" + "###".join(current_block)
            new_index_c[b_name] = block_length
            new_index_o.append(b_name)
        elif len(current_block) > 0:
            for b_name in current_block:
                new_index_c[b_name] = index_c[b_name]
                new_index_o.append(b_name)
        return new_index_c, new_index_o

    def parse_index(self, index_o: list, index_c: dict, full_len: int):
        """
        Parse index and merge too small contigs
        :param index_o: index order
        :param index_c: index contigs def
        :param full_len: length of the sequence
        :return: new index orders and contigs def
        """
        new_index_o = []
        new_index_c = {}
        current_block = []
        for index in index_o:
            if index_c[index] >= 0.002 * full_len:
                new_index_c, new_index_o = self.__flush_blocks(index_c, new_index_c, new_index_o, current_block)
                current_block = []
                new_index_c[index] = index_c[index]
                new_index_o.append(index)
            else:
                current_block.append(index)
        new_index_c, new_index_o = self.__flush_blocks(index_c, new_index_c, new_index_o, current_block)
        return new_index_c, new_index_o

82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
    @staticmethod
    def remove_noise(lines, noise_limit):
        keep_lines = {
            "0": [],
            "1": [],
            "2": [],
            "3": []
        }
        for cls, c_lines in lines.items():
            for line in c_lines:
                x1 = line[0]
                x2 = line[1]
                y1 = line[2]
                y2 = line[3]
                idy = line[4]
                len_m = sqrt(pow(x2 - x1, 2) + pow(y2 - y1, 2))
                if len_m >= noise_limit:
                    keep_lines[cls].append(line)
        return keep_lines

    def parse_paf(self, merge_index=True, noise=True):
103
104
105
        min_idy = 10000000000
        max_idy = -10000000000
        lines = {
106
107
108
109
            "0": [],
            "1": [],
            "2": [],
            "3": []
110
        }
111
112
        q_abs_start = {}
        q_abs_current_start = 0
113
        q_len = 0
114
115
        try:
            with open(self.idx_q, "r") as idx_q_f:
116
                name_q = idx_q_f.readline().strip("\n")
117
118
                q_order = []
                q_contigs = {}
119
                q_reversed = {}
120
121
122
123
                for line in idx_q_f:
                    parts = line.strip("\n").split("\t")
                    id_c = parts[0]
                    len_c = int(parts[1])
124
125
126
127
                    if len(parts) > 2:
                        q_reversed[id_c] = parts[2] == "1"
                    else:
                        q_reversed[id_c] = False
128
                    q_order.append(id_c)
129
                    q_abs_start[id_c] = q_abs_current_start
130
                    q_contigs[id_c] = len_c
131
                    q_len += len_c
132
                    q_abs_current_start += len_c
133
134
            if merge_index:
                q_contigs, q_order = self.parse_index(q_order, q_contigs, q_len)
135
136
137
138
        except IOError:
            self.error = "Index file does not exist for query!"
            return False

139
140
        t_abs_start = {}
        t_abs_current_start = 0
141
        t_len = 0
142
143
        try:
            with open(self.idx_t, "r") as idx_t_f:
144
                name_t = idx_t_f.readline().strip("\n")
145
146
147
148
149
150
151
                t_order = []
                t_contigs = {}
                for line in idx_t_f:
                    parts = line.strip("\n").split("\t")
                    id_c = parts[0]
                    len_c = int(parts[1])
                    t_order.append(id_c)
152
                    t_abs_start[id_c] = t_abs_current_start
153
                    t_contigs[id_c] = len_c
154
                    t_len += len_c
155
                    t_abs_current_start += len_c
156
157
            if merge_index:
                t_contigs, t_order = self.parse_index(t_order, t_contigs, t_len)
158
159
160
161
        except IOError:
            self.error = "Index file does not exist for target!"
            return False

162
163
        len_q = q_abs_current_start
        len_t = t_abs_current_start
164
        lines_lens = []
165
166
167

        try:
            with open(self.paf, "r") as paf_file:
Floreal Cabanettes's avatar
Floreal Cabanettes committed
168
169
170
171
172
173
                nb_lines = 0
                for line in paf_file:
                    nb_lines += 1
                    if nb_lines > self.max_nb_lines:
                        self.sampled = True
                        break
174
175
176
177
                    parts = line.strip("\n").split("\t")
                    v1 = parts[0]
                    v6 = parts[5]
                    strand = 1 if parts[4] == "+" else -1
178
                    idy = int(parts[9]) / int(parts[10])
179
180
181
182
183
184
185
                    min_idy = min(min_idy, idy)
                    max_idy = max(max_idy, idy)
                    # x1, x2, y1, y2, idy
                    y1 = int(parts[2]) + q_abs_start[v1]
                    y2 = int(parts[3]) + q_abs_start[v1]
                    x1 = int(parts[7 if strand == 1 else 8]) + t_abs_start[v6]
                    x2 = int(parts[8 if strand == 1 else 7]) + t_abs_start[v6]
186
187
                    len_m = sqrt(pow(x2 - x1, 2) + pow(y2 - y1, 2))
                    lines_lens.append(len_m)
188
189
190
191
192
193
                    if idy < self.limit_idy[0]:
                        class_idy = "0"
                    elif idy < self.limit_idy[1]:
                        class_idy = "1"
                    elif idy < self.limit_idy[2]:
                        class_idy = "2"
194
                    else:
195
                        class_idy = "3"
Floreal Cabanettes's avatar
Floreal Cabanettes committed
196
                    lines[class_idy].append([x1, x2, y1, y2, idy, v1, v6])
197
198
199
200
        except IOError:
            self.error = "PAF file does not exist!"
            return False

Floreal Cabanettes's avatar
Floreal Cabanettes committed
201
202
        if not noise and nb_lines > 1000:
            counts, bins, bars = plt.hist(lines_lens, bins=nb_lines//10)
203
204
205
206
207
208
209
210
211
212
213
            counts = list(counts)
            max_value = max(counts)
            max_index = counts.index(max_value)
            limit_index = -1
            for i in range(max_index, len(counts)):
                if counts[i] < max_value / 100:
                    limit_index = i
                    break
            if limit_index > -1:
                lines = self.remove_noise(lines, bins[limit_index])

214
215
216
217
218
219
220
221
        self.parsed = True
        self.len_q = len_q
        self.len_t = len_t
        self.min_idy = min_idy
        self.max_idy = max_idy
        self.lines = lines
        self.q_contigs = q_contigs
        self.q_order = q_order
222
        self.q_reversed = q_reversed
223
224
225
226
227
228
229
        self.t_contigs = t_contigs
        self.t_order = t_order
        self.name_q = name_q
        self.name_t = name_t

    def get_d3js_data(self):
        return {
230
231
            'y_len': self.len_q,
            'x_len': self.len_t,
232
233
234
            'min_idy': self.min_idy,
            'max_idy': self.max_idy,
            'lines': self.lines,
235
236
237
238
239
240
            'y_contigs': self.q_contigs,
            'y_order': self.q_order,
            'x_contigs': self.t_contigs,
            'x_order': self.t_order,
            'name_y': self.name_q,
            'name_x': self.name_t,
241
242
            'limit_idy': self.limit_idy,
            'sorted': self.sorted,
243
244
            'sampled': self.sampled,
            "max_nb_lines": self.max_nb_lines,
245
246
247
248
249
250
251
252
253
254
        }

    def save_json(self, out):
        import json
        success, data = self.parse_paf()
        if success:
            with open(out, "w") as out_f:
                out_f.write(json.dumps(data))
        else:
            raise Exception(data)
Floreal Cabanettes's avatar
Floreal Cabanettes committed
255

256
257
258
259
260
261
262
263
264
265
266
267
268
269
    def is_contig_well_oriented(self, lines: list, contig, chrom):
        """
        Returns True if the contig is well oriented. A well oriented contig
        must have y increased when x increased. We check that only for highest matchs
        (small matches must be ignores)
        :param lines: lines inside the contig
        :return: True if well oriented, False else
        """
        lines.sort(key=lambda x: -x[-1])
        max_len = lines[0][-1]
        i = 0

        # Select sample of tested lines:
        while i < len(lines) and lines[i][-1] > max_len * 0.1 \
270
                and lines[i][-1] >= 0.05 * min(self.q_contigs[contig], self.t_contigs[chrom]):
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
            i += 1
        selected_lines = lines[:i]

        # Check orientation of each line:
        if len(selected_lines) > 1:
            selected_lines.sort(key=lambda x: x[0])
            orients = []
            for i in range(1, len(selected_lines)):
                if selected_lines[i][2] > selected_lines[i-1][2]:
                    orients.append(1)
                else:
                    orients.append(-1)
            if mean(orients) > -0.1:  # We have a good orientation (-0.1 to ignore ambiguous cases)
                return True
        elif len(selected_lines) == 1:
            orient_first = "+" if selected_lines[0][3] < selected_lines[0][4] else "-"  # Get X orientation
            if selected_lines[0][-2 if orient_first == "+" else -3] > \
                    selected_lines[0][-3 if orient_first == "+" else -2]:  # Check Y according to X orientation
                return True
        elif len(selected_lines) == 0:  # None line were selected: we ignore this block
            return True

        # In all other cases the orientation is wrong:
        return False

    def reorient_contigs_in_paf(self, contigs):
        """
        Reorient contigs in the PAF file
        :param contigs: contigs to be reoriented
        """
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
        sorted_file = self.paf + ".sorted"
        with open(self.paf, "r") as source, open(sorted_file, "w") as target:
            for line in source:
                parts = line.strip("\n").split("\t")
                if parts[0] in contigs:
                    len_q = int(parts[1])
                    x1_q = int(parts[2])
                    x2_q = int(parts[3])
                    if parts[4] == "-":
                        parts[4] = "+"
                    else:
                        parts[4] = "-"
                    parts[2] = str(len_q - x2_q)
                    parts[3] = str(len_q - x1_q)
                target.write("\t".join(parts) + "\n")
        self.paf = sorted_file
        return True

    def _update_query_index(self, contigs_reoriented):
        with open(self.idx_q, "w") as idx:
            idx.write(self.name_q + "\n")
            for contig in self.q_order:
                idx.write("\t".join([contig, str(self.q_contigs[contig]), "1" if contig in contigs_reoriented else "0"])
                          + "\n")
325

326
327
328
329
330
331
332
333
334
    def set_sorted(self, is_sorted):
        self.sorted = is_sorted
        sorted_touch = os.path.join(os.path.dirname(self.paf), ".sorted")
        if is_sorted:
            Path(sorted_touch).touch()
        else:
            if os.path.exists(sorted_touch):
                os.remove(sorted_touch)

335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
    def compute_gravity_contigs(self):
        """
        Compute gravity for each contig on each chromosome (how many big matches they have).
        Will be used to find which chromosome has the highest value for each contig
        :return:
            - gravity for each contig and each chromosome:
                {contig1: {chr1: value, chr2: value, ...}, contig2: ...}
            - For each block save lines inside:
                [median_on_query, squared length, median_on_target, x1, x2, y1, y2, length] (x : on target, y: on query)
        """
        gravity_contig = {}
        lines_on_block = {}
        # Compute size of blocks (in term of how many big match they have), and save median of each match on each one
        # (for next step)
        for line in [j for i in list(self.lines.values()) for j in i]:
            x1 = int(line[0])
            x2 = int(line[1])
            y1 = int(line[2])
            y2 = int(line[3])
            contig = line[5]
            chrm = line[6]
            block = (contig, chrm)
            # X1 and X2 (in good orientation):
            x_1 = min(x1, x2)
            x_2 = max(x2, x1)
            med_q = x_1 + (abs(x_2 - x_1) / 2)
            # Y1 and Y2 (in good orientation):
            y_1 = min(y1, y2)
            y_2 = max(y2, y1)
            med_t = y_1 + (abs(y_2 - y_1) / 2)
            len_m = sqrt(pow(x2 - x1, 2) + pow(y2 - y1, 2))  # Len
            len_m_2 = pow(1 + len_m, 2) # Pow of len
            if block not in lines_on_block:
                lines_on_block[block] = []
            lines_on_block[block].append((med_q, len_m_2, med_t, x1, x2, y1, y2, len_m))

            if contig not in gravity_contig:
                gravity_contig[contig] = {}
            if chrm not in gravity_contig[contig]:
                gravity_contig[contig][chrm] = 0
            gravity_contig[contig][chrm] += len_m_2
        return gravity_contig, lines_on_block

Floreal Cabanettes's avatar
Floreal Cabanettes committed
378
    def sort(self):
379
380
381
        """
        Sort contigs according to reference target and reorient them if needed
        """
382
        self.parse_paf(False)
383
        if not self.sorted:  # Do the sort
384
            gravity_contig , lines_on_block = self.compute_gravity_contigs()
Floreal Cabanettes's avatar
Floreal Cabanettes committed
385

386
387
388
389
390
391
392
393
394
395
396
            # For each contig, find best block, and deduce gravity of contig:
            gravity_on_contig = {}
            reorient_contigs = []
            for contig, chr_blocks in gravity_contig.items():
                # Find best block:
                max_number = 0
                max_chr = None
                for chrm, size in chr_blocks.items():
                    if size > max_number:
                        max_number = size
                        max_chr = chrm
Floreal Cabanettes's avatar
Floreal Cabanettes committed
397

398
399
400
401
402
403
404
405
                # Compute gravity of contig:
                nb_items = 0
                sum_items = 0
                lines_on_selected_block = lines_on_block[(contig, max_chr)]
                for med in lines_on_selected_block:
                    sum_items += med[0] * med[1]
                    nb_items += med[1]
                gravity_on_contig[contig] = sum_items / nb_items
Floreal Cabanettes's avatar
Floreal Cabanettes committed
406

407
408
409
410
                # Check if contig must be re-oriented:
                if len(lines_on_selected_block) > 0:
                    if not self.is_contig_well_oriented(lines_on_selected_block, contig, max_chr):
                        reorient_contigs.append(contig)
411

412
413
            # Sort contigs:
            self.q_order.sort(key=lambda x: gravity_on_contig[x] if x in gravity_on_contig else self.len_q + 1000)
Floreal Cabanettes's avatar
Floreal Cabanettes committed
414

415
416
            self.idx_q += ".sorted"

417
418
419
420
421
422
423
424
            with open(self.idx_q, "w") as idx_q_f:
                idx_q_f.write(self.name_q + "\n")
                for contig in self.q_order:
                    idx_q_f.write("\t".join([contig, str(self.q_contigs[contig])]) + "\n")

            # Re-orient contigs:
            if len(reorient_contigs) > 0:
                self.reorient_contigs_in_paf(reorient_contigs)
425
426
427
428
            else:
                sorted_file = self.paf + ".sorted"
                shutil.copyfile(self.paf, sorted_file)
                self.paf = sorted_file
429

430
431
432
            # Update index:
            self._update_query_index(reorient_contigs)

433
            self.set_sorted(True)
434
435
436
437
438
439
440
441

        else:  # Undo the sort
            if os.path.exists(self.paf):
                os.remove(self.paf)
                self.paf = self.paf.replace(".sorted", "")
            if os.path.exists(self.idx_q):
                os.remove(self.idx_q)
                self.idx_q = self.idx_q.replace(".sorted", "")
442
            self.set_sorted(False)
443
444

        # Re parse PAF file:
Floreal Cabanettes's avatar
Floreal Cabanettes committed
445
446
        self.parsed = False
        self.parse_paf()
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475

    def get_query_on_target_association(self):
        """
        For each query, get the best matching chromosome
        :return:
        """
        gravity_contig = self.compute_gravity_contigs()[0]
        query_on_target = {}
        for contig, chr_blocks in gravity_contig.items():
            # Find best block:
            max_number = 0
            max_chr = None
            for chrm, size in chr_blocks.items():
                if size > max_number:
                    max_number = size
                    max_chr = chrm
            if max_chr is not None:
                query_on_target[contig] = max_chr
            else:
                query_on_target[contig] = None
        return query_on_target

    def build_query_on_target_association_file(self):
        """
        For each query, get the best matching chromosome and save it to a CSV file.
        Use the order of queries
        :return: content of the file
        """
        query_on_target = self.get_query_on_target_association()
476
        content = "Query\tTarget\tStrand\n"
477
        for contig in self.q_order:
478
479
480
            strand = "+"
            if contig in self.q_reversed:
                strand = "-" if self.q_reversed[contig] else "+"
481
            if contig in query_on_target:
482
                content += "%s\t%s\t%s\n" % (contig, query_on_target[contig] or "None", strand)
483
            else:
484
                content += "%s\t%s\t%s\n" % (contig, "None", strand)
485
        return content