paf.py 18.7 KB
Newer Older
1
2
#!/usr/bin/env python3

3
import os
4
import re
5
import shutil
Floreal Cabanettes's avatar
Floreal Cabanettes committed
6
from math import sqrt
7
from numpy import mean
8
from pathlib import Path
9
10
import matplotlib as mpl
mpl.use('Agg')
11
from matplotlib import pyplot as plt
12
import json
13
from collections import Counter
Floreal Cabanettes's avatar
Floreal Cabanettes committed
14
from dgenies.bin.index import Index
15
16


17
class Paf:
18
    limit_idy = [0.25, 0.5, 0.75]
19
    max_nb_lines = 100000
20

21
    def __init__(self, paf: str, idx_q: str, idx_t: str, auto_parse: bool=True):
22
23
24
        self.paf = paf
        self.idx_q = idx_q
        self.idx_t = idx_t
25
        self.sorted = False
26
        if os.path.exists(os.path.join(os.path.dirname(paf), ".sorted")):
27
28
29
            self.paf += ".sorted"
            self.idx_q += ".sorted"
            self.sorted = True
30
        self.sampled= False
31
32
33
34
        self.len_q = None
        self.len_t = None
        self.min_idy = None
        self.max_idy = None
Floreal Cabanettes's avatar
Floreal Cabanettes committed
35
36
37
38
39
        self.lines = {}
        self.q_contigs = {}
        self.q_order = []
        self.t_contigs = {}
        self.t_order = []
40
        self.q_reversed = {}
41
42
43
44
45
        self.name_q = None
        self.name_t = None
        self.parsed = False
        self.error = False

46
47
        if auto_parse:
            self.parse_paf()
48

49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
    @staticmethod
    def __flush_blocks(index_c, new_index_c, new_index_o, current_block):
        if len(current_block) >= 5:
            block_length = 0
            for contig in current_block:
                block_length += index_c[contig]
            b_name = "###MIX###_" + "###".join(current_block)
            new_index_c[b_name] = block_length
            new_index_o.append(b_name)
        elif len(current_block) > 0:
            for b_name in current_block:
                new_index_c[b_name] = index_c[b_name]
                new_index_o.append(b_name)
        return new_index_c, new_index_o

    def parse_index(self, index_o: list, index_c: dict, full_len: int):
        """
        Parse index and merge too small contigs
        :param index_o: index order
        :param index_c: index contigs def
        :param full_len: length of the sequence
        :return: new index orders and contigs def
        """
        new_index_o = []
        new_index_c = {}
        current_block = []
        for index in index_o:
            if index_c[index] >= 0.002 * full_len:
                new_index_c, new_index_o = self.__flush_blocks(index_c, new_index_c, new_index_o, current_block)
                current_block = []
                new_index_c[index] = index_c[index]
                new_index_o.append(index)
            else:
                current_block.append(index)
        new_index_c, new_index_o = self.__flush_blocks(index_c, new_index_c, new_index_o, current_block)
        return new_index_c, new_index_o

86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
    @staticmethod
    def remove_noise(lines, noise_limit):
        keep_lines = {
            "0": [],
            "1": [],
            "2": [],
            "3": []
        }
        for cls, c_lines in lines.items():
            for line in c_lines:
                x1 = line[0]
                x2 = line[1]
                y1 = line[2]
                y2 = line[3]
                idy = line[4]
                len_m = sqrt(pow(x2 - x1, 2) + pow(y2 - y1, 2))
                if len_m >= noise_limit:
                    keep_lines[cls].append(line)
        return keep_lines

    def parse_paf(self, merge_index=True, noise=True):
107
108
109
        min_idy = 10000000000
        max_idy = -10000000000
        lines = {
110
111
112
113
            "0": [],  # idy < 0.25
            "1": [],  # idy < 0.5
            "2": [],  # idy < 0.75
            "3": []  # idy > 0.75
114
115
        }
        try:
116
            name_q, q_order, q_contigs, q_reversed, q_abs_start, len_q = Index.load(self.idx_q)
117
            if merge_index:
118
                q_contigs, q_order = self.parse_index(q_order, q_contigs, len_q)
119
120
121
122
123
        except IOError:
            self.error = "Index file does not exist for query!"
            return False

        try:
124
            name_t, t_order, t_contigs, t_reversed, t_abs_start, len_t = Index.load(self.idx_t)
125
            if merge_index:
126
                t_contigs, t_order = self.parse_index(t_order, t_contigs, len_t)
127
128
129
130
        except IOError:
            self.error = "Index file does not exist for target!"
            return False

131
        lines_lens = []
132
133
134

        try:
            with open(self.paf, "r") as paf_file:
Floreal Cabanettes's avatar
Floreal Cabanettes committed
135
136
137
138
139
140
                nb_lines = 0
                for line in paf_file:
                    nb_lines += 1
                    if nb_lines > self.max_nb_lines:
                        self.sampled = True
                        break
141
142
143
144
                    parts = line.strip("\n").split("\t")
                    v1 = parts[0]
                    v6 = parts[5]
                    strand = 1 if parts[4] == "+" else -1
145
                    idy = int(parts[9]) / int(parts[10])
146
147
148
149
150
151
152
                    min_idy = min(min_idy, idy)
                    max_idy = max(max_idy, idy)
                    # x1, x2, y1, y2, idy
                    y1 = int(parts[2]) + q_abs_start[v1]
                    y2 = int(parts[3]) + q_abs_start[v1]
                    x1 = int(parts[7 if strand == 1 else 8]) + t_abs_start[v6]
                    x2 = int(parts[8 if strand == 1 else 7]) + t_abs_start[v6]
153
154
                    len_m = sqrt(pow(x2 - x1, 2) + pow(y2 - y1, 2))
                    lines_lens.append(len_m)
155
156
157
158
159
160
                    if idy < self.limit_idy[0]:
                        class_idy = "0"
                    elif idy < self.limit_idy[1]:
                        class_idy = "1"
                    elif idy < self.limit_idy[2]:
                        class_idy = "2"
161
                    else:
162
                        class_idy = "3"
Floreal Cabanettes's avatar
Floreal Cabanettes committed
163
                    lines[class_idy].append([x1, x2, y1, y2, idy, v1, v6])
164
165
166
167
        except IOError:
            self.error = "PAF file does not exist!"
            return False

Floreal Cabanettes's avatar
Floreal Cabanettes committed
168
169
        if not noise and nb_lines > 1000:
            counts, bins, bars = plt.hist(lines_lens, bins=nb_lines//10)
170
171
172
173
174
175
176
177
178
179
180
            counts = list(counts)
            max_value = max(counts)
            max_index = counts.index(max_value)
            limit_index = -1
            for i in range(max_index, len(counts)):
                if counts[i] < max_value / 100:
                    limit_index = i
                    break
            if limit_index > -1:
                lines = self.remove_noise(lines, bins[limit_index])

181
182
183
184
185
186
187
188
        self.parsed = True
        self.len_q = len_q
        self.len_t = len_t
        self.min_idy = min_idy
        self.max_idy = max_idy
        self.lines = lines
        self.q_contigs = q_contigs
        self.q_order = q_order
189
        self.q_reversed = q_reversed
190
191
192
193
194
195
196
        self.t_contigs = t_contigs
        self.t_order = t_order
        self.name_q = name_q
        self.name_t = name_t

    def get_d3js_data(self):
        return {
197
198
            'y_len': self.len_q,
            'x_len': self.len_t,
199
200
201
            'min_idy': self.min_idy,
            'max_idy': self.max_idy,
            'lines': self.lines,
202
203
204
205
206
207
            'y_contigs': self.q_contigs,
            'y_order': self.q_order,
            'x_contigs': self.t_contigs,
            'x_order': self.t_order,
            'name_y': self.name_q,
            'name_x': self.name_t,
208
209
            'limit_idy': self.limit_idy,
            'sorted': self.sorted,
210
211
            'sampled': self.sampled,
            "max_nb_lines": self.max_nb_lines,
212
213
214
215
        }

    def save_json(self, out):
        import json
Floreal Cabanettes's avatar
Floreal Cabanettes committed
216
217
218
        data = self.get_d3js_data()
        with open(out, "w") as out_f:
            out_f.write(json.dumps(data))
Floreal Cabanettes's avatar
Floreal Cabanettes committed
219

220
221
222
223
224
225
226
227
228
229
230
231
232
233
    def is_contig_well_oriented(self, lines: list, contig, chrom):
        """
        Returns True if the contig is well oriented. A well oriented contig
        must have y increased when x increased. We check that only for highest matchs
        (small matches must be ignores)
        :param lines: lines inside the contig
        :return: True if well oriented, False else
        """
        lines.sort(key=lambda x: -x[-1])
        max_len = lines[0][-1]
        i = 0

        # Select sample of tested lines:
        while i < len(lines) and lines[i][-1] > max_len * 0.1 \
234
                and lines[i][-1] >= 0.05 * min(self.q_contigs[contig], self.t_contigs[chrom]):
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
            i += 1
        selected_lines = lines[:i]

        # Check orientation of each line:
        if len(selected_lines) > 1:
            selected_lines.sort(key=lambda x: x[0])
            orients = []
            for i in range(1, len(selected_lines)):
                if selected_lines[i][2] > selected_lines[i-1][2]:
                    orients.append(1)
                else:
                    orients.append(-1)
            if mean(orients) > -0.1:  # We have a good orientation (-0.1 to ignore ambiguous cases)
                return True
        elif len(selected_lines) == 1:
            orient_first = "+" if selected_lines[0][3] < selected_lines[0][4] else "-"  # Get X orientation
            if selected_lines[0][-2 if orient_first == "+" else -3] > \
                    selected_lines[0][-3 if orient_first == "+" else -2]:  # Check Y according to X orientation
                return True
        elif len(selected_lines) == 0:  # None line were selected: we ignore this block
            return True

        # In all other cases the orientation is wrong:
        return False

    def reorient_contigs_in_paf(self, contigs):
        """
        Reorient contigs in the PAF file
        :param contigs: contigs to be reoriented
        """
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
        sorted_file = self.paf + ".sorted"
        with open(self.paf, "r") as source, open(sorted_file, "w") as target:
            for line in source:
                parts = line.strip("\n").split("\t")
                if parts[0] in contigs:
                    len_q = int(parts[1])
                    x1_q = int(parts[2])
                    x2_q = int(parts[3])
                    if parts[4] == "-":
                        parts[4] = "+"
                    else:
                        parts[4] = "-"
                    parts[2] = str(len_q - x2_q)
                    parts[3] = str(len_q - x1_q)
                target.write("\t".join(parts) + "\n")
280
281
282
283
284
        if self.paf.endswith(".sorted"):
            os.remove(self.paf)
            shutil.move(sorted_file, self.paf)
        else:
            self.paf = sorted_file
285
286
287
288
289
290
291
292
        return True

    def _update_query_index(self, contigs_reoriented):
        with open(self.idx_q, "w") as idx:
            idx.write(self.name_q + "\n")
            for contig in self.q_order:
                idx.write("\t".join([contig, str(self.q_contigs[contig]), "1" if contig in contigs_reoriented else "0"])
                          + "\n")
293

294
295
296
297
298
299
300
301
302
    def set_sorted(self, is_sorted):
        self.sorted = is_sorted
        sorted_touch = os.path.join(os.path.dirname(self.paf), ".sorted")
        if is_sorted:
            Path(sorted_touch).touch()
        else:
            if os.path.exists(sorted_touch):
                os.remove(sorted_touch)

303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
    def compute_gravity_contigs(self):
        """
        Compute gravity for each contig on each chromosome (how many big matches they have).
        Will be used to find which chromosome has the highest value for each contig
        :return:
            - gravity for each contig and each chromosome:
                {contig1: {chr1: value, chr2: value, ...}, contig2: ...}
            - For each block save lines inside:
                [median_on_query, squared length, median_on_target, x1, x2, y1, y2, length] (x : on target, y: on query)
        """
        gravity_contig = {}
        lines_on_block = {}
        # Compute size of blocks (in term of how many big match they have), and save median of each match on each one
        # (for next step)
        for line in [j for i in list(self.lines.values()) for j in i]:
            x1 = int(line[0])
            x2 = int(line[1])
            y1 = int(line[2])
            y2 = int(line[3])
            contig = line[5]
            chrm = line[6]
            block = (contig, chrm)
            # X1 and X2 (in good orientation):
            x_1 = min(x1, x2)
            x_2 = max(x2, x1)
            med_q = x_1 + (abs(x_2 - x_1) / 2)
            # Y1 and Y2 (in good orientation):
            y_1 = min(y1, y2)
            y_2 = max(y2, y1)
            med_t = y_1 + (abs(y_2 - y_1) / 2)
            len_m = sqrt(pow(x2 - x1, 2) + pow(y2 - y1, 2))  # Len
            len_m_2 = pow(1 + len_m, 2) # Pow of len
            if block not in lines_on_block:
                lines_on_block[block] = []
            lines_on_block[block].append((med_q, len_m_2, med_t, x1, x2, y1, y2, len_m))

            if contig not in gravity_contig:
                gravity_contig[contig] = {}
            if chrm not in gravity_contig[contig]:
                gravity_contig[contig][chrm] = 0
            gravity_contig[contig][chrm] += len_m_2
        return gravity_contig, lines_on_block

Floreal Cabanettes's avatar
Floreal Cabanettes committed
346
    def sort(self):
347
348
349
        """
        Sort contigs according to reference target and reorient them if needed
        """
350
        self.parse_paf(False)
351
        if not self.sorted:  # Do the sort
352
            gravity_contig , lines_on_block = self.compute_gravity_contigs()
Floreal Cabanettes's avatar
Floreal Cabanettes committed
353

354
355
356
357
358
359
360
361
362
363
364
            # For each contig, find best block, and deduce gravity of contig:
            gravity_on_contig = {}
            reorient_contigs = []
            for contig, chr_blocks in gravity_contig.items():
                # Find best block:
                max_number = 0
                max_chr = None
                for chrm, size in chr_blocks.items():
                    if size > max_number:
                        max_number = size
                        max_chr = chrm
Floreal Cabanettes's avatar
Floreal Cabanettes committed
365

366
367
368
369
370
371
372
373
                # Compute gravity of contig:
                nb_items = 0
                sum_items = 0
                lines_on_selected_block = lines_on_block[(contig, max_chr)]
                for med in lines_on_selected_block:
                    sum_items += med[0] * med[1]
                    nb_items += med[1]
                gravity_on_contig[contig] = sum_items / nb_items
Floreal Cabanettes's avatar
Floreal Cabanettes committed
374

375
376
377
378
                # Check if contig must be re-oriented:
                if len(lines_on_selected_block) > 0:
                    if not self.is_contig_well_oriented(lines_on_selected_block, contig, max_chr):
                        reorient_contigs.append(contig)
379

380
381
            # Sort contigs:
            self.q_order.sort(key=lambda x: gravity_on_contig[x] if x in gravity_on_contig else self.len_q + 1000)
Floreal Cabanettes's avatar
Floreal Cabanettes committed
382

383
384
            self.idx_q += ".sorted"

385
386
387
388
389
390
391
392
            with open(self.idx_q, "w") as idx_q_f:
                idx_q_f.write(self.name_q + "\n")
                for contig in self.q_order:
                    idx_q_f.write("\t".join([contig, str(self.q_contigs[contig])]) + "\n")

            # Re-orient contigs:
            if len(reorient_contigs) > 0:
                self.reorient_contigs_in_paf(reorient_contigs)
393
394
395
396
            else:
                sorted_file = self.paf + ".sorted"
                shutil.copyfile(self.paf, sorted_file)
                self.paf = sorted_file
397

398
399
400
            # Update index:
            self._update_query_index(reorient_contigs)

401
            self.set_sorted(True)
402
403
404
405
406
407
408
409

        else:  # Undo the sort
            if os.path.exists(self.paf):
                os.remove(self.paf)
                self.paf = self.paf.replace(".sorted", "")
            if os.path.exists(self.idx_q):
                os.remove(self.idx_q)
                self.idx_q = self.idx_q.replace(".sorted", "")
410
            self.set_sorted(False)
411
412

        # Re parse PAF file:
Floreal Cabanettes's avatar
Floreal Cabanettes committed
413
414
        self.parsed = False
        self.parse_paf()
415

416
417
418
419
    def reverse_contig(self, contig_name):
        self.parse_paf(False)
        reorient_contigs = [contig_name]
        self.reorient_contigs_in_paf(reorient_contigs)
420
421
        if not self.idx_q.endswith(".sorted"):
            self.idx_q += ".sorted"
422
423
424
425
426
        self._update_query_index(reorient_contigs)
        self.set_sorted(True)
        self.parsed = False
        self.parse_paf()

427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
    def get_query_on_target_association(self):
        """
        For each query, get the best matching chromosome
        :return:
        """
        gravity_contig = self.compute_gravity_contigs()[0]
        query_on_target = {}
        for contig, chr_blocks in gravity_contig.items():
            # Find best block:
            max_number = 0
            max_chr = None
            for chrm, size in chr_blocks.items():
                if size > max_number:
                    max_number = size
                    max_chr = chrm
            if max_chr is not None:
                query_on_target[contig] = max_chr
            else:
                query_on_target[contig] = None
        return query_on_target

    def build_query_on_target_association_file(self):
        """
        For each query, get the best matching chromosome and save it to a CSV file.
        Use the order of queries
        :return: content of the file
        """
        query_on_target = self.get_query_on_target_association()
455
        content = "Query\tTarget\tStrand\n"
456
        for contig in self.q_order:
457
458
459
            strand = "+"
            if contig in self.q_reversed:
                strand = "-" if self.q_reversed[contig] else "+"
460
            if contig in query_on_target:
461
                content += "%s\t%s\t%s\n" % (contig, query_on_target[contig] or "None", strand)
462
            else:
463
                content += "%s\t%s\t%s\n" % (contig, "None", strand)
464
        return content
465

466
    def build_list_no_assoc(self, to):
467
        """
468
469
        Build list of queries that match with None target, or the opposite
        :param to: query or target
470
471
        :return: content of the file
        """
472
        index = self.idx_q if to == "query" else self.idx_t
473
        name, contigs_list, contigs, reversed, abs_start, c_len = Index.load(self.idx_t)
474
475
        with open(self.paf, "r") as paf:
            for line in paf:
476
477
478
                c_name = line.strip("\n").split("\t")[0 if to == "query" else 5]
                if c_name in contigs_list:
                    contigs_list.remove(c_name)
479
        return "\n".join(contigs_list) + "\n"
Floreal Cabanettes's avatar
Floreal Cabanettes committed
480
481
482
483
484
485

    def get_summary_stats(self):
        """
        Get summary of identity
        :return: table with percents by category
        """
486
487
488
489
490
        summary_file = self.paf + ".summary"
        if os.path.exists(summary_file):
            with open(summary_file, "r") as summary_file:
                txt = summary_file.read()
                return json.loads(txt)
Floreal Cabanettes's avatar
Floreal Cabanettes committed
491
        self.parse_paf(False, False)
492
493
        if self.parsed:
            percents = {}
494
            position_idy = ["-1"] * self.len_t
495
496
497
498
499

            cats = sorted(self.lines.keys())

            for cat in cats:
                for line in self.lines[cat]:
500
501
502
                    start = line[0]
                    end = line[1]+1
                    position_idy[start:end] = [cat] * (end - start)
503

504
            counts = Counter(position_idy)
505

506
507
            for cat in counts:
                percents[cat] = counts[cat] / self.len_t * 100
508
509
510
511
512
513

            with open(summary_file, "w") as summary_file:
                summary_file.write(json.dumps(percents))

            return percents
        return None