paf.py 16.5 KB
Newer Older
1
2
#!/usr/bin/env python3

3
import os
4
import shutil
Floreal Cabanettes's avatar
Floreal Cabanettes committed
5
from math import sqrt
6
from numpy import mean
7
from pathlib import Path
Floreal Cabanettes's avatar
Floreal Cabanettes committed
8

9
10

class Paf:
11
    limit_idy = [0.25, 0.5, 0.75]
12
    max_nb_lines = 100000
13

14
    def __init__(self, paf: str, idx_q: str, idx_t: str, auto_parse: bool=True):
15
16
17
        self.paf = paf
        self.idx_q = idx_q
        self.idx_t = idx_t
18
        self.sorted = False
19
        if os.path.exists(os.path.join(os.path.dirname(paf), ".sorted")):
20
21
22
            self.paf += ".sorted"
            self.idx_q += ".sorted"
            self.sorted = True
23
        self.sampled= False
24
25
26
27
        self.len_q = None
        self.len_t = None
        self.min_idy = None
        self.max_idy = None
Floreal Cabanettes's avatar
Floreal Cabanettes committed
28
29
30
31
32
        self.lines = {}
        self.q_contigs = {}
        self.q_order = []
        self.t_contigs = {}
        self.t_order = []
33
        self.q_reversed = {}
34
35
36
37
38
        self.name_q = None
        self.name_t = None
        self.parsed = False
        self.error = False

39
40
        if auto_parse:
            self.parse_paf()
41

42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
    @staticmethod
    def __flush_blocks(index_c, new_index_c, new_index_o, current_block):
        if len(current_block) >= 5:
            block_length = 0
            for contig in current_block:
                block_length += index_c[contig]
            b_name = "###MIX###_" + "###".join(current_block)
            new_index_c[b_name] = block_length
            new_index_o.append(b_name)
        elif len(current_block) > 0:
            for b_name in current_block:
                new_index_c[b_name] = index_c[b_name]
                new_index_o.append(b_name)
        return new_index_c, new_index_o

    def parse_index(self, index_o: list, index_c: dict, full_len: int):
        """
        Parse index and merge too small contigs
        :param index_o: index order
        :param index_c: index contigs def
        :param full_len: length of the sequence
        :return: new index orders and contigs def
        """
        new_index_o = []
        new_index_c = {}
        current_block = []
        for index in index_o:
            if index_c[index] >= 0.002 * full_len:
                new_index_c, new_index_o = self.__flush_blocks(index_c, new_index_c, new_index_o, current_block)
                current_block = []
                new_index_c[index] = index_c[index]
                new_index_o.append(index)
            else:
                current_block.append(index)
        new_index_c, new_index_o = self.__flush_blocks(index_c, new_index_c, new_index_o, current_block)
        return new_index_c, new_index_o

    def parse_paf(self, merge_index=True):
80
81
82
        min_idy = 10000000000
        max_idy = -10000000000
        lines = {
83
84
85
86
            "0": [],
            "1": [],
            "2": [],
            "3": []
87
        }
88
89
        q_abs_start = {}
        q_abs_current_start = 0
90
        q_len = 0
91
92
        try:
            with open(self.idx_q, "r") as idx_q_f:
93
                name_q = idx_q_f.readline().strip("\n")
94
95
                q_order = []
                q_contigs = {}
96
                q_reversed = {}
97
98
99
100
                for line in idx_q_f:
                    parts = line.strip("\n").split("\t")
                    id_c = parts[0]
                    len_c = int(parts[1])
101
102
103
104
                    if len(parts) > 2:
                        q_reversed[id_c] = parts[2] == "1"
                    else:
                        q_reversed[id_c] = False
105
                    q_order.append(id_c)
106
                    q_abs_start[id_c] = q_abs_current_start
107
                    q_contigs[id_c] = len_c
108
                    q_len += len_c
109
                    q_abs_current_start += len_c
110
111
            if merge_index:
                q_contigs, q_order = self.parse_index(q_order, q_contigs, q_len)
112
113
114
115
        except IOError:
            self.error = "Index file does not exist for query!"
            return False

116
117
        t_abs_start = {}
        t_abs_current_start = 0
118
        t_len = 0
119
120
        try:
            with open(self.idx_t, "r") as idx_t_f:
121
                name_t = idx_t_f.readline().strip("\n")
122
123
124
125
126
127
128
                t_order = []
                t_contigs = {}
                for line in idx_t_f:
                    parts = line.strip("\n").split("\t")
                    id_c = parts[0]
                    len_c = int(parts[1])
                    t_order.append(id_c)
129
                    t_abs_start[id_c] = t_abs_current_start
130
                    t_contigs[id_c] = len_c
131
                    t_len += len_c
132
                    t_abs_current_start += len_c
133
134
            if merge_index:
                t_contigs, t_order = self.parse_index(t_order, t_contigs, t_len)
135
136
137
138
        except IOError:
            self.error = "Index file does not exist for target!"
            return False

139
140
141
142
143
        len_q = q_abs_current_start
        len_t = t_abs_current_start

        try:
            with open(self.paf, "r") as paf_file:
144
145
146
147
                paf_lines = paf_file.readlines()
                if len(paf_lines) > self.max_nb_lines:
                    self.sampled = True
                for line in paf_lines[:self.max_nb_lines]:
148
149
150
151
                    parts = line.strip("\n").split("\t")
                    v1 = parts[0]
                    v6 = parts[5]
                    strand = 1 if parts[4] == "+" else -1
152
                    idy = int(parts[9]) / int(parts[10])
153
154
155
156
157
158
159
                    min_idy = min(min_idy, idy)
                    max_idy = max(max_idy, idy)
                    # x1, x2, y1, y2, idy
                    y1 = int(parts[2]) + q_abs_start[v1]
                    y2 = int(parts[3]) + q_abs_start[v1]
                    x1 = int(parts[7 if strand == 1 else 8]) + t_abs_start[v6]
                    x2 = int(parts[8 if strand == 1 else 7]) + t_abs_start[v6]
160
161
162
163
164
165
                    if idy < self.limit_idy[0]:
                        class_idy = "0"
                    elif idy < self.limit_idy[1]:
                        class_idy = "1"
                    elif idy < self.limit_idy[2]:
                        class_idy = "2"
166
                    else:
167
                        class_idy = "3"
Floreal Cabanettes's avatar
Floreal Cabanettes committed
168
                    lines[class_idy].append([x1, x2, y1, y2, idy, v1, v6])
169
170
171
172
        except IOError:
            self.error = "PAF file does not exist!"
            return False

173
174
175
176
177
178
179
180
        self.parsed = True
        self.len_q = len_q
        self.len_t = len_t
        self.min_idy = min_idy
        self.max_idy = max_idy
        self.lines = lines
        self.q_contigs = q_contigs
        self.q_order = q_order
181
        self.q_reversed = q_reversed
182
183
184
185
186
187
188
        self.t_contigs = t_contigs
        self.t_order = t_order
        self.name_q = name_q
        self.name_t = name_t

    def get_d3js_data(self):
        return {
189
190
            'y_len': self.len_q,
            'x_len': self.len_t,
191
192
193
            'min_idy': self.min_idy,
            'max_idy': self.max_idy,
            'lines': self.lines,
194
195
196
197
198
199
            'y_contigs': self.q_contigs,
            'y_order': self.q_order,
            'x_contigs': self.t_contigs,
            'x_order': self.t_order,
            'name_y': self.name_q,
            'name_x': self.name_t,
200
201
            'limit_idy': self.limit_idy,
            'sorted': self.sorted,
202
203
            'sampled': self.sampled,
            "max_nb_lines": self.max_nb_lines,
204
205
206
207
208
209
210
211
212
213
        }

    def save_json(self, out):
        import json
        success, data = self.parse_paf()
        if success:
            with open(out, "w") as out_f:
                out_f.write(json.dumps(data))
        else:
            raise Exception(data)
Floreal Cabanettes's avatar
Floreal Cabanettes committed
214

215
216
217
218
219
220
221
222
223
224
225
226
227
228
    def is_contig_well_oriented(self, lines: list, contig, chrom):
        """
        Returns True if the contig is well oriented. A well oriented contig
        must have y increased when x increased. We check that only for highest matchs
        (small matches must be ignores)
        :param lines: lines inside the contig
        :return: True if well oriented, False else
        """
        lines.sort(key=lambda x: -x[-1])
        max_len = lines[0][-1]
        i = 0

        # Select sample of tested lines:
        while i < len(lines) and lines[i][-1] > max_len * 0.1 \
229
                and lines[i][-1] >= 0.05 * min(self.q_contigs[contig], self.t_contigs[chrom]):
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
            i += 1
        selected_lines = lines[:i]

        # Check orientation of each line:
        if len(selected_lines) > 1:
            selected_lines.sort(key=lambda x: x[0])
            orients = []
            for i in range(1, len(selected_lines)):
                if selected_lines[i][2] > selected_lines[i-1][2]:
                    orients.append(1)
                else:
                    orients.append(-1)
            if mean(orients) > -0.1:  # We have a good orientation (-0.1 to ignore ambiguous cases)
                return True
        elif len(selected_lines) == 1:
            orient_first = "+" if selected_lines[0][3] < selected_lines[0][4] else "-"  # Get X orientation
            if selected_lines[0][-2 if orient_first == "+" else -3] > \
                    selected_lines[0][-3 if orient_first == "+" else -2]:  # Check Y according to X orientation
                return True
        elif len(selected_lines) == 0:  # None line were selected: we ignore this block
            return True

        # In all other cases the orientation is wrong:
        return False

    def reorient_contigs_in_paf(self, contigs):
        """
        Reorient contigs in the PAF file
        :param contigs: contigs to be reoriented
        """
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
        sorted_file = self.paf + ".sorted"
        with open(self.paf, "r") as source, open(sorted_file, "w") as target:
            for line in source:
                parts = line.strip("\n").split("\t")
                if parts[0] in contigs:
                    len_q = int(parts[1])
                    x1_q = int(parts[2])
                    x2_q = int(parts[3])
                    if parts[4] == "-":
                        parts[4] = "+"
                    else:
                        parts[4] = "-"
                    parts[2] = str(len_q - x2_q)
                    parts[3] = str(len_q - x1_q)
                target.write("\t".join(parts) + "\n")
        self.paf = sorted_file
        return True

    def _update_query_index(self, contigs_reoriented):
        with open(self.idx_q, "w") as idx:
            idx.write(self.name_q + "\n")
            for contig in self.q_order:
                idx.write("\t".join([contig, str(self.q_contigs[contig]), "1" if contig in contigs_reoriented else "0"])
                          + "\n")
284

285
286
287
288
289
290
291
292
293
    def set_sorted(self, is_sorted):
        self.sorted = is_sorted
        sorted_touch = os.path.join(os.path.dirname(self.paf), ".sorted")
        if is_sorted:
            Path(sorted_touch).touch()
        else:
            if os.path.exists(sorted_touch):
                os.remove(sorted_touch)

294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
    def compute_gravity_contigs(self):
        """
        Compute gravity for each contig on each chromosome (how many big matches they have).
        Will be used to find which chromosome has the highest value for each contig
        :return:
            - gravity for each contig and each chromosome:
                {contig1: {chr1: value, chr2: value, ...}, contig2: ...}
            - For each block save lines inside:
                [median_on_query, squared length, median_on_target, x1, x2, y1, y2, length] (x : on target, y: on query)
        """
        gravity_contig = {}
        lines_on_block = {}
        # Compute size of blocks (in term of how many big match they have), and save median of each match on each one
        # (for next step)
        for line in [j for i in list(self.lines.values()) for j in i]:
            x1 = int(line[0])
            x2 = int(line[1])
            y1 = int(line[2])
            y2 = int(line[3])
            contig = line[5]
            chrm = line[6]
            block = (contig, chrm)
            # X1 and X2 (in good orientation):
            x_1 = min(x1, x2)
            x_2 = max(x2, x1)
            med_q = x_1 + (abs(x_2 - x_1) / 2)
            # Y1 and Y2 (in good orientation):
            y_1 = min(y1, y2)
            y_2 = max(y2, y1)
            med_t = y_1 + (abs(y_2 - y_1) / 2)
            len_m = sqrt(pow(x2 - x1, 2) + pow(y2 - y1, 2))  # Len
            len_m_2 = pow(1 + len_m, 2) # Pow of len
            if block not in lines_on_block:
                lines_on_block[block] = []
            lines_on_block[block].append((med_q, len_m_2, med_t, x1, x2, y1, y2, len_m))

            if contig not in gravity_contig:
                gravity_contig[contig] = {}
            if chrm not in gravity_contig[contig]:
                gravity_contig[contig][chrm] = 0
            gravity_contig[contig][chrm] += len_m_2
        return gravity_contig, lines_on_block

Floreal Cabanettes's avatar
Floreal Cabanettes committed
337
    def sort(self):
338
339
340
        """
        Sort contigs according to reference target and reorient them if needed
        """
341
        self.parse_paf(False)
342
        if not self.sorted:  # Do the sort
343
            gravity_contig , lines_on_block = self.compute_gravity_contigs()
Floreal Cabanettes's avatar
Floreal Cabanettes committed
344

345
346
347
348
349
350
351
352
353
354
355
            # For each contig, find best block, and deduce gravity of contig:
            gravity_on_contig = {}
            reorient_contigs = []
            for contig, chr_blocks in gravity_contig.items():
                # Find best block:
                max_number = 0
                max_chr = None
                for chrm, size in chr_blocks.items():
                    if size > max_number:
                        max_number = size
                        max_chr = chrm
Floreal Cabanettes's avatar
Floreal Cabanettes committed
356

357
358
359
360
361
362
363
364
                # Compute gravity of contig:
                nb_items = 0
                sum_items = 0
                lines_on_selected_block = lines_on_block[(contig, max_chr)]
                for med in lines_on_selected_block:
                    sum_items += med[0] * med[1]
                    nb_items += med[1]
                gravity_on_contig[contig] = sum_items / nb_items
Floreal Cabanettes's avatar
Floreal Cabanettes committed
365

366
367
368
369
                # Check if contig must be re-oriented:
                if len(lines_on_selected_block) > 0:
                    if not self.is_contig_well_oriented(lines_on_selected_block, contig, max_chr):
                        reorient_contigs.append(contig)
370

371
372
            # Sort contigs:
            self.q_order.sort(key=lambda x: gravity_on_contig[x] if x in gravity_on_contig else self.len_q + 1000)
Floreal Cabanettes's avatar
Floreal Cabanettes committed
373

374
375
            self.idx_q += ".sorted"

376
377
378
379
380
381
382
383
            with open(self.idx_q, "w") as idx_q_f:
                idx_q_f.write(self.name_q + "\n")
                for contig in self.q_order:
                    idx_q_f.write("\t".join([contig, str(self.q_contigs[contig])]) + "\n")

            # Re-orient contigs:
            if len(reorient_contigs) > 0:
                self.reorient_contigs_in_paf(reorient_contigs)
384
385
386
387
            else:
                sorted_file = self.paf + ".sorted"
                shutil.copyfile(self.paf, sorted_file)
                self.paf = sorted_file
388

389
390
391
            # Update index:
            self._update_query_index(reorient_contigs)

392
            self.set_sorted(True)
393
394
395
396
397
398
399
400

        else:  # Undo the sort
            if os.path.exists(self.paf):
                os.remove(self.paf)
                self.paf = self.paf.replace(".sorted", "")
            if os.path.exists(self.idx_q):
                os.remove(self.idx_q)
                self.idx_q = self.idx_q.replace(".sorted", "")
401
            self.set_sorted(False)
402
403

        # Re parse PAF file:
Floreal Cabanettes's avatar
Floreal Cabanettes committed
404
405
        self.parsed = False
        self.parse_paf()
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434

    def get_query_on_target_association(self):
        """
        For each query, get the best matching chromosome
        :return:
        """
        gravity_contig = self.compute_gravity_contigs()[0]
        query_on_target = {}
        for contig, chr_blocks in gravity_contig.items():
            # Find best block:
            max_number = 0
            max_chr = None
            for chrm, size in chr_blocks.items():
                if size > max_number:
                    max_number = size
                    max_chr = chrm
            if max_chr is not None:
                query_on_target[contig] = max_chr
            else:
                query_on_target[contig] = None
        return query_on_target

    def build_query_on_target_association_file(self):
        """
        For each query, get the best matching chromosome and save it to a CSV file.
        Use the order of queries
        :return: content of the file
        """
        query_on_target = self.get_query_on_target_association()
435
        content = "Query\tTarget\tStrand\n"
436
        for contig in self.q_order:
437
438
439
440
            strand = "+"
            if contig in self.q_reversed:
                print("TRUE")
                strand = "-" if self.q_reversed[contig] else "+"
441
            if contig in query_on_target:
442
                content += "%s\t%s\t%s\n" % (contig, query_on_target[contig] or "None", strand)
443
            else:
444
                content += "%s\t%s\t%s\n" % (contig, "None", strand)
445
        return content