paf.py 16.1 KB
Newer Older
1
2
#!/usr/bin/env python3

3
import os
4
import shutil
Floreal Cabanettes's avatar
Floreal Cabanettes committed
5
from math import sqrt
6
from numpy import mean
7
from pathlib import Path
Floreal Cabanettes's avatar
Floreal Cabanettes committed
8

9
10

class Paf:
11
    limit_idy = [0.25, 0.5, 0.75]
12
    max_nb_lines = 100000
13

14
    def __init__(self, paf: str, idx_q: str, idx_t: str, auto_parse: bool=True):
15
16
17
        self.paf = paf
        self.idx_q = idx_q
        self.idx_t = idx_t
18
        self.sorted = False
19
        if os.path.exists(os.path.join(os.path.dirname(paf), ".sorted")):
20
21
22
            self.paf += ".sorted"
            self.idx_q += ".sorted"
            self.sorted = True
23
        self.sampled= False
24
25
26
27
        self.len_q = None
        self.len_t = None
        self.min_idy = None
        self.max_idy = None
Floreal Cabanettes's avatar
Floreal Cabanettes committed
28
29
30
31
32
        self.lines = {}
        self.q_contigs = {}
        self.q_order = []
        self.t_contigs = {}
        self.t_order = []
33
34
35
36
37
        self.name_q = None
        self.name_t = None
        self.parsed = False
        self.error = False

38
39
        if auto_parse:
            self.parse_paf()
40

41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
    @staticmethod
    def __flush_blocks(index_c, new_index_c, new_index_o, current_block):
        if len(current_block) >= 5:
            block_length = 0
            for contig in current_block:
                block_length += index_c[contig]
            b_name = "###MIX###_" + "###".join(current_block)
            new_index_c[b_name] = block_length
            new_index_o.append(b_name)
        elif len(current_block) > 0:
            for b_name in current_block:
                new_index_c[b_name] = index_c[b_name]
                new_index_o.append(b_name)
        return new_index_c, new_index_o

    def parse_index(self, index_o: list, index_c: dict, full_len: int):
        """
        Parse index and merge too small contigs
        :param index_o: index order
        :param index_c: index contigs def
        :param full_len: length of the sequence
        :return: new index orders and contigs def
        """
        new_index_o = []
        new_index_c = {}
        current_block = []
        for index in index_o:
            if index_c[index] >= 0.002 * full_len:
                new_index_c, new_index_o = self.__flush_blocks(index_c, new_index_c, new_index_o, current_block)
                current_block = []
                new_index_c[index] = index_c[index]
                new_index_o.append(index)
            else:
                current_block.append(index)
        new_index_c, new_index_o = self.__flush_blocks(index_c, new_index_c, new_index_o, current_block)
        return new_index_c, new_index_o

    def parse_paf(self, merge_index=True):
79
80
81
        min_idy = 10000000000
        max_idy = -10000000000
        lines = {
82
83
84
85
            "0": [],
            "1": [],
            "2": [],
            "3": []
86
        }
87
88
        q_abs_start = {}
        q_abs_current_start = 0
89
        q_len = 0
90
91
        try:
            with open(self.idx_q, "r") as idx_q_f:
92
                name_q = idx_q_f.readline().strip("\n")
93
94
95
96
97
98
99
                q_order = []
                q_contigs = {}
                for line in idx_q_f:
                    parts = line.strip("\n").split("\t")
                    id_c = parts[0]
                    len_c = int(parts[1])
                    q_order.append(id_c)
100
                    q_abs_start[id_c] = q_abs_current_start
101
                    q_contigs[id_c] = len_c
102
                    q_len += len_c
103
                    q_abs_current_start += len_c
104
105
            if merge_index:
                q_contigs, q_order = self.parse_index(q_order, q_contigs, q_len)
106
107
108
109
        except IOError:
            self.error = "Index file does not exist for query!"
            return False

110
111
        t_abs_start = {}
        t_abs_current_start = 0
112
        t_len = 0
113
114
        try:
            with open(self.idx_t, "r") as idx_t_f:
115
                name_t = idx_t_f.readline().strip("\n")
116
117
118
119
120
121
122
                t_order = []
                t_contigs = {}
                for line in idx_t_f:
                    parts = line.strip("\n").split("\t")
                    id_c = parts[0]
                    len_c = int(parts[1])
                    t_order.append(id_c)
123
                    t_abs_start[id_c] = t_abs_current_start
124
                    t_contigs[id_c] = len_c
125
                    t_len += len_c
126
                    t_abs_current_start += len_c
127
128
            if merge_index:
                t_contigs, t_order = self.parse_index(t_order, t_contigs, t_len)
129
130
131
132
        except IOError:
            self.error = "Index file does not exist for target!"
            return False

133
134
135
136
137
        len_q = q_abs_current_start
        len_t = t_abs_current_start

        try:
            with open(self.paf, "r") as paf_file:
138
139
140
141
                paf_lines = paf_file.readlines()
                if len(paf_lines) > self.max_nb_lines:
                    self.sampled = True
                for line in paf_lines[:self.max_nb_lines]:
142
143
144
145
                    parts = line.strip("\n").split("\t")
                    v1 = parts[0]
                    v6 = parts[5]
                    strand = 1 if parts[4] == "+" else -1
146
                    idy = int(parts[9]) / int(parts[10])
147
148
149
150
151
152
153
                    min_idy = min(min_idy, idy)
                    max_idy = max(max_idy, idy)
                    # x1, x2, y1, y2, idy
                    y1 = int(parts[2]) + q_abs_start[v1]
                    y2 = int(parts[3]) + q_abs_start[v1]
                    x1 = int(parts[7 if strand == 1 else 8]) + t_abs_start[v6]
                    x2 = int(parts[8 if strand == 1 else 7]) + t_abs_start[v6]
154
155
156
157
158
159
                    if idy < self.limit_idy[0]:
                        class_idy = "0"
                    elif idy < self.limit_idy[1]:
                        class_idy = "1"
                    elif idy < self.limit_idy[2]:
                        class_idy = "2"
160
                    else:
161
                        class_idy = "3"
Floreal Cabanettes's avatar
Floreal Cabanettes committed
162
                    lines[class_idy].append([x1, x2, y1, y2, idy, v1, v6])
163
164
165
166
        except IOError:
            self.error = "PAF file does not exist!"
            return False

167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
        self.parsed = True
        self.len_q = len_q
        self.len_t = len_t
        self.min_idy = min_idy
        self.max_idy = max_idy
        self.lines = lines
        self.q_contigs = q_contigs
        self.q_order = q_order
        self.t_contigs = t_contigs
        self.t_order = t_order
        self.name_q = name_q
        self.name_t = name_t

    def get_d3js_data(self):
        return {
182
183
            'y_len': self.len_q,
            'x_len': self.len_t,
184
185
186
            'min_idy': self.min_idy,
            'max_idy': self.max_idy,
            'lines': self.lines,
187
188
189
190
191
192
            'y_contigs': self.q_contigs,
            'y_order': self.q_order,
            'x_contigs': self.t_contigs,
            'x_order': self.t_order,
            'name_y': self.name_q,
            'name_x': self.name_t,
193
194
            'limit_idy': self.limit_idy,
            'sorted': self.sorted,
195
196
            'sampled': self.sampled,
            "max_nb_lines": self.max_nb_lines,
197
198
199
200
201
202
203
204
205
206
        }

    def save_json(self, out):
        import json
        success, data = self.parse_paf()
        if success:
            with open(out, "w") as out_f:
                out_f.write(json.dumps(data))
        else:
            raise Exception(data)
Floreal Cabanettes's avatar
Floreal Cabanettes committed
207

208
209
210
211
212
213
214
215
216
217
218
219
220
221
    def is_contig_well_oriented(self, lines: list, contig, chrom):
        """
        Returns True if the contig is well oriented. A well oriented contig
        must have y increased when x increased. We check that only for highest matchs
        (small matches must be ignores)
        :param lines: lines inside the contig
        :return: True if well oriented, False else
        """
        lines.sort(key=lambda x: -x[-1])
        max_len = lines[0][-1]
        i = 0

        # Select sample of tested lines:
        while i < len(lines) and lines[i][-1] > max_len * 0.1 \
222
                and lines[i][-1] >= 0.05 * min(self.q_contigs[contig], self.t_contigs[chrom]):
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
            i += 1
        selected_lines = lines[:i]

        # Check orientation of each line:
        if len(selected_lines) > 1:
            selected_lines.sort(key=lambda x: x[0])
            orients = []
            for i in range(1, len(selected_lines)):
                if selected_lines[i][2] > selected_lines[i-1][2]:
                    orients.append(1)
                else:
                    orients.append(-1)
            if mean(orients) > -0.1:  # We have a good orientation (-0.1 to ignore ambiguous cases)
                return True
        elif len(selected_lines) == 1:
            orient_first = "+" if selected_lines[0][3] < selected_lines[0][4] else "-"  # Get X orientation
            if selected_lines[0][-2 if orient_first == "+" else -3] > \
                    selected_lines[0][-3 if orient_first == "+" else -2]:  # Check Y according to X orientation
                return True
        elif len(selected_lines) == 0:  # None line were selected: we ignore this block
            return True

        # In all other cases the orientation is wrong:
        return False

    def reorient_contigs_in_paf(self, contigs):
        """
        Reorient contigs in the PAF file
        :param contigs: contigs to be reoriented
        """
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
        sorted_file = self.paf + ".sorted"
        with open(self.paf, "r") as source, open(sorted_file, "w") as target:
            for line in source:
                parts = line.strip("\n").split("\t")
                if parts[0] in contigs:
                    len_q = int(parts[1])
                    x1_q = int(parts[2])
                    x2_q = int(parts[3])
                    if parts[4] == "-":
                        parts[4] = "+"
                    else:
                        parts[4] = "-"
                    parts[2] = str(len_q - x2_q)
                    parts[3] = str(len_q - x1_q)
                target.write("\t".join(parts) + "\n")
        self.paf = sorted_file
        return True

    def _update_query_index(self, contigs_reoriented):
        with open(self.idx_q, "w") as idx:
            idx.write(self.name_q + "\n")
            for contig in self.q_order:
                idx.write("\t".join([contig, str(self.q_contigs[contig]), "1" if contig in contigs_reoriented else "0"])
                          + "\n")
277

278
279
280
281
282
283
284
285
286
    def set_sorted(self, is_sorted):
        self.sorted = is_sorted
        sorted_touch = os.path.join(os.path.dirname(self.paf), ".sorted")
        if is_sorted:
            Path(sorted_touch).touch()
        else:
            if os.path.exists(sorted_touch):
                os.remove(sorted_touch)

287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
    def compute_gravity_contigs(self):
        """
        Compute gravity for each contig on each chromosome (how many big matches they have).
        Will be used to find which chromosome has the highest value for each contig
        :return:
            - gravity for each contig and each chromosome:
                {contig1: {chr1: value, chr2: value, ...}, contig2: ...}
            - For each block save lines inside:
                [median_on_query, squared length, median_on_target, x1, x2, y1, y2, length] (x : on target, y: on query)
        """
        gravity_contig = {}
        lines_on_block = {}
        # Compute size of blocks (in term of how many big match they have), and save median of each match on each one
        # (for next step)
        for line in [j for i in list(self.lines.values()) for j in i]:
            x1 = int(line[0])
            x2 = int(line[1])
            y1 = int(line[2])
            y2 = int(line[3])
            contig = line[5]
            chrm = line[6]
            block = (contig, chrm)
            # X1 and X2 (in good orientation):
            x_1 = min(x1, x2)
            x_2 = max(x2, x1)
            med_q = x_1 + (abs(x_2 - x_1) / 2)
            # Y1 and Y2 (in good orientation):
            y_1 = min(y1, y2)
            y_2 = max(y2, y1)
            med_t = y_1 + (abs(y_2 - y_1) / 2)
            len_m = sqrt(pow(x2 - x1, 2) + pow(y2 - y1, 2))  # Len
            len_m_2 = pow(1 + len_m, 2) # Pow of len
            if block not in lines_on_block:
                lines_on_block[block] = []
            lines_on_block[block].append((med_q, len_m_2, med_t, x1, x2, y1, y2, len_m))

            if contig not in gravity_contig:
                gravity_contig[contig] = {}
            if chrm not in gravity_contig[contig]:
                gravity_contig[contig][chrm] = 0
            gravity_contig[contig][chrm] += len_m_2
        return gravity_contig, lines_on_block

Floreal Cabanettes's avatar
Floreal Cabanettes committed
330
    def sort(self):
331
332
333
        """
        Sort contigs according to reference target and reorient them if needed
        """
334
        self.parse_paf(False)
335
        if not self.sorted:  # Do the sort
336
            gravity_contig , lines_on_block = self.compute_gravity_contigs()
Floreal Cabanettes's avatar
Floreal Cabanettes committed
337

338
339
340
341
342
343
344
345
346
347
348
            # For each contig, find best block, and deduce gravity of contig:
            gravity_on_contig = {}
            reorient_contigs = []
            for contig, chr_blocks in gravity_contig.items():
                # Find best block:
                max_number = 0
                max_chr = None
                for chrm, size in chr_blocks.items():
                    if size > max_number:
                        max_number = size
                        max_chr = chrm
Floreal Cabanettes's avatar
Floreal Cabanettes committed
349

350
351
352
353
354
355
356
357
                # Compute gravity of contig:
                nb_items = 0
                sum_items = 0
                lines_on_selected_block = lines_on_block[(contig, max_chr)]
                for med in lines_on_selected_block:
                    sum_items += med[0] * med[1]
                    nb_items += med[1]
                gravity_on_contig[contig] = sum_items / nb_items
Floreal Cabanettes's avatar
Floreal Cabanettes committed
358

359
360
361
362
                # Check if contig must be re-oriented:
                if len(lines_on_selected_block) > 0:
                    if not self.is_contig_well_oriented(lines_on_selected_block, contig, max_chr):
                        reorient_contigs.append(contig)
363

364
365
            # Sort contigs:
            self.q_order.sort(key=lambda x: gravity_on_contig[x] if x in gravity_on_contig else self.len_q + 1000)
Floreal Cabanettes's avatar
Floreal Cabanettes committed
366

367
368
            self.idx_q += ".sorted"

369
370
371
372
373
374
375
376
            with open(self.idx_q, "w") as idx_q_f:
                idx_q_f.write(self.name_q + "\n")
                for contig in self.q_order:
                    idx_q_f.write("\t".join([contig, str(self.q_contigs[contig])]) + "\n")

            # Re-orient contigs:
            if len(reorient_contigs) > 0:
                self.reorient_contigs_in_paf(reorient_contigs)
377
378
379
380
            else:
                sorted_file = self.paf + ".sorted"
                shutil.copyfile(self.paf, sorted_file)
                self.paf = sorted_file
381

382
383
384
            # Update index:
            self._update_query_index(reorient_contigs)

385
            self.set_sorted(True)
386
387
388
389
390
391
392
393

        else:  # Undo the sort
            if os.path.exists(self.paf):
                os.remove(self.paf)
                self.paf = self.paf.replace(".sorted", "")
            if os.path.exists(self.idx_q):
                os.remove(self.idx_q)
                self.idx_q = self.idx_q.replace(".sorted", "")
394
            self.set_sorted(False)
395
396

        # Re parse PAF file:
Floreal Cabanettes's avatar
Floreal Cabanettes committed
397
398
        self.parsed = False
        self.parse_paf()
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434

    def get_query_on_target_association(self):
        """
        For each query, get the best matching chromosome
        :return:
        """
        gravity_contig = self.compute_gravity_contigs()[0]
        query_on_target = {}
        for contig, chr_blocks in gravity_contig.items():
            # Find best block:
            max_number = 0
            max_chr = None
            for chrm, size in chr_blocks.items():
                if size > max_number:
                    max_number = size
                    max_chr = chrm
            if max_chr is not None:
                query_on_target[contig] = max_chr
            else:
                query_on_target[contig] = None
        return query_on_target

    def build_query_on_target_association_file(self):
        """
        For each query, get the best matching chromosome and save it to a CSV file.
        Use the order of queries
        :return: content of the file
        """
        query_on_target = self.get_query_on_target_association()
        content = "Query\tTarget\n"
        for contig in self.q_order:
            if contig in query_on_target:
                content += "%s\t%s\n" % (contig, query_on_target[contig] or "None")
            else:
                content += "%s\t%s\n" % (contig, "None")
        return content