RAPPEL : Opération de maintenance > ForgeMIA indisponible le 20 Janvier entre 7h et 12h

paf.py 31.8 KB
Newer Older
1
2
#!/usr/bin/env python3

3
4
from dgenies import MODE

5
import os
6
import shutil
Floreal Cabanettes's avatar
Floreal Cabanettes committed
7
from math import sqrt
8
from numpy import mean
9
from pathlib import Path
10
import json
Floreal Cabanettes's avatar
Floreal Cabanettes committed
11
from dgenies.bin.index import Index
12
from dgenies.lib.functions import Functions
13
from intervaltree import IntervalTree
14
15
16
17
18
19
import matplotlib as mpl
mpl.use('Agg')
from matplotlib import pyplot as plt
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
20
21


22
class Paf:
23
    limit_idy = [0.25, 0.5, 0.75]
24
    max_nb_lines = 100000
25

26
    def __init__(self, paf: str, idx_q: str, idx_t: str, auto_parse: bool=True, mailer=None, id_job=None):
27
28
29
        self.paf = paf
        self.idx_q = idx_q
        self.idx_t = idx_t
30
        self.sorted = False
31
32
        self.data_dir = os.path.dirname(paf)
        if os.path.exists(os.path.join(self.data_dir, ".sorted")):
33
34
35
            self.paf += ".sorted"
            self.idx_q += ".sorted"
            self.sorted = True
36
        self.sampled= False
37
38
39
40
        self.len_q = None
        self.len_t = None
        self.min_idy = None
        self.max_idy = None
Floreal Cabanettes's avatar
Floreal Cabanettes committed
41
42
43
44
45
        self.lines = {}
        self.q_contigs = {}
        self.q_order = []
        self.t_contigs = {}
        self.t_order = []
46
        self.q_reversed = {}
47
48
49
50
        self.name_q = None
        self.name_t = None
        self.parsed = False
        self.error = False
51
52
        self.mailer = mailer
        self.id_job = id_job
53
54
        self.q_abs_start = {}
        self.t_abs_start = {}
55

56
57
        if auto_parse:
            self.parse_paf()
58

59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
    @staticmethod
    def __flush_blocks(index_c, new_index_c, new_index_o, current_block):
        if len(current_block) >= 5:
            block_length = 0
            for contig in current_block:
                block_length += index_c[contig]
            b_name = "###MIX###_" + "###".join(current_block)
            new_index_c[b_name] = block_length
            new_index_o.append(b_name)
        elif len(current_block) > 0:
            for b_name in current_block:
                new_index_c[b_name] = index_c[b_name]
                new_index_o.append(b_name)
        return new_index_c, new_index_o

    def parse_index(self, index_o: list, index_c: dict, full_len: int):
        """
        Parse index and merge too small contigs
        :param index_o: index order
        :param index_c: index contigs def
        :param full_len: length of the sequence
        :return: new index orders and contigs def
        """
        new_index_o = []
        new_index_c = {}
        current_block = []
        for index in index_o:
            if index_c[index] >= 0.002 * full_len:
                new_index_c, new_index_o = self.__flush_blocks(index_c, new_index_c, new_index_o, current_block)
                current_block = []
                new_index_c[index] = index_c[index]
                new_index_o.append(index)
            else:
                current_block.append(index)
        new_index_c, new_index_o = self.__flush_blocks(index_c, new_index_c, new_index_o, current_block)
        return new_index_c, new_index_o

96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
    @staticmethod
    def remove_noise(lines, noise_limit):
        keep_lines = {
            "0": [],
            "1": [],
            "2": [],
            "3": []
        }
        for cls, c_lines in lines.items():
            for line in c_lines:
                x1 = line[0]
                x2 = line[1]
                y1 = line[2]
                y2 = line[3]
                idy = line[4]
                len_m = sqrt(pow(x2 - x1, 2) + pow(y2 - y1, 2))
                if len_m >= noise_limit:
                    keep_lines[cls].append(line)
        return keep_lines

    def parse_paf(self, merge_index=True, noise=True):
117
118
119
        min_idy = 10000000000
        max_idy = -10000000000
        lines = {
120
121
122
123
            "0": [],  # idy < 0.25
            "1": [],  # idy < 0.5
            "2": [],  # idy < 0.75
            "3": []  # idy > 0.75
124
125
        }
        try:
126
            name_q, q_order, q_contigs, q_reversed, q_abs_start, len_q = Index.load(self.idx_q)
127
            self.q_abs_start = q_abs_start
128
            if merge_index:
129
                q_contigs, q_order = self.parse_index(q_order, q_contigs, len_q)
130
131
132
133
134
        except IOError:
            self.error = "Index file does not exist for query!"
            return False

        try:
135
            name_t, t_order, t_contigs, t_reversed, t_abs_start, len_t = Index.load(self.idx_t)
136
            self.t_abs_start = t_abs_start
137
            if merge_index:
138
                t_contigs, t_order = self.parse_index(t_order, t_contigs, len_t)
139
140
141
142
        except IOError:
            self.error = "Index file does not exist for target!"
            return False

143
        lines_lens = []
144
145
146

        try:
            with open(self.paf, "r") as paf_file:
Floreal Cabanettes's avatar
Floreal Cabanettes committed
147
148
149
150
151
152
                nb_lines = 0
                for line in paf_file:
                    nb_lines += 1
                    if nb_lines > self.max_nb_lines:
                        self.sampled = True
                        break
153
154
155
156
                    parts = line.strip("\n").split("\t")
                    v1 = parts[0]
                    v6 = parts[5]
                    strand = 1 if parts[4] == "+" else -1
157
                    idy = int(parts[9]) / int(parts[10])
158
159
160
161
162
163
164
                    min_idy = min(min_idy, idy)
                    max_idy = max(max_idy, idy)
                    # x1, x2, y1, y2, idy
                    y1 = int(parts[2]) + q_abs_start[v1]
                    y2 = int(parts[3]) + q_abs_start[v1]
                    x1 = int(parts[7 if strand == 1 else 8]) + t_abs_start[v6]
                    x2 = int(parts[8 if strand == 1 else 7]) + t_abs_start[v6]
165
166
                    len_m = sqrt(pow(x2 - x1, 2) + pow(y2 - y1, 2))
                    lines_lens.append(len_m)
167
168
169
170
171
172
                    if idy < self.limit_idy[0]:
                        class_idy = "0"
                    elif idy < self.limit_idy[1]:
                        class_idy = "1"
                    elif idy < self.limit_idy[2]:
                        class_idy = "2"
173
                    else:
174
                        class_idy = "3"
Floreal Cabanettes's avatar
Floreal Cabanettes committed
175
                    lines[class_idy].append([x1, x2, y1, y2, idy, v1, v6])
176
177
178
179
        except IOError:
            self.error = "PAF file does not exist!"
            return False

Floreal Cabanettes's avatar
Floreal Cabanettes committed
180
181
        if not noise and nb_lines > 1000:
            counts, bins, bars = plt.hist(lines_lens, bins=nb_lines//10)
182
183
184
185
186
187
188
189
190
191
192
            counts = list(counts)
            max_value = max(counts)
            max_index = counts.index(max_value)
            limit_index = -1
            for i in range(max_index, len(counts)):
                if counts[i] < max_value / 100:
                    limit_index = i
                    break
            if limit_index > -1:
                lines = self.remove_noise(lines, bins[limit_index])

193
194
195
196
197
198
199
200
        self.parsed = True
        self.len_q = len_q
        self.len_t = len_t
        self.min_idy = min_idy
        self.max_idy = max_idy
        self.lines = lines
        self.q_contigs = q_contigs
        self.q_order = q_order
201
        self.q_reversed = q_reversed
202
203
204
205
206
207
208
        self.t_contigs = t_contigs
        self.t_order = t_order
        self.name_q = name_q
        self.name_t = name_t

    def get_d3js_data(self):
        return {
209
210
            'y_len': self.len_q,
            'x_len': self.len_t,
211
212
213
            'min_idy': self.min_idy,
            'max_idy': self.max_idy,
            'lines': self.lines,
214
215
216
217
218
219
            'y_contigs': self.q_contigs,
            'y_order': self.q_order,
            'x_contigs': self.t_contigs,
            'x_order': self.t_order,
            'name_y': self.name_q,
            'name_x': self.name_t,
220
221
            'limit_idy': self.limit_idy,
            'sorted': self.sorted,
222
223
            'sampled': self.sampled,
            "max_nb_lines": self.max_nb_lines,
224
225
226
227
        }

    def save_json(self, out):
        import json
Floreal Cabanettes's avatar
Floreal Cabanettes committed
228
229
230
        data = self.get_d3js_data()
        with open(out, "w") as out_f:
            out_f.write(json.dumps(data))
Floreal Cabanettes's avatar
Floreal Cabanettes committed
231

232
233
234
235
236
237
238
239
240
241
242
243
244
245
    def is_contig_well_oriented(self, lines: list, contig, chrom):
        """
        Returns True if the contig is well oriented. A well oriented contig
        must have y increased when x increased. We check that only for highest matchs
        (small matches must be ignores)
        :param lines: lines inside the contig
        :return: True if well oriented, False else
        """
        lines.sort(key=lambda x: -x[-1])
        max_len = lines[0][-1]
        i = 0

        # Select sample of tested lines:
        while i < len(lines) and lines[i][-1] > max_len * 0.1 \
246
                and lines[i][-1] >= 0.05 * min(self.q_contigs[contig], self.t_contigs[chrom]):
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
            i += 1
        selected_lines = lines[:i]

        # Check orientation of each line:
        if len(selected_lines) > 1:
            selected_lines.sort(key=lambda x: x[0])
            orients = []
            for i in range(1, len(selected_lines)):
                if selected_lines[i][2] > selected_lines[i-1][2]:
                    orients.append(1)
                else:
                    orients.append(-1)
            if mean(orients) > -0.1:  # We have a good orientation (-0.1 to ignore ambiguous cases)
                return True
        elif len(selected_lines) == 1:
            orient_first = "+" if selected_lines[0][3] < selected_lines[0][4] else "-"  # Get X orientation
            if selected_lines[0][-2 if orient_first == "+" else -3] > \
                    selected_lines[0][-3 if orient_first == "+" else -2]:  # Check Y according to X orientation
                return True
        elif len(selected_lines) == 0:  # None line were selected: we ignore this block
            return True

        # In all other cases the orientation is wrong:
        return False

    def reorient_contigs_in_paf(self, contigs):
        """
        Reorient contigs in the PAF file
        :param contigs: contigs to be reoriented
        """
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
        sorted_file = self.paf + ".sorted"
        with open(self.paf, "r") as source, open(sorted_file, "w") as target:
            for line in source:
                parts = line.strip("\n").split("\t")
                if parts[0] in contigs:
                    len_q = int(parts[1])
                    x1_q = int(parts[2])
                    x2_q = int(parts[3])
                    if parts[4] == "-":
                        parts[4] = "+"
                    else:
                        parts[4] = "-"
                    parts[2] = str(len_q - x2_q)
                    parts[3] = str(len_q - x1_q)
                target.write("\t".join(parts) + "\n")
292
293
294
295
296
        if self.paf.endswith(".sorted"):
            os.remove(self.paf)
            shutil.move(sorted_file, self.paf)
        else:
            self.paf = sorted_file
297
298
299
300
301
302
303
304
        return True

    def _update_query_index(self, contigs_reoriented):
        with open(self.idx_q, "w") as idx:
            idx.write(self.name_q + "\n")
            for contig in self.q_order:
                idx.write("\t".join([contig, str(self.q_contigs[contig]), "1" if contig in contigs_reoriented else "0"])
                          + "\n")
305

306
307
308
309
310
311
312
313
314
    def set_sorted(self, is_sorted):
        self.sorted = is_sorted
        sorted_touch = os.path.join(os.path.dirname(self.paf), ".sorted")
        if is_sorted:
            Path(sorted_touch).touch()
        else:
            if os.path.exists(sorted_touch):
                os.remove(sorted_touch)

315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
    def compute_gravity_contigs(self):
        """
        Compute gravity for each contig on each chromosome (how many big matches they have).
        Will be used to find which chromosome has the highest value for each contig
        :return:
            - gravity for each contig and each chromosome:
                {contig1: {chr1: value, chr2: value, ...}, contig2: ...}
            - For each block save lines inside:
                [median_on_query, squared length, median_on_target, x1, x2, y1, y2, length] (x : on target, y: on query)
        """
        gravity_contig = {}
        lines_on_block = {}
        # Compute size of blocks (in term of how many big match they have), and save median of each match on each one
        # (for next step)
        for line in [j for i in list(self.lines.values()) for j in i]:
            x1 = int(line[0])
            x2 = int(line[1])
            y1 = int(line[2])
            y2 = int(line[3])
            contig = line[5]
            chrm = line[6]
            block = (contig, chrm)
            # X1 and X2 (in good orientation):
            x_1 = min(x1, x2)
            x_2 = max(x2, x1)
            med_q = x_1 + (abs(x_2 - x_1) / 2)
            # Y1 and Y2 (in good orientation):
            y_1 = min(y1, y2)
            y_2 = max(y2, y1)
            med_t = y_1 + (abs(y_2 - y_1) / 2)
            len_m = sqrt(pow(x2 - x1, 2) + pow(y2 - y1, 2))  # Len
            len_m_2 = pow(1 + len_m, 2) # Pow of len
            if block not in lines_on_block:
                lines_on_block[block] = []
            lines_on_block[block].append((med_q, len_m_2, med_t, x1, x2, y1, y2, len_m))

            if contig not in gravity_contig:
                gravity_contig[contig] = {}
            if chrm not in gravity_contig[contig]:
                gravity_contig[contig][chrm] = 0
            gravity_contig[contig][chrm] += len_m_2
        return gravity_contig, lines_on_block

Floreal Cabanettes's avatar
Floreal Cabanettes committed
358
    def sort(self):
359
360
361
        """
        Sort contigs according to reference target and reorient them if needed
        """
362
        self.parse_paf(False)
363
        sorted_file = self.paf + ".sorted"
364
        if not self.sorted:  # Do the sort
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
            if not self.paf.endswith(".sorted") and not self.idx_q.endswith(".sorted") and \
                    (not os.path.exists(self.paf + ".sorted") or not os.path.exists(self.idx_q + ".sorted")):
                gravity_contig , lines_on_block = self.compute_gravity_contigs()

                # For each contig, find best block, and deduce gravity of contig:
                gravity_on_contig = {}
                reorient_contigs = []
                for contig, chr_blocks in gravity_contig.items():
                    # Find best block:
                    max_number = 0
                    max_chr = None
                    for chrm, size in chr_blocks.items():
                        if size > max_number:
                            max_number = size
                            max_chr = chrm

                    # Compute gravity of contig:
                    nb_items = 0
                    sum_items = 0
                    lines_on_selected_block = lines_on_block[(contig, max_chr)]
                    for med in lines_on_selected_block:
                        sum_items += med[0] * med[1]
                        nb_items += med[1]
                    gravity_on_contig[contig] = sum_items / nb_items

                    # Check if contig must be re-oriented:
                    if len(lines_on_selected_block) > 0:
                        if not self.is_contig_well_oriented(lines_on_selected_block, contig, max_chr):
                            reorient_contigs.append(contig)

                # Sort contigs:
                self.q_order.sort(key=lambda x: gravity_on_contig[x] if x in gravity_on_contig else self.len_q + 1000)

                self.idx_q += ".sorted"

                with open(self.idx_q, "w") as idx_q_f:
                    idx_q_f.write(self.name_q + "\n")
                    for contig in self.q_order:
                        idx_q_f.write("\t".join([contig, str(self.q_contigs[contig])]) + "\n")

                # Re-orient contigs:
                if len(reorient_contigs) > 0:
                    self.reorient_contigs_in_paf(reorient_contigs)
                else:
                    shutil.copyfile(self.paf, sorted_file)
410

411
412
                # Update index:
                self._update_query_index(reorient_contigs)
413

414
            else:
415
                self.idx_q += ".sorted"
416
            self.set_sorted(True)
417
            self.paf = sorted_file
418
419

        else:  # Undo the sort
420
421
            self.paf = self.paf.replace(".sorted", "")
            self.idx_q = self.idx_q.replace(".sorted", "")
422
            self.set_sorted(False)
423
424

        # Re parse PAF file:
Floreal Cabanettes's avatar
Floreal Cabanettes committed
425
426
        self.parsed = False
        self.parse_paf()
427

428
429
430
431
    def reverse_contig(self, contig_name):
        self.parse_paf(False)
        reorient_contigs = [contig_name]
        self.reorient_contigs_in_paf(reorient_contigs)
432
433
        if not self.idx_q.endswith(".sorted"):
            self.idx_q += ".sorted"
434
435
436
437
438
        self._update_query_index(reorient_contigs)
        self.set_sorted(True)
        self.parsed = False
        self.parse_paf()

439
    def get_query_on_target_association(self, with_coords=True):
440
441
442
443
        """
        For each query, get the best matching chromosome
        :return:
        """
444
        gravity_contig, lines_on_block = self.compute_gravity_contigs()
445
446
447
448
449
        query_on_target = {}
        for contig, chr_blocks in gravity_contig.items():
            # Find best block:
            max_number = 0
            max_chr = None
450
451
452
453
            min_target = -1
            max_target = -1
            min_query = -1
            max_query = -1
454
455
456
457
            for chrm, size in chr_blocks.items():
                if size > max_number:
                    max_number = size
                    max_chr = chrm
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
                    min_target = -1
                    max_target = -1
                    min_query = -1
                    max_query = -1
                    if with_coords:
                        for line in lines_on_block[(contig, chrm)]:
                            x1 = min(line[3], line[4]) - self.t_abs_start[chrm]
                            x2 = max(line[3], line[4]) - self.t_abs_start[chrm]
                            if x1 < min_target or min_target == -1:
                                min_target = x1
                            if x2 > max_target:
                                max_target = x2
                            y1 = min(line[5], line[6]) - self.q_abs_start[contig]
                            y2 = max(line[5], line[6]) - self.q_abs_start[contig]
                            if y1 < min_query or min_query == -1:
                                min_query = y1
                            if y2 > max_query:
                                max_query = y2
476
            if max_chr is not None:
477
                query_on_target[contig] = (max_chr, min_query, max_query, min_target, max_target)
478
479
480
481
            else:
                query_on_target[contig] = None
        return query_on_target

482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
    def get_queries_on_target_association(self):
        """
        For each target, get the list of queries associated to it
        :return:
        """
        gravity_contig = self.compute_gravity_contigs()[0]
        queries_on_target = {}
        for contig, chr_blocks in gravity_contig.items():
            # Find best block:
            max_number = 0
            max_chr = None
            for chrm, size in chr_blocks.items():
                if size > max_number:
                    max_number = size
                    max_chr = chrm
            if max_chr is not None:
                if max_chr not in queries_on_target:
                    queries_on_target[max_chr] = []
                queries_on_target[max_chr].append(contig)
        return queries_on_target

503
504
505
506
507
508
    def build_query_on_target_association_file(self):
        """
        For each query, get the best matching chromosome and save it to a CSV file.
        Use the order of queries
        :return: content of the file
        """
509
510
        query_on_target = self.get_query_on_target_association(with_coords=True)
        content = "Query\tTarget\tStrand\tQ-len\tQ-start\tQ-stop\tT-len\tT-start\tT-stop\n"
511
        for contig in self.q_order:
512
513
514
            strand = "+"
            if contig in self.q_reversed:
                strand = "-" if self.q_reversed[contig] else "+"
515
            if contig in query_on_target:
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
                min_target = str(query_on_target[contig][3])
                if min_target == "-1":
                    min_target = "na"
                max_target = str(query_on_target[contig][4])
                if max_target == "-1":
                    max_target = "na"
                min_query = str(query_on_target[contig][1])
                if min_query == "-1":
                    min_query = "na"
                max_query = str(query_on_target[contig][2])
                if max_query == "-1":
                    max_query = "na"
                chrm = query_on_target[contig][0]
                content += "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (contig, chrm or "None", strand,
                                                                     str(self.q_contigs[contig]), min_query, max_query,
                                                                     str(self.t_contigs[chrm]), min_target, max_target)
532
            else:
533
534
                content += "%s\t%s\t%s\t%s\tna\tna\tna\tna\tna\n" % (contig, "None", strand,
                                                                     str(self.q_contigs[contig]))
535
        return content
536

537
    def build_list_no_assoc(self, to):
538
        """
539
540
        Build list of queries that match with None target, or the opposite
        :param to: query or target
541
542
        :return: content of the file
        """
543
        index = self.idx_q if to == "query" else self.idx_t
Floreal Cabanettes's avatar
Floreal Cabanettes committed
544
        name, contigs_list, contigs, reversed, abs_start, c_len = Index.load(index)
545
        contigs_list = set(contigs_list)
546
547
        with open(self.paf, "r") as paf:
            for line in paf:
548
549
550
                c_name = line.strip("\n").split("\t")[0 if to == "query" else 5]
                if c_name in contigs_list:
                    contigs_list.remove(c_name)
551
        return "\n".join(contigs_list) + "\n"
Floreal Cabanettes's avatar
Floreal Cabanettes committed
552

553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
    def _add_percents(self, percents, item):
        i_count = item.length()
        percents[str(item.data)] += i_count
        percents["-1"] -= i_count
        return percents

    def _remove_overlaps(self, position_idy: IntervalTree, percents: dict):
        while len(position_idy) > 0:
            item = position_idy.pop()
            start = item.begin
            end = item.end
            cat = item.data
            overlaps = position_idy.search(start,end)
            if len(overlaps) > 0:
                has_overlap = False
                for overlap in overlaps:
                    if has_overlap:
                        break
                    o_start = overlap.begin
                    o_end = overlap.end
                    o_cat = overlap.data
                    if not position_idy.containsi(o_start, o_end, o_cat):
                        continue
                    if start < o_start:
                        if end <= o_end:
                            # cccccccccccccc*******
                            # *****ooooooooo[ooooooo]
                            if o_cat < cat:
                                if end < o_end:
                                    # No overlap with the current item, we stay has_overlap as False
                                    position_idy.discard(overlap)
                                    position_idy[end:o_end] = o_cat
                                else:
                                    position_idy.discard(overlap)  # No kept overlap
                            elif o_cat == cat:
                                if end < o_end:
                                    has_overlap = True
                                    position_idy.discard(overlap)
                                    position_idy[start:o_end] = cat
                                else:
                                    position_idy.discard(overlap)  # No kept overlap
                            else:
                                has_overlap = True
                                position_idy.discard(overlap)
                                position_idy[start:o_start] = cat
                                position_idy[o_start:o_end] = o_cat
                        else:  # end > o_end
                            # ccccccccccccccccccc
                            # *****oooooooooo****
                            if o_cat <= cat:
                                position_idy.discard(overlap)  # No kept overlap
                            else:  # o_cat > cat
                                has_overlap = True
                                position_idy.discard(overlap)
                                position_idy[start:o_start] = cat
                                position_idy[o_start:o_end] = o_cat
                                position_idy[o_end:end] = cat
                    elif start == o_start:
                        if end < o_end:
                            # cccccccccccc*******
                            # ooooooooooooooooooo
                            if o_cat < cat:
                                # No overlap with the current item, we stay has_overlap as False
                                position_idy.discard(overlap)
                                position_idy[end:o_end] = o_cat
                            elif o_cat == cat:
                                has_overlap = True
                                position_idy.discard(overlap)
                                position_idy[start:o_end] = cat
                            else:  # o_cat > cat
                                # The overlap just contains current item
                                has_overlap = True
                        elif end == o_end:
                            # ***cccccccccccccccc***
                            # ***oooooooooooooooo***
                            if o_cat <= cat:
                                position_idy.discard(overlap)  # No kept overlap
                            else:
                                # The overlap just contains current item
                                has_overlap = True
                        else:  # end > o_end
                            # ccccccccccccccccccccccccccccc
                            # oooooooooooooooooooo*********
                            if o_cat <= cat:
                                # current item just contains the overlap
                                position_idy.discard(overlap)
                            else:
                                has_overlap = True
                                position_idy.discard(overlap)
                                position_idy[o_start:o_end] = o_cat
                                position_idy[o_end:end] = cat
                    else:  # start > o_start
                        if end <= o_end:
                            # ******ccccccccc*******
                            # ooooooooooooooo[ooooooo]
                            if o_cat < cat:
                                has_overlap=True
                                position_idy.discard(overlap)
                                position_idy[o_start:start] = o_cat
                                position_idy[start:end] = cat
                                if end < o_end:
                                    position_idy[end:o_end] = o_cat
                            else:  # o_cat >= cat
                                # Overlap just contains the item
                                has_overlap = True
                        else: # end > o_end
                            # ******ccccccccccccccccccccc
                            # ooooooooooooooooo**********
                            if o_cat < cat:
                                has_overlap = True
                                position_idy.discard(overlap)
                                position_idy[o_start:start] = o_cat
                                position_idy[start:end] = cat
                            elif o_cat == cat:
                                has_overlap = True
                                position_idy.discard(overlap)
                                position_idy[o_start:end] = cat
                            else:  # o_cat > cat
                                has_overlap = True
                                position_idy[o_end:end] = cat
                if not has_overlap:
                    percents = self._add_percents(percents, item)

676
            else:
677
                percents = self._add_percents(percents, item)
678

679
        return percents
680

681
    def build_summary_stats(self, status_file):
Floreal Cabanettes's avatar
Floreal Cabanettes committed
682
683
684
685
        """
        Get summary of identity
        :return: table with percents by category
        """
686
        summary_file = self.paf + ".summary"
Floreal Cabanettes's avatar
Floreal Cabanettes committed
687
        self.parse_paf(False, False)
688
        if self.parsed:
689
            percents = {"-1": self.len_t}
690
            position_idy = IntervalTree()
691
692
693
694

            cats = sorted(self.lines.keys())

            for cat in cats:
695
                percents[cat] = 0
696
                for line in self.lines[cat]:
697
                    start = min(line[0], line[1])
698
699
                    end = max(line[0], line[1]) + 1
                    position_idy[start:end] = int(cat)
700

701
            percents = self._remove_overlaps(position_idy, percents)
Floreal Cabanettes's avatar
Floreal Cabanettes committed
702

703
704
            for cat in percents:
                percents[cat] = percents[cat] / self.len_t * 100
705
706
707
708

            with open(summary_file, "w") as summary_file:
                summary_file.write(json.dumps(percents))

709
            os.remove(status_file)
710
            return percents
711
712
713
714
715
716
717
718
719
        shutil.move(status_file, status_file + ".fail")
        return None

    def get_summary_stats(self):
        summary_file = self.paf + ".summary"
        if os.path.exists(summary_file):
            with open(summary_file, "r") as summary_file:
                txt = summary_file.read()
                return json.loads(txt)
720
        return None
721
722
723
724
725
726
727
728
729
730

    def build_query_chr_as_reference(self):
        try:
            if not self.sorted:
                raise Exception("Contigs must be sorted to do that!")
            with open(os.path.join(self.data_dir, ".query")) as query_file:
                query_fasta = query_file.read().strip("\n")
            if not os.path.isfile(query_fasta):
                raise Exception("Query fasta does not exists")
            o_fasta = os.path.join(os.path.dirname(query_fasta), "as_reference_" + os.path.basename(query_fasta))
Floreal Cabanettes's avatar
Floreal Cabanettes committed
731
732
            if o_fasta.endswith(".gz"):
                o_fasta = o_fasta[:-3]
Floreal Cabanettes's avatar
Floreal Cabanettes committed
733
            if not os.path.exists(o_fasta):
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
                uncompressed = False
                if query_fasta.endswith(".gz"):
                    uncompressed = True
                    query_fasta = Functions.uncompress(query_fasta)
                query_f = SeqIO.index(query_fasta, "fasta")
                contigs_assoc = self.get_queries_on_target_association()
                mapped_queries = set()
                with open(o_fasta, "w") as out:
                    for target in self.t_order:
                        if target in contigs_assoc:
                            queries = sorted(contigs_assoc[target], key=lambda x: self.q_order.index(x))
                            seq = SeqRecord(Seq(""))
                            for query in queries:
                                mapped_queries.add(query)
                                new_seq = query_f[query]
                                if self.q_reversed[query]:
                                    new_seq = new_seq.reverse_complement()
                                seq += new_seq
                                seq += 100 * "N"
                            seq = seq[:-100]
                            seq.id = seq.name = seq.description = target
                            SeqIO.write(seq, out, "fasta")
                    for contig in self.q_order:
                        if contig not in mapped_queries:
                            seq = query_f[contig]
                            seq.id += "_unaligned"
                            SeqIO.write(seq, out, "fasta")
Floreal Cabanettes's avatar
Floreal Cabanettes committed
761
                query_f.close()
762
763
                if uncompressed:
                    os.remove(query_fasta)
764
765
            status = "success"
        except Exception:
Floreal Cabanettes's avatar
Floreal Cabanettes committed
766
            o_fasta = "_._"
767
768
            status="fail"

769
770
771
772
773
774
775
776
777
        if MODE == "webserver":
            parts = os.path.basename(o_fasta).rsplit(".", 1)
            Functions.send_fasta_ready(mailer=self.mailer,
                                       job_name=self.id_job,
                                       sample_name=parts[0],
                                       ext=parts[1],
                                       compressed=False,
                                       path="download",
                                       status=status)
778
        return o_fasta