paf.py 29.3 KB
Newer Older
1
2
#!/usr/bin/env python3

3
import os
4
import shutil
Floreal Cabanettes's avatar
Floreal Cabanettes committed
5
from math import sqrt
6
from numpy import mean
7
from pathlib import Path
8
import json
Floreal Cabanettes's avatar
Floreal Cabanettes committed
9
from dgenies.bin.index import Index
10
from dgenies.lib.functions import Functions
11
from intervaltree import IntervalTree
12
13
14
15
16
17
import matplotlib as mpl
mpl.use('Agg')
from matplotlib import pyplot as plt
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
18
19


20
class Paf:
21
    limit_idy = [0.25, 0.5, 0.75]
22
    max_nb_lines = 100000
23

24
    def __init__(self, paf: str, idx_q: str, idx_t: str, auto_parse: bool=True, mailer=None, id_job=None):
25
26
27
        self.paf = paf
        self.idx_q = idx_q
        self.idx_t = idx_t
28
        self.sorted = False
29
30
        self.data_dir = os.path.dirname(paf)
        if os.path.exists(os.path.join(self.data_dir, ".sorted")):
31
32
33
            self.paf += ".sorted"
            self.idx_q += ".sorted"
            self.sorted = True
34
        self.sampled= False
35
36
37
38
        self.len_q = None
        self.len_t = None
        self.min_idy = None
        self.max_idy = None
Floreal Cabanettes's avatar
Floreal Cabanettes committed
39
40
41
42
43
        self.lines = {}
        self.q_contigs = {}
        self.q_order = []
        self.t_contigs = {}
        self.t_order = []
44
        self.q_reversed = {}
45
46
47
48
        self.name_q = None
        self.name_t = None
        self.parsed = False
        self.error = False
49
50
        self.mailer = mailer
        self.id_job = id_job
51

52
53
        if auto_parse:
            self.parse_paf()
54

55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
    @staticmethod
    def __flush_blocks(index_c, new_index_c, new_index_o, current_block):
        if len(current_block) >= 5:
            block_length = 0
            for contig in current_block:
                block_length += index_c[contig]
            b_name = "###MIX###_" + "###".join(current_block)
            new_index_c[b_name] = block_length
            new_index_o.append(b_name)
        elif len(current_block) > 0:
            for b_name in current_block:
                new_index_c[b_name] = index_c[b_name]
                new_index_o.append(b_name)
        return new_index_c, new_index_o

    def parse_index(self, index_o: list, index_c: dict, full_len: int):
        """
        Parse index and merge too small contigs
        :param index_o: index order
        :param index_c: index contigs def
        :param full_len: length of the sequence
        :return: new index orders and contigs def
        """
        new_index_o = []
        new_index_c = {}
        current_block = []
        for index in index_o:
            if index_c[index] >= 0.002 * full_len:
                new_index_c, new_index_o = self.__flush_blocks(index_c, new_index_c, new_index_o, current_block)
                current_block = []
                new_index_c[index] = index_c[index]
                new_index_o.append(index)
            else:
                current_block.append(index)
        new_index_c, new_index_o = self.__flush_blocks(index_c, new_index_c, new_index_o, current_block)
        return new_index_c, new_index_o

92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
    @staticmethod
    def remove_noise(lines, noise_limit):
        keep_lines = {
            "0": [],
            "1": [],
            "2": [],
            "3": []
        }
        for cls, c_lines in lines.items():
            for line in c_lines:
                x1 = line[0]
                x2 = line[1]
                y1 = line[2]
                y2 = line[3]
                idy = line[4]
                len_m = sqrt(pow(x2 - x1, 2) + pow(y2 - y1, 2))
                if len_m >= noise_limit:
                    keep_lines[cls].append(line)
        return keep_lines

    def parse_paf(self, merge_index=True, noise=True):
113
114
115
        min_idy = 10000000000
        max_idy = -10000000000
        lines = {
116
117
118
119
            "0": [],  # idy < 0.25
            "1": [],  # idy < 0.5
            "2": [],  # idy < 0.75
            "3": []  # idy > 0.75
120
121
        }
        try:
122
            name_q, q_order, q_contigs, q_reversed, q_abs_start, len_q = Index.load(self.idx_q)
123
            if merge_index:
124
                q_contigs, q_order = self.parse_index(q_order, q_contigs, len_q)
125
126
127
128
129
        except IOError:
            self.error = "Index file does not exist for query!"
            return False

        try:
130
            name_t, t_order, t_contigs, t_reversed, t_abs_start, len_t = Index.load(self.idx_t)
131
            if merge_index:
132
                t_contigs, t_order = self.parse_index(t_order, t_contigs, len_t)
133
134
135
136
        except IOError:
            self.error = "Index file does not exist for target!"
            return False

137
        lines_lens = []
138
139
140

        try:
            with open(self.paf, "r") as paf_file:
Floreal Cabanettes's avatar
Floreal Cabanettes committed
141
142
143
144
145
146
                nb_lines = 0
                for line in paf_file:
                    nb_lines += 1
                    if nb_lines > self.max_nb_lines:
                        self.sampled = True
                        break
147
148
149
150
                    parts = line.strip("\n").split("\t")
                    v1 = parts[0]
                    v6 = parts[5]
                    strand = 1 if parts[4] == "+" else -1
151
                    idy = int(parts[9]) / int(parts[10])
152
153
154
155
156
157
158
                    min_idy = min(min_idy, idy)
                    max_idy = max(max_idy, idy)
                    # x1, x2, y1, y2, idy
                    y1 = int(parts[2]) + q_abs_start[v1]
                    y2 = int(parts[3]) + q_abs_start[v1]
                    x1 = int(parts[7 if strand == 1 else 8]) + t_abs_start[v6]
                    x2 = int(parts[8 if strand == 1 else 7]) + t_abs_start[v6]
159
160
                    len_m = sqrt(pow(x2 - x1, 2) + pow(y2 - y1, 2))
                    lines_lens.append(len_m)
161
162
163
164
165
166
                    if idy < self.limit_idy[0]:
                        class_idy = "0"
                    elif idy < self.limit_idy[1]:
                        class_idy = "1"
                    elif idy < self.limit_idy[2]:
                        class_idy = "2"
167
                    else:
168
                        class_idy = "3"
Floreal Cabanettes's avatar
Floreal Cabanettes committed
169
                    lines[class_idy].append([x1, x2, y1, y2, idy, v1, v6])
170
171
172
173
        except IOError:
            self.error = "PAF file does not exist!"
            return False

Floreal Cabanettes's avatar
Floreal Cabanettes committed
174
175
        if not noise and nb_lines > 1000:
            counts, bins, bars = plt.hist(lines_lens, bins=nb_lines//10)
176
177
178
179
180
181
182
183
184
185
186
            counts = list(counts)
            max_value = max(counts)
            max_index = counts.index(max_value)
            limit_index = -1
            for i in range(max_index, len(counts)):
                if counts[i] < max_value / 100:
                    limit_index = i
                    break
            if limit_index > -1:
                lines = self.remove_noise(lines, bins[limit_index])

187
188
189
190
191
192
193
194
        self.parsed = True
        self.len_q = len_q
        self.len_t = len_t
        self.min_idy = min_idy
        self.max_idy = max_idy
        self.lines = lines
        self.q_contigs = q_contigs
        self.q_order = q_order
195
        self.q_reversed = q_reversed
196
197
198
199
200
201
202
        self.t_contigs = t_contigs
        self.t_order = t_order
        self.name_q = name_q
        self.name_t = name_t

    def get_d3js_data(self):
        return {
203
204
            'y_len': self.len_q,
            'x_len': self.len_t,
205
206
207
            'min_idy': self.min_idy,
            'max_idy': self.max_idy,
            'lines': self.lines,
208
209
210
211
212
213
            'y_contigs': self.q_contigs,
            'y_order': self.q_order,
            'x_contigs': self.t_contigs,
            'x_order': self.t_order,
            'name_y': self.name_q,
            'name_x': self.name_t,
214
215
            'limit_idy': self.limit_idy,
            'sorted': self.sorted,
216
217
            'sampled': self.sampled,
            "max_nb_lines": self.max_nb_lines,
218
219
220
221
        }

    def save_json(self, out):
        import json
Floreal Cabanettes's avatar
Floreal Cabanettes committed
222
223
224
        data = self.get_d3js_data()
        with open(out, "w") as out_f:
            out_f.write(json.dumps(data))
Floreal Cabanettes's avatar
Floreal Cabanettes committed
225

226
227
228
229
230
231
232
233
234
235
236
237
238
239
    def is_contig_well_oriented(self, lines: list, contig, chrom):
        """
        Returns True if the contig is well oriented. A well oriented contig
        must have y increased when x increased. We check that only for highest matchs
        (small matches must be ignores)
        :param lines: lines inside the contig
        :return: True if well oriented, False else
        """
        lines.sort(key=lambda x: -x[-1])
        max_len = lines[0][-1]
        i = 0

        # Select sample of tested lines:
        while i < len(lines) and lines[i][-1] > max_len * 0.1 \
240
                and lines[i][-1] >= 0.05 * min(self.q_contigs[contig], self.t_contigs[chrom]):
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
            i += 1
        selected_lines = lines[:i]

        # Check orientation of each line:
        if len(selected_lines) > 1:
            selected_lines.sort(key=lambda x: x[0])
            orients = []
            for i in range(1, len(selected_lines)):
                if selected_lines[i][2] > selected_lines[i-1][2]:
                    orients.append(1)
                else:
                    orients.append(-1)
            if mean(orients) > -0.1:  # We have a good orientation (-0.1 to ignore ambiguous cases)
                return True
        elif len(selected_lines) == 1:
            orient_first = "+" if selected_lines[0][3] < selected_lines[0][4] else "-"  # Get X orientation
            if selected_lines[0][-2 if orient_first == "+" else -3] > \
                    selected_lines[0][-3 if orient_first == "+" else -2]:  # Check Y according to X orientation
                return True
        elif len(selected_lines) == 0:  # None line were selected: we ignore this block
            return True

        # In all other cases the orientation is wrong:
        return False

    def reorient_contigs_in_paf(self, contigs):
        """
        Reorient contigs in the PAF file
        :param contigs: contigs to be reoriented
        """
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
        sorted_file = self.paf + ".sorted"
        with open(self.paf, "r") as source, open(sorted_file, "w") as target:
            for line in source:
                parts = line.strip("\n").split("\t")
                if parts[0] in contigs:
                    len_q = int(parts[1])
                    x1_q = int(parts[2])
                    x2_q = int(parts[3])
                    if parts[4] == "-":
                        parts[4] = "+"
                    else:
                        parts[4] = "-"
                    parts[2] = str(len_q - x2_q)
                    parts[3] = str(len_q - x1_q)
                target.write("\t".join(parts) + "\n")
286
287
288
289
290
        if self.paf.endswith(".sorted"):
            os.remove(self.paf)
            shutil.move(sorted_file, self.paf)
        else:
            self.paf = sorted_file
291
292
293
294
295
296
297
298
        return True

    def _update_query_index(self, contigs_reoriented):
        with open(self.idx_q, "w") as idx:
            idx.write(self.name_q + "\n")
            for contig in self.q_order:
                idx.write("\t".join([contig, str(self.q_contigs[contig]), "1" if contig in contigs_reoriented else "0"])
                          + "\n")
299

300
301
302
303
304
305
306
307
308
    def set_sorted(self, is_sorted):
        self.sorted = is_sorted
        sorted_touch = os.path.join(os.path.dirname(self.paf), ".sorted")
        if is_sorted:
            Path(sorted_touch).touch()
        else:
            if os.path.exists(sorted_touch):
                os.remove(sorted_touch)

309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
    def compute_gravity_contigs(self):
        """
        Compute gravity for each contig on each chromosome (how many big matches they have).
        Will be used to find which chromosome has the highest value for each contig
        :return:
            - gravity for each contig and each chromosome:
                {contig1: {chr1: value, chr2: value, ...}, contig2: ...}
            - For each block save lines inside:
                [median_on_query, squared length, median_on_target, x1, x2, y1, y2, length] (x : on target, y: on query)
        """
        gravity_contig = {}
        lines_on_block = {}
        # Compute size of blocks (in term of how many big match they have), and save median of each match on each one
        # (for next step)
        for line in [j for i in list(self.lines.values()) for j in i]:
            x1 = int(line[0])
            x2 = int(line[1])
            y1 = int(line[2])
            y2 = int(line[3])
            contig = line[5]
            chrm = line[6]
            block = (contig, chrm)
            # X1 and X2 (in good orientation):
            x_1 = min(x1, x2)
            x_2 = max(x2, x1)
            med_q = x_1 + (abs(x_2 - x_1) / 2)
            # Y1 and Y2 (in good orientation):
            y_1 = min(y1, y2)
            y_2 = max(y2, y1)
            med_t = y_1 + (abs(y_2 - y_1) / 2)
            len_m = sqrt(pow(x2 - x1, 2) + pow(y2 - y1, 2))  # Len
            len_m_2 = pow(1 + len_m, 2) # Pow of len
            if block not in lines_on_block:
                lines_on_block[block] = []
            lines_on_block[block].append((med_q, len_m_2, med_t, x1, x2, y1, y2, len_m))

            if contig not in gravity_contig:
                gravity_contig[contig] = {}
            if chrm not in gravity_contig[contig]:
                gravity_contig[contig][chrm] = 0
            gravity_contig[contig][chrm] += len_m_2
        return gravity_contig, lines_on_block

Floreal Cabanettes's avatar
Floreal Cabanettes committed
352
    def sort(self):
353
354
355
        """
        Sort contigs according to reference target and reorient them if needed
        """
356
        self.parse_paf(False)
357
        sorted_file = self.paf + ".sorted"
358
        if not self.sorted:  # Do the sort
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
            if not self.paf.endswith(".sorted") and not self.idx_q.endswith(".sorted") and \
                    (not os.path.exists(self.paf + ".sorted") or not os.path.exists(self.idx_q + ".sorted")):
                gravity_contig , lines_on_block = self.compute_gravity_contigs()

                # For each contig, find best block, and deduce gravity of contig:
                gravity_on_contig = {}
                reorient_contigs = []
                for contig, chr_blocks in gravity_contig.items():
                    # Find best block:
                    max_number = 0
                    max_chr = None
                    for chrm, size in chr_blocks.items():
                        if size > max_number:
                            max_number = size
                            max_chr = chrm

                    # Compute gravity of contig:
                    nb_items = 0
                    sum_items = 0
                    lines_on_selected_block = lines_on_block[(contig, max_chr)]
                    for med in lines_on_selected_block:
                        sum_items += med[0] * med[1]
                        nb_items += med[1]
                    gravity_on_contig[contig] = sum_items / nb_items

                    # Check if contig must be re-oriented:
                    if len(lines_on_selected_block) > 0:
                        if not self.is_contig_well_oriented(lines_on_selected_block, contig, max_chr):
                            reorient_contigs.append(contig)

                # Sort contigs:
                self.q_order.sort(key=lambda x: gravity_on_contig[x] if x in gravity_on_contig else self.len_q + 1000)

                self.idx_q += ".sorted"

                with open(self.idx_q, "w") as idx_q_f:
                    idx_q_f.write(self.name_q + "\n")
                    for contig in self.q_order:
                        idx_q_f.write("\t".join([contig, str(self.q_contigs[contig])]) + "\n")

                # Re-orient contigs:
                if len(reorient_contigs) > 0:
                    self.reorient_contigs_in_paf(reorient_contigs)
                else:
                    shutil.copyfile(self.paf, sorted_file)
404

405
406
                # Update index:
                self._update_query_index(reorient_contigs)
407

408
            else:
409
                self.idx_q += ".sorted"
410
            self.set_sorted(True)
411
            self.paf = sorted_file
412
413

        else:  # Undo the sort
414
415
            self.paf = self.paf.replace(".sorted", "")
            self.idx_q = self.idx_q.replace(".sorted", "")
416
            self.set_sorted(False)
417
418

        # Re parse PAF file:
Floreal Cabanettes's avatar
Floreal Cabanettes committed
419
420
        self.parsed = False
        self.parse_paf()
421

422
423
424
425
    def reverse_contig(self, contig_name):
        self.parse_paf(False)
        reorient_contigs = [contig_name]
        self.reorient_contigs_in_paf(reorient_contigs)
426
427
        if not self.idx_q.endswith(".sorted"):
            self.idx_q += ".sorted"
428
429
430
431
432
        self._update_query_index(reorient_contigs)
        self.set_sorted(True)
        self.parsed = False
        self.parse_paf()

433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
    def get_query_on_target_association(self):
        """
        For each query, get the best matching chromosome
        :return:
        """
        gravity_contig = self.compute_gravity_contigs()[0]
        query_on_target = {}
        for contig, chr_blocks in gravity_contig.items():
            # Find best block:
            max_number = 0
            max_chr = None
            for chrm, size in chr_blocks.items():
                if size > max_number:
                    max_number = size
                    max_chr = chrm
            if max_chr is not None:
                query_on_target[contig] = max_chr
            else:
                query_on_target[contig] = None
        return query_on_target

454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
    def get_queries_on_target_association(self):
        """
        For each target, get the list of queries associated to it
        :return:
        """
        gravity_contig = self.compute_gravity_contigs()[0]
        queries_on_target = {}
        for contig, chr_blocks in gravity_contig.items():
            # Find best block:
            max_number = 0
            max_chr = None
            for chrm, size in chr_blocks.items():
                if size > max_number:
                    max_number = size
                    max_chr = chrm
            if max_chr is not None:
                if max_chr not in queries_on_target:
                    queries_on_target[max_chr] = []
                queries_on_target[max_chr].append(contig)
        return queries_on_target

475
476
477
478
479
480
481
    def build_query_on_target_association_file(self):
        """
        For each query, get the best matching chromosome and save it to a CSV file.
        Use the order of queries
        :return: content of the file
        """
        query_on_target = self.get_query_on_target_association()
482
        content = "Query\tTarget\tStrand\n"
483
        for contig in self.q_order:
484
485
486
            strand = "+"
            if contig in self.q_reversed:
                strand = "-" if self.q_reversed[contig] else "+"
487
            if contig in query_on_target:
488
                content += "%s\t%s\t%s\n" % (contig, query_on_target[contig] or "None", strand)
489
            else:
490
                content += "%s\t%s\t%s\n" % (contig, "None", strand)
491
        return content
492

493
    def build_list_no_assoc(self, to):
494
        """
495
496
        Build list of queries that match with None target, or the opposite
        :param to: query or target
497
498
        :return: content of the file
        """
499
        index = self.idx_q if to == "query" else self.idx_t
Floreal Cabanettes's avatar
Floreal Cabanettes committed
500
        name, contigs_list, contigs, reversed, abs_start, c_len = Index.load(index)
501
502
        with open(self.paf, "r") as paf:
            for line in paf:
503
504
505
                c_name = line.strip("\n").split("\t")[0 if to == "query" else 5]
                if c_name in contigs_list:
                    contigs_list.remove(c_name)
506
        return "\n".join(contigs_list) + "\n"
Floreal Cabanettes's avatar
Floreal Cabanettes committed
507

508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
    def _add_percents(self, percents, item):
        i_count = item.length()
        percents[str(item.data)] += i_count
        percents["-1"] -= i_count
        return percents

    def _remove_overlaps(self, position_idy: IntervalTree, percents: dict):
        while len(position_idy) > 0:
            item = position_idy.pop()
            start = item.begin
            end = item.end
            cat = item.data
            overlaps = position_idy.search(start,end)
            if len(overlaps) > 0:
                has_overlap = False
                for overlap in overlaps:
                    if has_overlap:
                        break
                    o_start = overlap.begin
                    o_end = overlap.end
                    o_cat = overlap.data
                    if not position_idy.containsi(o_start, o_end, o_cat):
                        continue
                    if start < o_start:
                        if end <= o_end:
                            # cccccccccccccc*******
                            # *****ooooooooo[ooooooo]
                            if o_cat < cat:
                                if end < o_end:
                                    # No overlap with the current item, we stay has_overlap as False
                                    position_idy.discard(overlap)
                                    position_idy[end:o_end] = o_cat
                                else:
                                    position_idy.discard(overlap)  # No kept overlap
                            elif o_cat == cat:
                                if end < o_end:
                                    has_overlap = True
                                    position_idy.discard(overlap)
                                    position_idy[start:o_end] = cat
                                else:
                                    position_idy.discard(overlap)  # No kept overlap
                            else:
                                has_overlap = True
                                position_idy.discard(overlap)
                                position_idy[start:o_start] = cat
                                position_idy[o_start:o_end] = o_cat
                        else:  # end > o_end
                            # ccccccccccccccccccc
                            # *****oooooooooo****
                            if o_cat <= cat:
                                position_idy.discard(overlap)  # No kept overlap
                            else:  # o_cat > cat
                                has_overlap = True
                                position_idy.discard(overlap)
                                position_idy[start:o_start] = cat
                                position_idy[o_start:o_end] = o_cat
                                position_idy[o_end:end] = cat
                    elif start == o_start:
                        if end < o_end:
                            # cccccccccccc*******
                            # ooooooooooooooooooo
                            if o_cat < cat:
                                # No overlap with the current item, we stay has_overlap as False
                                position_idy.discard(overlap)
                                position_idy[end:o_end] = o_cat
                            elif o_cat == cat:
                                has_overlap = True
                                position_idy.discard(overlap)
                                position_idy[start:o_end] = cat
                            else:  # o_cat > cat
                                # The overlap just contains current item
                                has_overlap = True
                        elif end == o_end:
                            # ***cccccccccccccccc***
                            # ***oooooooooooooooo***
                            if o_cat <= cat:
                                position_idy.discard(overlap)  # No kept overlap
                            else:
                                # The overlap just contains current item
                                has_overlap = True
                        else:  # end > o_end
                            # ccccccccccccccccccccccccccccc
                            # oooooooooooooooooooo*********
                            if o_cat <= cat:
                                # current item just contains the overlap
                                position_idy.discard(overlap)
                            else:
                                has_overlap = True
                                position_idy.discard(overlap)
                                position_idy[o_start:o_end] = o_cat
                                position_idy[o_end:end] = cat
                    else:  # start > o_start
                        if end <= o_end:
                            # ******ccccccccc*******
                            # ooooooooooooooo[ooooooo]
                            if o_cat < cat:
                                has_overlap=True
                                position_idy.discard(overlap)
                                position_idy[o_start:start] = o_cat
                                position_idy[start:end] = cat
                                if end < o_end:
                                    position_idy[end:o_end] = o_cat
                            else:  # o_cat >= cat
                                # Overlap just contains the item
                                has_overlap = True
                        else: # end > o_end
                            # ******ccccccccccccccccccccc
                            # ooooooooooooooooo**********
                            if o_cat < cat:
                                has_overlap = True
                                position_idy.discard(overlap)
                                position_idy[o_start:start] = o_cat
                                position_idy[start:end] = cat
                            elif o_cat == cat:
                                has_overlap = True
                                position_idy.discard(overlap)
                                position_idy[o_start:end] = cat
                            else:  # o_cat > cat
                                has_overlap = True
                                position_idy[o_end:end] = cat
                if not has_overlap:
                    percents = self._add_percents(percents, item)

631
            else:
632
                percents = self._add_percents(percents, item)
633

634
        return percents
635

636
    def build_summary_stats(self, status_file):
Floreal Cabanettes's avatar
Floreal Cabanettes committed
637
638
639
640
        """
        Get summary of identity
        :return: table with percents by category
        """
641
        summary_file = self.paf + ".summary"
Floreal Cabanettes's avatar
Floreal Cabanettes committed
642
        self.parse_paf(False, False)
643
        if self.parsed:
644
            percents = {"-1": self.len_t}
645
            position_idy = IntervalTree()
646
647
648
649

            cats = sorted(self.lines.keys())

            for cat in cats:
650
                percents[cat] = 0
651
                for line in self.lines[cat]:
652
                    start = min(line[0], line[1])
653
654
                    end = max(line[0], line[1]) + 1
                    position_idy[start:end] = int(cat)
655

656
            percents = self._remove_overlaps(position_idy, percents)
Floreal Cabanettes's avatar
Floreal Cabanettes committed
657

658
659
            for cat in percents:
                percents[cat] = percents[cat] / self.len_t * 100
660
661
662
663

            with open(summary_file, "w") as summary_file:
                summary_file.write(json.dumps(percents))

664
            os.remove(status_file)
665
            return percents
666
667
668
669
670
671
672
673
674
        shutil.move(status_file, status_file + ".fail")
        return None

    def get_summary_stats(self):
        summary_file = self.paf + ".summary"
        if os.path.exists(summary_file):
            with open(summary_file, "r") as summary_file:
                txt = summary_file.read()
                return json.loads(txt)
675
        return None
676
677
678
679
680
681
682
683
684
685

    def build_query_chr_as_reference(self):
        try:
            if not self.sorted:
                raise Exception("Contigs must be sorted to do that!")
            with open(os.path.join(self.data_dir, ".query")) as query_file:
                query_fasta = query_file.read().strip("\n")
            if not os.path.isfile(query_fasta):
                raise Exception("Query fasta does not exists")
            o_fasta = os.path.join(os.path.dirname(query_fasta), "as_reference_" + os.path.basename(query_fasta))
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
            if not os.path.exists(o_fasta):
                uncompressed = False
                if query_fasta.endswith(".gz"):
                    uncompressed = True
                    query_fasta = Functions.uncompress(query_fasta)
                query_f = SeqIO.index(query_fasta, "fasta")
                contigs_assoc = self.get_queries_on_target_association()
                mapped_queries = set()
                with open(o_fasta, "w") as out:
                    for target in self.t_order:
                        if target in contigs_assoc:
                            queries = sorted(contigs_assoc[target], key=lambda x: self.q_order.index(x))
                            seq = SeqRecord(Seq(""))
                            for query in queries:
                                mapped_queries.add(query)
                                new_seq = query_f[query]
                                if self.q_reversed[query]:
                                    new_seq = new_seq.reverse_complement()
                                seq += new_seq
                                seq += 100 * "N"
                            seq = seq[:-100]
                            seq.id = seq.name = seq.description = target
                            SeqIO.write(seq, out, "fasta")
                    for contig in self.q_order:
                        if contig not in mapped_queries:
                            seq = query_f[contig]
                            seq.id += "_unaligned"
                            SeqIO.write(seq, out, "fasta")
                if uncompressed:
                    os.remove(query_fasta)
716
717
718
719
            status = "success"
        except Exception:
            o_fasta = None
            status="fail"
Floreal Cabanettes's avatar
Floreal Cabanettes committed
720
            query_fasta = "_._"
721

Floreal Cabanettes's avatar
Floreal Cabanettes committed
722
        parts = os.path.basename(query_fasta).rsplit(".", 1)
723
724
        Functions.send_fasta_ready(mailer=self.mailer,
                                   job_name=self.id_job,
Floreal Cabanettes's avatar
Floreal Cabanettes committed
725
726
                                   sample_name="as_reference_" + parts[0],
                                   ext=parts[1],
727
728
729
730
                                   compressed=False,
                                   path="download",
                                   status=status)
        return o_fasta