paf.py 4.61 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
#!/usr/bin/env python3


class Paf:
    limit_idy = 0.5

    def __init__(self, paf, idx_q, idx_t):
        self.paf = paf
        self.idx_q = idx_q
        self.idx_t = idx_t

        self.len_q = None
        self.len_t = None
        self.min_idy = None
        self.max_idy = None
        self.lines = None
        self.q_contigs = None
        self.q_order = None
        self.t_contigs = None
        self.t_order = None
        self.name_q = None
        self.name_t = None
        self.parsed = False
        self.error = False

        self.parse_paf()

    def parse_paf(self):
        len_q = 0
        len_t = 0
        min_idy = 10000000000
        max_idy = -10000000000
        name_q = None
        name_t = None
        lines = {
            "pos+": [],
            "pos-": [],
            "neg+": [],
            "neg-": []
        }
41
42
        q_abs_start = {}
        q_abs_current_start = 0
43
44
        try:
            with open(self.idx_q, "r") as idx_q_f:
45
                name_q = idx_q_f.readline().strip("\n")
46
47
48
49
50
51
52
                q_order = []
                q_contigs = {}
                for line in idx_q_f:
                    parts = line.strip("\n").split("\t")
                    id_c = parts[0]
                    len_c = int(parts[1])
                    q_order.append(id_c)
53
                    q_abs_start[id_c] = q_abs_current_start
54
                    q_contigs[id_c] = len_c
55
                    q_abs_current_start += len_c
56
57
58
59
        except IOError:
            self.error = "Index file does not exist for query!"
            return False

60
61
        t_abs_start = {}
        t_abs_current_start = 0
62
63
        try:
            with open(self.idx_t, "r") as idx_t_f:
64
                name_t = idx_t_f.readline().strip("\n")
65
66
67
68
69
70
71
                t_order = []
                t_contigs = {}
                for line in idx_t_f:
                    parts = line.strip("\n").split("\t")
                    id_c = parts[0]
                    len_c = int(parts[1])
                    t_order.append(id_c)
72
                    t_abs_start[id_c] = t_abs_current_start
73
                    t_contigs[id_c] = len_c
74
                    t_abs_current_start += len_c
75
76
77
78
        except IOError:
            self.error = "Index file does not exist for target!"
            return False

79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
        len_q = q_abs_current_start
        len_t = t_abs_current_start

        try:
            with open(self.paf, "r") as paf_file:
                for line in paf_file:
                    parts = line.strip("\n").split("\t")
                    v1 = parts[0]
                    v6 = parts[5]
                    strand = 1 if parts[4] == "+" else -1
                    idy = int(parts[9]) / int(parts[10]) * strand
                    min_idy = min(min_idy, idy)
                    max_idy = max(max_idy, idy)
                    # x1, x2, y1, y2, idy
                    y1 = int(parts[2]) + q_abs_start[v1]
                    y2 = int(parts[3]) + q_abs_start[v1]
                    x1 = int(parts[7 if strand == 1 else 8]) + t_abs_start[v6]
                    x2 = int(parts[8 if strand == 1 else 7]) + t_abs_start[v6]
                    if idy < -self.limit_idy:
                        class_idy = "neg-"
                    elif idy < 0:
                        class_idy = "neg+"
                    elif idy < self.limit_idy:
                        class_idy = "pos-"
                    else:
                        class_idy = "pos+"
                    lines[class_idy].append([x1, x2, y1, y2, idy])
        except IOError:
            self.error = "PAF file does not exist!"
            return False

110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
        self.parsed = True
        self.len_q = len_q
        self.len_t = len_t
        self.min_idy = min_idy
        self.max_idy = max_idy
        self.lines = lines
        self.q_contigs = q_contigs
        self.q_order = q_order
        self.t_contigs = t_contigs
        self.t_order = t_order
        self.name_q = name_q
        self.name_t = name_t

    def get_d3js_data(self):
        return {
125
126
            'y_len': self.len_q,
            'x_len': self.len_t,
127
128
129
            'min_idy': self.min_idy,
            'max_idy': self.max_idy,
            'lines': self.lines,
130
131
132
133
134
135
            'y_contigs': self.q_contigs,
            'y_order': self.q_order,
            'x_contigs': self.t_contigs,
            'x_order': self.t_order,
            'name_y': self.name_q,
            'name_x': self.name_t,
136
137
138
139
140
141
142
143
144
145
146
            'limit_idy': self.limit_idy
        }

    def save_json(self, out):
        import json
        success, data = self.parse_paf()
        if success:
            with open(out, "w") as out_f:
                out_f.write(json.dumps(data))
        else:
            raise Exception(data)