fixreference.py 3.43 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#!/usr/bin/env python3


def get_valid_maps(maps):
    matchs = []
    bad_matchs = []
    with open(maps, 'r') as mps:
        for line in mps:
            cols = line.rstrip().split("\t")
            ref = cols[0]
            ref_len = int(cols[1])
            prot = cols[2]
            prot_len = int(cols[3])
            start_ref = int(cols[4])
            end_ref = int(cols[5])
            start_prot = int(cols[6])
            end_prot = int(cols[7])
            ref_seq = cols[8]
            prot_seq = cols[9]
            e_value = cols[10]
            bitscore = cols[11]
            match_len = int(cols[12])
            btop = cols[13]
            ref_frame = int(cols[14])
            if match_len / prot_len >= 0.98:
                to_matchs = matchs
            else:
                to_matchs = bad_matchs
            to_matchs.append({
                "ref": {"id": ref, "len": ref_len, "seq": ref_seq, "start": start_ref, "end": end_ref,
                        "frame": ref_frame},
                "prot": {"id": prot, "len": prot_len, "seq": prot_seq, "start": start_prot, "end": end_prot},
                "e_value": e_value,
                "bitscore": bitscore,
                "btop": btop
            })

            return matchs, bad_matchs


def get_gap_sequence(seq, pos_gap):
    i = 0
    gap_seq = ""
    end_seq = False
    while i < len(seq) and not end_seq:
        if pos_gap == 0:
            next_base = seq[i+1]
            if seq[i] != "-":
                end_seq = True
        else:
            next_base = seq[i]
            if seq[i+1] != "-":
                end_seq = True
        if not end_seq:
            gap_seq += next_base
    return gap_seq


def parse_btop(btop, rseq, pseq):
    """
    Parse btop

    :param btop: btop string
    :param rseq: reference sequence
    :param pseq: protein sequence
    :return: parsed btop
    """
    reference = []
    protein = []
    i = 0
    while i < len(btop):
        char = btop[i]
        if char.isalpha():
            pchar = btop[i+1]
            if pchar.isalpha():
                reference.append(char)
                protein.append(btop[i+1])
                i += 2
            else:
                gap_seq = get_gap_sequence(rseq[i:], 1)
                reference.append(gap_seq)
                protein.append("-" * len(gap_seq))
                i += len(gap_seq) + 1
        elif char.isdigit():
            j = i+1
            while btop[i:j].isdigit():
                j += 1
            j -= 1
            m_size = int(btop[i:j])
            i = j
        elif char == "-":
            gap_seq =get_gap_sequence(pseq[i:], 0)
            protein.append(gap_seq)
            reference.append("-" * len(gap_seq))
        elif char == "/":
            pass  # TODO
        elif char == "\\":
            pass  # TODO

    return reference, protein


def parse_valid_matches(matches):
    for match in matches:
        reference, protein = parse_btop(match["btop"], match["ref"]["seq"], match["prot"]["seq"])


if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(description="Fix reference using diamond mapping of reference proteins on"
                                                 "reference file")
    parser.add_argument("-r", "--reference", help="Reference fasta file", required=True)
    parser.add_argument("-d", "--diamond", help="Diamond mapping file", required=True)
    parser.add_argument("-o", "--output-prefix", help="Output prefix", required=True)