Commit 55115024 authored by Floreal Cabanettes's avatar Floreal Cabanettes
Browse files

Start working on fixreference script

parent 0544eb2e
#!/usr/bin/env python3
def get_valid_maps(maps):
matchs = []
bad_matchs = []
with open(maps, 'r') as mps:
for line in mps:
cols = line.rstrip().split("\t")
ref = cols[0]
ref_len = int(cols[1])
prot = cols[2]
prot_len = int(cols[3])
start_ref = int(cols[4])
end_ref = int(cols[5])
start_prot = int(cols[6])
end_prot = int(cols[7])
ref_seq = cols[8]
prot_seq = cols[9]
e_value = cols[10]
bitscore = cols[11]
match_len = int(cols[12])
btop = cols[13]
ref_frame = int(cols[14])
if match_len / prot_len >= 0.98:
to_matchs = matchs
else:
to_matchs = bad_matchs
to_matchs.append({
"ref": {"id": ref, "len": ref_len, "seq": ref_seq, "start": start_ref, "end": end_ref,
"frame": ref_frame},
"prot": {"id": prot, "len": prot_len, "seq": prot_seq, "start": start_prot, "end": end_prot},
"e_value": e_value,
"bitscore": bitscore,
"btop": btop
})
return matchs, bad_matchs
def get_gap_sequence(seq, pos_gap):
i = 0
gap_seq = ""
end_seq = False
while i < len(seq) and not end_seq:
if pos_gap == 0:
next_base = seq[i+1]
if seq[i] != "-":
end_seq = True
else:
next_base = seq[i]
if seq[i+1] != "-":
end_seq = True
if not end_seq:
gap_seq += next_base
return gap_seq
def parse_btop(btop, rseq, pseq):
"""
Parse btop
:param btop: btop string
:param rseq: reference sequence
:param pseq: protein sequence
:return: parsed btop
"""
reference = []
protein = []
i = 0
while i < len(btop):
char = btop[i]
if char.isalpha():
pchar = btop[i+1]
if pchar.isalpha():
reference.append(char)
protein.append(btop[i+1])
i += 2
else:
gap_seq = get_gap_sequence(rseq[i:], 1)
reference.append(gap_seq)
protein.append("-" * len(gap_seq))
i += len(gap_seq) + 1
elif char.isdigit():
j = i+1
while btop[i:j].isdigit():
j += 1
j -= 1
m_size = int(btop[i:j])
i = j
elif char == "-":
gap_seq =get_gap_sequence(pseq[i:], 0)
protein.append(gap_seq)
reference.append("-" * len(gap_seq))
elif char == "/":
pass # TODO
elif char == "\\":
pass # TODO
return reference, protein
def parse_valid_matches(matches):
for match in matches:
reference, protein = parse_btop(match["btop"], match["ref"]["seq"], match["prot"]["seq"])
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Fix reference using diamond mapping of reference proteins on"
"reference file")
parser.add_argument("-r", "--reference", help="Reference fasta file", required=True)
parser.add_argument("-d", "--diamond", help="Diamond mapping file", required=True)
parser.add_argument("-o", "--output-prefix", help="Output prefix", required=True)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment