Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
genotoul-bioinfo
miniannotator
Commits
55115024
Commit
55115024
authored
Aug 03, 2018
by
Floreal Cabanettes
Browse files
Start working on fixreference script
parent
0544eb2e
Changes
1
Show whitespace changes
Inline
Side-by-side
fixreference.py
0 → 100644
View file @
55115024
#!/usr/bin/env python3
def
get_valid_maps
(
maps
):
matchs
=
[]
bad_matchs
=
[]
with
open
(
maps
,
'r'
)
as
mps
:
for
line
in
mps
:
cols
=
line
.
rstrip
().
split
(
"
\t
"
)
ref
=
cols
[
0
]
ref_len
=
int
(
cols
[
1
])
prot
=
cols
[
2
]
prot_len
=
int
(
cols
[
3
])
start_ref
=
int
(
cols
[
4
])
end_ref
=
int
(
cols
[
5
])
start_prot
=
int
(
cols
[
6
])
end_prot
=
int
(
cols
[
7
])
ref_seq
=
cols
[
8
]
prot_seq
=
cols
[
9
]
e_value
=
cols
[
10
]
bitscore
=
cols
[
11
]
match_len
=
int
(
cols
[
12
])
btop
=
cols
[
13
]
ref_frame
=
int
(
cols
[
14
])
if
match_len
/
prot_len
>=
0.98
:
to_matchs
=
matchs
else
:
to_matchs
=
bad_matchs
to_matchs
.
append
({
"ref"
:
{
"id"
:
ref
,
"len"
:
ref_len
,
"seq"
:
ref_seq
,
"start"
:
start_ref
,
"end"
:
end_ref
,
"frame"
:
ref_frame
},
"prot"
:
{
"id"
:
prot
,
"len"
:
prot_len
,
"seq"
:
prot_seq
,
"start"
:
start_prot
,
"end"
:
end_prot
},
"e_value"
:
e_value
,
"bitscore"
:
bitscore
,
"btop"
:
btop
})
return
matchs
,
bad_matchs
def
get_gap_sequence
(
seq
,
pos_gap
):
i
=
0
gap_seq
=
""
end_seq
=
False
while
i
<
len
(
seq
)
and
not
end_seq
:
if
pos_gap
==
0
:
next_base
=
seq
[
i
+
1
]
if
seq
[
i
]
!=
"-"
:
end_seq
=
True
else
:
next_base
=
seq
[
i
]
if
seq
[
i
+
1
]
!=
"-"
:
end_seq
=
True
if
not
end_seq
:
gap_seq
+=
next_base
return
gap_seq
def
parse_btop
(
btop
,
rseq
,
pseq
):
"""
Parse btop
:param btop: btop string
:param rseq: reference sequence
:param pseq: protein sequence
:return: parsed btop
"""
reference
=
[]
protein
=
[]
i
=
0
while
i
<
len
(
btop
):
char
=
btop
[
i
]
if
char
.
isalpha
():
pchar
=
btop
[
i
+
1
]
if
pchar
.
isalpha
():
reference
.
append
(
char
)
protein
.
append
(
btop
[
i
+
1
])
i
+=
2
else
:
gap_seq
=
get_gap_sequence
(
rseq
[
i
:],
1
)
reference
.
append
(
gap_seq
)
protein
.
append
(
"-"
*
len
(
gap_seq
))
i
+=
len
(
gap_seq
)
+
1
elif
char
.
isdigit
():
j
=
i
+
1
while
btop
[
i
:
j
].
isdigit
():
j
+=
1
j
-=
1
m_size
=
int
(
btop
[
i
:
j
])
i
=
j
elif
char
==
"-"
:
gap_seq
=
get_gap_sequence
(
pseq
[
i
:],
0
)
protein
.
append
(
gap_seq
)
reference
.
append
(
"-"
*
len
(
gap_seq
))
elif
char
==
"/"
:
pass
# TODO
elif
char
==
"
\\
"
:
pass
# TODO
return
reference
,
protein
def
parse_valid_matches
(
matches
):
for
match
in
matches
:
reference
,
protein
=
parse_btop
(
match
[
"btop"
],
match
[
"ref"
][
"seq"
],
match
[
"prot"
][
"seq"
])
if
__name__
==
"__main__"
:
import
argparse
parser
=
argparse
.
ArgumentParser
(
description
=
"Fix reference using diamond mapping of reference proteins on"
"reference file"
)
parser
.
add_argument
(
"-r"
,
"--reference"
,
help
=
"Reference fasta file"
,
required
=
True
)
parser
.
add_argument
(
"-d"
,
"--diamond"
,
help
=
"Diamond mapping file"
,
required
=
True
)
parser
.
add_argument
(
"-o"
,
"--output-prefix"
,
help
=
"Output prefix"
,
required
=
True
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment