Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
genotoul-bioinfo
miniannotator
Commits
0544eb2e
Commit
0544eb2e
authored
Jul 26, 2018
by
Floreal Cabanettes
Browse files
Make exon ID unique by gene, not by transcript
parent
392c7ce0
Changes
1
Hide whitespace changes
Inline
Side-by-side
miniannotator.py
View file @
0544eb2e
...
...
@@ -301,6 +301,72 @@ class Miniannotator:
return
full_genes
@
staticmethod
def
_rename_exons
(
gene
):
"""
Rename exons: exons of a same gene which have the same coordinates must have the same name
special name for two exons which overlap but does not have the same coordinates
:param gene: First item: gene id. Second item: gene object, containing transcripts, exons and indels
:type gene: list
:return: gene with exons renamed
:rtype: str, dict
"""
gene_id
=
gene
[
0
]
gene_o
=
gene
[
1
]
exons_corr
=
{}
exons_by_coord
=
{}
# Save exons by coordinates
for
transcript
in
gene_o
[
"transcripts"
].
values
():
for
ex_id
,
exon
in
transcript
[
"exons"
].
items
():
coords
=
(
exon
[
"start"
],
exon
[
"end"
])
if
coords
in
exons_by_coord
:
exons_by_coord
[
coords
].
append
(
ex_id
)
else
:
exons_by_coord
[
coords
]
=
[
ex_id
]
# Build new exons name
ex_order
=
sorted
(
exons_by_coord
.
keys
())
i
=
0
ex_nb
=
1
for
coord
in
ex_order
:
create_new
=
True
if
i
>
-
1
:
for
k
in
range
(
i
,
0
,
-
1
):
o_coord
=
ex_order
[
k
]
if
coord
[
0
]
<
o_coord
[
0
]
<
coord
[
1
]
or
coord
[
0
]
<
o_coord
[
1
]
<
coord
[
1
]
or
\
o_coord
[
0
]
<
coord
[
0
]
<
o_coord
[
1
]
or
o_coord
[
0
]
<
coord
[
1
]
<
o_coord
[
1
]:
n_alt
=
2
new_id_basis
=
exons_corr
[
exons_by_coord
[
o_coord
][
0
]].
split
(
"_"
)[
0
]
new_id
=
new_id_basis
+
"_%d"
%
n_alt
while
new_id
in
exons_corr
.
values
():
n_alt
+=
1
new_id
=
new_id_basis
+
"_%d"
%
n_alt
create_new
=
False
break
if
create_new
:
new_id
=
gene_id
+
".e%d"
%
ex_nb
ex_nb
+=
1
for
ex_id
in
exons_by_coord
[
coord
]:
exons_corr
[
ex_id
]
=
new_id
i
+=
1
# Rename exons in transcripts
for
transcript
in
gene_o
[
"transcripts"
].
values
():
# Rename exons:
new_exons
=
OrderedDict
()
for
ex_id
,
exon
in
transcript
[
"exons"
].
items
():
new_exons
[
exons_corr
[
ex_id
]]
=
exon
transcript
[
"exons"
]
=
new_exons
# Rename exons in indels:
for
indel
in
transcript
[
"indels"
]:
indel
[
"exon"
]
=
exons_corr
[
indel
[
"exon"
]]
return
gene_id
,
gene_o
def
write_gtf_file
(
self
,
gtf_file
,
genes
):
"""
Write genes, transcripts, exons and indels to a GTF file
...
...
@@ -400,8 +466,12 @@ class Miniannotator:
full_genes
=
self
.
_build_genes_transcripts
(
genes
=
genes
,
exons
=
exons
,
indels
=
indels
)
print
(
"Renaming exons..."
,
flush
=
True
)
pool
=
multiprocessing
.
Pool
(
processes
=
self
.
conf
[
"threads"
])
full_genes_2
=
OrderedDict
(
pool
.
map
(
Miniannotator
.
_rename_exons
,
list
(
full_genes
.
items
())))
print
(
"Writing to GTF file..."
,
flush
=
True
)
self
.
write_gtf_file
(
gtf_file
=
gtf_file
,
genes
=
full_genes
)
self
.
write_gtf_file
(
gtf_file
=
gtf_file
,
genes
=
full_genes
_2
)
if
__name__
==
"__main__"
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment