Commit 0544eb2e authored by Floreal Cabanettes's avatar Floreal Cabanettes
Browse files

Make exon ID unique by gene, not by transcript

parent 392c7ce0
......@@ -301,6 +301,72 @@ class Miniannotator:
return full_genes
@staticmethod
def _rename_exons(gene):
"""
Rename exons: exons of a same gene which have the same coordinates must have the same name
special name for two exons which overlap but does not have the same coordinates
:param gene: First item: gene id. Second item: gene object, containing transcripts, exons and indels
:type gene: list
:return: gene with exons renamed
:rtype: str, dict
"""
gene_id = gene[0]
gene_o = gene[1]
exons_corr = {}
exons_by_coord = {}
# Save exons by coordinates
for transcript in gene_o["transcripts"].values():
for ex_id, exon in transcript["exons"].items():
coords = (exon["start"], exon["end"])
if coords in exons_by_coord:
exons_by_coord[coords].append(ex_id)
else:
exons_by_coord[coords] = [ex_id]
# Build new exons name
ex_order = sorted(exons_by_coord.keys())
i = 0
ex_nb = 1
for coord in ex_order:
create_new = True
if i > -1:
for k in range(i, 0, -1):
o_coord = ex_order[k]
if coord[0] < o_coord[0] < coord[1] or coord[0] < o_coord[1] < coord[1] or \
o_coord[0] < coord[0] < o_coord[1] or o_coord[0] < coord[1] < o_coord[1]:
n_alt = 2
new_id_basis = exons_corr[exons_by_coord[o_coord][0]].split("_")[0]
new_id = new_id_basis + "_%d" % n_alt
while new_id in exons_corr.values():
n_alt += 1
new_id = new_id_basis + "_%d" % n_alt
create_new = False
break
if create_new:
new_id = gene_id + ".e%d" % ex_nb
ex_nb += 1
for ex_id in exons_by_coord[coord]:
exons_corr[ex_id] = new_id
i += 1
# Rename exons in transcripts
for transcript in gene_o["transcripts"].values():
# Rename exons:
new_exons = OrderedDict()
for ex_id, exon in transcript["exons"].items():
new_exons[exons_corr[ex_id]] = exon
transcript["exons"] = new_exons
# Rename exons in indels:
for indel in transcript["indels"]:
indel["exon"] = exons_corr[indel["exon"]]
return gene_id, gene_o
def write_gtf_file(self, gtf_file, genes):
"""
Write genes, transcripts, exons and indels to a GTF file
......@@ -400,8 +466,12 @@ class Miniannotator:
full_genes = self._build_genes_transcripts(genes=genes, exons=exons, indels=indels)
print("Renaming exons...", flush=True)
pool = multiprocessing.Pool(processes=self.conf["threads"])
full_genes_2 = OrderedDict(pool.map(Miniannotator._rename_exons, list(full_genes.items())))
print("Writing to GTF file...", flush=True)
self.write_gtf_file(gtf_file=gtf_file, genes=full_genes)
self.write_gtf_file(gtf_file=gtf_file, genes=full_genes_2)
if __name__ == "__main__":
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment