Commit 4171d11b authored by Christophe Klopp's avatar Christophe Klopp
Browse files

add semicolon to gtf format and new gff output file

parent c211be8c
......@@ -381,7 +381,7 @@ class Miniannotator:
with open(gtf_file, "w") as gtf:
for gene_id, gene in genes.items():
line = '{seqname}\tminiannotator\t{feature}\t{start}\t{end}\t.\t{strand}\t.\tgene_id "{gene_id}"' \
' {attrs}\n'
'{attrs}\n'
gtf.write(line.format(
seqname=gene["seqname"],
feature="gene",
......@@ -393,7 +393,7 @@ class Miniannotator:
))
for tr_id, transcript in gene["transcripts"].items():
attrs = 'transcript_id "{tr_id}"'.format(tr_id=tr_id)
attrs = '; transcript_id "{tr_id}"'.format(tr_id=tr_id)
gtf.write(line.format(
seqname=gene["seqname"],
feature="transcript",
......@@ -405,7 +405,7 @@ class Miniannotator:
))
for ex_id, exon in transcript["exons"].items():
attrs_ex = attrs + ' exon_id "{ex_id}"'.format(ex_id=ex_id)
attrs_ex = attrs + '; exon_id "{ex_id}"'.format(ex_id=ex_id)
gtf.write(line.format(
seqname=gene["seqname"],
feature="exon",
......@@ -417,7 +417,7 @@ class Miniannotator:
))
for indel in transcript["indels"]:
attrs_ex = attrs + ' exon_id "{ex_id}"'.format(ex_id=indel["exon"])
attrs_ex = attrs + '; exon_id "{ex_id}"'.format(ex_id=indel["exon"])
if indel["type"] == "ins":
attrs_ex += ' sequence "%s" contig_pos "%s"' % (indel["seq"], indel["contig"])
gtf.write(line.format(
......@@ -430,6 +430,57 @@ class Miniannotator:
attrs=attrs_ex
))
def write_gff_file(self, gff_file, genes):
"""
Write genes, transcripts, exons and indels to a GFF file
:param gff_file: GFF file path
:type gff_file: str
:param genes: final genes object. Each gene has position and transcripts of this gene. Each transcript contains
associated exons and list of indels
:type genes: dict
"""
with open(gff_file, "w") as gff:
for gene_id, gene in genes.items():
line = '{seqname}\tminiannotator\t{feature}\t{start}\t{end}\t.\t{strand}\t.\t{attrs}\n'
attrs = 'ID={gene_id}'.format(gene_id=gene_id)
gff.write(line.format(
seqname=gene["seqname"],
feature="gene",
start=gene["start"],
end=gene["end"],
strand=gene["strand"],
gene_id=gene_id,
attrs=attrs
))
for tr_id, transcript in gene["transcripts"].items():
attrs = 'ID={tr_id}'.format(tr_id=tr_id)
attrs = attrs + ";" + ' Parent={gene_id}'.format(gene_id=gene_id)
gff.write(line.format(
seqname=gene["seqname"],
feature="transcript",
start=transcript["start"],
end=transcript["end"],
strand=gene["strand"],
attrs=attrs
))
for ex_id, exon in transcript["exons"].items():
attrs = 'ID={ex_id}'.format(ex_id=ex_id)
attrs = attrs + ";" + ' Parent={tr_id}'.format(tr_id=tr_id)
gff.write(line.format(
seqname=gene["seqname"],
feature="exon",
start=exon["start"],
end=exon["end"],
strand=gene["strand"],
attrs=attrs
))
def search_genes(self, gtf_file):
"""
Parse BAM file to search genes and exons positions
......@@ -437,7 +488,7 @@ class Miniannotator:
Splice reads define exons: introns are the "N" in the cigarline
Save also DEL/INS events in genes
:param gtf_file: output GTF file path
:param gtf_file: output GTF and GFF file path
:type gtf_file: str
"""
print("Searching genes in genome...", flush=True)
......@@ -473,6 +524,9 @@ class Miniannotator:
print("Writing to GTF file...", flush=True)
self.write_gtf_file(gtf_file=gtf_file, genes=full_genes_2)
print("Writing to GFF file...", flush=True)
gff_file = os.path.splitext(gtf_file)[0]+".gff"
self.write_gff_file(gff_file=gff_file, genes=full_genes_2)
if __name__ == "__main__":
import argparse
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment