Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Maintenance - Mise à jour mensuelle Lundi 6 Février entre 7h00 et 9h00
Open sidebar
genotoul-bioinfo
miniannotator
Commits
a40f987f
Commit
a40f987f
authored
Jul 23, 2018
by
Floreal Cabanettes
Browse files
Add writing of output GTF file
parent
14b056db
Changes
1
Hide whitespace changes
Inline
Side-by-side
miniannotator.py
View file @
a40f987f
...
@@ -2,20 +2,22 @@
...
@@ -2,20 +2,22 @@
import
os
import
os
import
pysam
import
pysam
import
re
import
sys
import
sys
import
yaml
import
yaml
import
subprocess
import
subprocess
from
collections
import
OrderedDict
PRG_PATH
=
os
.
path
.
dirname
(
os
.
path
.
realpath
(
__file__
))
PRG_PATH
=
os
.
path
.
dirname
(
os
.
path
.
realpath
(
__file__
))
class
Miniannotator
:
class
Miniannotator
:
def
__init__
(
self
,
reference
,
assembly
,
map
=
None
):
def
__init__
(
self
,
reference
,
assembly
,
qoverlap
=
0.9
,
map
=
None
):
self
.
conf
=
self
.
_load_config
()
self
.
conf
=
self
.
_load_config
()
self
.
reference
=
reference
self
.
reference
=
reference
self
.
assembly
=
assembly
self
.
assembly
=
assembly
self
.
afasta
=
pysam
.
FastaFile
(
self
.
assembly
)
self
.
qoverlap
=
qoverlap
self
.
map
=
map
self
.
map
=
map
@
staticmethod
@
staticmethod
...
@@ -55,20 +57,211 @@ class Miniannotator:
...
@@ -55,20 +57,211 @@ class Miniannotator:
self
.
map
=
map_f
self
.
map
=
map_f
return
rcode
return
rcode
def
search_genes
(
self
):
def
_query_overlap
(
self
,
read
):
"""
Get query mapping length
:param read: read pysam object
:type read: pysam.libcalignedsegment.AlignedSegment
:return: read mapping length on target percent [0-1]
:rtype: float
"""
return
(
read
.
query_alignment_length
/
read
.
infer_query_length
())
if
read
.
cigarstring
is
not
None
else
-
1
# Remove reads without cigarstring
def
_read_pass
(
self
,
read
):
"""
Check if read pass the mapping size filter
:param read: read pysam object
:type read: pysam.libcalignedsegment.AlignedSegment
:return: True if pass the filter, else False
:rtype: float
"""
return
self
.
_query_overlap
(
read
)
>=
self
.
qoverlap
def
_read_tposition
(
self
,
read
):
"""
Get mapping position of read on the reference
:param read: read pysam object
:type read: pysam.libcalignedsegment.AlignedSegment
:return: position (start, end)
:rtype: int, int
"""
return
read
.
reference_start
,
read
.
reference_end
def
_get_query_sequence
(
self
,
contig
,
start
,
end
):
return
str
(
self
.
afasta
.
fetch
(
reference
=
contig
,
start
=
start
,
end
=
end
))
def
_exons
(
self
,
read
,
start
):
"""
Define exons on reference with read split, and get indels
:param read: read pysam object
:type read: pysam.libcalignedsegment.AlignedSegment
:param start: start position of the read on the reference
:type start: int
:return:
"""
exons
=
[]
indels
=
[]
exon_start
=
start
exon_end
=
start
qstart
=
0
complete
=
False
for
cigar
in
read
.
cigartuples
:
operation
=
cigar
[
0
]
length
=
cigar
[
1
]
if
operation
==
0
:
# MATCH
exon_end
+=
length
complete
=
False
qstart
+=
length
elif
operation
==
1
:
# INSERTION
indels
.
append
({
"type"
:
"ins"
,
"start"
:
exon_end
,
"end"
:
exon_end
+
length
,
"seq"
:
self
.
_get_query_sequence
(
read
.
query_name
,
qstart
,
qstart
+
length
),
"contig"
:
"%s:%d-%d"
%
(
read
.
query_name
,
qstart
+
1
,
qstart
+
length
)})
# 1-based coords
qstart
+=
length
elif
operation
==
2
:
# DELETION
indels
.
append
({
"type"
:
"del"
,
"start"
:
exon_end
,
"end"
:
exon_end
+
length
})
exon_end
+=
length
elif
operation
==
3
:
# SPLIT
exons
.
append
((
exon_start
,
exon_end
))
complete
=
True
exon_start
=
exon_end
+
length
exon_end
=
exon_start
elif
operation
==
4
:
# SOFT CLIP
exon_start
+=
length
qstart
+=
length
elif
operation
==
5
:
# HARD CLIP
pass
else
:
print
(
"Operation not expected: %d"
%
operation
)
if
not
complete
:
exons
.
append
((
exon_start
,
exon_end
))
return
exons
,
indels
@
staticmethod
def
_sort_genes_ids
(
item
):
match
=
re
.
match
(
r
"([^:]+):(\d+)-(\d+)([+-])(_(\d))?"
,
item
)
return
match
.
group
(
1
),
int
(
match
.
group
(
2
)),
int
(
match
.
group
(
3
)),
match
.
group
(
4
),
\
int
(
match
.
group
(
6
))
if
match
.
group
(
6
)
is
not
None
else
0
def
_write_gene
(
self
,
gtf
,
gene
,
exons
,
indels
):
"""
Write gene into GTF file
:param gtf: GTF file handler
:type gtf: _io.TextIOWrapper
:param gene: dictionnary describing gene. Required keys : id, seqname, start, end, strand
:type gene: dict
:param exons: list of exons. Each exon is a tuple (start, end)
:type exons: list
:param indels: list of indels. Each indel is a dictionnary with keys type (ins or del), start and end
:type indels: list
:return:
"""
line
=
'{seqname}
\t
miniannotator
\t
{feature}
\t
{start}
\t
{end}
\t
.
\t
{strand}
\t
.
\t
gene_id "{gene_id}" {attrs}
\n
'
gene_line
=
line
.
format
(
seqname
=
gene
[
"seqname"
],
feature
=
"gene"
,
start
=
gene
[
"start"
],
end
=
gene
[
"end"
],
strand
=
gene
[
"strand"
],
gene_id
=
gene
[
"id"
],
attrs
=
""
)
gtf
.
write
(
gene_line
)
for
exon
in
sorted
(
exons
):
exon_line
=
line
.
format
(
seqname
=
gene
[
"seqname"
],
feature
=
"exon"
,
start
=
exon
[
0
]
+
1
,
# Transform 0-based => 1-based
end
=
exon
[
1
],
strand
=
gene
[
"strand"
],
gene_id
=
gene
[
"id"
],
attrs
=
""
)
gtf
.
write
(
exon_line
)
if
len
(
indels
)
>
0
:
for
indel
in
sorted
(
indels
,
key
=
lambda
x
:
(
x
[
"start"
],
x
[
"end"
])):
indel_line
=
line
.
format
(
seqname
=
gene
[
"seqname"
],
feature
=
indel
[
"type"
],
start
=
indel
[
"start"
]
+
1
,
# Transform 0-based => 1-based
end
=
indel
[
"end"
],
strand
=
gene
[
"strand"
],
gene_id
=
gene
[
"id"
],
attrs
=
""
if
indel
[
"type"
]
==
"del"
else
(
'sequence "%s" contig_pos "%s"'
%
(
indel
[
"seq"
],
indel
[
"contig"
]))
)
gtf
.
write
(
indel_line
)
def
search_genes
(
self
,
gtf_file
):
"""
"""
Parse BAM file to search genes and exons positions
Parse BAM file to search genes and exons positions
Query match position on the reference defines gene position
Query match position on the reference defines gene position
Splice reads define exons: introns are the "N" in the cigarline
Splice reads define exons: introns are the "N" in the cigarline
Save also DEL/INS events in genes
Save also DEL/INS events in genes
:return:
:param gtf_file: output GTF file path
:type gtf_file: str
"""
"""
genes
=
OrderedDict
()
print
(
"Searching genes in genome..."
,
flush
=
True
)
exons
=
OrderedDict
()
indels
=
OrderedDict
()
genes
=
{}
exons
=
{}
indels
=
{}
align
=
pysam
.
AlignmentFile
(
self
.
map
)
align
=
pysam
.
AlignmentFile
(
self
.
map
)
for
read
in
align
:
if
self
.
_read_pass
(
read
):
start
,
end
=
self
.
_read_tposition
(
read
)
g_exons
,
g_indels
=
self
.
_exons
(
read
,
start
)
reference
=
read
.
reference_name
gene_id
=
"%s:%d-%d"
%
(
reference
,
start
,
end
)
+
(
"-"
if
read
.
is_reverse
else
"+"
)
if
gene_id
in
genes
:
it
=
2
new_gene_id
=
gene_id
+
"_"
+
str
(
it
)
while
new_gene_id
in
genes
:
it
+=
1
new_gene_id
=
gene_id
+
"_"
+
str
(
it
)
else
:
genes
[
gene_id
]
=
None
exons
[
gene_id
]
=
g_exons
indels
[
gene_id
]
=
g_indels
print
(
"Sorting genes..."
,
flush
=
True
)
genes_order
=
sorted
(
genes
.
keys
(),
key
=
lambda
x
:
self
.
_sort_genes_ids
(
x
))
print
(
"Writing genes in GTF file..."
,
flush
=
True
)
gene_id_nb
=
0
with
open
(
os
.
path
.
join
(
gtf_file
),
"w"
)
as
gtf
:
for
gene
in
genes_order
:
match
=
re
.
match
(
r
"([^:]+):(\d+)-(\d+)([+-])(_(\d))?"
,
gene
)
if
match
.
group
(
6
)
is
None
:
gene_id_nb
+=
1
gene_id
=
"GENE%0.10d"
else
:
gene_id
=
"GENE%0.10d_"
+
match
.
group
(
6
)
gene_c
=
{
"id"
:
gene_id
%
gene_id_nb
,
"seqname"
:
match
.
group
(
1
),
"start"
:
int
(
match
.
group
(
2
))
+
1
,
# Transform 0-based => 1-based
"end"
:
int
(
match
.
group
(
3
)),
"strand"
:
match
.
group
(
4
)
}
g_exons
=
exons
[
gene
]
g_indels
=
indels
[
gene
]
self
.
_write_gene
(
gtf
=
gtf
,
gene
=
gene_c
,
exons
=
g_exons
,
indels
=
g_indels
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
...
@@ -79,6 +272,8 @@ if __name__ == "__main__":
...
@@ -79,6 +272,8 @@ if __name__ == "__main__":
parser
.
add_argument
(
"-a"
,
"--assembly"
,
help
=
"De novo assembly fasta file"
,
required
=
True
)
parser
.
add_argument
(
"-a"
,
"--assembly"
,
help
=
"De novo assembly fasta file"
,
required
=
True
)
parser
.
add_argument
(
"-m"
,
"--map"
,
help
=
"BAM map file build with minimap2 (if not given, will be computed now)"
,
parser
.
add_argument
(
"-m"
,
"--map"
,
help
=
"BAM map file build with minimap2 (if not given, will be computed now)"
,
required
=
False
)
required
=
False
)
parser
.
add_argument
(
"-q"
,
"--min-qoverlap"
,
help
=
"Minimal query overlap [0-100]"
,
type
=
int
,
default
=
90
,
required
=
False
)
parser
.
add_argument
(
"-o"
,
"--output-dir"
,
help
=
"Output folder path"
,
required
=
False
,
default
=
"."
)
parser
.
add_argument
(
"-o"
,
"--output-dir"
,
help
=
"Output folder path"
,
required
=
False
,
default
=
"."
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
...
@@ -88,12 +283,20 @@ if __name__ == "__main__":
...
@@ -88,12 +283,20 @@ if __name__ == "__main__":
elif
not
os
.
path
.
isdir
(
args
.
output_dir
):
elif
not
os
.
path
.
isdir
(
args
.
output_dir
):
print
(
"Error: output folder %s exists but is not a folder"
%
args
.
output_dir
,
file
=
sys
.
stderr
)
print
(
"Error: output folder %s exists but is not a folder"
%
args
.
output_dir
,
file
=
sys
.
stderr
)
annotator
=
Miniannotator
(
args
.
reference
,
args
.
assembly
)
# Map:
# Map:
if
args
.
map
is
None
:
if
args
.
map
is
None
:
annotator
=
Miniannotator
(
reference
=
args
.
reference
,
assembly
=
args
.
assembly
,
qoverlap
=
args
.
min_qoverlap
/
100
)
map_file
=
os
.
path
.
join
(
args
.
output_dir
,
"map.bam"
)
map_file
=
os
.
path
.
join
(
args
.
output_dir
,
"map.bam"
)
annotator
.
launch_minimap
(
map_file
)
annotator
.
launch_minimap
(
map_file
)
else
:
else
:
print
(
"Using map from %s..."
%
args
.
map
)
print
(
"Using map from %s..."
%
args
.
map
,
flush
=
True
)
map_file
=
args
.
map
annotator
=
Miniannotator
(
reference
=
args
.
reference
,
assembly
=
args
.
assembly
,
qoverlap
=
args
.
min_qoverlap
/
100
,
map
=
args
.
map
)
# Search genes
annotator
.
search_genes
(
os
.
path
.
join
(
args
.
output_dir
,
"annotations.gtf"
))
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment