Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
M
magatt
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
UMR GDEC
magatt
Commits
a16a78fd
Commit
a16a78fd
authored
5 years ago
by
Helene Rimbert
Browse files
Options
Downloads
Patches
Plain Diff
save last modif recalcCoords
parent
dbb390cd
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
bin/recalcCoordsFromBlat.py
+135
-117
135 additions, 117 deletions
bin/recalcCoordsFromBlat.py
with
135 additions
and
117 deletions
bin/recalcCoordsFromBlat.py
+
135
−
117
View file @
a16a78fd
...
...
@@ -7,6 +7,7 @@ from Bio.Seq import Seq
from
Bio.SeqRecord
import
SeqRecord
from
Bio.Alphabet
import
IUPAC
from
Bio
import
SeqIO
from
pprint
import
pprint
import
argparse
import
pysam
import
sys
...
...
@@ -31,7 +32,7 @@ class recalcCoords (object):
self
.
newCoord
=
defaultdict
()
self
.
outputGff
=
''
self
.
geneMapping
=
defaultdict
()
self
.
g
eneAnnot
=
=
defaultdict
()
self
.
newG
eneAnnot
=
defaultdict
()
def
main
(
self
):
...
...
@@ -89,7 +90,7 @@ class recalcCoords (object):
'''
Recalc the new coord
'''
self
.
recalcCoord
(
self
.
newGeneAnnot
[
geneID
]
=
self
.
recalcCoord
(
strand
=
blat
[
'
hitStrandOnTarget
'
],
queryChrom
=
blat
[
'
queryChrom
'
],
queryStart
=
blat
[
'
queryStart
'
],
...
...
@@ -101,7 +102,9 @@ class recalcCoords (object):
hitStop
=
blat
[
'
hitStopOnTarget
'
],
annot
=
annot
,
blatMapping
=
blat
)
input
()
pprint
(
self
.
newGeneAnnot
[
geneID
])
input
()
def
recalcCoord
(
self
,
strand
,
queryChrom
,
queryStart
,
queryStop
,
targetChrom
,
targetStart
,
targetStop
,
hitStart
,
hitStop
,
annot
,
blatMapping
):
...
...
@@ -111,24 +114,26 @@ class recalcCoords (object):
newCdsSequence
=
''
queryCdsSequence
=
''
if
strand
==
'
+
'
:
print
(
'
Hit on strand + of the target
'
)
print
(
'
target chrom {} starting from {} and ending at {}
'
.
format
(
targetChrom
,
targetStart
,
targetStop
))
print
(
'
Hit on strand {} of the target
'
.
format
(
strand
))
print
(
'
target chrom {} starting from {} and ending at {}
'
.
format
(
targetChrom
,
targetStart
,
targetStop
))
for
feature
in
annot
:
feature
=
feature
.
rstrip
(
'
\n
'
).
split
(
"
\t
"
)
print
(
'
************ recalc coord for feature {}
'
.
format
(
feature
))
for
feature
in
annot
:
feature
=
feature
.
rstrip
(
'
\n
'
).
split
(
"
\t
"
)
print
(
'
************ recalc coord for feature {}
'
.
format
(
feature
))
(
newStart
,
newStop
,
newFeatureLength
,
newStrand
,
newChrom
)
=
(
0
,
0
,
0
,
0
,
0
)
# extract infos from col #9
gffInfoTable
=
feature
[
8
].
split
(
'
;
'
)
# extract infos from col #9
gffInfoTable
=
feature
[
8
].
split
(
'
;
'
)
# and transform it into a dict
gffInfoDict
=
defaultdict
()
for
infoPair
in
gffInfoTable
:
(
key
,
value
)
=
infoPair
.
split
(
'
=
'
)
gffInfoDict
[
key
]
=
value
# and transform it into a dict
gffInfoDict
=
defaultdict
()
for
infoPair
in
gffInfoTable
:
(
key
,
value
)
=
infoPair
.
split
(
'
=
'
)
gffInfoDict
[
key
]
=
value
# recalc the start and stop of current feature
# recalc the start and stop of current feature
if
strand
==
'
+
'
:
#FORMULA: NewCoord = targetStart + hitStart -1 +(oldCoord - geneStart)
newStart
=
int
(
targetStart
)
+
int
(
hitStart
)
-
1
+
(
int
(
feature
[
3
])
-
int
(
queryStart
))
newStop
=
int
(
targetStart
)
+
int
(
hitStart
)
+
(
int
(
feature
[
4
])
-
int
(
queryStart
))
newFeatureLength
=
newStop
-
newStart
+
1
...
...
@@ -136,121 +141,134 @@ class recalcCoords (object):
newStrand
=
feature
[
6
]
newChrom
=
targetChrom
print
(
'
New start = {} new stop = {} : new feature length = {}
'
.
format
(
newStart
,
newStop
,
newFeatureLength
))
# save the new coords for mrna and gene features
if
feature
[
2
]
in
[
'
gene
'
,
'
mRNA
'
]:
newFeature
=
feature
newFeature
[
0
]
=
newChrom
newFeature
[
3
]
=
newStart
newFeature
[
4
]
=
newStop
newFeature
[
6
]
=
newStrand
print
(
"
new feature coords on target genome: {}
"
.
format
(
newFeature
))
newAnnotCoord
.
append
(
newFeature
)
continue
elif
strand
==
'
-
'
:
#FORMULA: NewCoord = targetStart + hitStart -1 +(geneStop - oldCoord)
newStart
=
int
(
targetStart
)
+
int
(
hitStart
)
+
(
int
(
queryStop
)
-
int
(
feature
[
4
]))
newStop
=
int
(
targetStart
)
+
int
(
hitStart
)
+
1
+
(
int
(
queryStop
)
-
int
(
feature
[
3
]))
newFeatureLength
=
newStop
-
newStart
+
1
# strand oposit from the query
if
feature
[
6
]
==
'
+
'
:
newStrand
=
'
-
'
elif
feature
[
6
]
==
'
-
'
:
newStrand
=
'
+
'
newChrom
=
targetChrom
"""
CHECK FOR CDS/UTR/EXONS integrity:
extract the fasta sequence to compare the UTR/EXON/CDS features before and after recalc
1. from query sequence
"""
queryFeatureSeq
=
self
.
getFastaSeq
(
fasta
=
self
.
queryFasta
,
chrom
=
queryChrom
,
start
=
int
(
feature
[
3
])
-
1
,
stop
=
int
(
feature
[
4
]))
querySeqRecord
=
SeqRecord
(
Seq
(
queryFeatureSeq
,
IUPAC
.
ambiguous_dna
),
id
=
gffInfoDict
[
'
ID
'
],
name
=
gffInfoDict
[
'
ID
'
],
description
=
'
feature
'
+
feature
[
2
]
+
'
of sequence extracted from
'
+
self
.
options
.
queryGenome
)
#print(querySeqRecord,len(querySeqRecord))
"""
2. From target Sequence
"""
targetFeatureSeq
=
self
.
getFastaSeq
(
fasta
=
self
.
targetFasta
,
chrom
=
newChrom
,
start
=
int
(
newStart
),
stop
=
int
(
newStop
))
targetSeqRecord
=
SeqRecord
(
Seq
(
targetFeatureSeq
,
IUPAC
.
ambiguous_dna
),
id
=
gffInfoDict
[
'
ID
'
],
name
=
gffInfoDict
[
'
ID
'
],
description
=
'
feature
'
+
feature
[
2
]
+
'
of sequence extracted from
'
+
self
.
options
.
targetGenome
)
#print(targetSeqRecord,len(targetSeqRecord))
"""
Concat the cds sequence if current feature is cds
"""
if
feature
[
2
]
==
'
CDS
'
:
newCdsSequence
+=
str
(
targetSeqRecord
.
seq
)
queryCdsSequence
+=
str
(
querySeqRecord
.
seq
)
"""
Test if the 2 sequences are the same to validate the recalculation
"""
seq1
=
str
(
querySeqRecord
.
seq
).
upper
()
seq2
=
str
(
targetSeqRecord
.
seq
).
upper
()
if
seq1
==
seq2
:
print
(
"
the two sequences are the same
\n
"
)
newFeature
=
feature
newFeature
[
0
]
=
newChrom
newFeature
[
3
]
=
newStart
newFeature
[
4
]
=
newStop
newFeature
[
6
]
=
newStrand
print
(
"
new feature coords on target genome: {}
"
.
format
(
newFeature
))
newAnnotCoord
.
append
(
newFeature
)
elif
seq1
!=
seq2
and
blatMapping
[
'
mappingStatus
'
]
==
'
fullMatchWithMissmatches
'
:
print
(
"
The two sequences are different but the blat status is fullMatchWithMissmatches so we will be lenient
"
)
newFeature
=
feature
newFeature
[
0
]
=
newChrom
newFeature
[
3
]
=
newStart
newFeature
[
4
]
=
newStop
newFeature
[
6
]
=
newStrand
print
(
"
new feature coords on target genome: {}
"
.
format
(
newFeature
))
newAnnotCoord
.
append
(
newFeature
)
else
:
sys
.
stderr
.
write
(
"
Cannot parse strand of the hit !
"
)
exit
()
else
:
sys
.
stderr
.
write
(
"
error with feature {} :
"
.
format
(
gffInfoDict
[
'
ID
'
]))
sys
.
stderr
.
write
(
"
the 2 sequences are not the same: cannot transfert this gene !
\n
"
)
#print('Query: {}'.format(seq1))
#print(' Target: {}'.format(seq2))
transfertStatus
=
1
print
(
'
New start = {} new stop = {} : new feature length = {}
'
.
format
(
newStart
,
newStop
,
newFeatureLength
))
elif
strand
==
'
-
'
:
# save the new coords for mrna and gene features
if
feature
[
2
]
in
[
'
gene
'
,
'
mRNA
'
]:
newFeature
=
feature
newFeature
[
0
]
=
newChrom
newFeature
[
3
]
=
newStart
newFeature
[
4
]
=
newStop
newFeature
[
6
]
=
newStrand
print
(
"
new feature coords on target genome: {}
"
.
format
(
newFeature
))
newAnnotCoord
.
append
(
newFeature
)
continue
print
(
'
Hit on strand - of the target
'
)
print
(
'
target chrom {} starting from {} and ending at {}
'
.
format
(
targetChrom
,
targetStart
,
targetStop
))
print
(
'
we reverse the strand on the target
'
)
"""
CHECK FOR CDS/UTR/EXONS integrity:
extract the fasta sequence to compare the UTR/EXON/CDS features before and after recalc
1. from query sequence
"""
queryFeatureSeq
=
self
.
getFastaSeq
(
fasta
=
self
.
queryFasta
,
chrom
=
queryChrom
,
start
=
int
(
feature
[
3
])
-
1
,
stop
=
int
(
feature
[
4
]))
querySeqRecord
=
SeqRecord
(
Seq
(
queryFeatureSeq
,
IUPAC
.
ambiguous_dna
),
id
=
gffInfoDict
[
'
ID
'
],
name
=
gffInfoDict
[
'
ID
'
],
description
=
'
feature
'
+
feature
[
2
]
+
'
of sequence extracted from
'
+
self
.
options
.
queryGenome
)
#print(querySeqRecord,len(querySeqRecord))
"""
2. From target Sequence
"""
targetFeatureSeq
=
self
.
getFastaSeq
(
fasta
=
self
.
targetFasta
,
chrom
=
newChrom
,
start
=
int
(
newStart
),
stop
=
int
(
newStop
))
targetSeqRecord
=
SeqRecord
(
Seq
(
targetFeatureSeq
,
IUPAC
.
ambiguous_dna
),
id
=
gffInfoDict
[
'
ID
'
],
name
=
gffInfoDict
[
'
ID
'
],
description
=
'
feature
'
+
feature
[
2
]
+
'
of sequence extracted from
'
+
self
.
options
.
targetGenome
)
#print(targetSeqRecord,len(targetSeqRecord))
"""
Concat the cds sequence if current feature is cds
"""
if
feature
[
2
]
==
'
CDS
'
:
newCdsSequence
+=
str
(
targetSeqRecord
.
seq
)
queryCdsSequence
+=
str
(
querySeqRecord
.
seq
)
"""
Test if the 2 sequences are the same to validate the recalculation
"""
seq1
=
str
(
querySeqRecord
.
seq
).
upper
()
if
newStrand
==
'
-
'
:
# need to revComp the Sequence
seq2
=
str
(
targetSeqRecord
.
reverse_complement
().
seq
).
upper
()
elif
newStrand
==
'
+
'
:
seq2
=
str
(
targetSeqRecord
.
seq
).
upper
()
else
:
sys
.
stderr
.
write
(
"
Error when converting sequence to uppercase for comparison
"
)
sys
.
stderr
.
write
(
"
Cannot gues strand of the feature on the target sequence
"
)
exit
()
if
seq1
==
seq2
:
print
(
"
the two sequences are the same
\n
"
)
newFeature
=
feature
newFeature
[
0
]
=
newChrom
newFeature
[
3
]
=
newStart
newFeature
[
4
]
=
newStop
newFeature
[
6
]
=
newStrand
print
(
"
new feature coords on target genome: {}
"
.
format
(
newFeature
))
newAnnotCoord
.
append
(
newFeature
)
elif
seq1
!=
seq2
and
blatMapping
[
'
mappingStatus
'
]
==
'
fullMatchWithMissmatches
'
:
print
(
"
The two sequences are different but the blat status is fullMatchWithMissmatches so we will be lenient
"
)
print
(
'
Query: {}
'
.
format
(
seq1
))
print
(
'
Target: {}
'
.
format
(
seq2
))
newFeature
=
feature
newFeature
[
0
]
=
newChrom
newFeature
[
3
]
=
newStart
newFeature
[
4
]
=
newStop
newFeature
[
6
]
=
newStrand
print
(
"
new feature coords on target genome: {}
"
.
format
(
newFeature
))
newAnnotCoord
.
append
(
newFeature
)
for
feature
in
annot
:
feature
=
feature
.
rstrip
(
'
\n
'
).
split
(
"
\t
"
)
print
(
'
recalc coord for feature {}
'
.
format
(
feature
))
else
:
sys
.
stderr
.
write
(
"
error with feature {} :
"
.
format
(
gffInfoDict
[
'
ID
'
]))
sys
.
stderr
.
write
(
"
the 2 sequences are not the same: cannot transfert this gene !
\n
"
)
print
(
'
Query: {}
'
.
format
(
seq1
))
print
(
'
Target: {}
'
.
format
(
seq2
))
transfertStatus
=
1
else
:
sys
.
stderr
.
write
(
'
Cannot parse strand of blat hit on target
'
)
"""
If at least one feature failed when recal coords,
aka the sequences are not the same in target and query,
we do not return the gene and its subfeatures
"""
if
(
transfertStatus
==
1
and
strand
==
'
+
'
):
if
(
transfertStatus
==
1
):
sys
.
stderr
.
write
(
"
At least one UTR/EXON/CDS feature failed when recalc coords for gene {}.
\n
"
.
format
(
geneid
))
return
0
elif
(
strand
==
'
+
'
):
print
(
"
New GFF:
"
)
print
(
*
newAnnotCoord
,
sep
=
"
\n
"
)
if
self
.
getTranslatedCDS
(
seq
=
newCdsSequence
,
strand
=
newStrand
)
!=
self
.
getTranslatedCDS
(
seq
=
queryCdsSequence
,
strand
=
feature
[
6
]):
print
(
"
Translated CDS are NOT the same
"
)
newAnnotCoord
=
self
.
addCdsWarningTag
(
annotArray
=
newAnnotCoord
)
return
newAnnotCoord
else
:
print
(
"
next
"
)
print
(
"
New GFF:
"
)
print
(
*
newAnnotCoord
,
sep
=
"
\n
"
)
if
self
.
getTranslatedCDS
(
seq
=
newCdsSequence
,
strand
=
newStrand
)
!=
self
.
getTranslatedCDS
(
seq
=
queryCdsSequence
,
strand
=
feature
[
6
]):
print
(
"
Translated CDS are NOT the same
"
)
newAnnotCoord
=
self
.
addCdsWarningTag
(
annotArray
=
newAnnotCoord
)
return
newAnnotCoord
def
addCdsWarningTag
(
self
,
annotArray
):
for
featureArray
in
annotArray
:
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment