Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
M
magatt
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
UMR GDEC
magatt
Commits
f494725a
Commit
f494725a
authored
5 years ago
by
Helene Rimbert
Browse files
Options
Downloads
Patches
Plain Diff
refactoring in an OO way. recalc for strand + ok
parent
fa8d2c66
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
bin/recalcCoordsFromBlat.py
+394
-307
394 additions, 307 deletions
bin/recalcCoordsFromBlat.py
with
394 additions
and
307 deletions
bin/recalcCoordsFromBlat.py
+
394
−
307
View file @
f494725a
...
...
@@ -7,336 +7,423 @@ from Bio.Seq import Seq
from
Bio.SeqRecord
import
SeqRecord
from
Bio.Alphabet
import
IUPAC
from
Bio
import
SeqIO
import
argparse
import
pysam
import
sys
"""
Command line arguments
"""
blatMapping
=
sys
.
argv
[
1
]
# Blat result file from microMappingPipeline.py
summaryMapping
=
sys
.
argv
[
2
]
# Summary of the blat mapping from microMappingPipeline.py
outputDir
=
sys
.
argv
[
3
]
# output directory ...
genomeQuery
=
sys
.
argv
[
4
]
# fasta file of the v1 genome (must be fai indexed)
genomeTarget
=
sys
.
argv
[
5
]
# fasta file of the v2 genome (must be fai indexed)
annotQuery
=
sys
.
argv
[
6
]
# initial GFF file of the annotation
"""
Global var
"""
geneAnnot
=
defaultdict
()
geneMapping
=
defaultdict
()
newCoord
=
defaultdict
()
outputGff
=
outputDir
+
'
/targetAnnot.gff3
'
def
main
():
"""
check for input files and output directory
"""
sys
.
stdout
.
write
(
'
Check input files:
'
)
checkFile
(
file
=
blatMapping
)
checkFile
(
file
=
summaryMapping
)
checkFile
(
file
=
annotQuery
)
"""
Save blat mapping info into dict
"""
geneMapping
=
saveBlatResults
(
summary
=
summaryMapping
,
blat
=
blatMapping
)
"""
Save Annotation infirmation from GFF file
geneAnnot is a dict with gene id as key
"""
geneAnnot
=
saveAnnotFromGFF
(
gff
=
annotQuery
)
"""
For each gene, with full match (with or without missmatches)
"""
for
geneID
in
sorted
(
geneAnnot
.
keys
()):
print
(
"
Dealing with gene {}
\n
"
.
format
(
geneID
))
blat
=
''
annot
=
geneAnnot
[
geneID
]
if
geneID
in
geneMapping
.
keys
():
blat
=
geneMapping
[
geneID
]
else
:
sys
.
stderr
.
srite
(
"
ERROR cannot find gene {} in blat data
\n
"
.
format
(
geneID
))
continue
numFeatures
=
len
(
annot
)
# some logs
print
(
"
Annotation features:
"
)
print
(
annot
)
print
(
"
Found {} features for gene {}
\n
"
.
format
(
numFeatures
,
geneID
))
print
(
"
Blat mapping:
"
)
print
(
blat
)
'''
Recalc the new coord
'''
newCoord
=
recalcCoord
(
strand
=
blat
[
'
hitStrandOnTarget
'
],
queryChrom
=
blat
[
'
queryChrom
'
],
queryStart
=
blat
[
'
queryStart
'
],
queryStop
=
blat
[
'
queryStop
'
],
targetChrom
=
blat
[
'
targetChrom
'
],
targetStart
=
blat
[
'
targetStart
'
],
targetStop
=
blat
[
'
targetStop
'
],
hitStart
=
blat
[
'
hitStartOnTarget
'
],
hitStop
=
blat
[
'
hitStopOnTarget
'
],
annot
=
annot
)
input
()
def
recalcCoord
(
strand
,
queryChrom
,
queryStart
,
queryStop
,
targetChrom
,
targetStart
,
targetStop
,
hitStart
,
hitStop
,
annot
):
newAnnotCoord
=
[]
transfertStatus
=
0
geneid
=
annot
[
0
].
split
(
"
\t
"
)[
8
].
split
(
'
;
'
)[
0
]
if
strand
==
'
+
'
:
print
(
'
Hit on strand + of the target
'
)
print
(
'
target chrom {} starting from {} and ending at {}
'
.
format
(
targetChrom
,
targetStart
,
targetStop
))
for
feature
in
annot
:
feature
=
feature
.
rstrip
(
'
\n
'
).
split
(
"
\t
"
)
print
(
'
************ recalc coord for feature {}
'
.
format
(
feature
))
# extract infos from col #9
gffInfoTable
=
feature
[
8
].
split
(
'
;
'
)
# and transform it into a dict
gffInfoDict
=
defaultdict
()
for
infoPair
in
gffInfoTable
:
(
key
,
value
)
=
infoPair
.
split
(
'
=
'
)
gffInfoDict
[
key
]
=
value
# recalc the start and stop of current feature
newStart
=
int
(
targetStart
)
+
int
(
hitStart
)
-
1
+
(
int
(
feature
[
3
])
-
int
(
queryStart
))
newStop
=
int
(
targetStart
)
+
int
(
hitStart
)
+
(
int
(
feature
[
4
])
-
int
(
queryStart
))
newFeatureLength
=
newStop
-
newStart
+
1
# strand stays the same as in query gff
newStrand
=
feature
[
6
]
newChrom
=
targetChrom
print
(
'
New start = {} new stop = {} : new feature length = {}
'
.
format
(
newStart
,
newStop
,
newFeatureLength
))
"""
extract the fasta sequence to compare the features before and after recalc
1. from query
"""
queryFeatureSeq
=
getFastaSeq
(
fasta
=
queryFasta
,
chrom
=
queryChrom
,
start
=
int
(
feature
[
3
])
-
1
,
stop
=
int
(
feature
[
4
]))
querySeqRecord
=
SeqRecord
(
Seq
(
queryFeatureSeq
,
IUPAC
.
ambiguous_dna
),
id
=
gffInfoDict
[
'
ID
'
],
name
=
gffInfoDict
[
'
ID
'
],
description
=
'
Sequence extracted from
'
+
genomeQuery
)
print
(
querySeqRecord
)
"""
Extract FASTA
2. From target Sequence
"""
targetFeatureSeq
=
getFastaSeq
(
fasta
=
targetFasta
,
chrom
=
newChrom
,
start
=
int
(
newStart
),
stop
=
int
(
newStop
))
targetSeqRecord
=
SeqRecord
(
Seq
(
targetFeatureSeq
,
IUPAC
.
ambiguous_dna
),
id
=
gffInfoDict
[
'
ID
'
],
name
=
gffInfoDict
[
'
ID
'
],
description
=
'
Sequence extracted from
'
+
genomeTarget
)
print
(
targetSeqRecord
)
"""
Test if the 2 sequences are the same to validate the recalculation
"""
if
queryFeatureSeq
==
targetFeatureSeq
:
print
(
"
the two sequences are the same
\n
"
)
newFeature
=
feature
newFeature
[
0
]
=
newChrom
newFeature
[
3
]
=
newStart
newFeature
[
4
]
=
newStop
newFeature
[
6
]
=
newStrand
print
(
"
new feature coords on target genome: {}
"
.
format
(
newFeature
))
newAnnotCoord
.
append
(
newFeature
)
# blatMapping = sys.argv[1] # Blat result file from microMappingPipeline.py
# summaryMapping = sys.argv[2] # Summary of the blat mapping from microMappingPipeline.py
# outputDir = sys.argv[3] # output directory ...
# genomeQuery = sys.argv[4] # fasta file of the v1 genome (must be fai indexed)
# genomeTarget = sys.argv[5] # fasta file of the v2 genome (must be fai indexed)
# annotQuery = sys.argv[6] # initial GFF file of the annotation
class
recalcCoords
(
object
):
def
__init__
(
self
):
"""
Global var
"""
self
.
geneAnnot
=
defaultdict
()
self
.
geneMapping
=
defaultdict
()
self
.
newCoord
=
defaultdict
()
self
.
outputGff
=
''
self
.
geneMapping
=
defaultdict
()
self
.
geneAnnot
==
defaultdict
()
def
main
(
self
):
self
.
getOptions
()
self
.
checkInputs
()
self
.
openFastaHandlers
()
self
.
openOutputGffHandler
()
# Save blat mapping info into dict. self.geneMapping is a dict with gene id as key
self
.
saveBlatResults
(
summary
=
self
.
options
.
summaryMapping
,
blat
=
self
.
options
.
blat
)
# Save Annotation information from GFF file. self.geneAnnot is a dict with gene id as key
self
.
saveAnnotFromGFF
()
# For each gene, with full match (with or without missmatches), we try to recalc the coordinates
self
.
parseAnnotAndRecalc
()
def
parseAnnotAndRecalc
(
self
):
for
geneID
in
sorted
(
self
.
geneAnnot
.
keys
()):
blat
=
''
annot
=
''
print
(
"
Dealing with gene {}
\n
"
.
format
(
geneID
))
# save current anno for the working gene.
annot
=
self
.
geneAnnot
[
geneID
]
if
geneID
in
self
.
geneMapping
.
keys
():
blat
=
self
.
geneMapping
[
geneID
]
print
(
'
Blast summary found: {}
'
.
format
(
blat
))
else
:
sys
.
stderr
.
write
(
"
error with feature {} :
"
.
format
(
gffInfoDict
[
'
ID
'
]))
sys
.
stderr
.
write
(
"
the 2 sequences are not the same: cannot transfert this gene !
\n
"
)
print
(
'
Query: {}
'
.
format
(
queryFeatureSeq
))
print
(
'
Target: {}
'
.
format
(
targetFeatureSeq
))
transfertStatus
=
1
input
()
elif
strand
==
'
-
'
:
print
(
'
Hit on strand - of the target
'
)
print
(
'
target chrom {} starting from {} and ending at {}
'
.
format
(
targetChrom
,
targetStart
,
targetStop
))
print
(
'
we reverse the strand on the target
'
)
for
feature
in
annot
:
feature
=
feature
.
rstrip
(
'
\n
'
).
split
(
"
\t
"
)
print
(
'
recalc coord for feature {}
'
.
format
(
feature
))
else
:
sys
.
stderr
.
write
(
'
Cannot parse strand of blat hit on target
'
)
"""
If at least one feature failed when recal coords,
aka the sequences are not the same in target and query,
we do not return the gene and its subfeatures
"""
if
(
transfertStatus
==
1
):
sys
.
stderr
.
write
(
"
At least one feature failed when recalc coords for gene {}.
\n
"
.
format
(
geneid
))
return
0
else
:
print
(
"
New GFF:
"
)
print
(
newAnnotCoord
)
return
newAnnotCoord
def
saveAnnotFromGFF
(
gff
):
sys
.
stdout
.
write
(
"
2. saving Query Annotation from file {} into Dict
\n
"
.
format
(
gff
))
sys
.
stdout
.
write
(
"
===========================================================================
\n
"
)
dataDict
=
defaultdict
()
geneID
=
''
with
open
(
gff
)
as
gffRecords
:
for
line
in
gffRecords
.
readlines
():
if
line
.
startswith
(
'
#
'
):
sys
.
stderr
.
write
(
"
ERROR cannot find gene {} in blat data
\n
"
.
format
(
geneID
))
# TODO:
continue
# transform gff line into array
(
chrom
,
source
,
featureType
,
start
,
stop
,
score
,
strand
,
frame
,
info
)
=
line
.
rstrip
(
'
\n
'
).
split
(
'
\t
'
)
numFeatures
=
len
(
annot
)
# save the last column with info tags
gffInfoTable
=
info
.
split
(
'
;
'
)
# some logs
print
(
"
Annotation features:
"
)
print
(
annot
)
print
(
"
Found {} features for gene {}
\n
"
.
format
(
numFeatures
,
geneID
))
print
(
"
Blat mapping data {}:
"
.
format
(
blat
))
# and transform it into a dict
gffInfoDict
=
defaultdict
()
for
infoPair
in
gffInfoTable
:
(
key
,
value
)
=
infoPair
.
split
(
'
=
'
)
gffInfoDict
[
key
]
=
value
if
blat
[
'
mappingStatus
'
]
==
'
unmapped
'
:
print
(
"
Blat mapping status is {}
"
.
format
(
blat
[
'
mappingStatus
'
]))
print
(
"
We cannot go further for now
"
)
# TODO:
if
featureType
==
'
gene
'
:
geneID
=
gffInfoDict
[
'
ID
'
]
dataDict
[
geneID
]
=
[
line
.
rstrip
(
'
\n
'
)]
elif
blat
[
'
mappingStatus
'
]
==
'
imperfectMatch
'
:
print
(
"
Blat mapping status is {}
"
.
format
(
blat
[
'
mappingStatus
'
]))
print
(
"
refine mapping with exonerate or something
"
)
# TODO:
else
:
dataDict
[
geneID
].
append
(
line
.
rstrip
(
'
\n
'
))
numGenes
=
len
(
dataDict
.
keys
())
sys
.
stdout
.
write
(
"
Found {} genes in GFF file {}
\n
"
.
format
(
numGenes
,
gff
))
return
dataDict
def
saveBlatResults
(
summary
,
blat
):
sys
.
stdout
.
write
(
"
1. saving blat results {} and summary {} into Dict
\n
"
.
format
(
blat
,
summary
))
sys
.
stdout
.
write
(
"
===========================================================================
\n
"
)
dataDict
=
defaultdict
()
# SUMMARY DATA
with
open
(
summary
)
as
summaryData
:
for
line
in
summaryData
.
readlines
():
#sys.stdout.write(' current record is {}'.format(line))
(
geneName
,
markerPair
,
mappingStatus
,
queryChrom
,
markerStatus
,
markerPair2
,
geneName2
,
geneQueryStart
,
geneQueryEnd
,
geneQueryStrand
,
leftMakerId
,
leftMakerQueryLoc
,
leftMarkerTargetLoc
,
rightMarkerId
,
rightMakerQueryLoc
,
rightMarkerTargetLoc
)
=
line
.
rstrip
(
'
\n
'
).
split
(
"
\t
"
)
dataDict
[
geneName
]
=
{
'
geneId
'
:
geneName
,
'
gffRecords
'
:
'
NA
'
,
'
gffSubfeatures
'
:
[],
'
targetStart
'
:
'
NA
'
,
'
targetStop
'
:
'
NA
'
,
'
targetChrom
'
:
'
NA
'
,
'
hitStrandOnTarget
'
:
'
NA
'
,
'
hitStartOnTarget
'
:
'
NA
'
,
'
hitStopOnTarget
'
:
'
NA
'
,
'
hitStrandOnTarget
'
:
'
NA
'
,
'
queryStart
'
:
geneQueryStart
,
'
queryStop
'
:
geneQueryEnd
,
'
queryChrom
'
:
queryChrom
,
'
queryStrand
'
:
geneQueryStrand
,
'
mappingStatus
'
:
mappingStatus
}
numGenesInDict
=
len
(
dataDict
.
keys
())
# BLAT MAPPING DATA
with
open
(
blat
)
as
blatData
:
for
line
in
blatData
.
readlines
():
(
geneName
,
markerPair
,
mappingStatus
,
mappingStatus2
,
blatCoverage
,
blatIndelBases
,
blatMissmatches
,
hitStartOnTarget
,
hitStopOnTarget
,
hitStrand
,
targetLength
,
targetName
)
=
line
.
rstrip
(
'
\n
'
).
split
(
'
\t
'
)
if
geneName
in
dataDict
.
keys
():
if
mappingStatus
==
'
unmapped
'
:
sys
.
stdout
.
write
(
"
gene {} is not mapped: no target information to recover.
\n
"
.
format
(
geneName
))
'''
Recalc the new coord
'''
self
.
recalcCoord
(
strand
=
blat
[
'
hitStrandOnTarget
'
],
queryChrom
=
blat
[
'
queryChrom
'
],
queryStart
=
blat
[
'
queryStart
'
],
queryStop
=
blat
[
'
queryStop
'
],
targetChrom
=
blat
[
'
targetChrom
'
],
targetStart
=
blat
[
'
targetStart
'
],
targetStop
=
blat
[
'
targetStop
'
],
hitStart
=
blat
[
'
hitStartOnTarget
'
],
hitStop
=
blat
[
'
hitStopOnTarget
'
],
annot
=
annot
,
blatMapping
=
blat
)
input
()
def
recalcCoord
(
self
,
strand
,
queryChrom
,
queryStart
,
queryStop
,
targetChrom
,
targetStart
,
targetStop
,
hitStart
,
hitStop
,
annot
,
blatMapping
):
newAnnotCoord
=
[]
transfertStatus
=
0
geneid
=
annot
[
0
].
split
(
"
\t
"
)[
8
].
split
(
'
;
'
)[
0
]
newCdsSequence
=
''
queryCdsSequence
=
''
if
strand
==
'
+
'
:
print
(
'
Hit on strand + of the target
'
)
print
(
'
target chrom {} starting from {} and ending at {}
'
.
format
(
targetChrom
,
targetStart
,
targetStop
))
for
feature
in
annot
:
feature
=
feature
.
rstrip
(
'
\n
'
).
split
(
"
\t
"
)
print
(
'
************ recalc coord for feature {}
'
.
format
(
feature
))
# extract infos from col #9
gffInfoTable
=
feature
[
8
].
split
(
'
;
'
)
# and transform it into a dict
gffInfoDict
=
defaultdict
()
for
infoPair
in
gffInfoTable
:
(
key
,
value
)
=
infoPair
.
split
(
'
=
'
)
gffInfoDict
[
key
]
=
value
# recalc the start and stop of current feature
newStart
=
int
(
targetStart
)
+
int
(
hitStart
)
-
1
+
(
int
(
feature
[
3
])
-
int
(
queryStart
))
newStop
=
int
(
targetStart
)
+
int
(
hitStart
)
+
(
int
(
feature
[
4
])
-
int
(
queryStart
))
newFeatureLength
=
newStop
-
newStart
+
1
# strand stays the same as in query gff
newStrand
=
feature
[
6
]
newChrom
=
targetChrom
print
(
'
New start = {} new stop = {} : new feature length = {}
'
.
format
(
newStart
,
newStop
,
newFeatureLength
))
# save the new coords for mrna and gene features
if
feature
[
2
]
in
[
'
gene
'
,
'
mRNA
'
]:
newFeature
=
feature
newFeature
[
0
]
=
newChrom
newFeature
[
3
]
=
newStart
newFeature
[
4
]
=
newStop
newFeature
[
6
]
=
newStrand
print
(
"
new feature coords on target genome: {}
"
.
format
(
newFeature
))
newAnnotCoord
.
append
(
newFeature
)
continue
"""
CHECK FOR CDS/UTR/EXONS integrity:
extract the fasta sequence to compare the UTR/EXON/CDS features before and after recalc
1. from query sequence
"""
queryFeatureSeq
=
self
.
getFastaSeq
(
fasta
=
self
.
queryFasta
,
chrom
=
queryChrom
,
start
=
int
(
feature
[
3
])
-
1
,
stop
=
int
(
feature
[
4
]))
querySeqRecord
=
SeqRecord
(
Seq
(
queryFeatureSeq
,
IUPAC
.
ambiguous_dna
),
id
=
gffInfoDict
[
'
ID
'
],
name
=
gffInfoDict
[
'
ID
'
],
description
=
'
feature
'
+
feature
[
2
]
+
'
of sequence extracted from
'
+
self
.
options
.
queryGenome
)
#print(querySeqRecord,len(querySeqRecord))
"""
2. From target Sequence
"""
targetFeatureSeq
=
self
.
getFastaSeq
(
fasta
=
self
.
targetFasta
,
chrom
=
newChrom
,
start
=
int
(
newStart
),
stop
=
int
(
newStop
))
targetSeqRecord
=
SeqRecord
(
Seq
(
targetFeatureSeq
,
IUPAC
.
ambiguous_dna
),
id
=
gffInfoDict
[
'
ID
'
],
name
=
gffInfoDict
[
'
ID
'
],
description
=
'
feature
'
+
feature
[
2
]
+
'
of sequence extracted from
'
+
self
.
options
.
targetGenome
)
#print(targetSeqRecord,len(targetSeqRecord))
"""
Concat the cds sequence if current feature is cds
"""
if
feature
[
2
]
==
'
CDS
'
:
newCdsSequence
+=
str
(
targetSeqRecord
.
seq
)
queryCdsSequence
+=
str
(
querySeqRecord
.
seq
)
"""
Test if the 2 sequences are the same to validate the recalculation
"""
seq1
=
str
(
querySeqRecord
.
seq
).
upper
()
seq2
=
str
(
targetSeqRecord
.
seq
).
upper
()
if
seq1
==
seq2
:
print
(
"
the two sequences are the same
\n
"
)
newFeature
=
feature
newFeature
[
0
]
=
newChrom
newFeature
[
3
]
=
newStart
newFeature
[
4
]
=
newStop
newFeature
[
6
]
=
newStrand
print
(
"
new feature coords on target genome: {}
"
.
format
(
newFeature
))
newAnnotCoord
.
append
(
newFeature
)
elif
seq1
!=
seq2
and
blatMapping
[
'
mappingStatus
'
]
==
'
fullMatchWithMissmatches
'
:
print
(
"
The two sequences are different but the blat status is fullMatchWithMissmatches so we will be lenient
"
)
newFeature
=
feature
newFeature
[
0
]
=
newChrom
newFeature
[
3
]
=
newStart
newFeature
[
4
]
=
newStop
newFeature
[
6
]
=
newStrand
print
(
"
new feature coords on target genome: {}
"
.
format
(
newFeature
))
newAnnotCoord
.
append
(
newFeature
)
else
:
(
prefix
,
targetChrom
,
targetStart
,
targetStop
)
=
targetName
.
split
(
'
_
'
)
dataDict
[
geneName
][
'
targetStart
'
]
=
targetStart
dataDict
[
geneName
][
'
targetStop
'
]
=
targetStop
dataDict
[
geneName
][
'
targetChrom
'
]
=
targetChrom
dataDict
[
geneName
][
'
hitStrandOnTarget
'
]
=
hitStrand
dataDict
[
geneName
][
'
hitStartOnTarget
'
]
=
hitStartOnTarget
dataDict
[
geneName
][
'
hitStopOnTarget
'
]
=
hitStopOnTarget
sys
.
stderr
.
write
(
"
error with feature {} :
"
.
format
(
gffInfoDict
[
'
ID
'
]))
sys
.
stderr
.
write
(
"
the 2 sequences are not the same: cannot transfert this gene !
\n
"
)
#print('Query: {}'.format(seq1))
#print(' Target: {}'.format(seq2))
transfertStatus
=
1
else
:
sys
.
stderr
.
write
(
"
ERROR: gene {} found in blat results but not in summary data!
\n
"
.
format
(
geneName
))
sys
.
exit
()
sys
.
stdout
.
write
(
"
Number of records in Gene Dict based on summaryData: {}
\n
"
.
format
(
numGenesInDict
))
return
dataDict
def
getFastaSeq
(
fasta
,
chrom
,
start
,
stop
):
seq
=
fasta
.
fetch
(
reference
=
chrom
,
start
=
start
,
end
=
stop
)
print
(
"
fasta sequence for chrom %s from %s to %s
\n
"
%
(
chrom
,
start
,
stop
))
#print(seq)
return
seq
def
checkFile
(
file
):
if
os
.
path
.
isfile
(
file
):
sys
.
stdout
.
write
(
"
File %s found
\n
"
%
str
(
file
))
else
:
sys
.
stdout
.
write
(
"
ERROR: Cannot find file %s
\n
"
%
str
(
file
))
sys
.
exit
()
def
checkDir
(
directory
):
if
os
.
path
.
isdir
(
directory
):
sys
.
stdout
.
write
(
"
Directory %s found
\n
"
%
str
(
directory
))
return
1
else
:
sys
.
stdout
.
write
(
"
Cannot find directory %s
\n
"
%
str
(
directory
))
return
0
elif
strand
==
'
-
'
:
if
__name__
==
"
__main__
"
:
if
(
checkDir
(
directory
=
outputDir
)
):
sys
.
stdout
.
write
(
"
0. Output directory already exists, no need to create it
\n
"
)
print
(
'
Hit on strand - of the target
'
)
print
(
'
target chrom {} starting from {} and ending at {}
'
.
format
(
targetChrom
,
targetStart
,
targetStop
))
print
(
'
we reverse the strand on the target
'
)
for
feature
in
annot
:
feature
=
feature
.
rstrip
(
'
\n
'
).
split
(
"
\t
"
)
print
(
'
recalc coord for feature {}
'
.
format
(
feature
))
else
:
sys
.
stderr
.
write
(
'
Cannot parse strand of blat hit on target
'
)
"""
If at least one feature failed when recal coords,
aka the sequences are not the same in target and query,
we do not return the gene and its subfeatures
"""
if
(
transfertStatus
==
1
and
strand
==
'
+
'
):
sys
.
stderr
.
write
(
"
At least one UTR/EXON/CDS feature failed when recalc coords for gene {}.
\n
"
.
format
(
geneid
))
return
0
elif
(
strand
==
'
+
'
):
print
(
"
New GFF:
"
)
print
(
*
newAnnotCoord
,
sep
=
"
\n
"
)
if
self
.
getTranslatedCDS
(
seq
=
newCdsSequence
,
strand
=
newStrand
)
!=
self
.
getTranslatedCDS
(
seq
=
queryCdsSequence
,
strand
=
feature
[
6
]):
print
(
"
Translated CDS are NOT the same
"
)
newAnnotCoord
=
self
.
addCdsWarningTag
(
annotArray
=
newAnnotCoord
)
return
newAnnotCoord
else
:
print
(
"
next
"
)
def
addCdsWarningTag
(
self
,
annotArray
):
for
featureArray
in
annotArray
:
featureArray
[
8
]
+=
'
;CDS_altered=True
'
return
annotArray
def
getTranslatedCDS
(
self
,
seq
,
strand
):
#print(" Evaluating CDS sequence for {}".format(seq))
seqObject
=
Seq
(
seq
,
IUPAC
.
ambiguous_dna
)
if
(
strand
==
'
-
'
):
seqObject
=
seqObject
.
reverse_complement
()
#print(" Rev Comp sequence is {}".format(seqObject))
pepObject
=
seqObject
.
translate
()
#print(" Translated sequence is {}".format(pepObject))
def
saveAnnotFromGFF
(
self
):
sys
.
stdout
.
write
(
"
2. saving Query Annotation from file {} into Dict
\n
"
.
format
(
self
.
options
.
gff
))
sys
.
stdout
.
write
(
"
===========================================================================
\n
"
)
geneID
=
''
with
open
(
self
.
options
.
gff
)
as
gffRecords
:
for
line
in
gffRecords
.
readlines
():
if
line
.
startswith
(
'
#
'
):
continue
# transform gff line into array
(
chrom
,
source
,
featureType
,
start
,
stop
,
score
,
strand
,
frame
,
info
)
=
line
.
rstrip
(
'
\n
'
).
split
(
'
\t
'
)
# save the last column with info tags
gffInfoTable
=
info
.
split
(
'
;
'
)
else
:
sys
.
stdout
.
write
(
"
0. Creating output directory %s
"
%
str
(
outputDir
))
os
.
mkdir
(
outputDir
,
0o750
)
# and transform it into a dict
gffInfoDict
=
defaultdict
()
for
infoPair
in
gffInfoTable
:
(
key
,
value
)
=
infoPair
.
split
(
'
=
'
)
gffInfoDict
[
key
]
=
value
if
featureType
==
'
gene
'
:
geneID
=
gffInfoDict
[
'
ID
'
]
self
.
geneAnnot
[
geneID
]
=
[
line
.
rstrip
(
'
\n
'
)]
else
:
self
.
geneAnnot
[
geneID
].
append
(
line
.
rstrip
(
'
\n
'
))
numGenes
=
len
(
self
.
geneAnnot
.
keys
())
sys
.
stdout
.
write
(
"
Found {} genes in GFF file {}
\n
"
.
format
(
numGenes
,
self
.
options
.
gff
))
def
saveBlatResults
(
self
,
summary
,
blat
):
sys
.
stdout
.
write
(
"
1. saving blat results {} and summary {} into Dict
\n
"
.
format
(
blat
,
summary
))
sys
.
stdout
.
write
(
"
===========================================================================
\n
"
)
# SUMMARY DATA
with
open
(
summary
)
as
summaryData
:
for
line
in
summaryData
.
readlines
():
#sys.stdout.write(' current record is {}'.format(line))
(
geneName
,
markerPair
,
mappingStatus
,
queryChrom
,
markerStatus
,
markerPair2
,
geneName2
,
geneQueryStart
,
geneQueryEnd
,
geneQueryStrand
,
leftMakerId
,
leftMakerQueryLoc
,
leftMarkerTargetLoc
,
rightMarkerId
,
rightMakerQueryLoc
,
rightMarkerTargetLoc
)
=
line
.
rstrip
(
'
\n
'
).
split
(
"
\t
"
)
self
.
geneMapping
[
geneName
]
=
{
'
geneId
'
:
geneName
,
'
gffRecords
'
:
'
NA
'
,
'
gffSubfeatures
'
:
[],
'
targetStart
'
:
'
NA
'
,
'
targetStop
'
:
'
NA
'
,
'
targetChrom
'
:
'
NA
'
,
'
hitStrandOnTarget
'
:
'
NA
'
,
'
hitStartOnTarget
'
:
'
NA
'
,
'
hitStopOnTarget
'
:
'
NA
'
,
'
hitStrandOnTarget
'
:
'
NA
'
,
'
queryStart
'
:
geneQueryStart
,
'
queryStop
'
:
geneQueryEnd
,
'
queryChrom
'
:
queryChrom
,
'
queryStrand
'
:
geneQueryStrand
,
'
mappingStatus
'
:
mappingStatus
}
numGenesInDict
=
len
(
self
.
geneMapping
.
keys
())
# BLAT MAPPING DATA
with
open
(
blat
)
as
blatData
:
for
line
in
blatData
.
readlines
():
(
geneName
,
markerPair
,
mappingStatus
,
mappingStatus2
,
blatCoverage
,
blatIndelBases
,
blatMissmatches
,
hitStartOnTarget
,
hitStopOnTarget
,
hitStrand
,
targetLength
,
targetName
)
=
line
.
rstrip
(
'
\n
'
).
split
(
'
\t
'
)
if
geneName
in
self
.
geneMapping
.
keys
():
if
mappingStatus
==
'
unmapped
'
:
#sys.stdout.write(" gene {} is not mapped: no target information to recover.\n".format(geneName))
continue
else
:
(
prefix
,
targetChrom
,
targetStart
,
targetStop
)
=
targetName
.
split
(
'
_
'
)
self
.
geneMapping
[
geneName
][
'
targetStart
'
]
=
targetStart
self
.
geneMapping
[
geneName
][
'
targetStop
'
]
=
targetStop
self
.
geneMapping
[
geneName
][
'
targetChrom
'
]
=
targetChrom
self
.
geneMapping
[
geneName
][
'
hitStrandOnTarget
'
]
=
hitStrand
self
.
geneMapping
[
geneName
][
'
hitStartOnTarget
'
]
=
hitStartOnTarget
self
.
geneMapping
[
geneName
][
'
hitStopOnTarget
'
]
=
hitStopOnTarget
else
:
sys
.
stderr
.
write
(
"
ERROR: gene {} found in blat results but not in summary data!
\n
"
.
format
(
geneName
))
sys
.
exit
()
sys
.
stdout
.
write
(
"
Number of records in Gene Dict based on summaryData: {}
\n
"
.
format
(
numGenesInDict
))
def
getFastaSeq
(
self
,
fasta
,
chrom
,
start
,
stop
):
seq
=
fasta
.
fetch
(
reference
=
chrom
,
start
=
start
,
end
=
stop
)
print
(
"
fasta sequence for chrom %s from %s to %s
\n
"
%
(
chrom
,
start
,
stop
))
#print(seq)
return
seq
def
checkFile
(
self
,
file
):
if
os
.
path
.
isfile
(
file
):
sys
.
stdout
.
write
(
"
File %s found
\n
"
%
str
(
file
))
else
:
sys
.
stdout
.
write
(
"
ERROR: Cannot find file %s
\n
"
%
str
(
file
))
sys
.
exit
()
def
checkDir
(
self
,
directory
):
if
os
.
path
.
isdir
(
directory
):
sys
.
stdout
.
write
(
"
Directory %s found
\n
"
%
str
(
directory
))
return
1
else
:
sys
.
stdout
.
write
(
"
Cannot find directory %s
\n
"
%
str
(
directory
))
return
0
def
getOptions
(
self
):
parser
=
argparse
.
ArgumentParser
(
description
=
'
Recalculate coordinates on a new reference sequence based on BLAT summary mapping
'
)
parser
.
add_argument
(
'
-b
'
,
'
--blat
'
,
help
=
'
Blat result file from microMappingPipeline.py
'
,
required
=
True
)
parser
.
add_argument
(
'
-s
'
,
'
--summaryMapping
'
,
help
=
'
Summary of the blat mapping from microMappingPipeline.py
'
,
required
=
True
)
parser
.
add_argument
(
'
-o
'
,
'
--outputDir
'
,
help
=
'
Ouput directory
'
,
required
=
True
)
parser
.
add_argument
(
'
-q
'
,
'
--queryGenome
'
,
help
=
'
fasta file of the v1 genome (must be fai indexed)
'
,
required
=
True
)
parser
.
add_argument
(
'
-t
'
,
'
--targetGenome
'
,
help
=
'
fasta file of the v2 genome (must be fai indexed)
'
,
required
=
True
)
parser
.
add_argument
(
'
-g
'
,
'
--gff
'
,
help
=
'
initial GFF file of the annotation from the v1 genome
'
,
required
=
True
)
self
.
options
=
parser
.
parse_args
()
def
openFastaHandlers
(
self
):
"""
open for reading the query and target reference file
This will be used to extract the dna sequences before and after recalculation to ensure that the sequences are the same
and that the calculation is correct
"""
sys
.
stdout
.
write
(
'
Open FASTA files for sequence extraction
\n
'
)
self
.
queryFasta
=
pysam
.
FastaFile
(
self
.
options
.
queryGenome
)
self
.
targetFasta
=
pysam
.
FastaFile
(
self
.
options
.
targetGenome
)
def
openOutputGffHandler
(
self
):
self
.
outputGff
=
self
.
options
.
outputDir
+
'
/targetAnnot.gff3
'
self
.
outputGffFH
=
open
(
self
.
outputGff
,
'
w
'
)
def
checkInputs
(
self
):
"""
check for input files and output directory
"""
sys
.
stdout
.
write
(
'
Check input files:
'
)
self
.
checkFile
(
file
=
self
.
options
.
blat
)
self
.
checkFile
(
file
=
self
.
options
.
summaryMapping
)
self
.
checkFile
(
file
=
self
.
options
.
gff
)
if
(
self
.
checkDir
(
directory
=
self
.
options
.
outputDir
)
):
sys
.
stdout
.
write
(
"
0. Output directory already exists, no need to create it
\n
"
)
else
:
sys
.
stdout
.
write
(
"
0. Creating output directory %s
"
%
str
(
self
.
options
.
outputDir
))
os
.
mkdir
(
self
.
options
.
outputDir
,
0o750
)
self
.
checkFile
(
file
=
self
.
options
.
queryGenome
)
self
.
checkFile
(
file
=
self
.
options
.
targetGenome
)
if
__name__
==
"
__main__
"
:
checkFile
(
file
=
genomeQuery
)
checkFile
(
file
=
genomeTarget
)
run
=
recalcCoords
(
)
run
.
main
(
)
"""
open for reading the query and target reference file
This will be used to extract the dna sequences before and after recalculation to ensure that the sequences are the same
and that the calculation is correct
"""
sys
.
stdout
.
write
(
'
Open FASTA files for sequence extraction
\n
'
)
queryFasta
=
pysam
.
FastaFile
(
genomeQuery
)
targetFasta
=
pysam
.
FastaFile
(
genomeTarget
)
outputGffFH
=
open
(
outputGff
,
'
w
'
)
main
()
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment