Skip to content
Snippets Groups Projects
Commit a16a78fd authored by Helene Rimbert's avatar Helene Rimbert
Browse files

save last modif recalcCoords

parent dbb390cd
No related branches found
No related tags found
No related merge requests found
......@@ -7,6 +7,7 @@ from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC
from Bio import SeqIO
from pprint import pprint
import argparse
import pysam
import sys
......@@ -31,7 +32,7 @@ class recalcCoords (object):
self.newCoord=defaultdict()
self.outputGff=''
self.geneMapping=defaultdict()
self.geneAnnot==defaultdict()
self.newGeneAnnot=defaultdict()
def main(self):
......@@ -89,7 +90,7 @@ class recalcCoords (object):
'''
Recalc the new coord
'''
self.recalcCoord(
self.newGeneAnnot[geneID]=self.recalcCoord(
strand=blat['hitStrandOnTarget'],
queryChrom=blat['queryChrom'],
queryStart=blat['queryStart'],
......@@ -101,7 +102,9 @@ class recalcCoords (object):
hitStop=blat['hitStopOnTarget'],
annot=annot,
blatMapping=blat)
input()
pprint(self.newGeneAnnot[geneID])
input()
def recalcCoord(self,strand,queryChrom,queryStart,queryStop,targetChrom,targetStart,targetStop,hitStart,hitStop,annot,blatMapping):
......@@ -111,24 +114,26 @@ class recalcCoords (object):
newCdsSequence=''
queryCdsSequence=''
if strand == '+':
print(' Hit on strand + of the target')
print(' target chrom {} starting from {} and ending at {}'.format(targetChrom,targetStart,targetStop))
print(' Hit on strand {} of the target'.format(strand))
print(' target chrom {} starting from {} and ending at {}'.format(targetChrom,targetStart,targetStop))
for feature in annot:
feature=feature.rstrip('\n').split("\t")
print('************ recalc coord for feature {}'.format(feature))
for feature in annot:
feature=feature.rstrip('\n').split("\t")
print('************ recalc coord for feature {}'.format(feature))
(newStart,newStop,newFeatureLength,newStrand,newChrom)=(0,0,0,0,0)
# extract infos from col #9
gffInfoTable=feature[8].split(';')
# extract infos from col #9
gffInfoTable=feature[8].split(';')
# and transform it into a dict
gffInfoDict=defaultdict()
for infoPair in gffInfoTable:
(key,value)=infoPair.split('=')
gffInfoDict[key] = value
# and transform it into a dict
gffInfoDict=defaultdict()
for infoPair in gffInfoTable:
(key,value)=infoPair.split('=')
gffInfoDict[key] = value
# recalc the start and stop of current feature
# recalc the start and stop of current feature
if strand == '+':
#FORMULA: NewCoord = targetStart + hitStart -1 +(oldCoord - geneStart)
newStart=int(targetStart) + int(hitStart) -1 + (int(feature[3]) - int(queryStart))
newStop=int(targetStart) + int(hitStart) + (int(feature[4]) - int(queryStart))
newFeatureLength = newStop - newStart + 1
......@@ -136,121 +141,134 @@ class recalcCoords (object):
newStrand=feature[6]
newChrom=targetChrom
print(' New start = {} new stop = {} : new feature length = {}'.format(newStart,newStop,newFeatureLength))
# save the new coords for mrna and gene features
if feature[2] in ['gene','mRNA']:
newFeature = feature
newFeature[0]=newChrom
newFeature[3]=newStart
newFeature[4]=newStop
newFeature[6]=newStrand
print(" new feature coords on target genome: {}".format(newFeature))
newAnnotCoord.append(newFeature)
continue
elif strand == '-':
#FORMULA: NewCoord = targetStart + hitStart -1 +(geneStop - oldCoord)
newStart=int(targetStart) + int(hitStart) + (int(queryStop) - int(feature[4]))
newStop=int(targetStart) + int(hitStart) +1 + (int(queryStop) - int(feature[3]))
newFeatureLength = newStop - newStart + 1
# strand oposit from the query
if feature[6] == '+':
newStrand='-'
elif feature[6] == '-':
newStrand='+'
newChrom=targetChrom
"""
CHECK FOR CDS/UTR/EXONS integrity:
extract the fasta sequence to compare the UTR/EXON/CDS features before and after recalc
1. from query sequence
"""
queryFeatureSeq=self.getFastaSeq(fasta=self.queryFasta,
chrom=queryChrom,
start=int(feature[3])-1,
stop=int(feature[4]))
querySeqRecord=SeqRecord(
Seq(queryFeatureSeq, IUPAC.ambiguous_dna),
id=gffInfoDict['ID'],
name=gffInfoDict['ID'],
description='feature ' + feature[2] + ' of sequence extracted from ' + self.options.queryGenome)
#print(querySeqRecord,len(querySeqRecord))
"""
2. From target Sequence
"""
targetFeatureSeq=self.getFastaSeq(fasta=self.targetFasta,
chrom=newChrom,
start=int(newStart),
stop=int(newStop))
targetSeqRecord=SeqRecord(
Seq(targetFeatureSeq, IUPAC.ambiguous_dna),
id=gffInfoDict['ID'],
name=gffInfoDict['ID'],
description='feature ' + feature[2] + ' of sequence extracted from ' + self.options.targetGenome)
#print(targetSeqRecord,len(targetSeqRecord))
"""
Concat the cds sequence if current feature is cds
"""
if feature[2] == 'CDS':
newCdsSequence+=str(targetSeqRecord.seq)
queryCdsSequence+=str(querySeqRecord.seq)
"""
Test if the 2 sequences are the same to validate the recalculation
"""
seq1=str(querySeqRecord.seq).upper()
seq2=str(targetSeqRecord.seq).upper()
if seq1 == seq2:
print( "the two sequences are the same \n" )
newFeature = feature
newFeature[0]=newChrom
newFeature[3]=newStart
newFeature[4]=newStop
newFeature[6]=newStrand
print(" new feature coords on target genome: {}".format(newFeature))
newAnnotCoord.append(newFeature)
elif seq1 != seq2 and blatMapping['mappingStatus'] == 'fullMatchWithMissmatches':
print(" The two sequences are different but the blat status is fullMatchWithMissmatches so we will be lenient ")
newFeature = feature
newFeature[0]=newChrom
newFeature[3]=newStart
newFeature[4]=newStop
newFeature[6]=newStrand
print(" new feature coords on target genome: {}".format(newFeature))
newAnnotCoord.append(newFeature)
else:
sys.stderr.write(" Cannot parse strand of the hit !")
exit()
else:
sys.stderr.write(" error with feature {} :".format(gffInfoDict['ID']))
sys.stderr.write( "the 2 sequences are not the same: cannot transfert this gene ! \n" )
#print('Query: {}'.format(seq1))
#print(' Target: {}'.format(seq2))
transfertStatus=1
print(' New start = {} new stop = {} : new feature length = {}'.format(newStart,newStop,newFeatureLength))
elif strand == '-':
# save the new coords for mrna and gene features
if feature[2] in ['gene','mRNA']:
newFeature = feature
newFeature[0]=newChrom
newFeature[3]=newStart
newFeature[4]=newStop
newFeature[6]=newStrand
print(" new feature coords on target genome: {}".format(newFeature))
newAnnotCoord.append(newFeature)
continue
print(' Hit on strand - of the target')
print(' target chrom {} starting from {} and ending at {}'.format(targetChrom,targetStart,targetStop))
print(' we reverse the strand on the target')
"""
CHECK FOR CDS/UTR/EXONS integrity:
extract the fasta sequence to compare the UTR/EXON/CDS features before and after recalc
1. from query sequence
"""
queryFeatureSeq=self.getFastaSeq(fasta=self.queryFasta,
chrom=queryChrom,
start=int(feature[3])-1,
stop=int(feature[4]))
querySeqRecord=SeqRecord(
Seq(queryFeatureSeq, IUPAC.ambiguous_dna),
id=gffInfoDict['ID'],
name=gffInfoDict['ID'],
description='feature ' + feature[2] + ' of sequence extracted from ' + self.options.queryGenome)
#print(querySeqRecord,len(querySeqRecord))
"""
2. From target Sequence
"""
targetFeatureSeq=self.getFastaSeq(fasta=self.targetFasta,
chrom=newChrom,
start=int(newStart),
stop=int(newStop))
targetSeqRecord=SeqRecord(
Seq(targetFeatureSeq, IUPAC.ambiguous_dna),
id=gffInfoDict['ID'],
name=gffInfoDict['ID'],
description='feature ' + feature[2] + ' of sequence extracted from ' + self.options.targetGenome)
#print(targetSeqRecord,len(targetSeqRecord))
"""
Concat the cds sequence if current feature is cds
"""
if feature[2] == 'CDS':
newCdsSequence+=str(targetSeqRecord.seq)
queryCdsSequence+=str(querySeqRecord.seq)
"""
Test if the 2 sequences are the same to validate the recalculation
"""
seq1=str(querySeqRecord.seq).upper()
if newStrand == '-':
# need to revComp the Sequence
seq2=str(targetSeqRecord.reverse_complement().seq).upper()
elif newStrand == '+':
seq2=str(targetSeqRecord.seq).upper()
else:
sys.stderr.write(" Error when converting sequence to uppercase for comparison")
sys.stderr.write(" Cannot gues strand of the feature on the target sequence")
exit()
if seq1 == seq2:
print( "the two sequences are the same \n" )
newFeature = feature
newFeature[0]=newChrom
newFeature[3]=newStart
newFeature[4]=newStop
newFeature[6]=newStrand
print(" new feature coords on target genome: {}".format(newFeature))
newAnnotCoord.append(newFeature)
elif seq1 != seq2 and blatMapping['mappingStatus'] == 'fullMatchWithMissmatches':
print(" The two sequences are different but the blat status is fullMatchWithMissmatches so we will be lenient ")
print(' Query: {}'.format(seq1))
print(' Target: {}'.format(seq2))
newFeature = feature
newFeature[0]=newChrom
newFeature[3]=newStart
newFeature[4]=newStop
newFeature[6]=newStrand
print(" new feature coords on target genome: {}".format(newFeature))
newAnnotCoord.append(newFeature)
for feature in annot:
feature=feature.rstrip('\n').split("\t")
print(' recalc coord for feature {}'.format(feature))
else:
sys.stderr.write(" error with feature {} :".format(gffInfoDict['ID']))
sys.stderr.write( "the 2 sequences are not the same: cannot transfert this gene ! \n" )
print(' Query: {}'.format(seq1))
print(' Target: {}'.format(seq2))
transfertStatus=1
else:
sys.stderr.write(' Cannot parse strand of blat hit on target ')
"""
If at least one feature failed when recal coords,
aka the sequences are not the same in target and query,
we do not return the gene and its subfeatures
"""
if (transfertStatus == 1 and strand == '+'):
if (transfertStatus == 1):
sys.stderr.write(" At least one UTR/EXON/CDS feature failed when recalc coords for gene {}.\n".format(geneid))
return 0
elif (strand == '+'):
print(" New GFF:")
print(*newAnnotCoord, sep="\n")
if self.getTranslatedCDS(seq=newCdsSequence,strand=newStrand) != self.getTranslatedCDS(seq=queryCdsSequence,strand=feature[6]):
print("Translated CDS are NOT the same")
newAnnotCoord = self.addCdsWarningTag(annotArray=newAnnotCoord)
return newAnnotCoord
else:
print("next")
print(" New GFF:")
print(*newAnnotCoord, sep="\n")
if self.getTranslatedCDS(seq=newCdsSequence,strand=newStrand) != self.getTranslatedCDS(seq=queryCdsSequence,strand=feature[6]):
print("Translated CDS are NOT the same")
newAnnotCoord = self.addCdsWarningTag(annotArray=newAnnotCoord)
return newAnnotCoord
def addCdsWarningTag(self, annotArray):
for featureArray in annotArray:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment