save last modif recalcCoords

a16a78fd · Helene Rimbert · dbb390cd · a16a78fd
Commit a16a78fd authored 5 years ago by Helene Rimbert
--- a/bin/recalcCoordsFromBlat.py
+++ b/bin/recalcCoordsFromBlat.py
@@ -7,6 +7,7 @@ from Bio.Seq import Seq
 from Bio.SeqRecord import SeqRecord
 from Bio.Alphabet import IUPAC
 from Bio import SeqIO
+from pprint import pprint
 import argparse
 import pysam
 import sys
@@ -31,7 +32,7 @@ class recalcCoords (object):
 		self.newCoord=defaultdict()
 		self.outputGff=''
 		self.geneMapping=defaultdict()
-		self.geneAnnot==defaultdict()
+		self.newGeneAnnot=defaultdict()


 	def main(self):
@@ -89,7 +90,7 @@ class recalcCoords (object):
 				'''
 				Recalc the new coord
 				'''
-				self.recalcCoord(
+				self.newGeneAnnot[geneID]=self.recalcCoord(
 					strand=blat['hitStrandOnTarget'],
 					queryChrom=blat['queryChrom'],
 					queryStart=blat['queryStart'],
@@ -101,7 +102,9 @@ class recalcCoords (object):
 					hitStop=blat['hitStopOnTarget'],
 					annot=annot,
 					blatMapping=blat)
-				input()
+			pprint(self.newGeneAnnot[geneID])
+			input()
+


 	def recalcCoord(self,strand,queryChrom,queryStart,queryStop,targetChrom,targetStart,targetStop,hitStart,hitStop,annot,blatMapping):
@@ -111,24 +114,26 @@ class recalcCoords (object):
 		newCdsSequence=''
 		queryCdsSequence=''

-		if strand == '+':
-			print(' Hit on strand + of the target')
-			print(' target chrom {} starting from {} and ending at {}'.format(targetChrom,targetStart,targetStop))
+		print(' Hit on strand {} of the target'.format(strand))
+		print(' target chrom {} starting from {} and ending at {}'.format(targetChrom,targetStart,targetStop))

-			for feature in annot:
-				feature=feature.rstrip('\n').split("\t")
-				print('************ recalc coord for feature {}'.format(feature))
+		for feature in annot:
+			feature=feature.rstrip('\n').split("\t")
+			print('************ recalc coord for feature {}'.format(feature))
+			(newStart,newStop,newFeatureLength,newStrand,newChrom)=(0,0,0,0,0)

-				# extract infos from col #9
-				gffInfoTable=feature[8].split(';')
+			# extract infos from col #9
+			gffInfoTable=feature[8].split(';')

-				# and transform it into a dict
-				gffInfoDict=defaultdict()
-				for infoPair in gffInfoTable:
-					(key,value)=infoPair.split('=')
-					gffInfoDict[key] = value
+			# and transform it into a dict
+			gffInfoDict=defaultdict()
+			for infoPair in gffInfoTable:
+				(key,value)=infoPair.split('=')
+				gffInfoDict[key] = value

-				# recalc the start and stop of current feature
+			# recalc the start and stop of current feature
+			if strand == '+':
+				#FORMULA: NewCoord = targetStart + hitStart -1 +(oldCoord - geneStart)
 				newStart=int(targetStart) + int(hitStart) -1 + (int(feature[3]) - int(queryStart))
 				newStop=int(targetStart) + int(hitStart) + (int(feature[4]) - int(queryStart))
 				newFeatureLength = newStop - newStart + 1
@@ -136,121 +141,134 @@ class recalcCoords (object):
 				newStrand=feature[6]
 				newChrom=targetChrom

-				print(' New start = {} new stop = {} : new feature length = {}'.format(newStart,newStop,newFeatureLength))
-
-				# save the new coords for mrna and gene features
-				if feature[2] in ['gene','mRNA']:
-					newFeature = feature
-					newFeature[0]=newChrom
-					newFeature[3]=newStart
-					newFeature[4]=newStop
-					newFeature[6]=newStrand
-					print(" new feature coords on target genome: {}".format(newFeature))
-					newAnnotCoord.append(newFeature)
-					continue
+			elif strand == '-':
+				#FORMULA: NewCoord = targetStart + hitStart -1 +(geneStop - oldCoord)
+				newStart=int(targetStart) + int(hitStart) + (int(queryStop) - int(feature[4]))
+				newStop=int(targetStart) + int(hitStart) +1 + (int(queryStop) - int(feature[3]))
+				newFeatureLength = newStop - newStart + 1
+				# strand oposit from the query
+				if feature[6] == '+':
+					newStrand='-'
+				elif feature[6] == '-':
+					newStrand='+'
+				newChrom=targetChrom

-				"""
-				CHECK FOR CDS/UTR/EXONS integrity:
-				extract the fasta sequence to compare the UTR/EXON/CDS features before and after recalc
-				1. from query sequence
-				"""
-				queryFeatureSeq=self.getFastaSeq(fasta=self.queryFasta,
-					chrom=queryChrom,
-					start=int(feature[3])-1,
-					stop=int(feature[4]))
-				querySeqRecord=SeqRecord(
-					Seq(queryFeatureSeq, IUPAC.ambiguous_dna),
-					id=gffInfoDict['ID'],
-					name=gffInfoDict['ID'],
-					description='feature ' + feature[2] + ' of sequence extracted from ' + self.options.queryGenome)
-				#print(querySeqRecord,len(querySeqRecord))
-
-				"""
-				2. From target Sequence
-				"""
-				targetFeatureSeq=self.getFastaSeq(fasta=self.targetFasta,
-					chrom=newChrom,
-					start=int(newStart),
-					stop=int(newStop))
-				targetSeqRecord=SeqRecord(
-					Seq(targetFeatureSeq, IUPAC.ambiguous_dna),
-					id=gffInfoDict['ID'],
-					name=gffInfoDict['ID'],
-					description='feature ' + feature[2] + ' of sequence extracted from ' + self.options.targetGenome)
-				#print(targetSeqRecord,len(targetSeqRecord))
-
-				"""
-				Concat the cds sequence if current feature is cds
-				"""
-				if feature[2] == 'CDS':
-					newCdsSequence+=str(targetSeqRecord.seq)
-					queryCdsSequence+=str(querySeqRecord.seq)
-
-				"""
-				Test if the 2 sequences are the same to validate the recalculation
-				"""
-				seq1=str(querySeqRecord.seq).upper()
-				seq2=str(targetSeqRecord.seq).upper()
-				if seq1 == seq2:
-					print( "the two sequences are the same \n" )
-					newFeature = feature
-					newFeature[0]=newChrom
-					newFeature[3]=newStart
-					newFeature[4]=newStop
-					newFeature[6]=newStrand
-					print(" new feature coords on target genome: {}".format(newFeature))
-					newAnnotCoord.append(newFeature)
-
-				elif seq1 != seq2 and blatMapping['mappingStatus'] == 'fullMatchWithMissmatches':
-					print(" The two sequences are different but the blat status is fullMatchWithMissmatches so we will be lenient ")
-
-					newFeature = feature
-					newFeature[0]=newChrom
-					newFeature[3]=newStart
-					newFeature[4]=newStop
-					newFeature[6]=newStrand
-					print(" new feature coords on target genome: {}".format(newFeature))
-					newAnnotCoord.append(newFeature)
+			else:
+				sys.stderr.write(" Cannot parse strand of the hit !")
+				exit()

-				else:
-					sys.stderr.write(" error with feature {} :".format(gffInfoDict['ID']))
-					sys.stderr.write( "the 2 sequences are not the same: cannot transfert this gene ! \n" )
-					#print('Query: {}'.format(seq1))
-					#print(' Target: {}'.format(seq2))
-					transfertStatus=1

+			print(' New start = {} new stop = {} : new feature length = {}'.format(newStart,newStop,newFeatureLength))

-		elif strand == '-':
+			# save the new coords for mrna and gene features
+			if feature[2] in ['gene','mRNA']:
+				newFeature = feature
+				newFeature[0]=newChrom
+				newFeature[3]=newStart
+				newFeature[4]=newStop
+				newFeature[6]=newStrand
+				print(" new feature coords on target genome: {}".format(newFeature))
+				newAnnotCoord.append(newFeature)
+				continue

-			print(' Hit on strand - of the target')
-			print(' target chrom {} starting from {} and ending at {}'.format(targetChrom,targetStart,targetStop))
-			print(' we reverse the strand on the target')
+			"""
+			CHECK FOR CDS/UTR/EXONS integrity:
+			extract the fasta sequence to compare the UTR/EXON/CDS features before and after recalc
+			1. from query sequence
+			"""
+			queryFeatureSeq=self.getFastaSeq(fasta=self.queryFasta,
+				chrom=queryChrom,
+				start=int(feature[3])-1,
+				stop=int(feature[4]))
+			querySeqRecord=SeqRecord(
+				Seq(queryFeatureSeq, IUPAC.ambiguous_dna),
+				id=gffInfoDict['ID'],
+				name=gffInfoDict['ID'],
+				description='feature ' + feature[2] + ' of sequence extracted from ' + self.options.queryGenome)
+			#print(querySeqRecord,len(querySeqRecord))
+
+			"""
+			2. From target Sequence
+			"""
+			targetFeatureSeq=self.getFastaSeq(fasta=self.targetFasta,
+				chrom=newChrom,
+				start=int(newStart),
+				stop=int(newStop))
+			targetSeqRecord=SeqRecord(
+				Seq(targetFeatureSeq, IUPAC.ambiguous_dna),
+				id=gffInfoDict['ID'],
+				name=gffInfoDict['ID'],
+				description='feature ' + feature[2] + ' of sequence extracted from ' + self.options.targetGenome)
+			#print(targetSeqRecord,len(targetSeqRecord))
+
+			"""
+			Concat the cds sequence if current feature is cds
+			"""
+			if feature[2] == 'CDS':
+				newCdsSequence+=str(targetSeqRecord.seq)
+				queryCdsSequence+=str(querySeqRecord.seq)
+
+			"""
+			Test if the 2 sequences are the same to validate the recalculation
+			"""
+			seq1=str(querySeqRecord.seq).upper()
+			if newStrand == '-':
+				# need to revComp the Sequence
+				seq2=str(targetSeqRecord.reverse_complement().seq).upper()
+			elif newStrand == '+':
+				seq2=str(targetSeqRecord.seq).upper()
+			else:
+				sys.stderr.write(" Error when converting sequence to uppercase for comparison")
+				sys.stderr.write(" Cannot gues strand of the feature on the target sequence")
+				exit()
+
+			if seq1 == seq2:
+				print( "the two sequences are the same \n" )
+				newFeature = feature
+				newFeature[0]=newChrom
+				newFeature[3]=newStart
+				newFeature[4]=newStop
+				newFeature[6]=newStrand
+				print(" new feature coords on target genome: {}".format(newFeature))
+				newAnnotCoord.append(newFeature)
+
+			elif seq1 != seq2 and blatMapping['mappingStatus'] == 'fullMatchWithMissmatches':
+				print(" The two sequences are different but the blat status is fullMatchWithMissmatches so we will be lenient ")
+				print('  Query: {}'.format(seq1))
+				print(' Target: {}'.format(seq2))
+
+				newFeature = feature
+				newFeature[0]=newChrom
+				newFeature[3]=newStart
+				newFeature[4]=newStop
+				newFeature[6]=newStrand
+				print(" new feature coords on target genome: {}".format(newFeature))
+				newAnnotCoord.append(newFeature)

-			for feature in annot:
-				feature=feature.rstrip('\n').split("\t")
-				print(' recalc coord for feature {}'.format(feature))
+			else:
+				sys.stderr.write(" error with feature {} :".format(gffInfoDict['ID']))
+				sys.stderr.write( "the 2 sequences are not the same: cannot transfert this gene ! \n" )
+				print('  Query: {}'.format(seq1))
+				print(' Target: {}'.format(seq2))
+				transfertStatus=1

-		else:
-			sys.stderr.write(' Cannot parse strand of blat hit on target ')

 		"""
 		If at least one feature failed when recal coords,
 		aka the sequences are not the same in target and query,
 		we do not return the gene and its subfeatures
 		"""
-		if (transfertStatus == 1 and strand == '+'):
+		if (transfertStatus == 1):
 			sys.stderr.write(" At least one UTR/EXON/CDS feature failed when recalc coords for gene {}.\n".format(geneid))
 			return 0
-		elif (strand == '+'):
-			print(" New GFF:")
-			print(*newAnnotCoord, sep="\n")
-
-			if self.getTranslatedCDS(seq=newCdsSequence,strand=newStrand) != self.getTranslatedCDS(seq=queryCdsSequence,strand=feature[6]):
-				print("Translated CDS are NOT the same")
-				newAnnotCoord = self.addCdsWarningTag(annotArray=newAnnotCoord)
-			return newAnnotCoord
-		else:
-			print("next")
+
+		print(" New GFF:")
+		print(*newAnnotCoord, sep="\n")
+		if self.getTranslatedCDS(seq=newCdsSequence,strand=newStrand) != self.getTranslatedCDS(seq=queryCdsSequence,strand=feature[6]):
+			print("Translated CDS are NOT the same")
+			newAnnotCoord = self.addCdsWarningTag(annotArray=newAnnotCoord)
+		return newAnnotCoord
+

 	def addCdsWarningTag(self, annotArray):
 		for featureArray in annotArray: