refactoring in an OO way. recalc for strand + ok

f494725a · Helene Rimbert · fa8d2c66 · f494725a
Commit f494725a authored 5 years ago by Helene Rimbert
--- a/bin/recalcCoordsFromBlat.py
+++ b/bin/recalcCoordsFromBlat.py
@@ -7,336 +7,423 @@ from Bio.Seq import Seq
 from Bio.SeqRecord import SeqRecord
 from Bio.Alphabet import IUPAC
 from Bio import SeqIO
+import argparse
 import pysam
 import sys

 """
 Command line arguments
 """
-blatMapping = sys.argv[1] # Blat result file from microMappingPipeline.py
-summaryMapping = sys.argv[2] # Summary of the blat mapping from microMappingPipeline.py
-outputDir = sys.argv[3] # output directory ...
-genomeQuery = sys.argv[4] # fasta file of the v1 genome (must be fai indexed)
-genomeTarget = sys.argv[5] # fasta file of the v2 genome (must be fai indexed)
-annotQuery = sys.argv[6] # initial GFF file of the annotation
-
-"""
-Global var
-"""
-geneAnnot=defaultdict()
-geneMapping=defaultdict()
-newCoord=defaultdict()
-outputGff=outputDir+'/targetAnnot.gff3'
-
-
-def main():
-
-	"""
-	check for input files and output directory
-	"""
-	sys.stdout.write(' Check input files:')
-	checkFile(file=blatMapping)
-	checkFile(file=summaryMapping)
-	checkFile(file=annotQuery)
-
-	"""
-	Save blat mapping info into dict
-	"""
-	geneMapping=saveBlatResults(summary=summaryMapping, blat=blatMapping)
-
-	"""
-	Save Annotation infirmation from GFF file
-	geneAnnot is a dict with gene id as key
-	"""
-	geneAnnot=saveAnnotFromGFF(gff=annotQuery)
-
-	"""
-	For each gene, with full match (with or without missmatches)
-	"""
-	for geneID in sorted(geneAnnot.keys()):
-		print(" Dealing with gene {}\n".format(geneID))
-
-		blat=''
-		annot=geneAnnot[geneID]
-
-		if geneID in geneMapping.keys():
-			blat=geneMapping[geneID]
-		else:
-			sys.stderr.srite(" ERROR cannot find gene {} in blat data\n".format(geneID))
-			continue
-
-		numFeatures = len(annot)
-
-		# some logs
-		print(" Annotation features:")
-		print(annot)
-		print(" Found {} features for gene {}\n".format(numFeatures,geneID))
-		print(" Blat mapping:")
-		print(blat)
-
-		'''
-		Recalc the new coord
-		'''
-		newCoord = recalcCoord(
-			strand=blat['hitStrandOnTarget'],
-			queryChrom=blat['queryChrom'],
-			queryStart=blat['queryStart'],
-			queryStop=blat['queryStop'],
-			targetChrom=blat['targetChrom'],
-			targetStart=blat['targetStart'],
-			targetStop=blat['targetStop'],
-			hitStart=blat['hitStartOnTarget'],
-			hitStop=blat['hitStopOnTarget'],
-			annot=annot
-			)
-		input()
-
-
-def recalcCoord(strand,queryChrom,queryStart,queryStop,targetChrom,targetStart,targetStop,hitStart,hitStop,annot):
-	newAnnotCoord=[]
-	transfertStatus=0
-	geneid=annot[0].split("\t")[8].split(';')[0]
-
-	if strand == '+':
-		print(' Hit on strand + of the target')
-		print(' target chrom {} starting from {} and ending at {}'.format(targetChrom,targetStart,targetStop))
-
-		for feature in annot:
-			feature=feature.rstrip('\n').split("\t")
-			print('************ recalc coord for feature {}'.format(feature))
-
-			# extract infos from col #9
-			gffInfoTable=feature[8].split(';')
-
-			# and transform it into a dict
-			gffInfoDict=defaultdict()
-			for infoPair in gffInfoTable:
-				(key,value)=infoPair.split('=')
-				gffInfoDict[key] = value
-
-
-			# recalc the start and stop of current feature
-			newStart=int(targetStart) + int(hitStart) -1 + (int(feature[3]) - int(queryStart))
-			newStop=int(targetStart) + int(hitStart) + (int(feature[4]) - int(queryStart))
-			newFeatureLength = newStop - newStart + 1
-
-			# strand stays the same as in query gff
-			newStrand=feature[6]
-			newChrom=targetChrom
-
-			print(' New start = {} new stop = {} : new feature length = {}'.format(newStart,newStop,newFeatureLength))
-
-			"""
-			extract the fasta sequence to compare the features before and after recalc
-			1. from query
-			"""
-			queryFeatureSeq=getFastaSeq(fasta=queryFasta,
-				chrom=queryChrom,
-				start=int(feature[3])-1,
-				stop=int(feature[4]))
-			querySeqRecord=SeqRecord(
-				Seq(queryFeatureSeq, IUPAC.ambiguous_dna),
-				id=gffInfoDict['ID'],
-				name=gffInfoDict['ID'],
-				description='Sequence extracted from ' + genomeQuery)
-			print(querySeqRecord)
-
-			"""
-			Extract FASTA
-			2. From target Sequence
-			"""
-			targetFeatureSeq=getFastaSeq(fasta=targetFasta,
-				chrom=newChrom,
-				start=int(newStart),
-				stop=int(newStop))
-			targetSeqRecord=SeqRecord(
-				Seq(targetFeatureSeq, IUPAC.ambiguous_dna),
-				id=gffInfoDict['ID'],
-				name=gffInfoDict['ID'],
-				description='Sequence extracted from ' + genomeTarget)
-			print(targetSeqRecord)
-
-			"""
-			Test if the 2 sequences are the same to validate the recalculation
-			"""
-			if queryFeatureSeq == targetFeatureSeq:
-				print( "the two sequences are the same \n" )
-				newFeature = feature
-				newFeature[0]=newChrom
-				newFeature[3]=newStart
-				newFeature[4]=newStop
-				newFeature[6]=newStrand
-				print(" new feature coords on target genome: {}".format(newFeature))
-				newAnnotCoord.append(newFeature)
-
+# blatMapping = sys.argv[1] # Blat result file from microMappingPipeline.py
+# summaryMapping = sys.argv[2] # Summary of the blat mapping from microMappingPipeline.py
+# outputDir = sys.argv[3] # output directory ...
+# genomeQuery = sys.argv[4] # fasta file of the v1 genome (must be fai indexed)
+# genomeTarget = sys.argv[5] # fasta file of the v2 genome (must be fai indexed)
+# annotQuery = sys.argv[6] # initial GFF file of the annotation
+
+class recalcCoords (object):
+	def __init__(self):
+		"""
+		Global var
+		"""
+		self.geneAnnot=defaultdict()
+		self.geneMapping=defaultdict()
+		self.newCoord=defaultdict()
+		self.outputGff=''
+		self.geneMapping=defaultdict()
+		self.geneAnnot==defaultdict()
+
+
+	def main(self):
+
+		self.getOptions()
+		self.checkInputs()
+		self.openFastaHandlers()
+		self.openOutputGffHandler()
+
+		# Save blat mapping info into dict. self.geneMapping is a dict with gene id as key
+		self.saveBlatResults(summary=self.options.summaryMapping,blat=self.options.blat)
+
+		# Save Annotation information from GFF file. self.geneAnnot is a dict with gene id as key
+		self.saveAnnotFromGFF()
+
+		# For each gene, with full match (with or without missmatches), we try to recalc the coordinates
+		self.parseAnnotAndRecalc()
+
+	def parseAnnotAndRecalc(self):
+
+		for geneID in sorted(self.geneAnnot.keys()):
+			blat=''
+			annot=''
+			print(" Dealing with gene {}\n".format(geneID))
+
+			# save current anno for the working gene.
+			annot=self.geneAnnot[geneID]
+			if geneID in self.geneMapping.keys():
+				blat=self.geneMapping[geneID]
+				print(' Blast summary found: {}'.format(blat))
 			else:
-				sys.stderr.write(" error with feature {} :".format(gffInfoDict['ID']))
-				sys.stderr.write( "the 2 sequences are not the same: cannot transfert this gene ! \n" )
-				print('Query: {}'.format(queryFeatureSeq))
-				print(' Target: {}'.format(targetFeatureSeq))
-				transfertStatus=1
-
-			input()
-
-	elif strand == '-':
-		print(' Hit on strand - of the target')
-		print(' target chrom {} starting from {} and ending at {}'.format(targetChrom,targetStart,targetStop))
-		print(' we reverse the strand on the target')
-
-		for feature in annot:
-			feature=feature.rstrip('\n').split("\t")
-			print(' recalc coord for feature {}'.format(feature))
-
-	else:
-		sys.stderr.write(' Cannot parse strand of blat hit on target ')
-
-	"""
-	If at least one feature failed when recal coords,
-	aka the sequences are not the same in target and query,
-	we do not return the gene and its subfeatures
-	"""
-	if (transfertStatus == 1):
-		sys.stderr.write(" At least one feature failed when recalc coords for gene {}.\n".format(geneid))
-		return 0
-	else:
-		print(" New GFF:")
-		print(newAnnotCoord)
-		return newAnnotCoord
-
-def saveAnnotFromGFF(gff):
-	sys.stdout.write(" 2. saving Query Annotation from file {} into Dict \n".format(gff))
-	sys.stdout.write("===========================================================================\n")
-	dataDict=defaultdict()
-	geneID=''
-	with open(gff) as gffRecords:
-		for line in gffRecords.readlines():
-			if line.startswith('#'):
+				sys.stderr.write(" ERROR cannot find gene {} in blat data\n".format(geneID))
+				# TODO:
 				continue

-			# transform gff line into array
-			(chrom,source,featureType,start,stop,score,strand,frame,info)=line.rstrip('\n').split('\t')
+			numFeatures = len(annot)

-			# save the last column with info tags
-			gffInfoTable=info.split(';')
+			# some logs
+			print(" Annotation features:")
+			print(annot)
+			print(" Found {} features for gene {}\n".format(numFeatures,geneID))
+			print(" Blat mapping data {}:".format(blat))

-			# and transform it into a dict
-			gffInfoDict=defaultdict()
-			for infoPair in gffInfoTable:
-				(key,value)=infoPair.split('=')
-				gffInfoDict[key] = value
+			if blat['mappingStatus'] == 'unmapped':
+				print(" Blat mapping status is {}".format(blat['mappingStatus']))
+				print(" We cannot go further for now")
+				# TODO:

-			if featureType == 'gene':
-				geneID=gffInfoDict['ID']
-				dataDict[geneID] = [line.rstrip('\n')]
+			elif blat['mappingStatus'] == 'imperfectMatch':
+				print(" Blat mapping status is {}".format(blat['mappingStatus']))
+				print(" refine mapping with exonerate or something")
+				# TODO:

 			else:
-				dataDict[geneID].append(line.rstrip('\n'))
-
-	numGenes=len(dataDict.keys())
-	sys.stdout.write(" Found {} genes in GFF file {}\n".format(numGenes,gff))
-	return dataDict
-
-def saveBlatResults(summary, blat):
-	sys.stdout.write(" 1. saving blat results {} and summary {} into Dict \n".format(blat,summary))
-	sys.stdout.write("===========================================================================\n")
-	dataDict=defaultdict()
-
-	# SUMMARY DATA
-	with open(summary) as summaryData:
-		for line in summaryData.readlines():
-			#sys.stdout.write(' current record is {}'.format(line))
-
-			(geneName,markerPair,mappingStatus,queryChrom,markerStatus,markerPair2,geneName2,geneQueryStart,geneQueryEnd,geneQueryStrand,leftMakerId,leftMakerQueryLoc,leftMarkerTargetLoc,rightMarkerId,rightMakerQueryLoc,rightMarkerTargetLoc) = line.rstrip('\n').split("\t")
-
-			dataDict[geneName] = {
-				'geneId': geneName,
-				'gffRecords': 'NA',
-				'gffSubfeatures': [],
-				'targetStart':'NA',
-				'targetStop':'NA',
-				'targetChrom':'NA',
-				'hitStrandOnTarget':'NA',
-				'hitStartOnTarget':'NA',
-				'hitStopOnTarget':'NA',
-				'hitStrandOnTarget':'NA',
-				'queryStart':geneQueryStart,
-				'queryStop':geneQueryEnd,
-				'queryChrom':queryChrom,
-				'queryStrand':geneQueryStrand,
-				'mappingStatus':mappingStatus
-			}
-	numGenesInDict=len(dataDict.keys())
-
-	# BLAT MAPPING DATA
-	with open(blat) as blatData:
-		for line in blatData.readlines():
-
-			(geneName,markerPair,mappingStatus,mappingStatus2,blatCoverage,blatIndelBases,blatMissmatches,hitStartOnTarget,hitStopOnTarget,hitStrand,targetLength,targetName)=line.rstrip('\n').split('\t')
-
-			if geneName in dataDict.keys():
-
-				if mappingStatus == 'unmapped':
-					sys.stdout.write(" gene {} is not mapped: no target information to recover.\n".format(geneName))
+				'''
+				Recalc the new coord
+				'''
+				self.recalcCoord(
+					strand=blat['hitStrandOnTarget'],
+					queryChrom=blat['queryChrom'],
+					queryStart=blat['queryStart'],
+					queryStop=blat['queryStop'],
+					targetChrom=blat['targetChrom'],
+					targetStart=blat['targetStart'],
+					targetStop=blat['targetStop'],
+					hitStart=blat['hitStartOnTarget'],
+					hitStop=blat['hitStopOnTarget'],
+					annot=annot,
+					blatMapping=blat)
+				input()
+
+
+	def recalcCoord(self,strand,queryChrom,queryStart,queryStop,targetChrom,targetStart,targetStop,hitStart,hitStop,annot,blatMapping):
+		newAnnotCoord=[]
+		transfertStatus=0
+		geneid=annot[0].split("\t")[8].split(';')[0]
+		newCdsSequence=''
+		queryCdsSequence=''
+
+		if strand == '+':
+			print(' Hit on strand + of the target')
+			print(' target chrom {} starting from {} and ending at {}'.format(targetChrom,targetStart,targetStop))
+
+			for feature in annot:
+				feature=feature.rstrip('\n').split("\t")
+				print('************ recalc coord for feature {}'.format(feature))
+
+				# extract infos from col #9
+				gffInfoTable=feature[8].split(';')
+
+				# and transform it into a dict
+				gffInfoDict=defaultdict()
+				for infoPair in gffInfoTable:
+					(key,value)=infoPair.split('=')
+					gffInfoDict[key] = value
+
+				# recalc the start and stop of current feature
+				newStart=int(targetStart) + int(hitStart) -1 + (int(feature[3]) - int(queryStart))
+				newStop=int(targetStart) + int(hitStart) + (int(feature[4]) - int(queryStart))
+				newFeatureLength = newStop - newStart + 1
+				# strand stays the same as in query gff
+				newStrand=feature[6]
+				newChrom=targetChrom
+
+				print(' New start = {} new stop = {} : new feature length = {}'.format(newStart,newStop,newFeatureLength))
+
+				# save the new coords for mrna and gene features
+				if feature[2] in ['gene','mRNA']:
+					newFeature = feature
+					newFeature[0]=newChrom
+					newFeature[3]=newStart
+					newFeature[4]=newStop
+					newFeature[6]=newStrand
+					print(" new feature coords on target genome: {}".format(newFeature))
+					newAnnotCoord.append(newFeature)
 					continue
+
+				"""
+				CHECK FOR CDS/UTR/EXONS integrity:
+				extract the fasta sequence to compare the UTR/EXON/CDS features before and after recalc
+				1. from query sequence
+				"""
+				queryFeatureSeq=self.getFastaSeq(fasta=self.queryFasta,
+					chrom=queryChrom,
+					start=int(feature[3])-1,
+					stop=int(feature[4]))
+				querySeqRecord=SeqRecord(
+					Seq(queryFeatureSeq, IUPAC.ambiguous_dna),
+					id=gffInfoDict['ID'],
+					name=gffInfoDict['ID'],
+					description='feature ' + feature[2] + ' of sequence extracted from ' + self.options.queryGenome)
+				#print(querySeqRecord,len(querySeqRecord))
+
+				"""
+				2. From target Sequence
+				"""
+				targetFeatureSeq=self.getFastaSeq(fasta=self.targetFasta,
+					chrom=newChrom,
+					start=int(newStart),
+					stop=int(newStop))
+				targetSeqRecord=SeqRecord(
+					Seq(targetFeatureSeq, IUPAC.ambiguous_dna),
+					id=gffInfoDict['ID'],
+					name=gffInfoDict['ID'],
+					description='feature ' + feature[2] + ' of sequence extracted from ' + self.options.targetGenome)
+				#print(targetSeqRecord,len(targetSeqRecord))
+
+				"""
+				Concat the cds sequence if current feature is cds
+				"""
+				if feature[2] == 'CDS':
+					newCdsSequence+=str(targetSeqRecord.seq)
+					queryCdsSequence+=str(querySeqRecord.seq)
+
+				"""
+				Test if the 2 sequences are the same to validate the recalculation
+				"""
+				seq1=str(querySeqRecord.seq).upper()
+				seq2=str(targetSeqRecord.seq).upper()
+				if seq1 == seq2:
+					print( "the two sequences are the same \n" )
+					newFeature = feature
+					newFeature[0]=newChrom
+					newFeature[3]=newStart
+					newFeature[4]=newStop
+					newFeature[6]=newStrand
+					print(" new feature coords on target genome: {}".format(newFeature))
+					newAnnotCoord.append(newFeature)
+
+				elif seq1 != seq2 and blatMapping['mappingStatus'] == 'fullMatchWithMissmatches':
+					print(" The two sequences are different but the blat status is fullMatchWithMissmatches so we will be lenient ")
+
+					newFeature = feature
+					newFeature[0]=newChrom
+					newFeature[3]=newStart
+					newFeature[4]=newStop
+					newFeature[6]=newStrand
+					print(" new feature coords on target genome: {}".format(newFeature))
+					newAnnotCoord.append(newFeature)
+
 				else:
-					(prefix,targetChrom,targetStart,targetStop)=targetName.split('_')
-					dataDict[geneName]['targetStart'] = targetStart
-					dataDict[geneName]['targetStop'] = targetStop
-					dataDict[geneName]['targetChrom'] = targetChrom
-					dataDict[geneName]['hitStrandOnTarget'] = hitStrand
-					dataDict[geneName]['hitStartOnTarget'] = hitStartOnTarget
-					dataDict[geneName]['hitStopOnTarget'] = hitStopOnTarget
+					sys.stderr.write(" error with feature {} :".format(gffInfoDict['ID']))
+					sys.stderr.write( "the 2 sequences are not the same: cannot transfert this gene ! \n" )
+					#print('Query: {}'.format(seq1))
+					#print(' Target: {}'.format(seq2))
+					transfertStatus=1

-			else:
-				sys.stderr.write(" ERROR: gene {} found in blat results but not in summary data! \n".format(geneName))
-				sys.exit()
-
-	sys.stdout.write(" Number of records in Gene Dict based on summaryData: {}\n".format(numGenesInDict))
-	return dataDict
-
-def getFastaSeq(fasta,chrom,start,stop):
-	seq=fasta.fetch(reference=chrom, start=start,end=stop)
-	print(" fasta sequence for chrom %s from %s to %s\n" %(chrom,start,stop))
-	#print(seq)
-	return seq
-
-def checkFile (file):
-	if os.path.isfile(file):
-		sys.stdout.write(" File %s found\n" % str(file))
-	else:
-		sys.stdout.write(" ERROR: Cannot find file %s \n" % str(file))
-		sys.exit()
-
-def checkDir(directory):
-	if os.path.isdir(directory):
-		sys.stdout.write(" Directory %s found\n" % str(directory))
-		return 1
-	else:
-		sys.stdout.write(" Cannot find directory %s \n" % str(directory))
-		return 0

+		elif strand == '-':

-if __name__== "__main__":
-	if ( checkDir(directory=outputDir) ):
-		sys.stdout.write( " 0. Output directory already exists, no need to create it\n")
+			print(' Hit on strand - of the target')
+			print(' target chrom {} starting from {} and ending at {}'.format(targetChrom,targetStart,targetStop))
+			print(' we reverse the strand on the target')
+
+			for feature in annot:
+				feature=feature.rstrip('\n').split("\t")
+				print(' recalc coord for feature {}'.format(feature))
+
+		else:
+			sys.stderr.write(' Cannot parse strand of blat hit on target ')
+
+		"""
+		If at least one feature failed when recal coords,
+		aka the sequences are not the same in target and query,
+		we do not return the gene and its subfeatures
+		"""
+		if (transfertStatus == 1 and strand == '+'):
+			sys.stderr.write(" At least one UTR/EXON/CDS feature failed when recalc coords for gene {}.\n".format(geneid))
+			return 0
+		elif (strand == '+'):
+			print(" New GFF:")
+			print(*newAnnotCoord, sep="\n")
+
+			if self.getTranslatedCDS(seq=newCdsSequence,strand=newStrand) != self.getTranslatedCDS(seq=queryCdsSequence,strand=feature[6]):
+				print("Translated CDS are NOT the same")
+				newAnnotCoord = self.addCdsWarningTag(annotArray=newAnnotCoord)
+			return newAnnotCoord
+		else:
+			print("next")
+
+	def addCdsWarningTag(self, annotArray):
+		for featureArray in annotArray:
+			featureArray[8]+=';CDS_altered=True'
+		return annotArray
+
+	def getTranslatedCDS(self, seq, strand):
+		#print(" Evaluating CDS sequence for {}".format(seq))
+		seqObject=Seq(seq, IUPAC.ambiguous_dna)
+
+		if (strand == '-'):
+			seqObject=seqObject.reverse_complement()
+			#print(" Rev Comp sequence is {}".format(seqObject))
+
+		pepObject=seqObject.translate()
+		#print(" Translated sequence is {}".format(pepObject))
+
+	def saveAnnotFromGFF(self):
+		sys.stdout.write(" 2. saving Query Annotation from file {} into Dict \n".format(self.options.gff))
+		sys.stdout.write("===========================================================================\n")
+		geneID=''
+		with open(self.options.gff) as gffRecords:
+			for line in gffRecords.readlines():
+				if line.startswith('#'):
+					continue
+
+				# transform gff line into array
+				(chrom,source,featureType,start,stop,score,strand,frame,info)=line.rstrip('\n').split('\t')
+
+				# save the last column with info tags
+				gffInfoTable=info.split(';')

-	else:
-		sys.stdout.write( " 0. Creating output directory %s" % str(outputDir))
-		os.mkdir( outputDir, 0o750 )
+				# and transform it into a dict
+				gffInfoDict=defaultdict()
+				for infoPair in gffInfoTable:
+					(key,value)=infoPair.split('=')
+					gffInfoDict[key] = value

+				if featureType == 'gene':
+					geneID=gffInfoDict['ID']
+					self.geneAnnot[geneID] = [line.rstrip('\n')]
+
+				else:
+					self.geneAnnot[geneID].append(line.rstrip('\n'))
+
+		numGenes=len(self.geneAnnot.keys())
+		sys.stdout.write(" Found {} genes in GFF file {}\n".format(numGenes,self.options.gff))
+
+	def saveBlatResults(self, summary, blat):
+		sys.stdout.write(" 1. saving blat results {} and summary {} into Dict \n".format(blat,summary))
+		sys.stdout.write("===========================================================================\n")
+
+		# SUMMARY DATA
+		with open(summary) as summaryData:
+			for line in summaryData.readlines():
+				#sys.stdout.write(' current record is {}'.format(line))
+
+				(geneName,markerPair,mappingStatus,queryChrom,markerStatus,markerPair2,geneName2,geneQueryStart,geneQueryEnd,geneQueryStrand,leftMakerId,leftMakerQueryLoc,leftMarkerTargetLoc,rightMarkerId,rightMakerQueryLoc,rightMarkerTargetLoc) = line.rstrip('\n').split("\t")
+
+				self.geneMapping[geneName] = {
+					'geneId': geneName,
+					'gffRecords': 'NA',
+					'gffSubfeatures': [],
+					'targetStart':'NA',
+					'targetStop':'NA',
+					'targetChrom':'NA',
+					'hitStrandOnTarget':'NA',
+					'hitStartOnTarget':'NA',
+					'hitStopOnTarget':'NA',
+					'hitStrandOnTarget':'NA',
+					'queryStart':geneQueryStart,
+					'queryStop':geneQueryEnd,
+					'queryChrom':queryChrom,
+					'queryStrand':geneQueryStrand,
+					'mappingStatus':mappingStatus
+				}
+		numGenesInDict=len(self.geneMapping.keys())
+
+		# BLAT MAPPING DATA
+		with open(blat) as blatData:
+			for line in blatData.readlines():
+
+				(geneName,markerPair,mappingStatus,mappingStatus2,blatCoverage,blatIndelBases,blatMissmatches,hitStartOnTarget,hitStopOnTarget,hitStrand,targetLength,targetName)=line.rstrip('\n').split('\t')
+
+				if geneName in self.geneMapping.keys():
+
+					if mappingStatus == 'unmapped':
+						#sys.stdout.write(" gene {} is not mapped: no target information to recover.\n".format(geneName))
+						continue
+					else:
+						(prefix,targetChrom,targetStart,targetStop)=targetName.split('_')
+						self.geneMapping[geneName]['targetStart'] = targetStart
+						self.geneMapping[geneName]['targetStop'] = targetStop
+						self.geneMapping[geneName]['targetChrom'] = targetChrom
+						self.geneMapping[geneName]['hitStrandOnTarget'] = hitStrand
+						self.geneMapping[geneName]['hitStartOnTarget'] = hitStartOnTarget
+						self.geneMapping[geneName]['hitStopOnTarget'] = hitStopOnTarget
+
+				else:
+					sys.stderr.write(" ERROR: gene {} found in blat results but not in summary data! \n".format(geneName))
+					sys.exit()
+
+		sys.stdout.write(" Number of records in Gene Dict based on summaryData: {}\n".format(numGenesInDict))
+
+	def getFastaSeq(self,fasta,chrom,start,stop):
+		seq=fasta.fetch(reference=chrom, start=start,end=stop)
+		print(" fasta sequence for chrom %s from %s to %s\n" %(chrom,start,stop))
+		#print(seq)
+		return seq
+
+	def checkFile (self,file):
+		if os.path.isfile(file):
+			sys.stdout.write(" File %s found\n" % str(file))
+		else:
+			sys.stdout.write(" ERROR: Cannot find file %s \n" % str(file))
+			sys.exit()
+
+	def checkDir(self,directory):
+		if os.path.isdir(directory):
+			sys.stdout.write(" Directory %s found\n" % str(directory))
+			return 1
+		else:
+			sys.stdout.write(" Cannot find directory %s \n" % str(directory))
+			return 0
+
+	def getOptions(self):
+		parser=argparse.ArgumentParser(description='Recalculate coordinates on a new reference sequence based on BLAT summary mapping')
+		parser.add_argument('-b', '--blat', help='Blat result file from microMappingPipeline.py', required=True)
+		parser.add_argument('-s', '--summaryMapping', help='Summary of the blat mapping from microMappingPipeline.py', required=True)
+		parser.add_argument('-o', '--outputDir', help='Ouput directory', required=True)
+		parser.add_argument('-q', '--queryGenome', help='fasta file of the v1 genome (must be fai indexed)', required=True)
+		parser.add_argument('-t', '--targetGenome', help='fasta file of the v2 genome (must be fai indexed)', required=True)
+		parser.add_argument('-g', '--gff', help='initial GFF file of the annotation from the v1 genome', required=True)
+		self.options=parser.parse_args()
+
+	def openFastaHandlers(self):
+		"""
+		open for reading the query and target reference file
+		This will be used to extract the dna sequences before and after recalculation to ensure that the sequences are the same
+		and that the calculation is correct
+		"""
+		sys.stdout.write(' Open FASTA files for sequence extraction\n')
+		self.queryFasta = pysam.FastaFile(self.options.queryGenome)
+		self.targetFasta = pysam.FastaFile(self.options.targetGenome)
+
+	def openOutputGffHandler(self):
+		self.outputGff=self.options.outputDir+'/targetAnnot.gff3'
+		self.outputGffFH=open(self.outputGff, 'w')
+
+	def checkInputs(self):
+		"""
+		check for input files and output directory
+		"""
+		sys.stdout.write(' Check input files:')
+		self.checkFile(file=self.options.blat)
+		self.checkFile(file=self.options.summaryMapping)
+		self.checkFile(file=self.options.gff)
+		if ( self.checkDir(directory=self.options.outputDir) ):
+			sys.stdout.write( " 0. Output directory already exists, no need to create it\n")
+
+		else:
+			sys.stdout.write( " 0. Creating output directory %s" % str(self.options.outputDir))
+			os.mkdir( self.options.outputDir, 0o750 )
+
+		self.checkFile(file=self.options.queryGenome)
+		self.checkFile(file=self.options.targetGenome)
+
+
+if __name__== "__main__":

-	checkFile(file=genomeQuery)
-	checkFile(file=genomeTarget)
+	run=recalcCoords()
+	run.main()

-	"""
-	open for reading the query and target reference file
-	This will be used to extract the dna sequences before and after recalculation to ensure that the sequences are the same
-	and that the calculation is correct
-	"""
-	sys.stdout.write(' Open FASTA files for sequence extraction\n')
-	queryFasta = pysam.FastaFile(genomeQuery)
-	targetFasta = pysam.FastaFile(genomeTarget)

-	outputGffFH=open(outputGff, 'w')

 	main()