update recalcCoordsFromGmap.py

8fd794b3 · Helene Rimbert · 674db69b · 8fd794b3
Commit 8fd794b3 authored 5 years ago by Helene Rimbert
--- a/bin/recalcCoordsFromGmap.py
+++ b/bin/recalcCoordsFromGmap.py
@@ -8,6 +8,7 @@ import numpy as np
 import argparse # pour gerer les options
 import re # pour les regexp
 from pprint import pprint
+from pyfaidx import Fasta


 class recalcCoords (object):
@@ -16,10 +17,14 @@ class recalcCoords (object):
 		# initialize mains objects here
 		self.parameters = None
 		self.annot = {}
+		self.anchoredGenes = {}
+		self.oldRef = {}
+		self.newRef = {}

 		self.stats = { 'num_perfect_anchored' : 0, 
 			'num_perfect_multi_anchored' : 0,
-			'num_ambigous_match_anchored':0}
+			'num_ambigous_match_anchored':0,
+			'num_ambigous_multi_anchored':0}

 		self.version='1.0'
 		self.author='Helene Rimbert'
@@ -31,13 +36,17 @@ class recalcCoords (object):
 		self.parseOptions()
 		self.createWorkingEnv()
 		self.loadAnnot(gff=self.parameters.gff)
+		self.indexReferences()
+		self.recalcFromGMAP()
+		self.closeOutputFiles()

-		pprint(self.stats)
-
-		self.outoutPerfectMatch.close()
-		self.outoutPerfectMatchMulti.close()
-		self.outoutAmbigousMatch.close()
+		pprint(len(self.annot.keys()))

+	def closeOutputFiles(self):
+		self.outputPerfectMatch.close()
+		self.outputPerfectMatchMulti.close()
+		self.outputAmbigousMatch.close()
+		self.outputAmbigousMatchMulti.close()

 	def parseOptions(self):
 		"""
@@ -46,6 +55,8 @@ class recalcCoords (object):
 		parser = argparse.ArgumentParser()
 		parser.add_argument("-i", "--inputDir", help="Parsed GMAP results directory", required=True)
 		parser.add_argument("-g", "--gff", help="Original GFF Annotation", required=True)
+		parser.add_argument("-r", "--refold", help="Original reference sequence (FASTA)", required=True)
+		parser.add_argument("-R", "--refnew", help="New reference sequence (FASTA)", required=True)
 		parser.add_argument("-o", "--outputdir", help="output directory", required=True)
 		self.parameters=parser.parse_args()

@@ -60,14 +71,140 @@ class recalcCoords (object):
 			print (" Successfully created the directory %s " % self.parameters.outputdir)

 		# create filehandler for results
-		self.outputGFF = open(self.parameters.outputdir+"/perfectMatches.gff3", "w")
+		self.outputPerfectMatch = open(self.parameters.outputdir+"/perfectMatches.gff3", "w")
 		self.outputPerfectMatchMulti = open(self.parameters.outputdir+"/perfectmatch_multi.gff3", "w")
 		self.outputAmbigousMatch = open(self.parameters.outputdir+"/ambigous_match.gff3", "w")
+		self.outputAmbigousMatchMulti = open(self.parameters.outputdir+"/ambigous_match_multi.gff3", "w")
+
+	def indexReferences(self):
+
+		# index previous reference sequence with pyfaidx
+		try:
+			self.oldRef = Fasta(self.parameters.refold)
+
+		except Exception as e:
+			print(" Could not index FASTA file %s " % str(self.parameters.refold))
+			raise e
+
+		else:
+			print(" Index Successfully created for %s " % str(self.parameters.refold))
+		
+		# index new reference sequence with pyfaidx
+		try:
+			self.newRef = Fasta(self.parameters.refnew)
+
+		except Exception as e:
+			print(" Could not index FASTA file %s " % str(self.parameters.refnew))
+
+			raise e
+		else:
+			print(" Index Successfully created for %s " % str(self.parameters.refnew))
+		

 	def loadAnnot(self, gff):
 		"""
 		loading original GFF annotation
 		"""
+		geneID=''
+		if os.path.isfile(gff):
+			with open(gff, 'r') as myGff:
+				gffContent = myGff.readlines()
+
+			for gffLine in gffContent:
+				if not gffLine.startswith('#'):
+					(chrom, source, feature_type, start,stop,score,strand,frame,attributes) = gffLine.split("\t")
+
+					attributes = self.getGffAttributes(attribute_string = attributes)
+
+					if feature_type == "gene":
+						#print("\n\n ###########################################################")
+						#print(" Found new gene feature %s with attributes " % str(geneID))
+						#pprint(attributes)
+
+						# set current gene ID
+						geneID=attributes['ID']
+
+						self.annot[geneID] = [gffLine]
+					else:
+						self.annot[geneID].append(gffLine)
+
+						#print(" >>>>>>>>>>> Found new sub-feature %s with attributes " % str(attributes['ID']))
+						#pprint(attributes)
+			
+	def recalcFromGMAP(self):
+		"""
+		get different classes of anchored genes and recalculate their coords them
+		"""
+
+		self.recalcPerfect(psl = self.parameters.inputDir+'/perfectmatch.psl')
+
+	def recalcPerfect(self, psl):
+		print("  Recalc coord for genes with perfect match: %s" % str(psl))
+
+		with open(psl, 'r') as mypsl:
+			pslContent = mypsl.readlines()
+		
+		for pslLine in pslContent:
+			if not pslLine.startswith('#'):
+
+				psl_result = pslLine.rstrip().split("\t")
+				regionstart=self.getStartRegionFromQueryName(queryname=psl_result[9])
+				geneID = self.getGeneIdFromQueryName(queryname=psl_result[9])
+				annot = self.annot[geneID]
+				print(" current Perfect Hit for query %s: %s" % (str(geneID),str(psl_result)))
+
+				for feature in annot:
+					gff=feature.split("\t")
+					attributes=self.getGffAttributes(gff[-1])
+					print(" current feature:")
+					pprint(gff)
+
+					newchrom=psl_result[13]
+					tstart = int(psl_result[15])
+					tstop = int(psl_result[16])
+					tstrand = psl_result[8]
+
+					qchrom=gff[0]
+					qstart=int(gff[3])
+					qstop=int(gff[4])
+					qstrand=gff[6]
+
+					if tstrand == '+':
+						newstart = (tstart + ( qstart - regionstart ))
+						newstop = (tstart + (qstop - qstart +1 ) )
+
+						# check sequences
+						fastaold=self.oldRef[qchrom][qstart:qstop].seq 
+						fastanew=self.newRef[newchrom][newstart:newstop].seq
+
+
+						if fastaold == fastanew:
+							print(" old and new sequence are the same")
+						else:
+							print(" error while converting sequences: the sequences string are not the same")
+							pprint(fastaold)
+							pprint(fastanew)
+
+
+					input()
+
+	def getGeneIdFromQueryName(self, queryname):
+		geneID=queryname.split(':')[0]
+		return geneID
+
+	def getStartRegionFromQueryName(self,queryname):
+		start = queryname.split(':')[-1].split('-')[0]
+		return int(start)
+
+	def getGffAttributes(self, attribute_string):
+		attributes_list = attribute_string.split(';')
+		attributes_dict = {}
+
+		for attr in attributes_list:
+			(tag,value) = attr.split('=')
+			attributes_dict[tag] = value
+
+		return(attributes_dict)


 if __name__ == "__main__":