Skip to content
Snippets Groups Projects
Commit 8fd794b3 authored by Helene Rimbert's avatar Helene Rimbert
Browse files

update recalcCoordsFromGmap.py

parent 674db69b
No related branches found
No related tags found
Loading
......@@ -8,6 +8,7 @@ import numpy as np
import argparse # pour gerer les options
import re # pour les regexp
from pprint import pprint
from pyfaidx import Fasta
class recalcCoords (object):
......@@ -16,10 +17,14 @@ class recalcCoords (object):
# initialize mains objects here
self.parameters = None
self.annot = {}
self.anchoredGenes = {}
self.oldRef = {}
self.newRef = {}
self.stats = { 'num_perfect_anchored' : 0,
'num_perfect_multi_anchored' : 0,
'num_ambigous_match_anchored':0}
'num_ambigous_match_anchored':0,
'num_ambigous_multi_anchored':0}
self.version='1.0'
self.author='Helene Rimbert'
......@@ -31,13 +36,17 @@ class recalcCoords (object):
self.parseOptions()
self.createWorkingEnv()
self.loadAnnot(gff=self.parameters.gff)
self.indexReferences()
self.recalcFromGMAP()
self.closeOutputFiles()
pprint(self.stats)
self.outoutPerfectMatch.close()
self.outoutPerfectMatchMulti.close()
self.outoutAmbigousMatch.close()
pprint(len(self.annot.keys()))
def closeOutputFiles(self):
self.outputPerfectMatch.close()
self.outputPerfectMatchMulti.close()
self.outputAmbigousMatch.close()
self.outputAmbigousMatchMulti.close()
def parseOptions(self):
"""
......@@ -46,6 +55,8 @@ class recalcCoords (object):
parser = argparse.ArgumentParser()
parser.add_argument("-i", "--inputDir", help="Parsed GMAP results directory", required=True)
parser.add_argument("-g", "--gff", help="Original GFF Annotation", required=True)
parser.add_argument("-r", "--refold", help="Original reference sequence (FASTA)", required=True)
parser.add_argument("-R", "--refnew", help="New reference sequence (FASTA)", required=True)
parser.add_argument("-o", "--outputdir", help="output directory", required=True)
self.parameters=parser.parse_args()
......@@ -60,14 +71,140 @@ class recalcCoords (object):
print (" Successfully created the directory %s " % self.parameters.outputdir)
# create filehandler for results
self.outputGFF = open(self.parameters.outputdir+"/perfectMatches.gff3", "w")
self.outputPerfectMatch = open(self.parameters.outputdir+"/perfectMatches.gff3", "w")
self.outputPerfectMatchMulti = open(self.parameters.outputdir+"/perfectmatch_multi.gff3", "w")
self.outputAmbigousMatch = open(self.parameters.outputdir+"/ambigous_match.gff3", "w")
self.outputAmbigousMatchMulti = open(self.parameters.outputdir+"/ambigous_match_multi.gff3", "w")
def indexReferences(self):
# index previous reference sequence with pyfaidx
try:
self.oldRef = Fasta(self.parameters.refold)
except Exception as e:
print(" Could not index FASTA file %s " % str(self.parameters.refold))
raise e
else:
print(" Index Successfully created for %s " % str(self.parameters.refold))
# index new reference sequence with pyfaidx
try:
self.newRef = Fasta(self.parameters.refnew)
except Exception as e:
print(" Could not index FASTA file %s " % str(self.parameters.refnew))
raise e
else:
print(" Index Successfully created for %s " % str(self.parameters.refnew))
def loadAnnot(self, gff):
"""
loading original GFF annotation
"""
geneID=''
if os.path.isfile(gff):
with open(gff, 'r') as myGff:
gffContent = myGff.readlines()
for gffLine in gffContent:
if not gffLine.startswith('#'):
(chrom, source, feature_type, start,stop,score,strand,frame,attributes) = gffLine.split("\t")
attributes = self.getGffAttributes(attribute_string = attributes)
if feature_type == "gene":
#print("\n\n ###########################################################")
#print(" Found new gene feature %s with attributes " % str(geneID))
#pprint(attributes)
# set current gene ID
geneID=attributes['ID']
self.annot[geneID] = [gffLine]
else:
self.annot[geneID].append(gffLine)
#print(" >>>>>>>>>>> Found new sub-feature %s with attributes " % str(attributes['ID']))
#pprint(attributes)
def recalcFromGMAP(self):
"""
get different classes of anchored genes and recalculate their coords them
"""
self.recalcPerfect(psl = self.parameters.inputDir+'/perfectmatch.psl')
def recalcPerfect(self, psl):
print(" Recalc coord for genes with perfect match: %s" % str(psl))
with open(psl, 'r') as mypsl:
pslContent = mypsl.readlines()
for pslLine in pslContent:
if not pslLine.startswith('#'):
psl_result = pslLine.rstrip().split("\t")
regionstart=self.getStartRegionFromQueryName(queryname=psl_result[9])
geneID = self.getGeneIdFromQueryName(queryname=psl_result[9])
annot = self.annot[geneID]
print(" current Perfect Hit for query %s: %s" % (str(geneID),str(psl_result)))
for feature in annot:
gff=feature.split("\t")
attributes=self.getGffAttributes(gff[-1])
print(" current feature:")
pprint(gff)
newchrom=psl_result[13]
tstart = int(psl_result[15])
tstop = int(psl_result[16])
tstrand = psl_result[8]
qchrom=gff[0]
qstart=int(gff[3])
qstop=int(gff[4])
qstrand=gff[6]
if tstrand == '+':
newstart = (tstart + ( qstart - regionstart ))
newstop = (tstart + (qstop - qstart +1 ) )
# check sequences
fastaold=self.oldRef[qchrom][qstart:qstop].seq
fastanew=self.newRef[newchrom][newstart:newstop].seq
if fastaold == fastanew:
print(" old and new sequence are the same")
else:
print(" error while converting sequences: the sequences string are not the same")
pprint(fastaold)
pprint(fastanew)
input()
def getGeneIdFromQueryName(self, queryname):
geneID=queryname.split(':')[0]
return geneID
def getStartRegionFromQueryName(self,queryname):
start = queryname.split(':')[-1].split('-')[0]
return int(start)
def getGffAttributes(self, attribute_string):
attributes_list = attribute_string.split(';')
attributes_dict = {}
for attr in attributes_list:
(tag,value) = attr.split('=')
attributes_dict[tag] = value
return(attributes_dict)
if __name__ == "__main__":
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment