Skip to content
Snippets Groups Projects
Commit f494725a authored by Helene Rimbert's avatar Helene Rimbert
Browse files

refactoring in an OO way. recalc for strand + ok

parent fa8d2c66
No related branches found
No related tags found
No related merge requests found
......@@ -7,336 +7,423 @@ from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC
from Bio import SeqIO
import argparse
import pysam
import sys
"""
Command line arguments
"""
blatMapping = sys.argv[1] # Blat result file from microMappingPipeline.py
summaryMapping = sys.argv[2] # Summary of the blat mapping from microMappingPipeline.py
outputDir = sys.argv[3] # output directory ...
genomeQuery = sys.argv[4] # fasta file of the v1 genome (must be fai indexed)
genomeTarget = sys.argv[5] # fasta file of the v2 genome (must be fai indexed)
annotQuery = sys.argv[6] # initial GFF file of the annotation
"""
Global var
"""
geneAnnot=defaultdict()
geneMapping=defaultdict()
newCoord=defaultdict()
outputGff=outputDir+'/targetAnnot.gff3'
def main():
"""
check for input files and output directory
"""
sys.stdout.write(' Check input files:')
checkFile(file=blatMapping)
checkFile(file=summaryMapping)
checkFile(file=annotQuery)
"""
Save blat mapping info into dict
"""
geneMapping=saveBlatResults(summary=summaryMapping, blat=blatMapping)
"""
Save Annotation infirmation from GFF file
geneAnnot is a dict with gene id as key
"""
geneAnnot=saveAnnotFromGFF(gff=annotQuery)
"""
For each gene, with full match (with or without missmatches)
"""
for geneID in sorted(geneAnnot.keys()):
print(" Dealing with gene {}\n".format(geneID))
blat=''
annot=geneAnnot[geneID]
if geneID in geneMapping.keys():
blat=geneMapping[geneID]
else:
sys.stderr.srite(" ERROR cannot find gene {} in blat data\n".format(geneID))
continue
numFeatures = len(annot)
# some logs
print(" Annotation features:")
print(annot)
print(" Found {} features for gene {}\n".format(numFeatures,geneID))
print(" Blat mapping:")
print(blat)
'''
Recalc the new coord
'''
newCoord = recalcCoord(
strand=blat['hitStrandOnTarget'],
queryChrom=blat['queryChrom'],
queryStart=blat['queryStart'],
queryStop=blat['queryStop'],
targetChrom=blat['targetChrom'],
targetStart=blat['targetStart'],
targetStop=blat['targetStop'],
hitStart=blat['hitStartOnTarget'],
hitStop=blat['hitStopOnTarget'],
annot=annot
)
input()
def recalcCoord(strand,queryChrom,queryStart,queryStop,targetChrom,targetStart,targetStop,hitStart,hitStop,annot):
newAnnotCoord=[]
transfertStatus=0
geneid=annot[0].split("\t")[8].split(';')[0]
if strand == '+':
print(' Hit on strand + of the target')
print(' target chrom {} starting from {} and ending at {}'.format(targetChrom,targetStart,targetStop))
for feature in annot:
feature=feature.rstrip('\n').split("\t")
print('************ recalc coord for feature {}'.format(feature))
# extract infos from col #9
gffInfoTable=feature[8].split(';')
# and transform it into a dict
gffInfoDict=defaultdict()
for infoPair in gffInfoTable:
(key,value)=infoPair.split('=')
gffInfoDict[key] = value
# recalc the start and stop of current feature
newStart=int(targetStart) + int(hitStart) -1 + (int(feature[3]) - int(queryStart))
newStop=int(targetStart) + int(hitStart) + (int(feature[4]) - int(queryStart))
newFeatureLength = newStop - newStart + 1
# strand stays the same as in query gff
newStrand=feature[6]
newChrom=targetChrom
print(' New start = {} new stop = {} : new feature length = {}'.format(newStart,newStop,newFeatureLength))
"""
extract the fasta sequence to compare the features before and after recalc
1. from query
"""
queryFeatureSeq=getFastaSeq(fasta=queryFasta,
chrom=queryChrom,
start=int(feature[3])-1,
stop=int(feature[4]))
querySeqRecord=SeqRecord(
Seq(queryFeatureSeq, IUPAC.ambiguous_dna),
id=gffInfoDict['ID'],
name=gffInfoDict['ID'],
description='Sequence extracted from ' + genomeQuery)
print(querySeqRecord)
"""
Extract FASTA
2. From target Sequence
"""
targetFeatureSeq=getFastaSeq(fasta=targetFasta,
chrom=newChrom,
start=int(newStart),
stop=int(newStop))
targetSeqRecord=SeqRecord(
Seq(targetFeatureSeq, IUPAC.ambiguous_dna),
id=gffInfoDict['ID'],
name=gffInfoDict['ID'],
description='Sequence extracted from ' + genomeTarget)
print(targetSeqRecord)
"""
Test if the 2 sequences are the same to validate the recalculation
"""
if queryFeatureSeq == targetFeatureSeq:
print( "the two sequences are the same \n" )
newFeature = feature
newFeature[0]=newChrom
newFeature[3]=newStart
newFeature[4]=newStop
newFeature[6]=newStrand
print(" new feature coords on target genome: {}".format(newFeature))
newAnnotCoord.append(newFeature)
# blatMapping = sys.argv[1] # Blat result file from microMappingPipeline.py
# summaryMapping = sys.argv[2] # Summary of the blat mapping from microMappingPipeline.py
# outputDir = sys.argv[3] # output directory ...
# genomeQuery = sys.argv[4] # fasta file of the v1 genome (must be fai indexed)
# genomeTarget = sys.argv[5] # fasta file of the v2 genome (must be fai indexed)
# annotQuery = sys.argv[6] # initial GFF file of the annotation
class recalcCoords (object):
def __init__(self):
"""
Global var
"""
self.geneAnnot=defaultdict()
self.geneMapping=defaultdict()
self.newCoord=defaultdict()
self.outputGff=''
self.geneMapping=defaultdict()
self.geneAnnot==defaultdict()
def main(self):
self.getOptions()
self.checkInputs()
self.openFastaHandlers()
self.openOutputGffHandler()
# Save blat mapping info into dict. self.geneMapping is a dict with gene id as key
self.saveBlatResults(summary=self.options.summaryMapping,blat=self.options.blat)
# Save Annotation information from GFF file. self.geneAnnot is a dict with gene id as key
self.saveAnnotFromGFF()
# For each gene, with full match (with or without missmatches), we try to recalc the coordinates
self.parseAnnotAndRecalc()
def parseAnnotAndRecalc(self):
for geneID in sorted(self.geneAnnot.keys()):
blat=''
annot=''
print(" Dealing with gene {}\n".format(geneID))
# save current anno for the working gene.
annot=self.geneAnnot[geneID]
if geneID in self.geneMapping.keys():
blat=self.geneMapping[geneID]
print(' Blast summary found: {}'.format(blat))
else:
sys.stderr.write(" error with feature {} :".format(gffInfoDict['ID']))
sys.stderr.write( "the 2 sequences are not the same: cannot transfert this gene ! \n" )
print('Query: {}'.format(queryFeatureSeq))
print(' Target: {}'.format(targetFeatureSeq))
transfertStatus=1
input()
elif strand == '-':
print(' Hit on strand - of the target')
print(' target chrom {} starting from {} and ending at {}'.format(targetChrom,targetStart,targetStop))
print(' we reverse the strand on the target')
for feature in annot:
feature=feature.rstrip('\n').split("\t")
print(' recalc coord for feature {}'.format(feature))
else:
sys.stderr.write(' Cannot parse strand of blat hit on target ')
"""
If at least one feature failed when recal coords,
aka the sequences are not the same in target and query,
we do not return the gene and its subfeatures
"""
if (transfertStatus == 1):
sys.stderr.write(" At least one feature failed when recalc coords for gene {}.\n".format(geneid))
return 0
else:
print(" New GFF:")
print(newAnnotCoord)
return newAnnotCoord
def saveAnnotFromGFF(gff):
sys.stdout.write(" 2. saving Query Annotation from file {} into Dict \n".format(gff))
sys.stdout.write("===========================================================================\n")
dataDict=defaultdict()
geneID=''
with open(gff) as gffRecords:
for line in gffRecords.readlines():
if line.startswith('#'):
sys.stderr.write(" ERROR cannot find gene {} in blat data\n".format(geneID))
# TODO:
continue
# transform gff line into array
(chrom,source,featureType,start,stop,score,strand,frame,info)=line.rstrip('\n').split('\t')
numFeatures = len(annot)
# save the last column with info tags
gffInfoTable=info.split(';')
# some logs
print(" Annotation features:")
print(annot)
print(" Found {} features for gene {}\n".format(numFeatures,geneID))
print(" Blat mapping data {}:".format(blat))
# and transform it into a dict
gffInfoDict=defaultdict()
for infoPair in gffInfoTable:
(key,value)=infoPair.split('=')
gffInfoDict[key] = value
if blat['mappingStatus'] == 'unmapped':
print(" Blat mapping status is {}".format(blat['mappingStatus']))
print(" We cannot go further for now")
# TODO:
if featureType == 'gene':
geneID=gffInfoDict['ID']
dataDict[geneID] = [line.rstrip('\n')]
elif blat['mappingStatus'] == 'imperfectMatch':
print(" Blat mapping status is {}".format(blat['mappingStatus']))
print(" refine mapping with exonerate or something")
# TODO:
else:
dataDict[geneID].append(line.rstrip('\n'))
numGenes=len(dataDict.keys())
sys.stdout.write(" Found {} genes in GFF file {}\n".format(numGenes,gff))
return dataDict
def saveBlatResults(summary, blat):
sys.stdout.write(" 1. saving blat results {} and summary {} into Dict \n".format(blat,summary))
sys.stdout.write("===========================================================================\n")
dataDict=defaultdict()
# SUMMARY DATA
with open(summary) as summaryData:
for line in summaryData.readlines():
#sys.stdout.write(' current record is {}'.format(line))
(geneName,markerPair,mappingStatus,queryChrom,markerStatus,markerPair2,geneName2,geneQueryStart,geneQueryEnd,geneQueryStrand,leftMakerId,leftMakerQueryLoc,leftMarkerTargetLoc,rightMarkerId,rightMakerQueryLoc,rightMarkerTargetLoc) = line.rstrip('\n').split("\t")
dataDict[geneName] = {
'geneId': geneName,
'gffRecords': 'NA',
'gffSubfeatures': [],
'targetStart':'NA',
'targetStop':'NA',
'targetChrom':'NA',
'hitStrandOnTarget':'NA',
'hitStartOnTarget':'NA',
'hitStopOnTarget':'NA',
'hitStrandOnTarget':'NA',
'queryStart':geneQueryStart,
'queryStop':geneQueryEnd,
'queryChrom':queryChrom,
'queryStrand':geneQueryStrand,
'mappingStatus':mappingStatus
}
numGenesInDict=len(dataDict.keys())
# BLAT MAPPING DATA
with open(blat) as blatData:
for line in blatData.readlines():
(geneName,markerPair,mappingStatus,mappingStatus2,blatCoverage,blatIndelBases,blatMissmatches,hitStartOnTarget,hitStopOnTarget,hitStrand,targetLength,targetName)=line.rstrip('\n').split('\t')
if geneName in dataDict.keys():
if mappingStatus == 'unmapped':
sys.stdout.write(" gene {} is not mapped: no target information to recover.\n".format(geneName))
'''
Recalc the new coord
'''
self.recalcCoord(
strand=blat['hitStrandOnTarget'],
queryChrom=blat['queryChrom'],
queryStart=blat['queryStart'],
queryStop=blat['queryStop'],
targetChrom=blat['targetChrom'],
targetStart=blat['targetStart'],
targetStop=blat['targetStop'],
hitStart=blat['hitStartOnTarget'],
hitStop=blat['hitStopOnTarget'],
annot=annot,
blatMapping=blat)
input()
def recalcCoord(self,strand,queryChrom,queryStart,queryStop,targetChrom,targetStart,targetStop,hitStart,hitStop,annot,blatMapping):
newAnnotCoord=[]
transfertStatus=0
geneid=annot[0].split("\t")[8].split(';')[0]
newCdsSequence=''
queryCdsSequence=''
if strand == '+':
print(' Hit on strand + of the target')
print(' target chrom {} starting from {} and ending at {}'.format(targetChrom,targetStart,targetStop))
for feature in annot:
feature=feature.rstrip('\n').split("\t")
print('************ recalc coord for feature {}'.format(feature))
# extract infos from col #9
gffInfoTable=feature[8].split(';')
# and transform it into a dict
gffInfoDict=defaultdict()
for infoPair in gffInfoTable:
(key,value)=infoPair.split('=')
gffInfoDict[key] = value
# recalc the start and stop of current feature
newStart=int(targetStart) + int(hitStart) -1 + (int(feature[3]) - int(queryStart))
newStop=int(targetStart) + int(hitStart) + (int(feature[4]) - int(queryStart))
newFeatureLength = newStop - newStart + 1
# strand stays the same as in query gff
newStrand=feature[6]
newChrom=targetChrom
print(' New start = {} new stop = {} : new feature length = {}'.format(newStart,newStop,newFeatureLength))
# save the new coords for mrna and gene features
if feature[2] in ['gene','mRNA']:
newFeature = feature
newFeature[0]=newChrom
newFeature[3]=newStart
newFeature[4]=newStop
newFeature[6]=newStrand
print(" new feature coords on target genome: {}".format(newFeature))
newAnnotCoord.append(newFeature)
continue
"""
CHECK FOR CDS/UTR/EXONS integrity:
extract the fasta sequence to compare the UTR/EXON/CDS features before and after recalc
1. from query sequence
"""
queryFeatureSeq=self.getFastaSeq(fasta=self.queryFasta,
chrom=queryChrom,
start=int(feature[3])-1,
stop=int(feature[4]))
querySeqRecord=SeqRecord(
Seq(queryFeatureSeq, IUPAC.ambiguous_dna),
id=gffInfoDict['ID'],
name=gffInfoDict['ID'],
description='feature ' + feature[2] + ' of sequence extracted from ' + self.options.queryGenome)
#print(querySeqRecord,len(querySeqRecord))
"""
2. From target Sequence
"""
targetFeatureSeq=self.getFastaSeq(fasta=self.targetFasta,
chrom=newChrom,
start=int(newStart),
stop=int(newStop))
targetSeqRecord=SeqRecord(
Seq(targetFeatureSeq, IUPAC.ambiguous_dna),
id=gffInfoDict['ID'],
name=gffInfoDict['ID'],
description='feature ' + feature[2] + ' of sequence extracted from ' + self.options.targetGenome)
#print(targetSeqRecord,len(targetSeqRecord))
"""
Concat the cds sequence if current feature is cds
"""
if feature[2] == 'CDS':
newCdsSequence+=str(targetSeqRecord.seq)
queryCdsSequence+=str(querySeqRecord.seq)
"""
Test if the 2 sequences are the same to validate the recalculation
"""
seq1=str(querySeqRecord.seq).upper()
seq2=str(targetSeqRecord.seq).upper()
if seq1 == seq2:
print( "the two sequences are the same \n" )
newFeature = feature
newFeature[0]=newChrom
newFeature[3]=newStart
newFeature[4]=newStop
newFeature[6]=newStrand
print(" new feature coords on target genome: {}".format(newFeature))
newAnnotCoord.append(newFeature)
elif seq1 != seq2 and blatMapping['mappingStatus'] == 'fullMatchWithMissmatches':
print(" The two sequences are different but the blat status is fullMatchWithMissmatches so we will be lenient ")
newFeature = feature
newFeature[0]=newChrom
newFeature[3]=newStart
newFeature[4]=newStop
newFeature[6]=newStrand
print(" new feature coords on target genome: {}".format(newFeature))
newAnnotCoord.append(newFeature)
else:
(prefix,targetChrom,targetStart,targetStop)=targetName.split('_')
dataDict[geneName]['targetStart'] = targetStart
dataDict[geneName]['targetStop'] = targetStop
dataDict[geneName]['targetChrom'] = targetChrom
dataDict[geneName]['hitStrandOnTarget'] = hitStrand
dataDict[geneName]['hitStartOnTarget'] = hitStartOnTarget
dataDict[geneName]['hitStopOnTarget'] = hitStopOnTarget
sys.stderr.write(" error with feature {} :".format(gffInfoDict['ID']))
sys.stderr.write( "the 2 sequences are not the same: cannot transfert this gene ! \n" )
#print('Query: {}'.format(seq1))
#print(' Target: {}'.format(seq2))
transfertStatus=1
else:
sys.stderr.write(" ERROR: gene {} found in blat results but not in summary data! \n".format(geneName))
sys.exit()
sys.stdout.write(" Number of records in Gene Dict based on summaryData: {}\n".format(numGenesInDict))
return dataDict
def getFastaSeq(fasta,chrom,start,stop):
seq=fasta.fetch(reference=chrom, start=start,end=stop)
print(" fasta sequence for chrom %s from %s to %s\n" %(chrom,start,stop))
#print(seq)
return seq
def checkFile (file):
if os.path.isfile(file):
sys.stdout.write(" File %s found\n" % str(file))
else:
sys.stdout.write(" ERROR: Cannot find file %s \n" % str(file))
sys.exit()
def checkDir(directory):
if os.path.isdir(directory):
sys.stdout.write(" Directory %s found\n" % str(directory))
return 1
else:
sys.stdout.write(" Cannot find directory %s \n" % str(directory))
return 0
elif strand == '-':
if __name__== "__main__":
if ( checkDir(directory=outputDir) ):
sys.stdout.write( " 0. Output directory already exists, no need to create it\n")
print(' Hit on strand - of the target')
print(' target chrom {} starting from {} and ending at {}'.format(targetChrom,targetStart,targetStop))
print(' we reverse the strand on the target')
for feature in annot:
feature=feature.rstrip('\n').split("\t")
print(' recalc coord for feature {}'.format(feature))
else:
sys.stderr.write(' Cannot parse strand of blat hit on target ')
"""
If at least one feature failed when recal coords,
aka the sequences are not the same in target and query,
we do not return the gene and its subfeatures
"""
if (transfertStatus == 1 and strand == '+'):
sys.stderr.write(" At least one UTR/EXON/CDS feature failed when recalc coords for gene {}.\n".format(geneid))
return 0
elif (strand == '+'):
print(" New GFF:")
print(*newAnnotCoord, sep="\n")
if self.getTranslatedCDS(seq=newCdsSequence,strand=newStrand) != self.getTranslatedCDS(seq=queryCdsSequence,strand=feature[6]):
print("Translated CDS are NOT the same")
newAnnotCoord = self.addCdsWarningTag(annotArray=newAnnotCoord)
return newAnnotCoord
else:
print("next")
def addCdsWarningTag(self, annotArray):
for featureArray in annotArray:
featureArray[8]+=';CDS_altered=True'
return annotArray
def getTranslatedCDS(self, seq, strand):
#print(" Evaluating CDS sequence for {}".format(seq))
seqObject=Seq(seq, IUPAC.ambiguous_dna)
if (strand == '-'):
seqObject=seqObject.reverse_complement()
#print(" Rev Comp sequence is {}".format(seqObject))
pepObject=seqObject.translate()
#print(" Translated sequence is {}".format(pepObject))
def saveAnnotFromGFF(self):
sys.stdout.write(" 2. saving Query Annotation from file {} into Dict \n".format(self.options.gff))
sys.stdout.write("===========================================================================\n")
geneID=''
with open(self.options.gff) as gffRecords:
for line in gffRecords.readlines():
if line.startswith('#'):
continue
# transform gff line into array
(chrom,source,featureType,start,stop,score,strand,frame,info)=line.rstrip('\n').split('\t')
# save the last column with info tags
gffInfoTable=info.split(';')
else:
sys.stdout.write( " 0. Creating output directory %s" % str(outputDir))
os.mkdir( outputDir, 0o750 )
# and transform it into a dict
gffInfoDict=defaultdict()
for infoPair in gffInfoTable:
(key,value)=infoPair.split('=')
gffInfoDict[key] = value
if featureType == 'gene':
geneID=gffInfoDict['ID']
self.geneAnnot[geneID] = [line.rstrip('\n')]
else:
self.geneAnnot[geneID].append(line.rstrip('\n'))
numGenes=len(self.geneAnnot.keys())
sys.stdout.write(" Found {} genes in GFF file {}\n".format(numGenes,self.options.gff))
def saveBlatResults(self, summary, blat):
sys.stdout.write(" 1. saving blat results {} and summary {} into Dict \n".format(blat,summary))
sys.stdout.write("===========================================================================\n")
# SUMMARY DATA
with open(summary) as summaryData:
for line in summaryData.readlines():
#sys.stdout.write(' current record is {}'.format(line))
(geneName,markerPair,mappingStatus,queryChrom,markerStatus,markerPair2,geneName2,geneQueryStart,geneQueryEnd,geneQueryStrand,leftMakerId,leftMakerQueryLoc,leftMarkerTargetLoc,rightMarkerId,rightMakerQueryLoc,rightMarkerTargetLoc) = line.rstrip('\n').split("\t")
self.geneMapping[geneName] = {
'geneId': geneName,
'gffRecords': 'NA',
'gffSubfeatures': [],
'targetStart':'NA',
'targetStop':'NA',
'targetChrom':'NA',
'hitStrandOnTarget':'NA',
'hitStartOnTarget':'NA',
'hitStopOnTarget':'NA',
'hitStrandOnTarget':'NA',
'queryStart':geneQueryStart,
'queryStop':geneQueryEnd,
'queryChrom':queryChrom,
'queryStrand':geneQueryStrand,
'mappingStatus':mappingStatus
}
numGenesInDict=len(self.geneMapping.keys())
# BLAT MAPPING DATA
with open(blat) as blatData:
for line in blatData.readlines():
(geneName,markerPair,mappingStatus,mappingStatus2,blatCoverage,blatIndelBases,blatMissmatches,hitStartOnTarget,hitStopOnTarget,hitStrand,targetLength,targetName)=line.rstrip('\n').split('\t')
if geneName in self.geneMapping.keys():
if mappingStatus == 'unmapped':
#sys.stdout.write(" gene {} is not mapped: no target information to recover.\n".format(geneName))
continue
else:
(prefix,targetChrom,targetStart,targetStop)=targetName.split('_')
self.geneMapping[geneName]['targetStart'] = targetStart
self.geneMapping[geneName]['targetStop'] = targetStop
self.geneMapping[geneName]['targetChrom'] = targetChrom
self.geneMapping[geneName]['hitStrandOnTarget'] = hitStrand
self.geneMapping[geneName]['hitStartOnTarget'] = hitStartOnTarget
self.geneMapping[geneName]['hitStopOnTarget'] = hitStopOnTarget
else:
sys.stderr.write(" ERROR: gene {} found in blat results but not in summary data! \n".format(geneName))
sys.exit()
sys.stdout.write(" Number of records in Gene Dict based on summaryData: {}\n".format(numGenesInDict))
def getFastaSeq(self,fasta,chrom,start,stop):
seq=fasta.fetch(reference=chrom, start=start,end=stop)
print(" fasta sequence for chrom %s from %s to %s\n" %(chrom,start,stop))
#print(seq)
return seq
def checkFile (self,file):
if os.path.isfile(file):
sys.stdout.write(" File %s found\n" % str(file))
else:
sys.stdout.write(" ERROR: Cannot find file %s \n" % str(file))
sys.exit()
def checkDir(self,directory):
if os.path.isdir(directory):
sys.stdout.write(" Directory %s found\n" % str(directory))
return 1
else:
sys.stdout.write(" Cannot find directory %s \n" % str(directory))
return 0
def getOptions(self):
parser=argparse.ArgumentParser(description='Recalculate coordinates on a new reference sequence based on BLAT summary mapping')
parser.add_argument('-b', '--blat', help='Blat result file from microMappingPipeline.py', required=True)
parser.add_argument('-s', '--summaryMapping', help='Summary of the blat mapping from microMappingPipeline.py', required=True)
parser.add_argument('-o', '--outputDir', help='Ouput directory', required=True)
parser.add_argument('-q', '--queryGenome', help='fasta file of the v1 genome (must be fai indexed)', required=True)
parser.add_argument('-t', '--targetGenome', help='fasta file of the v2 genome (must be fai indexed)', required=True)
parser.add_argument('-g', '--gff', help='initial GFF file of the annotation from the v1 genome', required=True)
self.options=parser.parse_args()
def openFastaHandlers(self):
"""
open for reading the query and target reference file
This will be used to extract the dna sequences before and after recalculation to ensure that the sequences are the same
and that the calculation is correct
"""
sys.stdout.write(' Open FASTA files for sequence extraction\n')
self.queryFasta = pysam.FastaFile(self.options.queryGenome)
self.targetFasta = pysam.FastaFile(self.options.targetGenome)
def openOutputGffHandler(self):
self.outputGff=self.options.outputDir+'/targetAnnot.gff3'
self.outputGffFH=open(self.outputGff, 'w')
def checkInputs(self):
"""
check for input files and output directory
"""
sys.stdout.write(' Check input files:')
self.checkFile(file=self.options.blat)
self.checkFile(file=self.options.summaryMapping)
self.checkFile(file=self.options.gff)
if ( self.checkDir(directory=self.options.outputDir) ):
sys.stdout.write( " 0. Output directory already exists, no need to create it\n")
else:
sys.stdout.write( " 0. Creating output directory %s" % str(self.options.outputDir))
os.mkdir( self.options.outputDir, 0o750 )
self.checkFile(file=self.options.queryGenome)
self.checkFile(file=self.options.targetGenome)
if __name__== "__main__":
checkFile(file=genomeQuery)
checkFile(file=genomeTarget)
run=recalcCoords()
run.main()
"""
open for reading the query and target reference file
This will be used to extract the dna sequences before and after recalculation to ensure that the sequences are the same
and that the calculation is correct
"""
sys.stdout.write(' Open FASTA files for sequence extraction\n')
queryFasta = pysam.FastaFile(genomeQuery)
targetFasta = pysam.FastaFile(genomeTarget)
outputGffFH=open(outputGff, 'w')
main()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment