Newer
Older
from collections import defaultdict
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC
from Bio import SeqIO
#### command line arguments
inputMarkersFlankingGenes = sys.argv[1] # table with mapped flanking ISBPs from genes
outputDir = sys.argv[2] # output directory ...
genomeQuery = sys.argv[3] # fasta file of the v1 genome (must be fai indexed)
genomeTarget = sys.argv[4] # fasta file of the v2 genome (must be fai indexed)
markerPosOnTarget = sys.argv[5] # bed file of new coords of markers on target reference genome
#### generic data structure
# dict with summary of blat alignements: target region, coverage, indels, missmatches, anchoring status
blatResults = defaultdict()
def main():
### check for input files and output directory
checkFile(file=inputMarkersFlankingGenes)
if ( checkDir(directory=outputDir) ):
sys.stdout.write( " Output directory already exists, no need to create it\n")
else:
sys.stdout.write( " Creating output directory %s" % str(outputDir))
os.mkdir( outputDir, 0o750 )
### open for reading the query and target reference file
queryFasta = pysam.FastaFile(genomeQuery)
targetFasta = pysam.FastaFile(genomeTarget)
### save coords of marker on target reference
markersOnTarget_dict = getCoordsFromBed(bedfile=markerPosOnTarget)
### read input file containing the flanking IBSP markers for each gene
##### variables
numlines=0
index_result_dir=0
prefixResultDir=outputDir+'/temp/'+str(index_result_dir).zfill(6)
##### output files
with open(inputMarkersFlankingGenes) as file:
for line in file.readlines():
# increment counters
# because to avoid having a big result dir with +100k subdirs,
# the scripts creates subdirs for every 1k genes
numlines+=1
if numlines % 1000 == 0:
index_result_dir+=1
prefixResultDir=outputDir+'/temp/'+str(index_result_dir).zfill(6)
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# define the working dir for the current gene analysis
workingDir=prefixResultDir+'/'+str(numlines).zfill(6)
os.makedirs(workingDir, 0o750)
# deal with data
lineArray = line.rstrip("\n").split("\t")
print("\n\n------------------------------------------")
print(" Results of gene %s going into subdir %s" % (lineArray[0], str(workingDir)))
print(" array of data: ")
print(lineArray)
# check if both anchors are mapped on the same chromosome
geneDict={ 'geneId': lineArray[0],
'geneChrom':lineArray[1],
'geneStart':lineArray[2],
'geneStop':lineArray[3],
'geneScore':lineArray[4],
'geneStrand':lineArray[5],
'marker5pChrom':lineArray[6],
'marker5pStart':lineArray[7],
'marker5pStop':lineArray[8],
'marker5pId':lineArray[9],
'marker5pDistance':lineArray[10],
'marker3pChrom':lineArray[11],
'marker3pStart':lineArray[12],
'marker3pStop':lineArray[13],
'marker3pId':lineArray[14],
'marker3pDistance':lineArray[15],
'regionLengthOnQuery':lineArray[16]
}
print(" Gene directory:")
print(geneDict)
anchor5pTargetCoods_dict= {'chrom': 0, 'start':0, 'stop':0}
anchor3pTargetCoods_dict= {}
if geneDict['marker5pChrom'] == '.' or geneDict['marker3pChrom'] == '.':
# deal with genes with one anchor missing (basecally start and end of chromosomes)
sys.stderr.write(" current gene %s has missing anchor on one side \n" % geneDict['geneId'])
if geneDict['marker5pId'] == '.':
print(' Missing 5prime anchor: using genomic sequence from the start of chromosome')
geneDict['marker5pChrom'] = geneDict['marker3pChrom'].replace('chr','Chr')
geneDict['marker5pStart'] = 0
geneDict['marker5pStop'] = 0
geneDict['marker5pId'] = geneDict['marker5pChrom']
anchor5pTargetCoods_dict.update(chrom= geneDict['marker5pChrom'], start= 1, stop=1)
anchor3pTargetCoods_dict.update(markersOnTarget_dict[geneDict['marker3pId']])
elif geneDict['marker3pId'] == '.':
print(' Missing 3prime anchor: using genomic sequence to the end of chromosome')
geneDict['marker3pChrom'] = geneDict['marker5pChrom'].replace('chr','Chr')
geneDict['marker3pStart'] = targetFasta.get_reference_length(geneDict['marker3pChrom'])
geneDict['marker3pStop'] = targetFasta.get_reference_length(geneDict['marker3pChrom'])
geneDict['marker3pId'] = geneDict['marker3pChrom']
anchor3pTargetCoods_dict.update(chrom= geneDict['marker3pChrom'], start=geneDict['marker3pStart'], stop=geneDict['marker3pStop'])
anchor5pTargetCoods_dict.update(markersOnTarget_dict[geneDict['marker5pId']])
else:
print(" Problem with the anchors: canno find 5prime and 3prime anchors for the gene {}".format(geneDict['geneId']))
else:
# get mapping of the flanking anchors on the target genome
anchor5pTargetCoods_dict.update(markersOnTarget_dict[geneDict['marker5pId']])
anchor3pTargetCoods_dict.update(markersOnTarget_dict[geneDict['marker3pId']])
print(" anchors 5prime:\n")
print(anchor5pTargetCoods_dict)
print(" anchors 3prime:\n")
print(anchor3pTargetCoods_dict)
if anchor5pTargetCoods_dict['chrom'] == anchor3pTargetCoods_dict['chrom']:
print(" both anchors are mapped on the same chrom. we can proceed to sequence extraction and mapping\n")
### deal with query sequence: extract the fasta seq to map
queryGeneSeq=getFastaSeq(fasta=queryFasta,
chrom=geneDict['geneChrom'],
start=int(geneDict['geneStart']),
stop=int(geneDict['geneStop']))
Seq(queryGeneSeq, IUPAC.ambiguous_dna),
id=geneDict['geneId'],
description='coords '+geneDict['geneChrom']+'_'+str(geneDict['geneStart'])+'-'+str(geneDict['geneStop'])+', flanking markers '+ geneDict['marker5pId']+'-'+geneDict['marker3pId'])
#print(querySeqRecord)
queryFasta2blast = workingDir+'/query.fasta'
SeqIO.write(querySeqRecord, queryFasta2blast, 'fasta')
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
### deal with target sequence: extract the fasta seq to use a db
# fisrt check orientation:
if anchor5pTargetCoods_dict['start'] > anchor3pTargetCoods_dict['stop']:
# invert start and stop
anchor5pTargetCoods_dict['start'],anchor3pTargetCoods_dict['stop'] = anchor3pTargetCoods_dict['stop'],anchor5pTargetCoods_dict['start']
targetGenomeSeq=getFastaSeq(fasta=targetFasta,
chrom=anchor5pTargetCoods_dict['chrom'],
start=int(anchor5pTargetCoods_dict['start']),
stop=int(anchor3pTargetCoods_dict['stop']))
targetSeqRecord= SeqRecord(
Seq(targetGenomeSeq, IUPAC.ambiguous_dna),
id='target_'+anchor5pTargetCoods_dict['chrom']+'_'+str(anchor5pTargetCoods_dict['start'])+'_'+str(anchor3pTargetCoods_dict['stop']))
targetFasta2blast = workingDir+'/target.fasta'
SeqIO.write(targetSeqRecord, targetFasta2blast, 'fasta')
################### run Blat of the genomic sequence on the target region on new assembly
blatResultFile=runBlat(db=targetFasta2blast,
query=queryFasta2blast,
blatResult=workingDir+'/blat.txt',
outFormat='psl')
############ check if the genomic align perfectly on the new reference
checkPerfectHit(blatresult=blatResultFile,
workingDir=workingDir,
maxhit=1)
os.remove(queryFasta2blast)
os.remove(targetFasta2blast)
sys.stderr.write(" 5prime anchor %s and 3prime anchor %s are not on the same chromsome\n" % (geneDict['marker5pId'],geneDict['marker3pId']))
sys.stderr.write(" Cannot anchoring gene %s \n" % geneDict['geneId'])
# #input()
# if numlines == 150:
# break
def checkPerfectHit(blatresult, workingDir, maxhit):
print(" blatResults %s" % blatresult)
hitIndex=0
with open(blatresult) as blat:
for hit in blat.readlines():
hitIndex+=1
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
if hitIndex > maxhit:
sys.stderr.write(" Tried more than {} hits: giving up for this gene".format(maxhit))
break
print(" -> #%i Blat hit %s " % (hitIndex,hit))
if parsePSL(pslRecord=hit) :
print(" we can anchor this gene")
break
else:
print(" we cannot anchor this gene\n")
print(" adding mummer/nucmer alignment info to this gene\n")
### add more alignemnt info using mummer/nucmer
# nucmer: perform the alignment
nucmerPref=workingDir+'/nucmer'
ref=workingDir+'/target.fasta'
query=workingDir+'/query.fasta'
os.system('nucmer -p {} {} {}'.format(nucmerPref,ref,query))
# mummerplot: generate the dot plot in png format
mummerplotPref=workingDir+'/mummerplot'
os.system('mummerplot --SNP -t png -p {} {}'.format(mummerplotPref,nucmerPref+'.delta'))
# dnadiff: show potential snps and gaps
dnadiffPref=workingDir+'/dnadiff'
os.system('dnadiff -p {} -d {}'.format(dnadiffPref,nucmerPref+'.delta'))
### add another blat format output for visualisation with ACT
blatfile=workingDir+'/blast_m8.tab'
os.system(' blat -extendThroughN -out=blast8 {} {} {}'.format(ref,query,blatfile))
run_act_file=workingDir+'/run_act.sh'
os.system(' echo act query.fasta blast_m8.tab target.fasta > {}'.format(blatfile, run_act_file))
def parsePSL(pslRecord):
cov = ''
missmatches = ''
indels = ''
pslData=pslRecord.rstrip('\n').split('\t')
##### calc the % coverage
# pc_cov = (num_matche + num_mm + num_N) / length(querySeq) * 100
# add N count if we have some
# BLAT take into acount NN stretches with the '-extendThroughN' parameter.
sumCov = int(pslData[0]) + int(pslData[1]) + int(pslData[3])
pc_cov = ( sumCov / int(pslData[10])) * 100
print(" - Coverage: {}".format(pc_cov))
##### get missmatches info
missmatches = int(pslData[1])
pc_mm = (missmatches / sumCov) * 100
print(" - missmatches: {} ({} %)".format(missmatches, pc_mm))
##### get indels info
# based on the column 8: Number of bases inserted into target
indelBases = int(pslData[7])
pc_indels = (indelBases / sumCov) * 100
print(" - indels: {} ({} %)".format(indelBases, pc_indels))
##### decide if we remap or not the gene
if int(float(pc_cov)) < 100 or int(indelBases) > 0:
print(" Gene {} has indel or is not fully covered: CANNOT ANCHOR IT\n".format(pslData[9]))
return 0
else:
if missmatches == 0:
print(" Gene {} is a perfect full match: anchoring can be done!\n".format(pslData[9]))
return 1
else:
print(" Gene {} has missmatches warning in anchoring!\n".format(pslData[9]))
return 1
def runBlat(db,query,blatResult, outFormat):
cmd='blat -noHead -extendThroughN -out='+outFormat+' '+db+' '+query+' '+blatResult
print(' blat command to run: %s'% str(cmd))
os.system('blat -noHead -extendThroughN -out={} {} {} {}'.format(outFormat,db,query,blatResult))
return blatResult
def getFastaSeq(fasta,chrom,start,stop):
seq=fasta.fetch(reference=chrom, start=start,end=stop)
print(" fasta sequence for chrom %s from %s to %s\n" %(chrom,start,stop))
#print(seq)
def getCoordsFromBed (bedfile):
checkFile(file=bedfile)
bedDict = defaultdict()
with open(bedfile) as file:
for line in file.readlines():
bed=line.rstrip("\n").split("\t")
if len(bed) < 4:
sys.stderr.write(" ERROR PARSING BEDFILE %s " % str(bedfile))
sys.stderr.write(" File do ot have 4 columns or is not tab delimited\n")
sys.stderr.write(" Please check the format\n")
sys.exit()
#print(" bed array content ")
#print(bed)
bedDict[str(bed[3])] ={'chrom': bed[0],'start':bed[1],'stop':bed[2],'mapQ': bed[4],'strand':bed[5]}
numRecords=len(bedDict.keys())
sys.stdout.write(" Found %i records in Bed file %s " % (numRecords, bedfile))
return bedDict
def checkFile (file):
if os.path.isfile(file):
sys.stdout.write(" File %s found\n" % str(file))
else:
sys.stderr.write(" ERROR: Cannot find file %s \n" % str(file))
sys.exit()
def checkDir(directory):
if os.path.isdir(directory):
sys.stdout.write(" Directory %s found\n" % str(directory))
sys.stderr.write(" Cannot find directory %s \n" % str(directory))
return 0