Newer
Older
from collections import defaultdict
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC
from Bio import SeqIO
#### command line arguments
#inputMarkersFlankingGenes = sys.argv[1] # table with mapped flanking ISBPs from genes
marker5prime = sys.argv[1] # output of bedtools closest for the upstreams markers (5 first markers, in case the closest is not on the same chromosome)
marker3prime = sys.argv[2] # output of bedtools closest for the downstreams markers (5 first markers, in case the closest is not on the same chromosome)
outputDir = sys.argv[3] # output directory ...
genomeQuery = sys.argv[4] # fasta file of the v1 genome (must be fai indexed)
genomeTarget = sys.argv[5] # fasta file of the v2 genome (must be fai indexed)
markerPosOnTarget = sys.argv[6] # bed file of new coords of markers on target reference genome
#### generic data structure
# dict with summary of blat alignements: target region, coverage, indels, missmatches, anchoring status
blatResults = defaultdict()
outputTable_file=outputDir+'/mappingSummary.csv'
outputBlatFile=outputDir+'/allBlat.csv'
mappingResults = defaultdict()
def main():
### check for input files and output directory
checkFile(file=marker5prime)
checkFile(file=marker3prime)
### open for reading the query and target reference file
queryFasta = pysam.FastaFile(genomeQuery)
targetFasta = pysam.FastaFile(genomeTarget)
### save coords of marker on target reference
markersOnTarget_dict = getCoordsFromBed(bedfile=markerPosOnTarget)
### save list of flanking markers for each genes , and also their coordinates on the query genome
flankingMarkersPerGenes,markersOnQuery_dict = getFlankFromBedtoolsClosest(upstream=marker5prime, downstream = marker3prime)
### read input file containing the flanking IBSP markers for each gene
##### variables
numlines=0
index_result_dir=0
prefixResultDir=outputDir+'/temp/'+str(index_result_dir).zfill(6)
##### now loop over all genes in flankingMarkersPerGenes dict
for gene in flankingMarkersPerGenes:
numlines+=1
if numlines % 1000 == 0:
index_result_dir+=1
prefixResultDir=outputDir+'/temp/'+str(index_result_dir).zfill(6)
### define the working dir for the current gene analysis
workingDir=prefixResultDir+'/'+str(numlines).zfill(6)
os.makedirs(workingDir, 0o750)
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
### deal with data output dir
print("\n\n------------------------------------------")
print(" Results of gene %s going into subdir %s" % (gene, str(workingDir)))
# print(" gene data:")
# print(flankingMarkersPerGenes[gene])
##################################################################################################################################################################
### Generate all paris of flanking markers and check if we can anchor the gene on target genome
##################################################################################################################################################################
# remember that for each gene, I have 5 closest markers saved into tuple,
# I will try first pair and check if they are on the same chromosome or not ...
markerPairs=generateMarkerPairs(
array5prime=flankingMarkersPerGenes[gene]['5prime'],
array3prime=flankingMarkersPerGenes[gene]['3prime'])
### initialize dict with all coordinates of markers on query and target seq
markerCoords=defaultdict()
pairUsed = 'NA'
markerPairStatus = 0
### check if markers are mapped on same target chromosome
for pair in markerPairs:
upstreamMarker,downsteamMarker = pair.split('|')
print(" check pair with upstream marker {} and downstream marker {}".format(upstreamMarker,downsteamMarker))
if( upstreamMarker in markersOnTarget_dict) and (downsteamMarker in markersOnTarget_dict) and (markersOnTarget_dict[upstreamMarker]['chrom'] == markersOnTarget_dict[downsteamMarker]['chrom']):
print(" Markers are on same chrom on target genome. this is good")
markerCoords['upstream'] = {
'id': upstreamMarker,
'query': markersOnQuery_dict[upstreamMarker],
'target': markersOnTarget_dict[upstreamMarker]
}
markerCoords['downstream'] = {
'id': downsteamMarker,
'query' : markersOnQuery_dict[downsteamMarker],
'target': markersOnTarget_dict[downsteamMarker]
}
markerPairStatus = 1
pairUsed = pair
break
elif upstreamMarker == 'NA' and downsteamMarker in markersOnTarget_dict:
"""
Here, we do not have any marker upstream:
we use the start of the target chromosome from the downstream marker as coordinates
"""
print(" Missing marker upstream of gene")
print(" using the start of the target chromosome of the downstream marker")
markerCoords['upstream'] = {
'id': 'start'+markersOnTarget_dict[downsteamMarker]['chrom'],
'query': markersOnQuery_dict[upstreamMarker],
'target': {
'strand': '+',
'mapQ':'NA',
'stop':1,
'start':0,
'chrom':markersOnTarget_dict[downsteamMarker]['chrom']
}
markerCoords['downstream'] = {
'id': downsteamMarker,
'query' : markersOnQuery_dict[downsteamMarker],
'target': markersOnTarget_dict[downsteamMarker]
}
markerPairStatus = 1
pairUsed = 'start'+markersOnTarget_dict[downsteamMarker]['chrom']+'|'+downsteamMarker
break
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
elif downsteamMarker == 'NA' and upstreamMarker in markersOnTarget_dict:
"""
Here, we do not have any marker downstream:
we use the end of the target chromosome from the upstream marker as coordinates
targetFasta.get_reference_length(geneDict['marker3pChrom'])
"""
print(" Missing marker downstream of gene")
print(" using the end of the target chromosome of the upstream marker")
markerCoords['upstream'] = {
'id': upstreamMarker,
'query': markersOnQuery_dict[upstreamMarker],
'target': markersOnTarget_dict[upstreamMarker]
}
markerCoords['downstream'] = {
'id': 'end'+markersOnTarget_dict[upstreamMarker]['chrom'],
'query' : markersOnQuery_dict[downsteamMarker],
'target': {
'strand': '+',
'mapQ':'NA',
'stop':targetFasta.get_reference_length(markersOnTarget_dict[upstreamMarker]['chrom']),
'start':targetFasta.get_reference_length(markersOnTarget_dict[upstreamMarker]['chrom']),
'chrom':markersOnTarget_dict[upstreamMarker]['chrom']
}
}
markerPairStatus = 1
pairUsed = 'end'+markersOnTarget_dict[upstreamMarker]['chrom']+'|'+upstreamMarker
break
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
print(" Markers are not on the same target chromosome: {} and {}.".format(markersOnTarget_dict[upstreamMarker]['chrom'],markersOnTarget_dict[downsteamMarker]['chrom']))
print(" trying with the next pair of markers")
continue
if markerPairStatus == 0:
"""
We could not identify a pair of flanking marker which map on the target genome
This gene cannot be anchored
we move on to the next gene
"""
print(" Could not identify a pair of marker to use to anchor the gene {}.".format(gene))
print(" going to next gene")
flankingMarkersPerGenes[gene]['markerPairStatus'] = 'noAnchorsOnTargetGenome'
flankingMarkersPerGenes[gene]['markerPairUsed'] = pairUsed
continue
else:
"""
We found a pair of markers on the target genome to use as anchor for the gene
We can extract the genomic region between these markers and use it as a target for blat
"""
print(" Pair of markers found to anchor the gene on the target genome")
print(" try now to extract the genomic sequences of query gene and target genome")
flankingMarkersPerGenes[gene]['markerPairStatus'] = 'markerPairedOnTarget'
flankingMarkersPerGenes[gene]['markerPairUsed'] = pairUsed
print(" marker IDs of flanking markers: {} ".format(pair))
print(markerCoords)
# print(" Generic gene info:")
# print(flankingMarkersPerGenes[gene])
"""
Extract fasta sequences of target and query genomic regions
============================================================================================================
"""
# 1) deal with query sequence: extract the fasta seq to map
queryGeneSeq=getFastaSeq(fasta=queryFasta,
chrom=flankingMarkersPerGenes[gene]['chrom'],
start=int(flankingMarkersPerGenes[gene]['start']),
stop=int(flankingMarkersPerGenes[gene]['stop']))
querySeqRecord= SeqRecord(
Seq(queryGeneSeq, IUPAC.ambiguous_dna),
id=flankingMarkersPerGenes[gene]['name'],
description='coords '+flankingMarkersPerGenes[gene]['chrom']+'_'+str(flankingMarkersPerGenes[gene]['start'])+'-'+str(flankingMarkersPerGenes[gene]['stop'])+', flanking markers '+ flankingMarkersPerGenes[gene]['markerPairUsed'])
#print(querySeqRecord)
queryFasta2blast = workingDir+'/query.fasta'
SeqIO.write(querySeqRecord, queryFasta2blast, 'fasta')
# 2) deal with target sequence: extract the fasta seq to use a db
# fisrt check orientation:
if int(markerCoords['upstream']['target']['start']) > int(markerCoords['downstream']['target']['stop']):
# invert start and stop
markerCoords['upstream']['target']['start'],markerCoords['downstream']['target']['stop'] = markerCoords['downstream']['target']['stop'] , markerCoords['upstream']['target']['start']
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
targetGenomeSeq=getFastaSeq(fasta=targetFasta,
chrom=markerCoords['upstream']['target']['chrom'],
start=int(markerCoords['upstream']['target']['start']),
stop=int(markerCoords['downstream']['target']['stop']))
targetSeqRecord= SeqRecord(
Seq(targetGenomeSeq, IUPAC.ambiguous_dna),
id='target_'+markerCoords['upstream']['target']['chrom']+'_'+str(markerCoords['upstream']['target']['start'])+'_'+str(markerCoords['downstream']['target']['stop']))
targetFasta2blast = workingDir+'/target.fasta'
SeqIO.write(targetSeqRecord, targetFasta2blast, 'fasta')
#print(targetSeqRecord)
"""
run Blat of the genomic sequence on the target region on new assembly
============================================================================================================
"""
blatResultFile=runBlat(db=targetFasta2blast,
query=queryFasta2blast,
blatResult=workingDir+'/blat.txt',
outFormat='psl')
"""
check if the genomic align perfectly on the new reference
============================================================================================================
"""
blatresult=checkPerfectHit(blatresult=blatResultFile,
workingDir=workingDir,
maxhit=1)
if blatresult['blatStatus'] == 'unmapped':
print(" No BLAT results found for gene {}".format(gene))
outputSummaryLine = createOutputLine(blat=blatresult, gene=flankingMarkersPerGenes[gene], markers=markerCoords )
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
else:
print(" BLAT result found for gene {} with status {}".format(gene,blatresult['blatStatus']))
#print(blatresult)
outputSummaryLine = createOutputLine(blat=blatresult, gene=flankingMarkersPerGenes[gene], markers=markerCoords )
if blatresult['blatStatus'] == 'fullPerfectMatch':
os.remove(queryFasta2blast)
os.remove(targetFasta2blast)
#input()
def createOutputLine(blat,gene,markers):
print("output line")
outputArray=[gene['name'],gene['markerPairUsed'],blat['blatStatus']]
blatArray=[gene['name'],gene['markerPairUsed'],blat['blatStatus']]
# loop over gene data content
for key,value in sorted(gene.items()):
#print(" key: {} value: {}".format(key,value))
if key == '3prime' or key == '5prime':
continue
outputArray.append(str(value))
# loop over marker info
upstreamMarkerInfo = [markers['upstream']['id'],
markers['upstream']['query']['chrom']+':'+str(markers['upstream']['query']['start'])+'-'+str(markers['upstream']['query']['stop']),
markers['upstream']['target']['chrom']+':'+str(markers['upstream']['target']['start'])+'-'+str(markers['upstream']['target']['stop'])]
downstreamMarkerInfo = [markers['downstream']['id'],
markers['downstream']['query']['chrom']+':'+str(markers['downstream']['query']['start'])+'-'+str(markers['downstream']['query']['stop']),
markers['downstream']['target']['chrom']+':'+str(markers['downstream']['target']['start'])+'-'+str(markers['downstream']['target']['stop'])]
outputArray+=upstreamMarkerInfo
outputArray+=downstreamMarkerInfo
# aggregate blat results
for key,value in sorted(blat.items()):
#print(" key: {} value: {}".format(key,value))
blatArray.append(str(value))
print('\t'.join(outputArray))
print('\t'.join(blatArray))
outputTableFH.write('\t'.join(outputArray)+"\n")
outputBlatFH.write('\t'.join(blatArray)+"\n")
def generateMarkerPairs(array5prime, array3prime):
"""
Generate all pairs of markers flanking the gene to anchor
This should be used when the first pair of flanking markers are not on the same target chromosome,
Because when this hapen, we cannot anchor the gene on the target genome
"""
pairs=[]
for up in array5prime:
for down in array3prime:
pairs.append(up+'|'+down)
print(" all marker pairs: {}".format(pairs))
return pairs
def getFlankFromBedtoolsClosest(upstream, downstream):
print(" Storing flanking markers info from bedtools closest results")
genesWithFlankISBP = defaultdict()
markersOnQuery = defaultdict()
with open(upstream) as upstreamresults:
for line in upstreamresults.readlines():
(gchrom, gstart,gstop,gid,gscore,gstrand,mchrom,mstart,mstop,mid,mdistance) = line.rstrip("\n").split("\t")
if mid == '.':
mchrom,mstart,mstop,mid,mdistance = 'NA','NA','NA','NA','NA'
if not gid in genesWithFlankISBP:
genesWithFlankISBP[gid] = {
'name':gid,
'chrom':gchrom,
'start':gstart,
'stop':gstop,
'strand':gstrand,
'5prime': [mid],
'3prime':[]
}
else:
genesWithFlankISBP[gid]['5prime'].append(mid)
markersOnQuery[mid]={'start':mstart,'chrom':mchrom, 'stop':mstop}
with open(downstream) as downstreamresults:
for line in downstreamresults.readlines():
(gchrom, gstart,gstop,gid,gscore,gstrand,mchrom,mstart,mstop,mid,mdistance) = line.rstrip("\n").split("\t")
if mid == '.':
mchrom,mstart,mstop,mid,mdistance = 'NA','NA','NA','NA','NA'
if gid in genesWithFlankISBP:
genesWithFlankISBP[gid]['3prime'].append(mid)
print(" Problem with gene {}: not already in the dict!!".format(gid))
markersOnQuery[mid]={'start':mstart,'chrom':mchrom, 'stop':mstop}
return genesWithFlankISBP,markersOnQuery
def checkPerfectHit(blatresult, workingDir, maxhit):
print(" blatResults %s" % blatresult)
hitIndex=0
blatResultDict = {
'blatStatus':'unmapped',
'cov':'NA',
'missmatches':'NA',
'indelBases':'NA',
'start':'NA',
'stop':'NA',
'strand':'NA',
'targetLength':'NA',
'targetName':'NA'}
cov = ''
missmatches = ''
indelBases = ''
if os.stat(blatresult).st_size == 0:
sys.stdout.write(" Blat result file is empty!")
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
sys.stdout.write(" Found Blat results to parse")
with open(blatresult) as blat:
for hit in blat.readlines():
hitIndex+=1
if hitIndex > maxhit:
sys.stdout.write(" Tried more than {} hits: giving up for this gene".format(maxhit))
break
print(" -> #%i Blat hit %s " % (hitIndex,hit))
pslData=hit.rstrip('\n').split('\t')
"""
calc the % coverage
pc_cov = (num_matche + num_mm + num_N) / length(querySeq) * 100
add N count if we have some
BLAT take into acount NN stretches with the '-extendThroughN' parameter.
"""
sumCov = int(pslData[0]) + int(pslData[1]) + int(pslData[3])
cov = ( sumCov / int(pslData[10])) * 100
print(" - Coverage: {}".format(cov))
"""
get missmatches info
"""
missmatches = int(pslData[1])
pc_mm = (missmatches / sumCov) * 100
print(" - missmatches: {} ({} %)".format(missmatches, pc_mm))
"""
get indels info
based on the column 8 & 5: Number of bases inserted into target and query
"""
indelBases = int(pslData[7]) + int(pslData[5])
pc_indels = (indelBases / sumCov) * 100
print(" - indels: {} ({} %)".format(indelBases, pc_indels))
blatResultDict['cov'] = cov
blatResultDict['missmatches'] = missmatches
blatResultDict['indelBases'] = indelBases
blatResultDict['start'] = pslData[15]
blatResultDict['stop'] = pslData[16]
blatResultDict['strand'] = pslData[8]
blatResultDict['targetLength'] = pslData[14]
blatResultDict['targetName'] = pslData[13]
##### decide if we remap or not the gene
if int(float(cov)) < 100 or int(indelBases) > 0:
print(" Gene {} has indel or is not fully covered: CANNOT ANCHOR IT\n".format(pslData[9]))
blatResultDict['blatStatus'] = 'imperfectMatch'
print(" we cannot anchor this gene\n")
print(" adding mummer/nucmer alignment info to this gene\n")
### add more alignemnt info using mummer/nucmer
# nucmer: perform the alignment
nucmerPref=workingDir+'/nucmer'
ref=workingDir+'/target.fasta'
query=workingDir+'/query.fasta'
os.system('nucmer -p {} {} {}'.format(nucmerPref,ref,query))
# mummerplot: generate the dot plot in png format
mummerplotPref=workingDir+'/mummerplot'
os.system('mummerplot --SNP -t png -p {} {}'.format(mummerplotPref,nucmerPref+'.delta'))
### add another blat format output for visualisation with ACT
blatfile=workingDir+'/blast_m8.tab'
os.system(' blat -extendThroughN -out=blast8 {} {} {}'.format(ref,query,blatfile))
run_act_file=workingDir+'/run_act.sh'
os.system(' echo act query.fasta blast_m8.tab target.fasta > {}'.format(blatfile, run_act_file))
else:
if missmatches == 0:
print(" Gene {} is a perfect full match: anchoring can be done!\n".format(pslData[9]))
blatResultDict['blatStatus'] = 'fullPerfectMatch'
print(" we can anchor this gene")
break
else:
print(" Gene {} has missmatches warning in anchoring!\n".format(pslData[9]))
blatResultDict['blatStatus'] = 'fullMatchWithMissmatches'
print(" we can anchor this gene")
break
return blatResultDict
def runBlat(db,query,blatResult, outFormat):
cmd='blat -noHead -extendThroughN -out='+outFormat+' '+db+' '+query+' '+blatResult
print(' blat command to run: %s'% str(cmd))
os.system('blat -noHead -extendThroughN -out={} {} {} {}'.format(outFormat,db,query,blatResult))
return blatResult
def getFastaSeq(fasta,chrom,start,stop):
seq=fasta.fetch(reference=chrom, start=start,end=stop)
print(" fasta sequence for chrom %s from %s to %s\n" %(chrom,start,stop))
#print(seq)
def getCoordsFromBed (bedfile):
checkFile(file=bedfile)
bedDict = defaultdict()
with open(bedfile) as file:
for line in file.readlines():
bed=line.rstrip("\n").split("\t")
if len(bed) < 4:
sys.stdout.write(" ERROR PARSING BEDFILE %s " % str(bedfile))
sys.stdout.write(" File do ot have 4 columns or is not tab delimited\n")
sys.stdout.write(" Please check the format\n")
sys.exit()
#print(" bed array content ")
#print(bed)
bedDict[str(bed[3])] ={'chrom': bed[0],'start':bed[1],'stop':bed[2],'mapQ': bed[4],'strand':bed[5]}
numRecords=len(bedDict.keys())
sys.stdout.write(" Found %i records in Bed file %s " % (numRecords, bedfile))
return bedDict
def checkFile (file):
if os.path.isfile(file):
sys.stdout.write(" File %s found\n" % str(file))
else:
sys.stdout.write(" ERROR: Cannot find file %s \n" % str(file))
sys.exit()
def checkDir(directory):
if os.path.isdir(directory):
sys.stdout.write(" Directory %s found\n" % str(directory))
sys.stdout.write(" Cannot find directory %s \n" % str(directory))
if ( checkDir(directory=outputDir) ):
sys.stdout.write( " Output directory already exists, no need to create it\n")
else:
sys.stdout.write( " Creating output directory %s" % str(outputDir))
os.mkdir( outputDir, 0o750 )
outputTableFH=open(outputTable_file, 'w')
outputBlatFH=open(outputBlatFile, 'w')