#!/usr/bin/env python3 # coding: utf-8 import os.path from os import path from collections import defaultdict from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from Bio.Alphabet import IUPAC from Bio import SeqIO import pysam import sys #### command line arguments #inputMarkersFlankingGenes = sys.argv[1] # table with mapped flanking ISBPs from genes marker5prime = sys.argv[1] # output of bedtools closest for the upstreams markers (5 first markers, in case the closest is not on the same chromosome) marker3prime = sys.argv[2] # output of bedtools closest for the downstreams markers (5 first markers, in case the closest is not on the same chromosome) outputDir = sys.argv[3] # output directory ... genomeQuery = sys.argv[4] # fasta file of the v1 genome (must be fai indexed) genomeTarget = sys.argv[5] # fasta file of the v2 genome (must be fai indexed) markerPosOnTarget = sys.argv[6] # bed file of new coords of markers on target reference genome #### generic data structure # dict with summary of blat alignements: target region, coverage, indels, missmatches, anchoring status blatResults = defaultdict() outputTable_file=outputDir+'/mappingSummary.csv' outputBlatFile=outputDir+'/allBlat.csv' mappingResults = defaultdict() def main(): ### check for input files and output directory checkFile(file=marker5prime) checkFile(file=marker3prime) ### open for reading the query and target reference file queryFasta = pysam.FastaFile(genomeQuery) targetFasta = pysam.FastaFile(genomeTarget) ### save coords of marker on target reference markersOnTarget_dict = getCoordsFromBed(bedfile=markerPosOnTarget) ### save list of flanking markers for each genes , and also their coordinates on the query genome flankingMarkersPerGenes,markersOnQuery_dict = getFlankFromBedtoolsClosest(upstream=marker5prime, downstream = marker3prime) ### read input file containing the flanking IBSP markers for each gene ##### variables numlines=0 index_result_dir=0 prefixResultDir=outputDir+'/temp/'+str(index_result_dir).zfill(6) ##### now loop over all genes in flankingMarkersPerGenes dict for gene in flankingMarkersPerGenes: numlines+=1 if numlines % 1000 == 0: index_result_dir+=1 prefixResultDir=outputDir+'/temp/'+str(index_result_dir).zfill(6) ### define the working dir for the current gene analysis workingDir=prefixResultDir+'/'+str(numlines).zfill(6) os.makedirs(workingDir, 0o750) ### deal with data output dir print("\n\n------------------------------------------") print(" Results of gene %s going into subdir %s" % (gene, str(workingDir))) # print(" gene data:") # print(flankingMarkersPerGenes[gene]) ################################################################################################################################################################## ### Generate all paris of flanking markers and check if we can anchor the gene on target genome ################################################################################################################################################################## # remember that for each gene, I have 5 closest markers saved into tuple, # I will try first pair and check if they are on the same chromosome or not ... markerPairs=generateMarkerPairs( array5prime=flankingMarkersPerGenes[gene]['5prime'], array3prime=flankingMarkersPerGenes[gene]['3prime']) ### initialize dict with all coordinates of markers on query and target seq markerCoords=defaultdict() pairUsed = 'NA' markerPairStatus = 0 ### check if markers are mapped on same target chromosome for pair in markerPairs: upstreamMarker,downsteamMarker = pair.split('|') print(" check pair with upstream marker {} and downstream marker {}".format(upstreamMarker,downsteamMarker)) if( upstreamMarker in markersOnTarget_dict) and (downsteamMarker in markersOnTarget_dict) and (markersOnTarget_dict[upstreamMarker]['chrom'] == markersOnTarget_dict[downsteamMarker]['chrom']): print(" Markers are on same chrom on target genome. this is good") markerCoords['upstream'] = { 'id': upstreamMarker, 'query': markersOnQuery_dict[upstreamMarker], 'target': markersOnTarget_dict[upstreamMarker] } markerCoords['downstream'] = { 'id': downsteamMarker, 'query' : markersOnQuery_dict[downsteamMarker], 'target': markersOnTarget_dict[downsteamMarker] } markerPairStatus = 1 pairUsed = pair break elif upstreamMarker == 'NA' and downsteamMarker in markersOnTarget_dict: """ Here, we do not have any marker upstream: we use the start of the target chromosome from the downstream marker as coordinates """ print(" Missing marker upstream of gene") print(" using the start of the target chromosome of the downstream marker") markerCoords['upstream'] = { 'id': 'start'+markersOnTarget_dict[downsteamMarker]['chrom'], 'query': markersOnQuery_dict[upstreamMarker], 'target': { 'strand': '+', 'mapQ':'NA', 'stop':1, 'start':0, 'chrom':markersOnTarget_dict[downsteamMarker]['chrom'] } } markerCoords['downstream'] = { 'id': downsteamMarker, 'query' : markersOnQuery_dict[downsteamMarker], 'target': markersOnTarget_dict[downsteamMarker] } markerPairStatus = 1 pairUsed = 'start'+markersOnTarget_dict[downsteamMarker]['chrom']+'|'+downsteamMarker break elif downsteamMarker == 'NA' and upstreamMarker in markersOnTarget_dict: """ Here, we do not have any marker downstream: we use the end of the target chromosome from the upstream marker as coordinates targetFasta.get_reference_length(geneDict['marker3pChrom']) """ print(" Missing marker downstream of gene") print(" using the end of the target chromosome of the upstream marker") markerCoords['upstream'] = { 'id': upstreamMarker, 'query': markersOnQuery_dict[upstreamMarker], 'target': markersOnTarget_dict[upstreamMarker] } markerCoords['downstream'] = { 'id': 'end'+markersOnTarget_dict[upstreamMarker]['chrom'], 'query' : markersOnQuery_dict[downsteamMarker], 'target': { 'strand': '+', 'mapQ':'NA', 'stop':targetFasta.get_reference_length(markersOnTarget_dict[upstreamMarker]['chrom']), 'start':targetFasta.get_reference_length(markersOnTarget_dict[upstreamMarker]['chrom']), 'chrom':markersOnTarget_dict[upstreamMarker]['chrom'] } } markerPairStatus = 1 pairUsed = 'end'+markersOnTarget_dict[upstreamMarker]['chrom']+'|'+upstreamMarker break else: print(" Markers are not on the same target chromosome: {} and {}.".format(markersOnTarget_dict[upstreamMarker]['chrom'],markersOnTarget_dict[downsteamMarker]['chrom'])) print(" trying with the next pair of markers") continue if markerPairStatus == 0: """ We could not identify a pair of flanking marker which map on the target genome This gene cannot be anchored we move on to the next gene """ print(" Could not identify a pair of marker to use to anchor the gene {}.".format(gene)) print(" going to next gene") flankingMarkersPerGenes[gene]['markerPairStatus'] = 'noAnchorsOnTargetGenome' flankingMarkersPerGenes[gene]['markerPairUsed'] = pairUsed continue else: """ We found a pair of markers on the target genome to use as anchor for the gene We can extract the genomic region between these markers and use it as a target for blat """ print(" Pair of markers found to anchor the gene on the target genome") print(" try now to extract the genomic sequences of query gene and target genome") flankingMarkersPerGenes[gene]['markerPairStatus'] = 'markerPairedOnTarget' flankingMarkersPerGenes[gene]['markerPairUsed'] = pairUsed print(" marker IDs of flanking markers: {} ".format(pair)) print(markerCoords) # print(" Generic gene info:") # print(flankingMarkersPerGenes[gene]) """ Extract fasta sequences of target and query genomic regions ============================================================================================================ """ # 1) deal with query sequence: extract the fasta seq to map queryGeneSeq=getFastaSeq(fasta=queryFasta, chrom=flankingMarkersPerGenes[gene]['chrom'], start=int(flankingMarkersPerGenes[gene]['start']), stop=int(flankingMarkersPerGenes[gene]['stop'])) querySeqRecord= SeqRecord( Seq(queryGeneSeq, IUPAC.ambiguous_dna), id=flankingMarkersPerGenes[gene]['name'], description='coords '+flankingMarkersPerGenes[gene]['chrom']+'_'+str(flankingMarkersPerGenes[gene]['start'])+'-'+str(flankingMarkersPerGenes[gene]['stop'])+', flanking markers '+ flankingMarkersPerGenes[gene]['markerPairUsed']) #print(querySeqRecord) queryFasta2blast = workingDir+'/query.fasta' SeqIO.write(querySeqRecord, queryFasta2blast, 'fasta') # 2) deal with target sequence: extract the fasta seq to use a db # fisrt check orientation: if int(markerCoords['upstream']['target']['start']) > int(markerCoords['downstream']['target']['stop']): # invert start and stop markerCoords['upstream']['target']['start'],markerCoords['downstream']['target']['stop'] = markerCoords['downstream']['target']['stop'] , markerCoords['upstream']['target']['start'] targetGenomeSeq=getFastaSeq(fasta=targetFasta, chrom=markerCoords['upstream']['target']['chrom'], start=int(markerCoords['upstream']['target']['start']), stop=int(markerCoords['downstream']['target']['stop'])) targetSeqRecord= SeqRecord( Seq(targetGenomeSeq, IUPAC.ambiguous_dna), id='target_'+markerCoords['upstream']['target']['chrom']+'_'+str(markerCoords['upstream']['target']['start'])+'_'+str(markerCoords['downstream']['target']['stop'])) targetFasta2blast = workingDir+'/target.fasta' SeqIO.write(targetSeqRecord, targetFasta2blast, 'fasta') #print(targetSeqRecord) """ run Blat of the genomic sequence on the target region on new assembly ============================================================================================================ """ blatResultFile=runBlat(db=targetFasta2blast, query=queryFasta2blast, blatResult=workingDir+'/blat.txt', outFormat='psl') """ check if the genomic align perfectly on the new reference ============================================================================================================ """ blatresult=checkPerfectHit(blatresult=blatResultFile, workingDir=workingDir, maxhit=1) if blatresult['blatStatus'] == 'unmapped': print(" No BLAT results found for gene {}".format(gene)) outputSummaryLine = createOutputLine(blat=blatresult, gene=flankingMarkersPerGenes[gene], markers=markerCoords ) else: print(" BLAT result found for gene {} with status {}".format(gene,blatresult['blatStatus'])) #print(blatresult) outputSummaryLine = createOutputLine(blat=blatresult, gene=flankingMarkersPerGenes[gene], markers=markerCoords ) if blatresult['blatStatus'] == 'fullPerfectMatch': os.remove(queryFasta2blast) os.remove(targetFasta2blast) #input() def createOutputLine(blat,gene,markers): print("output line") outputArray=[gene['name'],gene['markerPairUsed'],blat['blatStatus']] blatArray=[gene['name'],gene['markerPairUsed'],blat['blatStatus']] # loop over gene data content for key,value in sorted(gene.items()): #print(" key: {} value: {}".format(key,value)) if key == '3prime' or key == '5prime': continue outputArray.append(str(value)) # loop over marker info upstreamMarkerInfo = [markers['upstream']['id'], markers['upstream']['query']['chrom']+':'+str(markers['upstream']['query']['start'])+'-'+str(markers['upstream']['query']['stop']), markers['upstream']['target']['chrom']+':'+str(markers['upstream']['target']['start'])+'-'+str(markers['upstream']['target']['stop'])] downstreamMarkerInfo = [markers['downstream']['id'], markers['downstream']['query']['chrom']+':'+str(markers['downstream']['query']['start'])+'-'+str(markers['downstream']['query']['stop']), markers['downstream']['target']['chrom']+':'+str(markers['downstream']['target']['start'])+'-'+str(markers['downstream']['target']['stop'])] outputArray+=upstreamMarkerInfo outputArray+=downstreamMarkerInfo # aggregate blat results for key,value in sorted(blat.items()): #print(" key: {} value: {}".format(key,value)) blatArray.append(str(value)) print('\t'.join(outputArray)) print('\t'.join(blatArray)) outputTableFH.write('\t'.join(outputArray)+"\n") outputBlatFH.write('\t'.join(blatArray)+"\n") def generateMarkerPairs(array5prime, array3prime): """ Generate all pairs of markers flanking the gene to anchor This should be used when the first pair of flanking markers are not on the same target chromosome, Because when this hapen, we cannot anchor the gene on the target genome """ pairs=[] for up in array5prime: for down in array3prime: pairs.append(up+'|'+down) print(" all marker pairs: {}".format(pairs)) return pairs def getFlankFromBedtoolsClosest(upstream, downstream): print(" Storing flanking markers info from bedtools closest results") genesWithFlankISBP = defaultdict() markersOnQuery = defaultdict() with open(upstream) as upstreamresults: for line in upstreamresults.readlines(): (gchrom, gstart,gstop,gid,gscore,gstrand,mchrom,mstart,mstop,mid,mdistance) = line.rstrip("\n").split("\t") if mid == '.': mchrom,mstart,mstop,mid,mdistance = 'NA','NA','NA','NA','NA' if not gid in genesWithFlankISBP: genesWithFlankISBP[gid] = { 'name':gid, 'chrom':gchrom, 'start':gstart, 'stop':gstop, 'strand':gstrand, '5prime': [mid], '3prime':[] } else: genesWithFlankISBP[gid]['5prime'].append(mid) markersOnQuery[mid]={'start':mstart,'chrom':mchrom, 'stop':mstop} with open(downstream) as downstreamresults: for line in downstreamresults.readlines(): (gchrom, gstart,gstop,gid,gscore,gstrand,mchrom,mstart,mstop,mid,mdistance) = line.rstrip("\n").split("\t") if mid == '.': mchrom,mstart,mstop,mid,mdistance = 'NA','NA','NA','NA','NA' if gid in genesWithFlankISBP: genesWithFlankISBP[gid]['3prime'].append(mid) else: print(" Problem with gene {}: not already in the dict!!".format(gid)) markersOnQuery[mid]={'start':mstart,'chrom':mchrom, 'stop':mstop} return genesWithFlankISBP,markersOnQuery def checkPerfectHit(blatresult, workingDir, maxhit): print(" blatResults %s" % blatresult) hitIndex=0 blatResultDict = { 'blatStatus':'unmapped', 'cov':'NA', 'missmatches':'NA', 'indelBases':'NA', 'start':'NA', 'stop':'NA', 'strand':'NA', 'targetLength':'NA', 'targetName':'NA'} cov = '' missmatches = '' indelBases = '' if os.stat(blatresult).st_size == 0: sys.stdout.write(" Blat result file is empty!") else: sys.stdout.write(" Found Blat results to parse") with open(blatresult) as blat: for hit in blat.readlines(): hitIndex+=1 if hitIndex > maxhit: sys.stdout.write(" Tried more than {} hits: giving up for this gene".format(maxhit)) break print(" -> #%i Blat hit %s " % (hitIndex,hit)) pslData=hit.rstrip('\n').split('\t') """ calc the % coverage pc_cov = (num_matche + num_mm + num_N) / length(querySeq) * 100 add N count if we have some BLAT take into acount NN stretches with the '-extendThroughN' parameter. """ sumCov = int(pslData[0]) + int(pslData[1]) + int(pslData[3]) cov = ( sumCov / int(pslData[10])) * 100 print(" - Coverage: {}".format(cov)) """ get missmatches info """ missmatches = int(pslData[1]) pc_mm = (missmatches / sumCov) * 100 print(" - missmatches: {} ({} %)".format(missmatches, pc_mm)) """ get indels info based on the column 8 & 5: Number of bases inserted into target and query """ indelBases = int(pslData[7]) + int(pslData[5]) pc_indels = (indelBases / sumCov) * 100 print(" - indels: {} ({} %)".format(indelBases, pc_indels)) blatResultDict['cov'] = cov blatResultDict['missmatches'] = missmatches blatResultDict['indelBases'] = indelBases blatResultDict['start'] = pslData[15] blatResultDict['stop'] = pslData[16] blatResultDict['strand'] = pslData[8] blatResultDict['targetLength'] = pslData[14] blatResultDict['targetName'] = pslData[13] ##### decide if we remap or not the gene if int(float(cov)) < 100 or int(indelBases) > 0: print(" Gene {} has indel or is not fully covered: CANNOT ANCHOR IT\n".format(pslData[9])) blatResultDict['blatStatus'] = 'imperfectMatch' print(" we cannot anchor this gene\n") print(" adding mummer/nucmer alignment info to this gene\n") ### add more alignemnt info using mummer/nucmer # nucmer: perform the alignment nucmerPref=workingDir+'/nucmer' ref=workingDir+'/target.fasta' query=workingDir+'/query.fasta' os.system('nucmer -p {} {} {}'.format(nucmerPref,ref,query)) # mummerplot: generate the dot plot in png format mummerplotPref=workingDir+'/mummerplot' os.system('mummerplot --SNP -t png -p {} {}'.format(mummerplotPref,nucmerPref+'.delta')) ### add another blat format output for visualisation with ACT blatfile=workingDir+'/blast_m8.tab' os.system(' blat -extendThroughN -out=blast8 {} {} {}'.format(ref,query,blatfile)) run_act_file=workingDir+'/run_act.sh' os.system(' echo act query.fasta blast_m8.tab target.fasta > {}'.format(blatfile, run_act_file)) else: if missmatches == 0: print(" Gene {} is a perfect full match: anchoring can be done!\n".format(pslData[9])) blatResultDict['blatStatus'] = 'fullPerfectMatch' print(" we can anchor this gene") break else: print(" Gene {} has missmatches warning in anchoring!\n".format(pslData[9])) blatResultDict['blatStatus'] = 'fullMatchWithMissmatches' print(" we can anchor this gene") break return blatResultDict def runBlat(db,query,blatResult, outFormat): cmd='blat -noHead -extendThroughN -out='+outFormat+' '+db+' '+query+' '+blatResult print(' blat command to run: %s'% str(cmd)) os.system('blat -noHead -extendThroughN -out={} {} {} {}'.format(outFormat,db,query,blatResult)) return blatResult def getFastaSeq(fasta,chrom,start,stop): seq=fasta.fetch(reference=chrom, start=start,end=stop) print(" fasta sequence for chrom %s from %s to %s\n" %(chrom,start,stop)) #print(seq) return seq def getCoordsFromBed (bedfile): checkFile(file=bedfile) bedDict = defaultdict() with open(bedfile) as file: for line in file.readlines(): bed=line.rstrip("\n").split("\t") if len(bed) < 4: sys.stdout.write(" ERROR PARSING BEDFILE %s " % str(bedfile)) sys.stdout.write(" File do ot have 4 columns or is not tab delimited\n") sys.stdout.write(" Please check the format\n") sys.exit() #print(" bed array content ") #print(bed) bedDict[str(bed[3])] ={'chrom': bed[0],'start':bed[1],'stop':bed[2],'mapQ': bed[4],'strand':bed[5]} numRecords=len(bedDict.keys()) sys.stdout.write(" Found %i records in Bed file %s " % (numRecords, bedfile)) return bedDict def checkFile (file): if os.path.isfile(file): sys.stdout.write(" File %s found\n" % str(file)) else: sys.stdout.write(" ERROR: Cannot find file %s \n" % str(file)) sys.exit() def checkDir(directory): if os.path.isdir(directory): sys.stdout.write(" Directory %s found\n" % str(directory)) return 1 else: sys.stdout.write(" Cannot find directory %s \n" % str(directory)) return 0 if __name__== "__main__": if ( checkDir(directory=outputDir) ): sys.stdout.write( " Output directory already exists, no need to create it\n") else: sys.stdout.write( " Creating output directory %s" % str(outputDir)) os.mkdir( outputDir, 0o750 ) outputTableFH=open(outputTable_file, 'w') outputBlatFH=open(outputBlatFile, 'w') main()