microMappingPipeline.py

#!/usr/bin/env python3
# coding: utf-8
import os.path
from os import path
from collections import defaultdict
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC
from Bio import SeqIO
import pysam
import sys

#### command line arguments
#inputMarkersFlankingGenes = sys.argv[1] # table with mapped flanking ISBPs from genes
marker5prime = sys.argv[1] # output of bedtools closest for the upstreams markers (5 first markers, in case the closest is not on the same chromosome)
marker3prime = sys.argv[2] # output of bedtools closest for the downstreams markers (5 first markers, in case the closest is not on the same chromosome)
outputDir = sys.argv[3] # output directory ...
genomeQuery = sys.argv[4] # fasta file of the v1 genome (must be fai indexed)
genomeTarget = sys.argv[5] # fasta file of the v2 genome (must be fai indexed)
markerPosOnTarget = sys.argv[6] # bed file of new coords of markers on target reference genome


#### generic data structure
# dict with summary of blat alignements: target region, coverage, indels, missmatches, anchoring status
blatResults = defaultdict()
outputTable_file=outputDir+'/mappingSummary.csv'
outputBlatFile=outputDir+'/allBlat.csv'
mappingResults = defaultdict()

def main():

	### check for input files and output directory
	checkFile(file=marker5prime)
	checkFile(file=marker3prime)

	### open for reading the query and target reference file
	queryFasta = pysam.FastaFile(genomeQuery)
	targetFasta = pysam.FastaFile(genomeTarget)

	### save coords of marker on target reference
	markersOnTarget_dict = getCoordsFromBed(bedfile=markerPosOnTarget)

	### save list of flanking markers for each genes , and also their coordinates on the query genome
	flankingMarkersPerGenes,markersOnQuery_dict  = getFlankFromBedtoolsClosest(upstream=marker5prime, downstream = marker3prime)

	### read input file containing the flanking IBSP markers for each gene
	##### variables
	numlines=0
	index_result_dir=0
	prefixResultDir=outputDir+'/temp/'+str(index_result_dir).zfill(6)

	##### now loop over all genes in flankingMarkersPerGenes dict
	for gene in flankingMarkersPerGenes:
		numlines+=1
		if numlines % 1000 == 0:
			index_result_dir+=1
			prefixResultDir=outputDir+'/temp/'+str(index_result_dir).zfill(6)

		### define the working dir for the current gene analysis
		workingDir=prefixResultDir+'/'+str(numlines).zfill(6)
		os.makedirs(workingDir, 0o750)
			
		### deal with data output dir
		print("\n\n------------------------------------------")
		print(" Results of gene %s going into subdir %s" % (gene, str(workingDir)))
		# print(" gene data:")
		# print(flankingMarkersPerGenes[gene])

		##################################################################################################################################################################
		### Generate all paris of flanking markers and check if we can anchor the gene on target genome
		##################################################################################################################################################################
		# remember that for each gene, I have 5 closest markers saved into tuple,
		# I will try first pair and check if they are on the same chromosome or not ...
		markerPairs=generateMarkerPairs(
			array5prime=flankingMarkersPerGenes[gene]['5prime'],
			array3prime=flankingMarkersPerGenes[gene]['3prime'])

		### initialize dict with all coordinates of markers on query and target seq
		markerCoords=defaultdict()
		pairUsed = 'NA'
		markerPairStatus = 0

		### check if markers are mapped on same target chromosome
		for pair in markerPairs:
			upstreamMarker,downsteamMarker = pair.split('|')
			print(" check pair with upstream marker {} and downstream marker {}".format(upstreamMarker,downsteamMarker))

			if( upstreamMarker in markersOnTarget_dict) and (downsteamMarker in markersOnTarget_dict) and (markersOnTarget_dict[upstreamMarker]['chrom'] == markersOnTarget_dict[downsteamMarker]['chrom']):
				print(" Markers are on same chrom on target genome. this is good")
				markerCoords['upstream'] = {
					'id': upstreamMarker,
					'query': markersOnQuery_dict[upstreamMarker],
					'target': markersOnTarget_dict[upstreamMarker]
				}
				markerCoords['downstream'] = {
					'id': downsteamMarker,
					'query' : markersOnQuery_dict[downsteamMarker],
					'target': markersOnTarget_dict[downsteamMarker]
				}
				markerPairStatus = 1
				pairUsed = pair
				break

			elif upstreamMarker == 'NA' and downsteamMarker in markersOnTarget_dict:
				"""
				Here, we do not have any marker upstream:
				we use the start of the target chromosome from the downstream marker as coordinates
				"""
				print(" Missing marker upstream of gene")
				print(" using the start of the target chromosome of the downstream marker")
				markerCoords['upstream'] = {
					'id': 'start'+markersOnTarget_dict[downsteamMarker]['chrom'],
					'query': markersOnQuery_dict[upstreamMarker],
					'target': {
						'strand': '+',
						'mapQ':'NA',
						'stop':1,
						'start':0,
						'chrom':markersOnTarget_dict[downsteamMarker]['chrom']
					}
				}
				markerCoords['downstream'] = {
					'id': downsteamMarker,
					'query' : markersOnQuery_dict[downsteamMarker],
					'target': markersOnTarget_dict[downsteamMarker]
				}
				markerPairStatus = 1
				pairUsed = 'start'+markersOnTarget_dict[downsteamMarker]['chrom']+'|'+downsteamMarker
				break

			elif downsteamMarker == 'NA' and upstreamMarker in markersOnTarget_dict:
				"""
				Here, we do not have any marker downstream:
				we use the end of the target chromosome from the upstream marker as coordinates
				targetFasta.get_reference_length(geneDict['marker3pChrom'])
				"""
				print(" Missing marker downstream of gene")
				print(" using the end of the target chromosome of the upstream marker")
				markerCoords['upstream'] = {
					'id': upstreamMarker,
					'query': markersOnQuery_dict[upstreamMarker],
					'target': markersOnTarget_dict[upstreamMarker]
				}
				markerCoords['downstream'] = {
					'id': 'end'+markersOnTarget_dict[upstreamMarker]['chrom'],
					'query' : markersOnQuery_dict[downsteamMarker],
					'target': {
						'strand': '+',
						'mapQ':'NA',
						'stop':targetFasta.get_reference_length(markersOnTarget_dict[upstreamMarker]['chrom']),
						'start':targetFasta.get_reference_length(markersOnTarget_dict[upstreamMarker]['chrom']),
						'chrom':markersOnTarget_dict[upstreamMarker]['chrom']
					}
				}
				markerPairStatus = 1
				pairUsed = 'end'+markersOnTarget_dict[upstreamMarker]['chrom']+'|'+upstreamMarker
				break

			else:
				print(" Markers are not on the same target chromosome: {} and {}.".format(markersOnTarget_dict[upstreamMarker]['chrom'],markersOnTarget_dict[downsteamMarker]['chrom']))
				print(" trying with the next pair of markers")
				continue

		if markerPairStatus == 0:
			"""
			We could not identify a pair of flanking marker which map on the target genome
			This gene cannot be anchored
			we move on to the next gene
			"""
			print(" Could not identify a pair of marker to use to anchor the gene {}.".format(gene))
			print(" going to next gene")
			flankingMarkersPerGenes[gene]['markerPairStatus'] = 'noAnchorsOnTargetGenome'
			flankingMarkersPerGenes[gene]['markerPairUsed'] = pairUsed
			continue
		else:
			"""
			We found a pair of markers on the target genome to use as anchor for the gene
			We can extract the genomic region between these markers and use it as a target for blat
			"""
			print(" Pair of markers found to anchor the gene on the target genome")
			print(" try now to extract the genomic sequences of query gene and target genome")
			flankingMarkersPerGenes[gene]['markerPairStatus'] = 'markerPairedOnTarget'
			flankingMarkersPerGenes[gene]['markerPairUsed'] = pairUsed
			print(" marker IDs of flanking markers: {} ".format(pair))
			print(markerCoords)
			# print(" Generic gene info:")
			# print(flankingMarkersPerGenes[gene])

			"""
			 Extract fasta sequences of target and query genomic regions
			 ============================================================================================================
			"""

			# 1) deal with query sequence: extract the fasta seq to map
			queryGeneSeq=getFastaSeq(fasta=queryFasta, 
				chrom=flankingMarkersPerGenes[gene]['chrom'], 
				start=int(flankingMarkersPerGenes[gene]['start']),
				stop=int(flankingMarkersPerGenes[gene]['stop']))
			querySeqRecord= SeqRecord(
								Seq(queryGeneSeq, IUPAC.ambiguous_dna), 
								id=flankingMarkersPerGenes[gene]['name'],
								description='coords '+flankingMarkersPerGenes[gene]['chrom']+'_'+str(flankingMarkersPerGenes[gene]['start'])+'-'+str(flankingMarkersPerGenes[gene]['stop'])+', flanking markers '+ flankingMarkersPerGenes[gene]['markerPairUsed'])
			#print(querySeqRecord)
			queryFasta2blast = workingDir+'/query.fasta'
			SeqIO.write(querySeqRecord, queryFasta2blast, 'fasta')

			# 2) deal with target sequence: extract the fasta seq to use a db
			# fisrt check orientation:
			if int(markerCoords['upstream']['target']['start']) > int(markerCoords['downstream']['target']['stop']):
				# invert start and stop
				markerCoords['upstream']['target']['start'],markerCoords['downstream']['target']['stop'] = markerCoords['downstream']['target']['stop'] , markerCoords['upstream']['target']['start']
				
			targetGenomeSeq=getFastaSeq(fasta=targetFasta, 
				chrom=markerCoords['upstream']['target']['chrom'], 
				start=int(markerCoords['upstream']['target']['start']), 
				stop=int(markerCoords['downstream']['target']['stop']))
			targetSeqRecord= SeqRecord(
								Seq(targetGenomeSeq, IUPAC.ambiguous_dna), 
								id='target_'+markerCoords['upstream']['target']['chrom']+'_'+str(markerCoords['upstream']['target']['start'])+'_'+str(markerCoords['downstream']['target']['stop']))
			targetFasta2blast = workingDir+'/target.fasta'
			SeqIO.write(targetSeqRecord, targetFasta2blast, 'fasta')
			#print(targetSeqRecord)

			"""
			run Blat of the genomic sequence on the target region on new assembly
			============================================================================================================
			"""
			blatResultFile=runBlat(db=targetFasta2blast, 
				query=queryFasta2blast, 
				blatResult=workingDir+'/blat.txt', 
				outFormat='psl')

			"""
			check if the genomic align perfectly on the new reference
			============================================================================================================
			"""
			blatresult=checkPerfectHit(blatresult=blatResultFile, 
				workingDir=workingDir, 
				maxhit=1)

			if blatresult['blatStatus'] == 'unmapped':
				print(" No BLAT results found for gene {}".format(gene))
				outputSummaryLine = createOutputLine(blat=blatresult, gene=flankingMarkersPerGenes[gene], markers=markerCoords )

			else:
				print(" BLAT result found for gene {} with status {}".format(gene,blatresult['blatStatus']))
				#print(blatresult)
				outputSummaryLine = createOutputLine(blat=blatresult, gene=flankingMarkersPerGenes[gene], markers=markerCoords )

				if blatresult['blatStatus'] == 'fullPerfectMatch':
					os.remove(queryFasta2blast)
					os.remove(targetFasta2blast)

			#input()

def createOutputLine(blat,gene,markers):
	print("output line")

	outputArray=[gene['name'],gene['markerPairUsed'],blat['blatStatus']]
	blatArray=[gene['name'],gene['markerPairUsed'],blat['blatStatus']]

	# loop over gene data content
	for key,value in sorted(gene.items()):
		#print(" key: {} value: {}".format(key,value))
		if key == '3prime' or key == '5prime':
			continue 
		outputArray.append(str(value))

	# loop over marker info
	upstreamMarkerInfo = [markers['upstream']['id'],
		markers['upstream']['query']['chrom']+':'+str(markers['upstream']['query']['start'])+'-'+str(markers['upstream']['query']['stop']),
		markers['upstream']['target']['chrom']+':'+str(markers['upstream']['target']['start'])+'-'+str(markers['upstream']['target']['stop'])]

	downstreamMarkerInfo = [markers['downstream']['id'],
		markers['downstream']['query']['chrom']+':'+str(markers['downstream']['query']['start'])+'-'+str(markers['downstream']['query']['stop']),
		markers['downstream']['target']['chrom']+':'+str(markers['downstream']['target']['start'])+'-'+str(markers['downstream']['target']['stop'])]
	outputArray+=upstreamMarkerInfo
	outputArray+=downstreamMarkerInfo

	# aggregate blat results
	for key,value in sorted(blat.items()):
		#print(" key: {} value: {}".format(key,value))
		blatArray.append(str(value))

	print('\t'.join(outputArray))
	print('\t'.join(blatArray))
	outputTableFH.write('\t'.join(outputArray)+"\n")
	outputBlatFH.write('\t'.join(blatArray)+"\n")

def generateMarkerPairs(array5prime, array3prime):
	"""
	Generate all pairs of markers flanking the gene to anchor
	This should be used when the first pair of flanking markers are not on the same target chromosome,
	Because when this hapen, we cannot anchor the gene on the target genome
	"""
	pairs=[]
	for up in array5prime:
		for down in array3prime:

			pairs.append(up+'|'+down)
	print(" all marker pairs: {}".format(pairs))
	return pairs

def getFlankFromBedtoolsClosest(upstream, downstream):
	print(" Storing flanking markers info from bedtools closest results")
	genesWithFlankISBP = defaultdict()
	markersOnQuery = defaultdict()

	with open(upstream) as upstreamresults:
		for line in upstreamresults.readlines():
			(gchrom, gstart,gstop,gid,gscore,gstrand,mchrom,mstart,mstop,mid,mdistance) = line.rstrip("\n").split("\t")

			if mid == '.':
				mchrom,mstart,mstop,mid,mdistance = 'NA','NA','NA','NA','NA'
			
			if not gid in genesWithFlankISBP:
				genesWithFlankISBP[gid] = {
						'name':gid,
						'chrom':gchrom,
						'start':gstart,
						'stop':gstop,
						'strand':gstrand,
						'5prime': [mid],
						'3prime':[]
						}
			else:
				genesWithFlankISBP[gid]['5prime'].append(mid)

			markersOnQuery[mid]={'start':mstart,'chrom':mchrom, 'stop':mstop}

	with open(downstream) as downstreamresults:
		for line in downstreamresults.readlines():
			(gchrom, gstart,gstop,gid,gscore,gstrand,mchrom,mstart,mstop,mid,mdistance) = line.rstrip("\n").split("\t")

			if mid == '.':
				mchrom,mstart,mstop,mid,mdistance = 'NA','NA','NA','NA','NA'
			
			if gid in genesWithFlankISBP:
				genesWithFlankISBP[gid]['3prime'].append(mid)
			else:
				print(" Problem with gene {}: not already in the dict!!".format(gid))

			markersOnQuery[mid]={'start':mstart,'chrom':mchrom, 'stop':mstop}

	return genesWithFlankISBP,markersOnQuery

def checkPerfectHit(blatresult, workingDir, maxhit):
	print(" blatResults %s" % blatresult)
	hitIndex=0
	blatResultDict = {
			'blatStatus':'unmapped',
			'cov':'NA',
			'missmatches':'NA',
			'indelBases':'NA',
			'start':'NA',
			'stop':'NA',
			'strand':'NA',
			'targetLength':'NA',
			'targetName':'NA'}
	cov = ''
	missmatches = ''
	indelBases = ''

	if os.stat(blatresult).st_size == 0:
		sys.stdout.write(" Blat result file is empty!")

	else:
		sys.stdout.write(" Found Blat results to parse")

		with open(blatresult) as blat:
			for hit in blat.readlines():
				hitIndex+=1

				if hitIndex > maxhit:
					sys.stdout.write(" Tried more than {} hits: giving up for this gene".format(maxhit))
					break
				print("    -> #%i Blat hit %s " % (hitIndex,hit))

				pslData=hit.rstrip('\n').split('\t')

				"""
				calc the % coverage
				pc_cov = (num_matche + num_mm + num_N) / length(querySeq) * 100
				add N count if we have some
				BLAT take into acount NN stretches with the '-extendThroughN' parameter.
				"""
				sumCov = int(pslData[0]) + int(pslData[1]) + int(pslData[3])
				cov = ( sumCov / int(pslData[10])) * 100 
				print("     - Coverage: {}".format(cov))

				"""
				get missmatches info
				"""
				missmatches = int(pslData[1])
				pc_mm = (missmatches / sumCov) * 100
				print("     - missmatches: {} ({} %)".format(missmatches, pc_mm))

				"""
				get indels info
				based on the column 8 & 5: Number of bases inserted into target and query
				"""
				indelBases = int(pslData[7]) + int(pslData[5])
				pc_indels = (indelBases / sumCov) * 100
				print("     - indels: {} ({} %)".format(indelBases, pc_indels))

				blatResultDict['cov'] = cov
				blatResultDict['missmatches'] = missmatches
				blatResultDict['indelBases'] = indelBases
				blatResultDict['start'] = pslData[15]
				blatResultDict['stop'] = pslData[16]
				blatResultDict['strand'] = pslData[8]
				blatResultDict['targetLength'] = pslData[14]
				blatResultDict['targetName'] = pslData[13]

				##### decide if we remap or not the gene
				if int(float(cov)) < 100 or int(indelBases) > 0:
					print(" Gene {} has indel or is not fully covered: CANNOT ANCHOR IT\n".format(pslData[9]))
					blatResultDict['blatStatus'] = 'imperfectMatch'

					print(" we cannot anchor this gene\n")
					print(" adding mummer/nucmer alignment info to this gene\n")

					### add more alignemnt info using mummer/nucmer
					# nucmer: perform the alignment
					nucmerPref=workingDir+'/nucmer'
					ref=workingDir+'/target.fasta'
					query=workingDir+'/query.fasta'
					os.system('nucmer -p {} {} {}'.format(nucmerPref,ref,query))

					# mummerplot: generate the dot plot in png format
					mummerplotPref=workingDir+'/mummerplot'
					os.system('mummerplot --SNP -t png -p {} {}'.format(mummerplotPref,nucmerPref+'.delta'))

					### add another blat format output for visualisation with ACT
					blatfile=workingDir+'/blast_m8.tab'
					os.system(' blat -extendThroughN -out=blast8 {} {} {}'.format(ref,query,blatfile))
					run_act_file=workingDir+'/run_act.sh'
					os.system(' echo act query.fasta blast_m8.tab target.fasta > {}'.format(blatfile, run_act_file))


				else:
					if missmatches == 0:
						print(" Gene {} is a perfect full match: anchoring can be done!\n".format(pslData[9]))
						blatResultDict['blatStatus'] = 'fullPerfectMatch'
						print(" we can anchor this gene")
						break
						
					else:
						print(" Gene {} has missmatches warning in anchoring!\n".format(pslData[9]))
						blatResultDict['blatStatus'] = 'fullMatchWithMissmatches'
						print(" we can anchor this gene")
						break
	return blatResultDict

def runBlat(db,query,blatResult, outFormat):
	cmd='blat -noHead -extendThroughN -out='+outFormat+' '+db+' '+query+' '+blatResult
	print(' blat command to run: %s'% str(cmd))
	os.system('blat -noHead -extendThroughN -out={} {} {} {}'.format(outFormat,db,query,blatResult))
	return blatResult

def getFastaSeq(fasta,chrom,start,stop):
	seq=fasta.fetch(reference=chrom, start=start,end=stop)
	print(" fasta sequence for chrom %s from %s to %s\n" %(chrom,start,stop))
	#print(seq)
	return seq

def getCoordsFromBed (bedfile):
	checkFile(file=bedfile)
	bedDict = defaultdict()
	with open(bedfile) as file:
		for line in file.readlines():
			bed=line.rstrip("\n").split("\t")
			if len(bed) < 4:
				sys.stdout.write(" ERROR PARSING BEDFILE %s " % str(bedfile)) 
				sys.stdout.write(" File do ot have 4 columns or is not tab delimited\n")
				sys.stdout.write(" Please check the format\n")
				sys.exit()
			#print(" bed array content ")
			#print(bed)
			bedDict[str(bed[3])] ={'chrom': bed[0],'start':bed[1],'stop':bed[2],'mapQ': bed[4],'strand':bed[5]}

	numRecords=len(bedDict.keys())
	sys.stdout.write(" Found %i records in Bed file %s " % (numRecords, bedfile))
	return bedDict

def checkFile (file):
	if os.path.isfile(file):
		sys.stdout.write(" File %s found\n" % str(file))
	else:
		sys.stdout.write(" ERROR: Cannot find file %s \n" % str(file))
		sys.exit()

def checkDir(directory):
	if os.path.isdir(directory):
		sys.stdout.write(" Directory %s found\n" % str(directory))
		return 1
	else:
		sys.stdout.write(" Cannot find directory %s \n" % str(directory))
		return 0


if __name__== "__main__":
	if ( checkDir(directory=outputDir) ):
		sys.stdout.write( " Output directory already exists, no need to create it\n")

	else:
		sys.stdout.write( " Creating output directory %s" % str(outputDir))
		os.mkdir( outputDir, 0o750 )

	outputTableFH=open(outputTable_file, 'w')
	outputBlatFH=open(outputBlatFile, 'w')

	main()