microMappingPipeline.py

#!/usr/bin/env python3
# coding: utf-8
import os.path
from os import path
from collections import defaultdict
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC
from Bio import SeqIO
import pysam
import sys

#### command line arguments
inputMarkersFlankingGenes = sys.argv[1] # table with mapped flanking ISBPs from genes
outputDir = sys.argv[2] # output directory ...
genomeQuery = sys.argv[3] # fasta file of the v1 genome (must be fai indexed)
genomeTarget = sys.argv[4] # fasta file of the v2 genome (must be fai indexed)
markerPosOnTarget = sys.argv[5] # bed file of new coords of markers on target reference genome


#### generic data structure
# dict with summary of blat alignements: target region, coverage, indels, missmatches, anchoring status
blatResults = defaultdict()


def main():

	### check for input files and output directory
	checkFile(file=inputMarkersFlankingGenes)
	if ( checkDir(directory=outputDir) ):
		sys.stdout.write( " Output directory already exists, no need to create it\n")

	else:
		sys.stdout.write( " Creating output directory %s" % str(outputDir))
		os.mkdir( outputDir, 0o750 )

	### open for reading the query and target reference file
	queryFasta = pysam.FastaFile(genomeQuery)
	targetFasta = pysam.FastaFile(genomeTarget)

	### save coords of marker on target reference
	markersOnTarget_dict = getCoordsFromBed(bedfile=markerPosOnTarget)

	### read input file containing the flanking IBSP markers for each gene
	##### variables
	numlines=0
	index_result_dir=0
	prefixResultDir=outputDir+'/temp/'+str(index_result_dir).zfill(6)

	##### output files
	with open(inputMarkersFlankingGenes) as file:
		for line in file.readlines():
			# increment counters
			# because to avoid having a big result dir with +100k subdirs, 
			# the scripts creates subdirs for every 1k genes
			numlines+=1
			if numlines % 1000 == 0:
				index_result_dir+=1
				prefixResultDir=outputDir+'/temp/'+str(index_result_dir).zfill(6)

			# define the working dir for the current gene analysis
			workingDir=prefixResultDir+'/'+str(numlines).zfill(6)
			os.makedirs(workingDir, 0o750)
			
			# deal with data
			lineArray = line.rstrip("\n").split("\t")
			print("\n\n------------------------------------------")
			print(" Results of gene %s going into subdir %s" % (lineArray[0], str(workingDir)))
			print(" array of data: ")
			print(lineArray)

			# check if both anchors are mapped on the same chromosome
			geneDict={ 'geneId': lineArray[0],
						'geneChrom':lineArray[1],
						'geneStart':lineArray[2],
						'geneStop':lineArray[3],
						'geneScore':lineArray[4],
						'geneStrand':lineArray[5],
						'marker5pChrom':lineArray[6],
						'marker5pStart':lineArray[7],
						'marker5pStop':lineArray[8],
						'marker5pId':lineArray[9],
						'marker5pDistance':lineArray[10],
						'marker3pChrom':lineArray[11],
						'marker3pStart':lineArray[12],
						'marker3pStop':lineArray[13],
						'marker3pId':lineArray[14],
						'marker3pDistance':lineArray[15],
						'regionLengthOnQuery':lineArray[16]
					}
			print(" Gene directory:")
			print(geneDict)

			anchor5pTargetCoods_dict= {'chrom': 0, 'start':0, 'stop':0}
			anchor3pTargetCoods_dict= {}

			if geneDict['marker5pChrom'] == '.' or geneDict['marker3pChrom'] == '.':
				# deal with genes with one anchor missing (basecally start and end of chromosomes)
				sys.stderr.write(" current gene %s has missing anchor on one side \n" % geneDict['geneId'])

				if geneDict['marker5pId'] == '.':
					print(' Missing 5prime anchor: using genomic sequence from the start of chromosome')
					geneDict['marker5pChrom'] = geneDict['marker3pChrom'].replace('chr','Chr')
					geneDict['marker5pStart'] = 0
					geneDict['marker5pStop'] = 0
					geneDict['marker5pId'] = geneDict['marker5pChrom']
					anchor5pTargetCoods_dict.update(chrom= geneDict['marker5pChrom'], start= 1, stop=1)
					anchor3pTargetCoods_dict.update(markersOnTarget_dict[geneDict['marker3pId']])

				elif geneDict['marker3pId'] == '.':
					print(' Missing 3prime anchor: using genomic sequence to the end of chromosome')
					geneDict['marker3pChrom'] = geneDict['marker5pChrom'].replace('chr','Chr')
					geneDict['marker3pStart'] = targetFasta.get_reference_length(geneDict['marker3pChrom'])
					geneDict['marker3pStop'] = targetFasta.get_reference_length(geneDict['marker3pChrom'])
					geneDict['marker3pId'] = geneDict['marker3pChrom']
					anchor3pTargetCoods_dict.update(chrom= geneDict['marker3pChrom'], start=geneDict['marker3pStart'], stop=geneDict['marker3pStop'])
					anchor5pTargetCoods_dict.update(markersOnTarget_dict[geneDict['marker5pId']])
				else:
					print(" Problem with the anchors: canno find 5prime and 3prime anchors for the gene {}".format(geneDict['geneId']))
			else:
				# get mapping of the flanking anchors on the target genome
				anchor5pTargetCoods_dict.update(markersOnTarget_dict[geneDict['marker5pId']])
				anchor3pTargetCoods_dict.update(markersOnTarget_dict[geneDict['marker3pId']])

			print(" anchors 5prime:\n")
			print(anchor5pTargetCoods_dict)
			print(" anchors 3prime:\n")
			print(anchor3pTargetCoods_dict)

			if anchor5pTargetCoods_dict['chrom'] == anchor3pTargetCoods_dict['chrom']:
				print(" both anchors are mapped on the same chrom. we can proceed to sequence extraction and mapping\n")
				### deal with query sequence: extract the fasta seq to map
				queryGeneSeq=getFastaSeq(fasta=queryFasta, 
					chrom=geneDict['geneChrom'], 
					start=int(geneDict['geneStart']), 
					stop=int(geneDict['geneStop']))
				querySeqRecord= SeqRecord(
									Seq(queryGeneSeq, IUPAC.ambiguous_dna), 
									id=geneDict['geneId'],
									description='coords '+geneDict['geneChrom']+'_'+str(geneDict['geneStart'])+'-'+str(geneDict['geneStop'])+', flanking markers '+ geneDict['marker5pId']+'-'+geneDict['marker3pId'])
				#print(querySeqRecord)
				queryFasta2blast = workingDir+'/query.fasta'
				SeqIO.write(querySeqRecord, queryFasta2blast, 'fasta')

				### deal with target sequence: extract the fasta seq to use a db
				# fisrt check orientation:
				if anchor5pTargetCoods_dict['start'] > anchor3pTargetCoods_dict['stop']:
					# invert start and stop
					anchor5pTargetCoods_dict['start'],anchor3pTargetCoods_dict['stop'] = anchor3pTargetCoods_dict['stop'],anchor5pTargetCoods_dict['start']
					
				targetGenomeSeq=getFastaSeq(fasta=targetFasta, 
					chrom=anchor5pTargetCoods_dict['chrom'], 
					start=int(anchor5pTargetCoods_dict['start']), 
					stop=int(anchor3pTargetCoods_dict['stop']))
				targetSeqRecord= SeqRecord(
									Seq(targetGenomeSeq, IUPAC.ambiguous_dna), 
									id='target_'+anchor5pTargetCoods_dict['chrom']+'_'+str(anchor5pTargetCoods_dict['start'])+'_'+str(anchor3pTargetCoods_dict['stop']))
				targetFasta2blast = workingDir+'/target.fasta'
				SeqIO.write(targetSeqRecord, targetFasta2blast, 'fasta')

				################### run Blat of the genomic sequence on the target region on new assembly
				blatResultFile=runBlat(db=targetFasta2blast, 
					query=queryFasta2blast, 
					blatResult=workingDir+'/blat.txt', 
					outFormat='psl')

				############ check if the genomic align perfectly on the new reference
				checkPerfectHit(blatresult=blatResultFile, 
					workingDir=workingDir, 
					maxhit=1)

				os.remove(queryFasta2blast)
				os.remove(targetFasta2blast)


			else:
				# TODO
				sys.stderr.write(" 5prime anchor %s and 3prime anchor %s are not on the same chromsome\n" % (geneDict['marker5pId'],geneDict['marker3pId']))
				sys.stderr.write(" Cannot anchoring gene %s \n" % geneDict['geneId'])
				

			# #input()
			# if numlines == 150:
			# 	break

def checkPerfectHit(blatresult, workingDir, maxhit):
	print(" blatResults %s" % blatresult)
	hitIndex=0
	with open(blatresult) as blat:
		for hit in blat.readlines():
			hitIndex+=1

			if hitIndex > maxhit:
				sys.stderr.write(" Tried more than {} hits: giving up for this gene".format(maxhit))
				break
			print("    -> #%i Blat hit %s " % (hitIndex,hit))
				
			if parsePSL(pslRecord=hit) :
				print(" we can anchor this gene")
				break
				
			else:
				print(" we cannot anchor this gene\n")
				print(" adding mummer/nucmer alignment info to this gene\n")

				### add more alignemnt info using mummer/nucmer
				# nucmer: perform the alignment
				nucmerPref=workingDir+'/nucmer'
				ref=workingDir+'/target.fasta'
				query=workingDir+'/query.fasta'
				os.system('nucmer -p {} {} {}'.format(nucmerPref,ref,query))

				# mummerplot: generate the dot plot in png format
				mummerplotPref=workingDir+'/mummerplot'
				os.system('mummerplot --SNP -t png -p {} {}'.format(mummerplotPref,nucmerPref+'.delta'))

				# dnadiff: show potential snps and gaps
				dnadiffPref=workingDir+'/dnadiff'
				os.system('dnadiff -p {} -d {}'.format(dnadiffPref,nucmerPref+'.delta'))

				### add another blat format output for visualisation with ACT
				blatfile=workingDir+'/blast_m8.tab'
				os.system(' blat -extendThroughN -out=blast8 {} {} {}'.format(ref,query,blatfile))
				run_act_file=workingDir+'/run_act.sh'
				os.system(' echo act query.fasta blast_m8.tab target.fasta > {}'.format(blatfile, run_act_file))
				

def parsePSL(pslRecord):
	cov = ''
	missmatches = ''
	indels = ''
	pslData=pslRecord.rstrip('\n').split('\t')

	##### calc the % coverage
	# pc_cov = (num_matche + num_mm + num_N) / length(querySeq) * 100
	# add N count if we have some
	# BLAT take into acount NN stretches with the '-extendThroughN' parameter.
	sumCov = int(pslData[0]) + int(pslData[1]) + int(pslData[3])
	pc_cov = ( sumCov / int(pslData[10])) * 100 
	print("     - Coverage: {}".format(pc_cov))

	##### get missmatches info
	missmatches = int(pslData[1])
	pc_mm = (missmatches / sumCov) * 100
	print("     - missmatches: {} ({} %)".format(missmatches, pc_mm))

	##### get indels info
	# based on the column 8: Number of bases inserted into target
	indelBases = int(pslData[7])
	pc_indels = (indelBases / sumCov) * 100
	print("     - indels: {} ({} %)".format(indelBases, pc_indels))

	##### decide if we remap or not the gene
	if int(float(pc_cov)) < 100 or int(indelBases) > 0:
		print(" Gene {} has indel or is not fully covered: CANNOT ANCHOR IT\n".format(pslData[9]))
		return 0
	else:
		if missmatches == 0:
			print(" Gene {} is a perfect full match: anchoring can be done!\n".format(pslData[9]))
			return 1
		else:
			print(" Gene {} has missmatches warning in anchoring!\n".format(pslData[9]))
			return 1

def runBlat(db,query,blatResult, outFormat):
	cmd='blat -noHead -extendThroughN -out='+outFormat+' '+db+' '+query+' '+blatResult
	print(' blat command to run: %s'% str(cmd))
	os.system('blat -noHead -extendThroughN -out={} {} {} {}'.format(outFormat,db,query,blatResult))
	return blatResult

def getFastaSeq(fasta,chrom,start,stop):
	seq=fasta.fetch(reference=chrom, start=start,end=stop)
	print(" fasta sequence for chrom %s from %s to %s\n" %(chrom,start,stop))
	#print(seq)
	return seq

def getCoordsFromBed (bedfile):
	checkFile(file=bedfile)
	bedDict = defaultdict()
	with open(bedfile) as file:
		for line in file.readlines():
			bed=line.rstrip("\n").split("\t")
			if len(bed) < 4:
				sys.stderr.write(" ERROR PARSING BEDFILE %s " % str(bedfile)) 
				sys.stderr.write(" File do ot have 4 columns or is not tab delimited\n")
				sys.stderr.write(" Please check the format\n")
				sys.exit()
			#print(" bed array content ")
			#print(bed)
			bedDict[str(bed[3])] ={'chrom': bed[0],'start':bed[1],'stop':bed[2],'mapQ': bed[4],'strand':bed[5]}

	numRecords=len(bedDict.keys())
	sys.stdout.write(" Found %i records in Bed file %s " % (numRecords, bedfile))
	return bedDict

def checkFile (file):
	if os.path.isfile(file):
		sys.stdout.write(" File %s found\n" % str(file))
	else:
		sys.stderr.write(" ERROR: Cannot find file %s \n" % str(file))
		sys.exit()

def checkDir(directory):
	if os.path.isdir(directory):
		sys.stdout.write(" Directory %s found\n" % str(directory))
		return 1
	else:
		sys.stderr.write(" Cannot find directory %s \n" % str(directory))
		return 0


if __name__== "__main__":
	main()