Skip to content
Snippets Groups Projects
microMappingPipeline.py 11.8 KiB
Newer Older
Helene Rimbert's avatar
Helene Rimbert committed
#!/usr/bin/env python3
# coding: utf-8
Helene Rimbert's avatar
Helene Rimbert committed
import os.path
from os import path
from collections import defaultdict
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC
from Bio import SeqIO
Helene Rimbert's avatar
Helene Rimbert committed
import pysam
import sys

#### command line arguments
Helene Rimbert's avatar
Helene Rimbert committed
inputMarkersFlankingGenes = sys.argv[1] # table with mapped flanking ISBPs from genes
outputDir = sys.argv[2] # output directory ...
genomeQuery = sys.argv[3] # fasta file of the v1 genome (must be fai indexed)
genomeTarget = sys.argv[4] # fasta file of the v2 genome (must be fai indexed)
markerPosOnTarget = sys.argv[5] # bed file of new coords of markers on target reference genome

#### generic data structure
# dict with summary of blat alignements: target region, coverage, indels, missmatches, anchoring status
blatResults = defaultdict()


Helene Rimbert's avatar
Helene Rimbert committed
def main():

	### check for input files and output directory
	checkFile(file=inputMarkersFlankingGenes)
	if ( checkDir(directory=outputDir) ):
		sys.stdout.write( " Output directory already exists, no need to create it\n")

	else:
Helene Rimbert's avatar
Helene Rimbert committed
		sys.stdout.write( " Creating output directory %s" % str(outputDir))
		os.mkdir( outputDir, 0o750 )

	### open for reading the query and target reference file
	queryFasta = pysam.FastaFile(genomeQuery)
	targetFasta = pysam.FastaFile(genomeTarget)

	### save coords of marker on target reference
	markersOnTarget_dict = getCoordsFromBed(bedfile=markerPosOnTarget)
Helene Rimbert's avatar
Helene Rimbert committed

	### read input file containing the flanking IBSP markers for each gene
	##### variables
	numlines=0
	index_result_dir=0
	prefixResultDir=outputDir+'/temp/'+str(index_result_dir).zfill(6)

	##### output files
	with open(inputMarkersFlankingGenes) as file:
		for line in file.readlines():
			# increment counters
			# because to avoid having a big result dir with +100k subdirs, 
			# the scripts creates subdirs for every 1k genes
			numlines+=1
			if numlines % 1000 == 0:
				index_result_dir+=1
				prefixResultDir=outputDir+'/temp/'+str(index_result_dir).zfill(6)

			# define the working dir for the current gene analysis
			workingDir=prefixResultDir+'/'+str(numlines).zfill(6)
			os.makedirs(workingDir, 0o750)
			
			# deal with data
			lineArray = line.rstrip("\n").split("\t")
			print("\n\n------------------------------------------")
			print(" Results of gene %s going into subdir %s" % (lineArray[0], str(workingDir)))
			print(" array of data: ")
			print(lineArray)

			# check if both anchors are mapped on the same chromosome
			geneDict={ 'geneId': lineArray[0],
						'geneChrom':lineArray[1],
						'geneStart':lineArray[2],
						'geneStop':lineArray[3],
						'geneScore':lineArray[4],
						'geneStrand':lineArray[5],
						'marker5pChrom':lineArray[6],
						'marker5pStart':lineArray[7],
						'marker5pStop':lineArray[8],
						'marker5pId':lineArray[9],
						'marker5pDistance':lineArray[10],
						'marker3pChrom':lineArray[11],
						'marker3pStart':lineArray[12],
						'marker3pStop':lineArray[13],
						'marker3pId':lineArray[14],
						'marker3pDistance':lineArray[15],
						'regionLengthOnQuery':lineArray[16]
					}
			print(" Gene directory:")
			print(geneDict)

			anchor5pTargetCoods_dict= {'chrom': 0, 'start':0, 'stop':0}
			anchor3pTargetCoods_dict= {}

			if geneDict['marker5pChrom'] == '.' or geneDict['marker3pChrom'] == '.':
				# deal with genes with one anchor missing (basecally start and end of chromosomes)
				sys.stderr.write(" current gene %s has missing anchor on one side \n" % geneDict['geneId'])

				if geneDict['marker5pId'] == '.':
					print(' Missing 5prime anchor: using genomic sequence from the start of chromosome')
					geneDict['marker5pChrom'] = geneDict['marker3pChrom'].replace('chr','Chr')
					geneDict['marker5pStart'] = 0
					geneDict['marker5pStop'] = 0
					geneDict['marker5pId'] = geneDict['marker5pChrom']
					anchor5pTargetCoods_dict.update(chrom= geneDict['marker5pChrom'], start= 1, stop=1)
					anchor3pTargetCoods_dict.update(markersOnTarget_dict[geneDict['marker3pId']])

				elif geneDict['marker3pId'] == '.':
					print(' Missing 3prime anchor: using genomic sequence to the end of chromosome')
					geneDict['marker3pChrom'] = geneDict['marker5pChrom'].replace('chr','Chr')
					geneDict['marker3pStart'] = targetFasta.get_reference_length(geneDict['marker3pChrom'])
					geneDict['marker3pStop'] = targetFasta.get_reference_length(geneDict['marker3pChrom'])
					geneDict['marker3pId'] = geneDict['marker3pChrom']
					anchor3pTargetCoods_dict.update(chrom= geneDict['marker3pChrom'], start=geneDict['marker3pStart'], stop=geneDict['marker3pStop'])
					anchor5pTargetCoods_dict.update(markersOnTarget_dict[geneDict['marker5pId']])
				else:
					print(" Problem with the anchors: canno find 5prime and 3prime anchors for the gene {}".format(geneDict['geneId']))
			else:
				# get mapping of the flanking anchors on the target genome
				anchor5pTargetCoods_dict.update(markersOnTarget_dict[geneDict['marker5pId']])
				anchor3pTargetCoods_dict.update(markersOnTarget_dict[geneDict['marker3pId']])

			print(" anchors 5prime:\n")
			print(anchor5pTargetCoods_dict)
			print(" anchors 3prime:\n")
			print(anchor3pTargetCoods_dict)

			if anchor5pTargetCoods_dict['chrom'] == anchor3pTargetCoods_dict['chrom']:
				print(" both anchors are mapped on the same chrom. we can proceed to sequence extraction and mapping\n")
				### deal with query sequence: extract the fasta seq to map
				queryGeneSeq=getFastaSeq(fasta=queryFasta, 
					chrom=geneDict['geneChrom'], 
					start=int(geneDict['geneStart']), 
					stop=int(geneDict['geneStop']))
				querySeqRecord= SeqRecord(
									Seq(queryGeneSeq, IUPAC.ambiguous_dna), 
									id=geneDict['geneId'],
									description='coords '+geneDict['geneChrom']+'_'+str(geneDict['geneStart'])+'-'+str(geneDict['geneStop'])+', flanking markers '+ geneDict['marker5pId']+'-'+geneDict['marker3pId'])
				#print(querySeqRecord)
				queryFasta2blast = workingDir+'/query.fasta'
				SeqIO.write(querySeqRecord, queryFasta2blast, 'fasta')

				### deal with target sequence: extract the fasta seq to use a db
				# fisrt check orientation:
				if anchor5pTargetCoods_dict['start'] > anchor3pTargetCoods_dict['stop']:
					# invert start and stop
					anchor5pTargetCoods_dict['start'],anchor3pTargetCoods_dict['stop'] = anchor3pTargetCoods_dict['stop'],anchor5pTargetCoods_dict['start']
					
				targetGenomeSeq=getFastaSeq(fasta=targetFasta, 
					chrom=anchor5pTargetCoods_dict['chrom'], 
					start=int(anchor5pTargetCoods_dict['start']), 
					stop=int(anchor3pTargetCoods_dict['stop']))
				targetSeqRecord= SeqRecord(
									Seq(targetGenomeSeq, IUPAC.ambiguous_dna), 
									id='target_'+anchor5pTargetCoods_dict['chrom']+'_'+str(anchor5pTargetCoods_dict['start'])+'_'+str(anchor3pTargetCoods_dict['stop']))
				targetFasta2blast = workingDir+'/target.fasta'
				SeqIO.write(targetSeqRecord, targetFasta2blast, 'fasta')

				################### run Blat of the genomic sequence on the target region on new assembly
				blatResultFile=runBlat(db=targetFasta2blast, 
					query=queryFasta2blast, 
					blatResult=workingDir+'/blat.txt', 
					outFormat='psl')

				############ check if the genomic align perfectly on the new reference
				checkPerfectHit(blatresult=blatResultFile, 
					workingDir=workingDir, 
					maxhit=1)

				os.remove(queryFasta2blast)
				os.remove(targetFasta2blast)
				sys.stderr.write(" 5prime anchor %s and 3prime anchor %s are not on the same chromsome\n" % (geneDict['marker5pId'],geneDict['marker3pId']))
				sys.stderr.write(" Cannot anchoring gene %s \n" % geneDict['geneId'])
				


			# #input()
			# if numlines == 150:
			# 	break

def checkPerfectHit(blatresult, workingDir, maxhit):
	print(" blatResults %s" % blatresult)
	hitIndex=0
	with open(blatresult) as blat:
		for hit in blat.readlines():
			hitIndex+=1
			if hitIndex > maxhit:
				sys.stderr.write(" Tried more than {} hits: giving up for this gene".format(maxhit))
				break
			print("    -> #%i Blat hit %s " % (hitIndex,hit))
				
			if parsePSL(pslRecord=hit) :
				print(" we can anchor this gene")
				break
				
			else:
				print(" we cannot anchor this gene\n")
				print(" adding mummer/nucmer alignment info to this gene\n")

				### add more alignemnt info using mummer/nucmer
				# nucmer: perform the alignment
				nucmerPref=workingDir+'/nucmer'
				ref=workingDir+'/target.fasta'
				query=workingDir+'/query.fasta'
				os.system('nucmer -p {} {} {}'.format(nucmerPref,ref,query))

				# mummerplot: generate the dot plot in png format
				mummerplotPref=workingDir+'/mummerplot'
				os.system('mummerplot --SNP -t png -p {} {}'.format(mummerplotPref,nucmerPref+'.delta'))

				# dnadiff: show potential snps and gaps
				dnadiffPref=workingDir+'/dnadiff'
				os.system('dnadiff -p {} -d {}'.format(dnadiffPref,nucmerPref+'.delta'))

				### add another blat format output for visualisation with ACT
				blatfile=workingDir+'/blast_m8.tab'
				os.system(' blat -extendThroughN -out=blast8 {} {} {}'.format(ref,query,blatfile))
				run_act_file=workingDir+'/run_act.sh'
				os.system(' echo act query.fasta blast_m8.tab target.fasta > {}'.format(blatfile, run_act_file))
				



def parsePSL(pslRecord):
	cov = ''
	missmatches = ''
	indels = ''
	pslData=pslRecord.rstrip('\n').split('\t')

	##### calc the % coverage
	# pc_cov = (num_matche + num_mm + num_N) / length(querySeq) * 100
	# add N count if we have some
	# BLAT take into acount NN stretches with the '-extendThroughN' parameter.
	sumCov = int(pslData[0]) + int(pslData[1]) + int(pslData[3])
	pc_cov = ( sumCov / int(pslData[10])) * 100 
	print("     - Coverage: {}".format(pc_cov))

	##### get missmatches info
	missmatches = int(pslData[1])
	pc_mm = (missmatches / sumCov) * 100
	print("     - missmatches: {} ({} %)".format(missmatches, pc_mm))

	##### get indels info
	# based on the column 8: Number of bases inserted into target
	indelBases = int(pslData[7])
	pc_indels = (indelBases / sumCov) * 100
	print("     - indels: {} ({} %)".format(indelBases, pc_indels))

	##### decide if we remap or not the gene
	if int(float(pc_cov)) < 100 or int(indelBases) > 0:
		print(" Gene {} has indel or is not fully covered: CANNOT ANCHOR IT\n".format(pslData[9]))
		return 0
	else:
		if missmatches == 0:
			print(" Gene {} is a perfect full match: anchoring can be done!\n".format(pslData[9]))
			return 1
		else:
			print(" Gene {} has missmatches warning in anchoring!\n".format(pslData[9]))
			return 1
def runBlat(db,query,blatResult, outFormat):
	cmd='blat -noHead -extendThroughN -out='+outFormat+' '+db+' '+query+' '+blatResult
	print(' blat command to run: %s'% str(cmd))
	os.system('blat -noHead -extendThroughN -out={} {} {} {}'.format(outFormat,db,query,blatResult))
	return blatResult

def getFastaSeq(fasta,chrom,start,stop):
	seq=fasta.fetch(reference=chrom, start=start,end=stop)
	print(" fasta sequence for chrom %s from %s to %s\n" %(chrom,start,stop))
	#print(seq)
def getCoordsFromBed (bedfile):
	checkFile(file=bedfile)
	bedDict = defaultdict()
	with open(bedfile) as file:
		for line in file.readlines():
			bed=line.rstrip("\n").split("\t")
			if len(bed) < 4:
				sys.stderr.write(" ERROR PARSING BEDFILE %s " % str(bedfile)) 
				sys.stderr.write(" File do ot have 4 columns or is not tab delimited\n")
				sys.stderr.write(" Please check the format\n")
				sys.exit()
			#print(" bed array content ")
			#print(bed)
			bedDict[str(bed[3])] ={'chrom': bed[0],'start':bed[1],'stop':bed[2],'mapQ': bed[4],'strand':bed[5]}
	numRecords=len(bedDict.keys())
	sys.stdout.write(" Found %i records in Bed file %s " % (numRecords, bedfile))
	return bedDict
Helene Rimbert's avatar
Helene Rimbert committed

def checkFile (file):
	if os.path.isfile(file):
		sys.stdout.write(" File %s found\n" % str(file))
	else:
		sys.stderr.write(" ERROR: Cannot find file %s \n" % str(file))
		sys.exit()

def checkDir(directory):
	if os.path.isdir(directory):
		sys.stdout.write(" Directory %s found\n" % str(directory))
Helene Rimbert's avatar
Helene Rimbert committed
	else:
		sys.stderr.write(" Cannot find directory %s \n" % str(directory))
		return 0
Helene Rimbert's avatar
Helene Rimbert committed


if __name__== "__main__":
	main()