microMappingPipeline.py

#!/usr/bin/env python3
# coding: utf-8
import os.path
from os import path
from collections import defaultdict
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC
from Bio import SeqIO
import pysam
import sys

# command line arguments
inputMarkersFlankingGenes = sys.argv[1] # table with mapped flanking ISBPs from genes
outputDir = sys.argv[2] # output directory ...
genomeQuery = sys.argv[3] # fasta file of the v1 genome (must be fai indexed)
genomeTarget = sys.argv[4] # fasta file of the v2 genome (must bie fai indexed)
markerPosOnTarget = sys.argv[5] # bed file of new coords of markers on target reference genome

def main():

	### check for input files and output directory
	checkFile(file=inputMarkersFlankingGenes)
	if ( checkDir(directory=outputDir) ):
		sys.stdout.write( " Output directory already exists, no need to create it\n")

	else:
		sys.stdout.write( " Creating output directory %s" % str(outputDir))
		os.mkdir( outputDir, 0o750 )

	### open for reading the query and target reference file
	queryFasta = pysam.FastaFile(genomeQuery)
	targetFasta = pysam.FastaFile(genomeTarget)

	### save coords of marker on target reference
	markersOnTarget_dict = getCoordsFromBed(bedfile=markerPosOnTarget)

	### read input file containing the flanking IBSP markers for each gene
	##### variables
	numlines=0
	index_result_dir=0
	prefixResultDir=outputDir+'/temp/'+str(index_result_dir).zfill(6)

	##### output files
	with open(inputMarkersFlankingGenes) as file:
		for line in file.readlines():
			# increment counters
			# because to avoid having a big result dir with +100k subdirs, 
			# the scripts creates subdirs for every 1k genes
			numlines+=1
			if numlines % 1000 == 0:
				index_result_dir+=1
				prefixResultDir=outputDir+'/'+str(index_result_dir).zfill(6)

			# define the working dir for the current gene analysis
			workingDir=prefixResultDir+'/'+str(numlines).zfill(6)
			os.makedirs(workingDir, 0o750)
			
			# deal with data
			lineArray = line.rstrip("\n").split("\t")
			print("\n\n------------------------------------------")
			print(" Results of gene %s going into subdir %s" % (lineArray[0], str(workingDir)))
			print(" array of data: ")
			print(lineArray)

			# check if both anchors are mapped on the same chromosome
			geneDict={ 'geneId': lineArray[0],
						'geneChrom':lineArray[1],
						'geneStart':lineArray[2],
						'geneStop':lineArray[3],
						'geneScore':lineArray[4],
						'geneStrand':lineArray[5],
						'marker5pChrom':lineArray[6],
						'marker5pStart':lineArray[7],
						'marker5pStop':lineArray[8],
						'marker5pId':lineArray[9],
						'marker5pDistance':lineArray[10],
						'marker3pChrom':lineArray[11],
						'marker3pStart':lineArray[12],
						'marker3pStop':lineArray[13],
						'marker3pId':lineArray[14],
						'marker3pDistance':lineArray[15],
						'regionLengthOnQuery':lineArray[16]
					}
			print(" Gene directory:")
			print(geneDict)

			if geneDict['marker5pChrom'] == '.' or geneDict['marker3pChrom'] == '.':
				# deal afterward with genes with one anchor missing (basecally start and end of chromosomes)
				# TODO
				sys.stderr.write(" current gene %s has missing anchor on one side \n" % geneDict['geneId'])
				continue


			# get mapping of the flanking anchors on the target genome
			anchor5pTargetCoods_dict=markersOnTarget_dict[geneDict['marker5pId']]
			anchor3pTargetCoods_dict=markersOnTarget_dict[geneDict['marker3pId']]
			print(" anchors 5prime:\n")
			print(anchor5pTargetCoods_dict)
			print(" anchors 3prime:\n")
			print(anchor3pTargetCoods_dict)

			if anchor5pTargetCoods_dict['chrom'] == anchor3pTargetCoods_dict['chrom']:
				print(" both anchors are mapped on the same chrom. we can proceed to sequence extraction and mapping\n")
				### deal with query sequence
				queryGenomeSeq=getFastaSeq(fasta=queryFasta, chrom=geneDict['marker5pChrom'], start=int(geneDict['marker5pStart']), stop=int(geneDict['marker3pStart']))
				querySeqRecord= SeqRecord(
									Seq(queryGenomeSeq, IUPAC.ambiguous_dna), 
									id='query_'+str(geneDict['marker5pChrom'])+'_'+str(geneDict['marker5pStart'])+'-'+str(geneDict['marker3pStart']),
									description='target gene '+geneDict['geneId']+', flanking markers '+ geneDict['marker5pId']+'-'+geneDict['marker3pId'])
				print(querySeqRecord)
				queryFasta2blast = workingDir+'/query.fasta'
				SeqIO.write(querySeqRecord, queryFasta2blast, 'fasta')

				### deal with target sequence
				targetGenomeSeq=getFastaSeq(fasta=targetFasta, chrom=anchor5pTargetCoods_dict['chrom'], start=int(anchor5pTargetCoods_dict['start']), stop=int(anchor3pTargetCoods_dict['stop']))


			else:
				sys.stderr.write(" 5prime anchor %s and 3prime anchor %s are not on the same chromsome\n" % (geneDict['marker5pId'],geneDict['marker3pId']))
				sys.stderr.write(" Cannot anchoring gene %s \n" % geneDict['geneId'])
				continue


			input()

def getFastaSeq(fasta,chrom,start,stop):
	seq=fasta.fetch(reference=chrom, start=start,end=stop)
	print(" fasta sequence for chrom %s from %s to %s:\n" %(chrom,start,stop))
	print(seq)
	return seq

def getCoordsFromBed (bedfile):
	checkFile(file=bedfile)
	bedDict = defaultdict()
	with open(bedfile) as file:
		for line in file.readlines():
			bed=line.rstrip("\n").split("\t")
			if len(bed) < 4:
				sys.stderr.write(" ERROR PARSING BEDFILE %s " % str(bedfile)) 
				sys.stderr.write(" File do ot have 4 columns or is not tab delimited\n")
				sys.stderr.write(" Please check the format\n")
				sys.exit()
			#print(" bed array content ")
			#print(bed)
			bedDict[str(bed[3])] ={'chrom': bed[0],'start':bed[1],'stop':bed[2],'mapQ': bed[4],'strand':bed[5]}

	numRecords=len(bedDict.keys())
	sys.stdout.write(" Found %i records in Bed file %s " % (numRecords, bedfile))
	return bedDict

def checkFile (file):
	if os.path.isfile(file):
		sys.stdout.write(" File %s found\n" % str(file))
	else:
		sys.stderr.write(" ERROR: Cannot find file %s \n" % str(file))
		sys.exit()

def checkDir(directory):
	if os.path.isdir(directory):
		sys.stdout.write(" Directory %s found\n" % str(directory))
		return 1
	else:
		sys.stderr.write(" Cannot find directory %s \n" % str(directory))
		return 0


if __name__== "__main__":
	main()