Skip to content
Snippets Groups Projects
microMappingPipeline.py 6.09 KiB
Newer Older
Helene Rimbert's avatar
Helene Rimbert committed
#!/usr/bin/env python3
# coding: utf-8
Helene Rimbert's avatar
Helene Rimbert committed
import os.path
from os import path
from collections import defaultdict
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC
from Bio import SeqIO
Helene Rimbert's avatar
Helene Rimbert committed
import pysam
import sys

# command line arguments
inputMarkersFlankingGenes = sys.argv[1] # table with mapped flanking ISBPs from genes
outputDir = sys.argv[2] # output directory ...
genomeQuery = sys.argv[3] # fasta file of the v1 genome (must be fai indexed)
genomeTarget = sys.argv[4] # fasta file of the v2 genome (must bie fai indexed)
markerPosOnTarget = sys.argv[5] # bed file of new coords of markers on target reference genome
Helene Rimbert's avatar
Helene Rimbert committed

def main():

	### check for input files and output directory
	checkFile(file=inputMarkersFlankingGenes)
	if ( checkDir(directory=outputDir) ):
		sys.stdout.write( " Output directory already exists, no need to create it\n")

	else:
Helene Rimbert's avatar
Helene Rimbert committed
		sys.stdout.write( " Creating output directory %s" % str(outputDir))
		os.mkdir( outputDir, 0o750 )

	### open for reading the query and target reference file
	queryFasta = pysam.FastaFile(genomeQuery)
	targetFasta = pysam.FastaFile(genomeTarget)

	### save coords of marker on target reference
	markersOnTarget_dict = getCoordsFromBed(bedfile=markerPosOnTarget)
Helene Rimbert's avatar
Helene Rimbert committed

	### read input file containing the flanking IBSP markers for each gene
	##### variables
	numlines=0
	index_result_dir=0
	prefixResultDir=outputDir+'/temp/'+str(index_result_dir).zfill(6)

	##### output files
	with open(inputMarkersFlankingGenes) as file:
		for line in file.readlines():
			# increment counters
			# because to avoid having a big result dir with +100k subdirs, 
			# the scripts creates subdirs for every 1k genes
			numlines+=1
			if numlines % 1000 == 0:
				index_result_dir+=1
				prefixResultDir=outputDir+'/'+str(index_result_dir).zfill(6)

			# define the working dir for the current gene analysis
			workingDir=prefixResultDir+'/'+str(numlines).zfill(6)
			os.makedirs(workingDir, 0o750)
			
			# deal with data
			lineArray = line.rstrip("\n").split("\t")
			print("\n\n------------------------------------------")
			print(" Results of gene %s going into subdir %s" % (lineArray[0], str(workingDir)))
			print(" array of data: ")
			print(lineArray)

			# check if both anchors are mapped on the same chromosome
			geneDict={ 'geneId': lineArray[0],
						'geneChrom':lineArray[1],
						'geneStart':lineArray[2],
						'geneStop':lineArray[3],
						'geneScore':lineArray[4],
						'geneStrand':lineArray[5],
						'marker5pChrom':lineArray[6],
						'marker5pStart':lineArray[7],
						'marker5pStop':lineArray[8],
						'marker5pId':lineArray[9],
						'marker5pDistance':lineArray[10],
						'marker3pChrom':lineArray[11],
						'marker3pStart':lineArray[12],
						'marker3pStop':lineArray[13],
						'marker3pId':lineArray[14],
						'marker3pDistance':lineArray[15],
						'regionLengthOnQuery':lineArray[16]
					}
			print(" Gene directory:")
			print(geneDict)

			if geneDict['marker5pChrom'] == '.' or geneDict['marker3pChrom'] == '.':
				# deal afterward with genes with one anchor missing (basecally start and end of chromosomes)
				# TODO
				sys.stderr.write(" current gene %s has missing anchor on one side \n" % geneDict['geneId'])
				continue


			# get mapping of the flanking anchors on the target genome
			anchor5pTargetCoods_dict=markersOnTarget_dict[geneDict['marker5pId']]
			anchor3pTargetCoods_dict=markersOnTarget_dict[geneDict['marker3pId']]
			print(" anchors 5prime:\n")
			print(anchor5pTargetCoods_dict)
			print(" anchors 3prime:\n")
			print(anchor3pTargetCoods_dict)

			if anchor5pTargetCoods_dict['chrom'] == anchor3pTargetCoods_dict['chrom']:
				print(" both anchors are mapped on the same chrom. we can proceed to sequence extraction and mapping\n")
				### deal with query sequence
				queryGenomeSeq=getFastaSeq(fasta=queryFasta, chrom=geneDict['marker5pChrom'], start=int(geneDict['marker5pStart']), stop=int(geneDict['marker3pStart']))
				querySeqRecord= SeqRecord(
									Seq(queryGenomeSeq, IUPAC.ambiguous_dna), 
									id='query_'+str(geneDict['marker5pChrom'])+'_'+str(geneDict['marker5pStart'])+'-'+str(geneDict['marker3pStart']),
									description='target gene '+geneDict['geneId']+', flanking markers '+ geneDict['marker5pId']+'-'+geneDict['marker3pId'])
				print(querySeqRecord)
				queryFasta2blast = workingDir+'/query.fasta'
				SeqIO.write(querySeqRecord, queryFasta2blast, 'fasta')

				### deal with target sequence
				targetGenomeSeq=getFastaSeq(fasta=targetFasta, chrom=anchor5pTargetCoods_dict['chrom'], start=int(anchor5pTargetCoods_dict['start']), stop=int(anchor3pTargetCoods_dict['stop']))


			else:
				sys.stderr.write(" 5prime anchor %s and 3prime anchor %s are not on the same chromsome\n" % (geneDict['marker5pId'],geneDict['marker3pId']))
				sys.stderr.write(" Cannot anchoring gene %s \n" % geneDict['geneId'])
				continue


			input()

def getFastaSeq(fasta,chrom,start,stop):
	seq=fasta.fetch(reference=chrom, start=start,end=stop)
	print(" fasta sequence for chrom %s from %s to %s:\n" %(chrom,start,stop))
	print(seq)
	return seq
def getCoordsFromBed (bedfile):
	checkFile(file=bedfile)
	bedDict = defaultdict()
	with open(bedfile) as file:
		for line in file.readlines():
			bed=line.rstrip("\n").split("\t")
			if len(bed) < 4:
				sys.stderr.write(" ERROR PARSING BEDFILE %s " % str(bedfile)) 
				sys.stderr.write(" File do ot have 4 columns or is not tab delimited\n")
				sys.stderr.write(" Please check the format\n")
				sys.exit()
			#print(" bed array content ")
			#print(bed)
			bedDict[str(bed[3])] ={'chrom': bed[0],'start':bed[1],'stop':bed[2],'mapQ': bed[4],'strand':bed[5]}
	numRecords=len(bedDict.keys())
	sys.stdout.write(" Found %i records in Bed file %s " % (numRecords, bedfile))
	return bedDict
Helene Rimbert's avatar
Helene Rimbert committed

def checkFile (file):
	if os.path.isfile(file):
		sys.stdout.write(" File %s found\n" % str(file))
	else:
		sys.stderr.write(" ERROR: Cannot find file %s \n" % str(file))
		sys.exit()

def checkDir(directory):
	if os.path.isdir(directory):
		sys.stdout.write(" Directory %s found\n" % str(directory))
Helene Rimbert's avatar
Helene Rimbert committed
	else:
		sys.stderr.write(" Cannot find directory %s \n" % str(directory))
		return 0
Helene Rimbert's avatar
Helene Rimbert committed


if __name__== "__main__":
	main()