#!/usr/bin/env python3 # coding: utf-8 import os.path from os import path from collections import defaultdict from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from Bio.Alphabet import IUPAC from Bio import SeqIO import pysam import sys # command line arguments inputMarkersFlankingGenes = sys.argv[1] # table with mapped flanking ISBPs from genes outputDir = sys.argv[2] # output directory ... genomeQuery = sys.argv[3] # fasta file of the v1 genome (must be fai indexed) genomeTarget = sys.argv[4] # fasta file of the v2 genome (must bie fai indexed) markerPosOnTarget = sys.argv[5] # bed file of new coords of markers on target reference genome def main(): ### check for input files and output directory checkFile(file=inputMarkersFlankingGenes) if ( checkDir(directory=outputDir) ): sys.stdout.write( " Output directory already exists, no need to create it\n") else: sys.stdout.write( " Creating output directory %s" % str(outputDir)) os.mkdir( outputDir, 0o750 ) ### open for reading the query and target reference file queryFasta = pysam.FastaFile(genomeQuery) targetFasta = pysam.FastaFile(genomeTarget) ### save coords of marker on target reference markersOnTarget_dict = getCoordsFromBed(bedfile=markerPosOnTarget) ### read input file containing the flanking IBSP markers for each gene ##### variables numlines=0 index_result_dir=0 prefixResultDir=outputDir+'/temp/'+str(index_result_dir).zfill(6) ##### output files with open(inputMarkersFlankingGenes) as file: for line in file.readlines(): # increment counters # because to avoid having a big result dir with +100k subdirs, # the scripts creates subdirs for every 1k genes numlines+=1 if numlines % 1000 == 0: index_result_dir+=1 prefixResultDir=outputDir+'/'+str(index_result_dir).zfill(6) # define the working dir for the current gene analysis workingDir=prefixResultDir+'/'+str(numlines).zfill(6) os.makedirs(workingDir, 0o750) # deal with data lineArray = line.rstrip("\n").split("\t") print("\n\n------------------------------------------") print(" Results of gene %s going into subdir %s" % (lineArray[0], str(workingDir))) print(" array of data: ") print(lineArray) # check if both anchors are mapped on the same chromosome geneDict={ 'geneId': lineArray[0], 'geneChrom':lineArray[1], 'geneStart':lineArray[2], 'geneStop':lineArray[3], 'geneScore':lineArray[4], 'geneStrand':lineArray[5], 'marker5pChrom':lineArray[6], 'marker5pStart':lineArray[7], 'marker5pStop':lineArray[8], 'marker5pId':lineArray[9], 'marker5pDistance':lineArray[10], 'marker3pChrom':lineArray[11], 'marker3pStart':lineArray[12], 'marker3pStop':lineArray[13], 'marker3pId':lineArray[14], 'marker3pDistance':lineArray[15], 'regionLengthOnQuery':lineArray[16] } print(" Gene directory:") print(geneDict) if geneDict['marker5pChrom'] == '.' or geneDict['marker3pChrom'] == '.': # deal afterward with genes with one anchor missing (basecally start and end of chromosomes) # TODO sys.stderr.write(" current gene %s has missing anchor on one side \n" % geneDict['geneId']) continue # get mapping of the flanking anchors on the target genome anchor5pTargetCoods_dict=markersOnTarget_dict[geneDict['marker5pId']] anchor3pTargetCoods_dict=markersOnTarget_dict[geneDict['marker3pId']] print(" anchors 5prime:\n") print(anchor5pTargetCoods_dict) print(" anchors 3prime:\n") print(anchor3pTargetCoods_dict) if anchor5pTargetCoods_dict['chrom'] == anchor3pTargetCoods_dict['chrom']: print(" both anchors are mapped on the same chrom. we can proceed to sequence extraction and mapping\n") ### deal with query sequence queryGenomeSeq=getFastaSeq(fasta=queryFasta, chrom=geneDict['marker5pChrom'], start=int(geneDict['marker5pStart']), stop=int(geneDict['marker3pStart'])) querySeqRecord= SeqRecord( Seq(queryGenomeSeq, IUPAC.ambiguous_dna), id='query_'+str(geneDict['marker5pChrom'])+'_'+str(geneDict['marker5pStart'])+'-'+str(geneDict['marker3pStart']), description='target gene '+geneDict['geneId']+', flanking markers '+ geneDict['marker5pId']+'-'+geneDict['marker3pId']) print(querySeqRecord) queryFasta2blast = workingDir+'/query.fasta' SeqIO.write(querySeqRecord, queryFasta2blast, 'fasta') ### deal with target sequence targetGenomeSeq=getFastaSeq(fasta=targetFasta, chrom=anchor5pTargetCoods_dict['chrom'], start=int(anchor5pTargetCoods_dict['start']), stop=int(anchor3pTargetCoods_dict['stop'])) else: sys.stderr.write(" 5prime anchor %s and 3prime anchor %s are not on the same chromsome\n" % (geneDict['marker5pId'],geneDict['marker3pId'])) sys.stderr.write(" Cannot anchoring gene %s \n" % geneDict['geneId']) continue input() def getFastaSeq(fasta,chrom,start,stop): seq=fasta.fetch(reference=chrom, start=start,end=stop) print(" fasta sequence for chrom %s from %s to %s:\n" %(chrom,start,stop)) print(seq) return seq def getCoordsFromBed (bedfile): checkFile(file=bedfile) bedDict = defaultdict() with open(bedfile) as file: for line in file.readlines(): bed=line.rstrip("\n").split("\t") if len(bed) < 4: sys.stderr.write(" ERROR PARSING BEDFILE %s " % str(bedfile)) sys.stderr.write(" File do ot have 4 columns or is not tab delimited\n") sys.stderr.write(" Please check the format\n") sys.exit() #print(" bed array content ") #print(bed) bedDict[str(bed[3])] ={'chrom': bed[0],'start':bed[1],'stop':bed[2],'mapQ': bed[4],'strand':bed[5]} numRecords=len(bedDict.keys()) sys.stdout.write(" Found %i records in Bed file %s " % (numRecords, bedfile)) return bedDict def checkFile (file): if os.path.isfile(file): sys.stdout.write(" File %s found\n" % str(file)) else: sys.stderr.write(" ERROR: Cannot find file %s \n" % str(file)) sys.exit() def checkDir(directory): if os.path.isdir(directory): sys.stdout.write(" Directory %s found\n" % str(directory)) return 1 else: sys.stderr.write(" Cannot find directory %s \n" % str(directory)) return 0 if __name__== "__main__": main()