Newer
Older
from collections import defaultdict
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC
from Bio import SeqIO
import pysam
import sys
# command line arguments
inputMarkersFlankingGenes = sys.argv[1] # table with mapped flanking ISBPs from genes
outputDir = sys.argv[2] # output directory ...
genomeQuery = sys.argv[3] # fasta file of the v1 genome (must be fai indexed)
genomeTarget = sys.argv[4] # fasta file of the v2 genome (must bie fai indexed)
markerPosOnTarget = sys.argv[5] # bed file of new coords of markers on target reference genome
def main():
### check for input files and output directory
checkFile(file=inputMarkersFlankingGenes)
if ( checkDir(directory=outputDir) ):
sys.stdout.write( " Output directory already exists, no need to create it\n")
else:
sys.stdout.write( " Creating output directory %s" % str(outputDir))
os.mkdir( outputDir, 0o750 )
### open for reading the query and target reference file
queryFasta = pysam.FastaFile(genomeQuery)
targetFasta = pysam.FastaFile(genomeTarget)
### save coords of marker on target reference
markersOnTarget_dict = getCoordsFromBed(bedfile=markerPosOnTarget)
### read input file containing the flanking IBSP markers for each gene
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
##### variables
numlines=0
index_result_dir=0
prefixResultDir=outputDir+'/temp/'+str(index_result_dir).zfill(6)
##### output files
with open(inputMarkersFlankingGenes) as file:
for line in file.readlines():
# increment counters
# because to avoid having a big result dir with +100k subdirs,
# the scripts creates subdirs for every 1k genes
numlines+=1
if numlines % 1000 == 0:
index_result_dir+=1
prefixResultDir=outputDir+'/'+str(index_result_dir).zfill(6)
# define the working dir for the current gene analysis
workingDir=prefixResultDir+'/'+str(numlines).zfill(6)
os.makedirs(workingDir, 0o750)
# deal with data
lineArray = line.rstrip("\n").split("\t")
print("\n\n------------------------------------------")
print(" Results of gene %s going into subdir %s" % (lineArray[0], str(workingDir)))
print(" array of data: ")
print(lineArray)
# check if both anchors are mapped on the same chromosome
geneDict={ 'geneId': lineArray[0],
'geneChrom':lineArray[1],
'geneStart':lineArray[2],
'geneStop':lineArray[3],
'geneScore':lineArray[4],
'geneStrand':lineArray[5],
'marker5pChrom':lineArray[6],
'marker5pStart':lineArray[7],
'marker5pStop':lineArray[8],
'marker5pId':lineArray[9],
'marker5pDistance':lineArray[10],
'marker3pChrom':lineArray[11],
'marker3pStart':lineArray[12],
'marker3pStop':lineArray[13],
'marker3pId':lineArray[14],
'marker3pDistance':lineArray[15],
'regionLengthOnQuery':lineArray[16]
}
print(" Gene directory:")
print(geneDict)
if geneDict['marker5pChrom'] == '.' or geneDict['marker3pChrom'] == '.':
# deal afterward with genes with one anchor missing (basecally start and end of chromosomes)
# TODO
sys.stderr.write(" current gene %s has missing anchor on one side \n" % geneDict['geneId'])
continue
# get mapping of the flanking anchors on the target genome
anchor5pTargetCoods_dict=markersOnTarget_dict[geneDict['marker5pId']]
anchor3pTargetCoods_dict=markersOnTarget_dict[geneDict['marker3pId']]
print(" anchors 5prime:\n")
print(anchor5pTargetCoods_dict)
print(" anchors 3prime:\n")
print(anchor3pTargetCoods_dict)
if anchor5pTargetCoods_dict['chrom'] == anchor3pTargetCoods_dict['chrom']:
print(" both anchors are mapped on the same chrom. we can proceed to sequence extraction and mapping\n")
### deal with query sequence
queryGenomeSeq=getFastaSeq(fasta=queryFasta, chrom=geneDict['marker5pChrom'], start=int(geneDict['marker5pStart']), stop=int(geneDict['marker3pStart']))
querySeqRecord= SeqRecord(
Seq(queryGenomeSeq, IUPAC.ambiguous_dna),
id='query_'+str(geneDict['marker5pChrom'])+'_'+str(geneDict['marker5pStart'])+'-'+str(geneDict['marker3pStart']),
description='target gene '+geneDict['geneId']+', flanking markers '+ geneDict['marker5pId']+'-'+geneDict['marker3pId'])
print(querySeqRecord)
queryFasta2blast = workingDir+'/query.fasta'
SeqIO.write(querySeqRecord, queryFasta2blast, 'fasta')
### deal with target sequence
targetGenomeSeq=getFastaSeq(fasta=targetFasta, chrom=anchor5pTargetCoods_dict['chrom'], start=int(anchor5pTargetCoods_dict['start']), stop=int(anchor3pTargetCoods_dict['stop']))
else:
sys.stderr.write(" 5prime anchor %s and 3prime anchor %s are not on the same chromsome\n" % (geneDict['marker5pId'],geneDict['marker3pId']))
sys.stderr.write(" Cannot anchoring gene %s \n" % geneDict['geneId'])
continue
input()
def getFastaSeq(fasta,chrom,start,stop):
seq=fasta.fetch(reference=chrom, start=start,end=stop)
print(" fasta sequence for chrom %s from %s to %s:\n" %(chrom,start,stop))
print(seq)
return seq
def getCoordsFromBed (bedfile):
checkFile(file=bedfile)
bedDict = defaultdict()
with open(bedfile) as file:
for line in file.readlines():
bed=line.rstrip("\n").split("\t")
if len(bed) < 4:
sys.stderr.write(" ERROR PARSING BEDFILE %s " % str(bedfile))
sys.stderr.write(" File do ot have 4 columns or is not tab delimited\n")
sys.stderr.write(" Please check the format\n")
sys.exit()
#print(" bed array content ")
#print(bed)
bedDict[str(bed[3])] ={'chrom': bed[0],'start':bed[1],'stop':bed[2],'mapQ': bed[4],'strand':bed[5]}
numRecords=len(bedDict.keys())
sys.stdout.write(" Found %i records in Bed file %s " % (numRecords, bedfile))
return bedDict
def checkFile (file):
if os.path.isfile(file):
sys.stdout.write(" File %s found\n" % str(file))
else:
sys.stderr.write(" ERROR: Cannot find file %s \n" % str(file))
sys.exit()
def checkDir(directory):
if os.path.isdir(directory):
sys.stdout.write(" Directory %s found\n" % str(directory))
sys.stderr.write(" Cannot find directory %s \n" % str(directory))
return 0