From 16d31b3b8e84d59dd3260533fc4a46b9ecc1dffe Mon Sep 17 00:00:00 2001
From: Helene Rimbert <helene.rimbert@inra.fr>
Date: Tue, 17 Dec 2019 14:31:37 +0100
Subject: [PATCH] new snakemake rule geneAnchoring.smk

---
 bin/filterBam.py            |   2 +-
 bin/microMappingPipeline.py | 133 +++++++++++++++++++++++++++++++++++-
 rules/bedtoolsClosest.smk   |   4 +-
 rules/geneAnchoring.smk     |  15 ++++
 rules/preprocessISBP.smk    |  10 ++-
 5 files changed, 156 insertions(+), 8 deletions(-)
 create mode 100644 rules/geneAnchoring.smk

diff --git a/bin/filterBam.py b/bin/filterBam.py
index b713b21..9f282db 100755
--- a/bin/filterBam.py
+++ b/bin/filterBam.py
@@ -31,7 +31,7 @@ for read in samfile:
 		numFilteredReads +=1
 
 	else:
-		sys.stdout.write(" mapping not ok for read %s: qlength=%s cigarLength=%s mapQ=%s numMissmatches=%s\n" % (str(read.query_name),str(readLength), str(cigarLength), str(mapq), str(numMissmatches)))
+		sys.stderr.write(" mapping not ok for read %s: qlength=%s cigarLength=%s mapQ=%s numMissmatches=%s\n" % (str(read.query_name),str(readLength), str(cigarLength), str(mapq), str(numMissmatches)))
 
 		#some log
 		# sys.stdout.write (" read: %s\n" % str(read))
diff --git a/bin/microMappingPipeline.py b/bin/microMappingPipeline.py
index fed7c75..46ab754 100755
--- a/bin/microMappingPipeline.py
+++ b/bin/microMappingPipeline.py
@@ -1,6 +1,12 @@
 #!/usr/bin/env python3
+# coding: utf-8
 import os.path
 from os import path
+from collections import defaultdict
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
+from Bio.Alphabet import IUPAC
+from Bio import SeqIO
 import pysam
 import sys
 
@@ -9,20 +15,139 @@ inputMarkersFlankingGenes = sys.argv[1] # table with mapped flanking ISBPs from
 outputDir = sys.argv[2] # output directory ...
 genomeQuery = sys.argv[3] # fasta file of the v1 genome (must be fai indexed)
 genomeTarget = sys.argv[4] # fasta file of the v2 genome (must bie fai indexed)
+markerPosOnTarget = sys.argv[5] # bed file of new coords of markers on target reference genome
 
 def main():
 
 	### check for input files and output directory
 	checkFile(file=inputMarkersFlankingGenes)
-	if ( not checkDir(directory=outputDir) ):
+	if ( checkDir(directory=outputDir) ):
+		sys.stdout.write( " Output directory already exists, no need to create it\n")
+
+	else:
 		sys.stdout.write( " Creating output directory %s" % str(outputDir))
 		os.mkdir( outputDir, 0o750 )
 
-	#open for reading the reference file
+	### open for reading the query and target reference file
+	queryFasta = pysam.FastaFile(genomeQuery)
+	targetFasta = pysam.FastaFile(genomeTarget)
+
+	### save coords of marker on target reference
+	markersOnTarget_dict = getCoordsFromBed(bedfile=markerPosOnTarget)
 
 	### read input file containing the flanking IBSP markers for each gene
+	##### variables
+	numlines=0
+	index_result_dir=0
+	prefixResultDir=outputDir+'/temp/'+str(index_result_dir).zfill(6)
+
+	##### output files
+	with open(inputMarkersFlankingGenes) as file:
+		for line in file.readlines():
+			# increment counters
+			# because to avoid having a big result dir with +100k subdirs, 
+			# the scripts creates subdirs for every 1k genes
+			numlines+=1
+			if numlines % 1000 == 0:
+				index_result_dir+=1
+				prefixResultDir=outputDir+'/'+str(index_result_dir).zfill(6)
+
+			# define the working dir for the current gene analysis
+			workingDir=prefixResultDir+'/'+str(numlines).zfill(6)
+			os.makedirs(workingDir, 0o750)
+			
+			# deal with data
+			lineArray = line.rstrip("\n").split("\t")
+			print("\n\n------------------------------------------")
+			print(" Results of gene %s going into subdir %s" % (lineArray[0], str(workingDir)))
+			print(" array of data: ")
+			print(lineArray)
+
+			# check if both anchors are mapped on the same chromosome
+			geneDict={ 'geneId': lineArray[0],
+						'geneChrom':lineArray[1],
+						'geneStart':lineArray[2],
+						'geneStop':lineArray[3],
+						'geneScore':lineArray[4],
+						'geneStrand':lineArray[5],
+						'marker5pChrom':lineArray[6],
+						'marker5pStart':lineArray[7],
+						'marker5pStop':lineArray[8],
+						'marker5pId':lineArray[9],
+						'marker5pDistance':lineArray[10],
+						'marker3pChrom':lineArray[11],
+						'marker3pStart':lineArray[12],
+						'marker3pStop':lineArray[13],
+						'marker3pId':lineArray[14],
+						'marker3pDistance':lineArray[15],
+						'regionLengthOnQuery':lineArray[16]
+					}
+			print(" Gene directory:")
+			print(geneDict)
+
+			if geneDict['marker5pChrom'] == '.' or geneDict['marker3pChrom'] == '.':
+				# deal afterward with genes with one anchor missing (basecally start and end of chromosomes)
+				# TODO
+				sys.stderr.write(" current gene %s has missing anchor on one side \n" % geneDict['geneId'])
+				continue
+
+
+			# get mapping of the flanking anchors on the target genome
+			anchor5pTargetCoods_dict=markersOnTarget_dict[geneDict['marker5pId']]
+			anchor3pTargetCoods_dict=markersOnTarget_dict[geneDict['marker3pId']]
+			print(" anchors 5prime:\n")
+			print(anchor5pTargetCoods_dict)
+			print(" anchors 3prime:\n")
+			print(anchor3pTargetCoods_dict)
+
+			if anchor5pTargetCoods_dict['chrom'] == anchor3pTargetCoods_dict['chrom']:
+				print(" both anchors are mapped on the same chrom. we can proceed to sequence extraction and mapping\n")
+				### deal with query sequence
+				queryGenomeSeq=getFastaSeq(fasta=queryFasta, chrom=geneDict['marker5pChrom'], start=int(geneDict['marker5pStart']), stop=int(geneDict['marker3pStart']))
+				querySeqRecord= SeqRecord(
+									Seq(queryGenomeSeq, IUPAC.ambiguous_dna), 
+									id='query_'+str(geneDict['marker5pChrom'])+'_'+str(geneDict['marker5pStart'])+'-'+str(geneDict['marker3pStart']),
+									description='target gene '+geneDict['geneId']+', flanking markers '+ geneDict['marker5pId']+'-'+geneDict['marker3pId'])
+				print(querySeqRecord)
+				queryFasta2blast = workingDir+'/query.fasta'
+				SeqIO.write(querySeqRecord, queryFasta2blast, 'fasta')
+
+				### deal with target sequence
+				targetGenomeSeq=getFastaSeq(fasta=targetFasta, chrom=anchor5pTargetCoods_dict['chrom'], start=int(anchor5pTargetCoods_dict['start']), stop=int(anchor3pTargetCoods_dict['stop']))
+
+
+			else:
+				sys.stderr.write(" 5prime anchor %s and 3prime anchor %s are not on the same chromsome\n" % (geneDict['marker5pId'],geneDict['marker3pId']))
+				sys.stderr.write(" Cannot anchoring gene %s \n" % geneDict['geneId'])
+				continue
+
+
+			input()
+
+def getFastaSeq(fasta,chrom,start,stop):
+	seq=fasta.fetch(reference=chrom, start=start,end=stop)
+	print(" fasta sequence for chrom %s from %s to %s:\n" %(chrom,start,stop))
+	print(seq)
+	return seq
 
+def getCoordsFromBed (bedfile):
+	checkFile(file=bedfile)
+	bedDict = defaultdict()
+	with open(bedfile) as file:
+		for line in file.readlines():
+			bed=line.rstrip("\n").split("\t")
+			if len(bed) < 4:
+				sys.stderr.write(" ERROR PARSING BEDFILE %s " % str(bedfile)) 
+				sys.stderr.write(" File do ot have 4 columns or is not tab delimited\n")
+				sys.stderr.write(" Please check the format\n")
+				sys.exit()
+			#print(" bed array content ")
+			#print(bed)
+			bedDict[str(bed[3])] ={'chrom': bed[0],'start':bed[1],'stop':bed[2],'mapQ': bed[4],'strand':bed[5]}
 
+	numRecords=len(bedDict.keys())
+	sys.stdout.write(" Found %i records in Bed file %s " % (numRecords, bedfile))
+	return bedDict
 
 def checkFile (file):
 	if os.path.isfile(file):
@@ -34,8 +159,10 @@ def checkFile (file):
 def checkDir(directory):
 	if os.path.isdir(directory):
 		sys.stdout.write(" Directory %s found\n" % str(directory))
+		return 1
 	else:
-		sys.stderr.write(" ERROR: Cannot find directory %s \n" % str(directory))
+		sys.stderr.write(" Cannot find directory %s \n" % str(directory))
+		return 0
 
 
 if __name__== "__main__":
diff --git a/rules/bedtoolsClosest.smk b/rules/bedtoolsClosest.smk
index 1f89433..922a723 100644
--- a/rules/bedtoolsClosest.smk
+++ b/rules/bedtoolsClosest.smk
@@ -11,7 +11,7 @@ rule selectMappedISBP:
 rule upstreamClosest:
 	message: " Collect closest marker upstream of genes"
 	input: annot=config['results']+"/1.features.bed", markers=config['results']+'/2.mappedISBPs_coordsOnQuery.bed'
-	output: config['results']+'/2.closestbedUpstream.txt'
+	output: temp(config['results']+'/2.closestbedUpstream.txt')
 	log: config['results']+"/2.closestbedUpstream.log"
 	shell:
 		"""
@@ -21,7 +21,7 @@ rule upstreamClosest:
 rule downstreamClosest:
 	message: " Collect Downstream marker upstream of genes"
 	input: annot=config['results']+"/1.features.bed", markers=config['results']+'/2.mappedISBPs_coordsOnQuery.bed'
-	output: config['results']+'/2.downstreamClosest.txt'
+	output: temp(config['results']+'/2.downstreamClosest.txt')
 	log: config['results']+"/2.downstreamClosest.log"
 	shell:
 		"""
diff --git a/rules/geneAnchoring.smk b/rules/geneAnchoring.smk
new file mode 100644
index 0000000..0847122
--- /dev/null
+++ b/rules/geneAnchoring.smk
@@ -0,0 +1,15 @@
+rule mapHomologousRegions:
+	message: " mapping homologous regions of both references using ISBPs markers as anchors"
+	input: closestMarkers=config['results']+'/2.mergedClosestMarkers.txt',
+		refQuery=config['queryFasta'],
+		faiQuery=config['queryFasta']+'.fai',
+		refTarget=config['targetFasta'],
+		faiTarget=config['targetFasta']+'.fai',
+		markersOnTarget=config['results']+"/1.filteredISBPs.sorted.bed",
+		markersIds=config['results']+"/1.filteredISBPs.ids"
+	output: directory(config['results']+'/3.mapping')
+	log: config['results']+'/3.mapping.log'
+	shell: 
+		"""
+		bin/microMappingPipeline.py {input.closestMarkers} {output} {input.refQuery} {input.refTarget} {input.markersOnTarget} {markersIds} &> {log}
+		"""
\ No newline at end of file
diff --git a/rules/preprocessISBP.smk b/rules/preprocessISBP.smk
index 7b343b6..3db0ce3 100644
--- a/rules/preprocessISBP.smk
+++ b/rules/preprocessISBP.smk
@@ -1,7 +1,7 @@
 rule filterBam:
 	message: "Filtering BAM file of ISBPs"
 	input: config['isbpBam']
-	output: config['results']+'/1.filteredISBPs.bam'
+	output: temp(config['results']+'/1.filteredISBPs.bam')
 	log: config['results']+'/1.filterBam.log'
 	shell: "bin/filterBam.py {input} {output} 1> {log}"
 
@@ -17,4 +17,10 @@ rule sortBed:
 	input: config['results']+"/1.filteredISBPs.bed"
 	output: config['results']+"/1.filteredISBPs.sorted.bed"
 	log: config['results']+'/1.sortBed.log'
-	shell: "sort -k1,1 -k2,2n {input} 1> {output} 2> {log}"
\ No newline at end of file
+	shell: "sort -k1,1 -k2,2n {input} 1> {output} 2> {log}"
+
+rule dumpISBPsID:
+	message: "Dump ISBPs IDs"
+	input: config['results']+"/1.filteredISBPs.sorted.bed"
+	output: temp(config['results']+"/1.filteredISBPs.ids")
+	shell: " cut -f 4 {input} > {output}"
\ No newline at end of file
-- 
GitLab