Skip to content
Snippets Groups Projects
Commit 16d31b3b authored by Helene Rimbert's avatar Helene Rimbert
Browse files

new snakemake rule geneAnchoring.smk

parent 479d76b6
No related branches found
No related tags found
No related merge requests found
......@@ -31,7 +31,7 @@ for read in samfile:
numFilteredReads +=1
else:
sys.stdout.write(" mapping not ok for read %s: qlength=%s cigarLength=%s mapQ=%s numMissmatches=%s\n" % (str(read.query_name),str(readLength), str(cigarLength), str(mapq), str(numMissmatches)))
sys.stderr.write(" mapping not ok for read %s: qlength=%s cigarLength=%s mapQ=%s numMissmatches=%s\n" % (str(read.query_name),str(readLength), str(cigarLength), str(mapq), str(numMissmatches)))
#some log
# sys.stdout.write (" read: %s\n" % str(read))
......
#!/usr/bin/env python3
# coding: utf-8
import os.path
from os import path
from collections import defaultdict
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC
from Bio import SeqIO
import pysam
import sys
......@@ -9,20 +15,139 @@ inputMarkersFlankingGenes = sys.argv[1] # table with mapped flanking ISBPs from
outputDir = sys.argv[2] # output directory ...
genomeQuery = sys.argv[3] # fasta file of the v1 genome (must be fai indexed)
genomeTarget = sys.argv[4] # fasta file of the v2 genome (must bie fai indexed)
markerPosOnTarget = sys.argv[5] # bed file of new coords of markers on target reference genome
def main():
### check for input files and output directory
checkFile(file=inputMarkersFlankingGenes)
if ( not checkDir(directory=outputDir) ):
if ( checkDir(directory=outputDir) ):
sys.stdout.write( " Output directory already exists, no need to create it\n")
else:
sys.stdout.write( " Creating output directory %s" % str(outputDir))
os.mkdir( outputDir, 0o750 )
#open for reading the reference file
### open for reading the query and target reference file
queryFasta = pysam.FastaFile(genomeQuery)
targetFasta = pysam.FastaFile(genomeTarget)
### save coords of marker on target reference
markersOnTarget_dict = getCoordsFromBed(bedfile=markerPosOnTarget)
### read input file containing the flanking IBSP markers for each gene
##### variables
numlines=0
index_result_dir=0
prefixResultDir=outputDir+'/temp/'+str(index_result_dir).zfill(6)
##### output files
with open(inputMarkersFlankingGenes) as file:
for line in file.readlines():
# increment counters
# because to avoid having a big result dir with +100k subdirs,
# the scripts creates subdirs for every 1k genes
numlines+=1
if numlines % 1000 == 0:
index_result_dir+=1
prefixResultDir=outputDir+'/'+str(index_result_dir).zfill(6)
# define the working dir for the current gene analysis
workingDir=prefixResultDir+'/'+str(numlines).zfill(6)
os.makedirs(workingDir, 0o750)
# deal with data
lineArray = line.rstrip("\n").split("\t")
print("\n\n------------------------------------------")
print(" Results of gene %s going into subdir %s" % (lineArray[0], str(workingDir)))
print(" array of data: ")
print(lineArray)
# check if both anchors are mapped on the same chromosome
geneDict={ 'geneId': lineArray[0],
'geneChrom':lineArray[1],
'geneStart':lineArray[2],
'geneStop':lineArray[3],
'geneScore':lineArray[4],
'geneStrand':lineArray[5],
'marker5pChrom':lineArray[6],
'marker5pStart':lineArray[7],
'marker5pStop':lineArray[8],
'marker5pId':lineArray[9],
'marker5pDistance':lineArray[10],
'marker3pChrom':lineArray[11],
'marker3pStart':lineArray[12],
'marker3pStop':lineArray[13],
'marker3pId':lineArray[14],
'marker3pDistance':lineArray[15],
'regionLengthOnQuery':lineArray[16]
}
print(" Gene directory:")
print(geneDict)
if geneDict['marker5pChrom'] == '.' or geneDict['marker3pChrom'] == '.':
# deal afterward with genes with one anchor missing (basecally start and end of chromosomes)
# TODO
sys.stderr.write(" current gene %s has missing anchor on one side \n" % geneDict['geneId'])
continue
# get mapping of the flanking anchors on the target genome
anchor5pTargetCoods_dict=markersOnTarget_dict[geneDict['marker5pId']]
anchor3pTargetCoods_dict=markersOnTarget_dict[geneDict['marker3pId']]
print(" anchors 5prime:\n")
print(anchor5pTargetCoods_dict)
print(" anchors 3prime:\n")
print(anchor3pTargetCoods_dict)
if anchor5pTargetCoods_dict['chrom'] == anchor3pTargetCoods_dict['chrom']:
print(" both anchors are mapped on the same chrom. we can proceed to sequence extraction and mapping\n")
### deal with query sequence
queryGenomeSeq=getFastaSeq(fasta=queryFasta, chrom=geneDict['marker5pChrom'], start=int(geneDict['marker5pStart']), stop=int(geneDict['marker3pStart']))
querySeqRecord= SeqRecord(
Seq(queryGenomeSeq, IUPAC.ambiguous_dna),
id='query_'+str(geneDict['marker5pChrom'])+'_'+str(geneDict['marker5pStart'])+'-'+str(geneDict['marker3pStart']),
description='target gene '+geneDict['geneId']+', flanking markers '+ geneDict['marker5pId']+'-'+geneDict['marker3pId'])
print(querySeqRecord)
queryFasta2blast = workingDir+'/query.fasta'
SeqIO.write(querySeqRecord, queryFasta2blast, 'fasta')
### deal with target sequence
targetGenomeSeq=getFastaSeq(fasta=targetFasta, chrom=anchor5pTargetCoods_dict['chrom'], start=int(anchor5pTargetCoods_dict['start']), stop=int(anchor3pTargetCoods_dict['stop']))
else:
sys.stderr.write(" 5prime anchor %s and 3prime anchor %s are not on the same chromsome\n" % (geneDict['marker5pId'],geneDict['marker3pId']))
sys.stderr.write(" Cannot anchoring gene %s \n" % geneDict['geneId'])
continue
input()
def getFastaSeq(fasta,chrom,start,stop):
seq=fasta.fetch(reference=chrom, start=start,end=stop)
print(" fasta sequence for chrom %s from %s to %s:\n" %(chrom,start,stop))
print(seq)
return seq
def getCoordsFromBed (bedfile):
checkFile(file=bedfile)
bedDict = defaultdict()
with open(bedfile) as file:
for line in file.readlines():
bed=line.rstrip("\n").split("\t")
if len(bed) < 4:
sys.stderr.write(" ERROR PARSING BEDFILE %s " % str(bedfile))
sys.stderr.write(" File do ot have 4 columns or is not tab delimited\n")
sys.stderr.write(" Please check the format\n")
sys.exit()
#print(" bed array content ")
#print(bed)
bedDict[str(bed[3])] ={'chrom': bed[0],'start':bed[1],'stop':bed[2],'mapQ': bed[4],'strand':bed[5]}
numRecords=len(bedDict.keys())
sys.stdout.write(" Found %i records in Bed file %s " % (numRecords, bedfile))
return bedDict
def checkFile (file):
if os.path.isfile(file):
......@@ -34,8 +159,10 @@ def checkFile (file):
def checkDir(directory):
if os.path.isdir(directory):
sys.stdout.write(" Directory %s found\n" % str(directory))
return 1
else:
sys.stderr.write(" ERROR: Cannot find directory %s \n" % str(directory))
sys.stderr.write(" Cannot find directory %s \n" % str(directory))
return 0
if __name__== "__main__":
......
......@@ -11,7 +11,7 @@ rule selectMappedISBP:
rule upstreamClosest:
message: " Collect closest marker upstream of genes"
input: annot=config['results']+"/1.features.bed", markers=config['results']+'/2.mappedISBPs_coordsOnQuery.bed'
output: config['results']+'/2.closestbedUpstream.txt'
output: temp(config['results']+'/2.closestbedUpstream.txt')
log: config['results']+"/2.closestbedUpstream.log"
shell:
"""
......@@ -21,7 +21,7 @@ rule upstreamClosest:
rule downstreamClosest:
message: " Collect Downstream marker upstream of genes"
input: annot=config['results']+"/1.features.bed", markers=config['results']+'/2.mappedISBPs_coordsOnQuery.bed'
output: config['results']+'/2.downstreamClosest.txt'
output: temp(config['results']+'/2.downstreamClosest.txt')
log: config['results']+"/2.downstreamClosest.log"
shell:
"""
......
rule mapHomologousRegions:
message: " mapping homologous regions of both references using ISBPs markers as anchors"
input: closestMarkers=config['results']+'/2.mergedClosestMarkers.txt',
refQuery=config['queryFasta'],
faiQuery=config['queryFasta']+'.fai',
refTarget=config['targetFasta'],
faiTarget=config['targetFasta']+'.fai',
markersOnTarget=config['results']+"/1.filteredISBPs.sorted.bed",
markersIds=config['results']+"/1.filteredISBPs.ids"
output: directory(config['results']+'/3.mapping')
log: config['results']+'/3.mapping.log'
shell:
"""
bin/microMappingPipeline.py {input.closestMarkers} {output} {input.refQuery} {input.refTarget} {input.markersOnTarget} {markersIds} &> {log}
"""
\ No newline at end of file
rule filterBam:
message: "Filtering BAM file of ISBPs"
input: config['isbpBam']
output: config['results']+'/1.filteredISBPs.bam'
output: temp(config['results']+'/1.filteredISBPs.bam')
log: config['results']+'/1.filterBam.log'
shell: "bin/filterBam.py {input} {output} 1> {log}"
......@@ -17,4 +17,10 @@ rule sortBed:
input: config['results']+"/1.filteredISBPs.bed"
output: config['results']+"/1.filteredISBPs.sorted.bed"
log: config['results']+'/1.sortBed.log'
shell: "sort -k1,1 -k2,2n {input} 1> {output} 2> {log}"
\ No newline at end of file
shell: "sort -k1,1 -k2,2n {input} 1> {output} 2> {log}"
rule dumpISBPsID:
message: "Dump ISBPs IDs"
input: config['results']+"/1.filteredISBPs.sorted.bed"
output: temp(config['results']+"/1.filteredISBPs.ids")
shell: " cut -f 4 {input} > {output}"
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment