From 16d31b3b8e84d59dd3260533fc4a46b9ecc1dffe Mon Sep 17 00:00:00 2001 From: Helene Rimbert <helene.rimbert@inra.fr> Date: Tue, 17 Dec 2019 14:31:37 +0100 Subject: [PATCH] new snakemake rule geneAnchoring.smk --- bin/filterBam.py | 2 +- bin/microMappingPipeline.py | 133 +++++++++++++++++++++++++++++++++++- rules/bedtoolsClosest.smk | 4 +- rules/geneAnchoring.smk | 15 ++++ rules/preprocessISBP.smk | 10 ++- 5 files changed, 156 insertions(+), 8 deletions(-) create mode 100644 rules/geneAnchoring.smk diff --git a/bin/filterBam.py b/bin/filterBam.py index b713b21..9f282db 100755 --- a/bin/filterBam.py +++ b/bin/filterBam.py @@ -31,7 +31,7 @@ for read in samfile: numFilteredReads +=1 else: - sys.stdout.write(" mapping not ok for read %s: qlength=%s cigarLength=%s mapQ=%s numMissmatches=%s\n" % (str(read.query_name),str(readLength), str(cigarLength), str(mapq), str(numMissmatches))) + sys.stderr.write(" mapping not ok for read %s: qlength=%s cigarLength=%s mapQ=%s numMissmatches=%s\n" % (str(read.query_name),str(readLength), str(cigarLength), str(mapq), str(numMissmatches))) #some log # sys.stdout.write (" read: %s\n" % str(read)) diff --git a/bin/microMappingPipeline.py b/bin/microMappingPipeline.py index fed7c75..46ab754 100755 --- a/bin/microMappingPipeline.py +++ b/bin/microMappingPipeline.py @@ -1,6 +1,12 @@ #!/usr/bin/env python3 +# coding: utf-8 import os.path from os import path +from collections import defaultdict +from Bio.Seq import Seq +from Bio.SeqRecord import SeqRecord +from Bio.Alphabet import IUPAC +from Bio import SeqIO import pysam import sys @@ -9,20 +15,139 @@ inputMarkersFlankingGenes = sys.argv[1] # table with mapped flanking ISBPs from outputDir = sys.argv[2] # output directory ... genomeQuery = sys.argv[3] # fasta file of the v1 genome (must be fai indexed) genomeTarget = sys.argv[4] # fasta file of the v2 genome (must bie fai indexed) +markerPosOnTarget = sys.argv[5] # bed file of new coords of markers on target reference genome def main(): ### check for input files and output directory checkFile(file=inputMarkersFlankingGenes) - if ( not checkDir(directory=outputDir) ): + if ( checkDir(directory=outputDir) ): + sys.stdout.write( " Output directory already exists, no need to create it\n") + + else: sys.stdout.write( " Creating output directory %s" % str(outputDir)) os.mkdir( outputDir, 0o750 ) - #open for reading the reference file + ### open for reading the query and target reference file + queryFasta = pysam.FastaFile(genomeQuery) + targetFasta = pysam.FastaFile(genomeTarget) + + ### save coords of marker on target reference + markersOnTarget_dict = getCoordsFromBed(bedfile=markerPosOnTarget) ### read input file containing the flanking IBSP markers for each gene + ##### variables + numlines=0 + index_result_dir=0 + prefixResultDir=outputDir+'/temp/'+str(index_result_dir).zfill(6) + + ##### output files + with open(inputMarkersFlankingGenes) as file: + for line in file.readlines(): + # increment counters + # because to avoid having a big result dir with +100k subdirs, + # the scripts creates subdirs for every 1k genes + numlines+=1 + if numlines % 1000 == 0: + index_result_dir+=1 + prefixResultDir=outputDir+'/'+str(index_result_dir).zfill(6) + + # define the working dir for the current gene analysis + workingDir=prefixResultDir+'/'+str(numlines).zfill(6) + os.makedirs(workingDir, 0o750) + + # deal with data + lineArray = line.rstrip("\n").split("\t") + print("\n\n------------------------------------------") + print(" Results of gene %s going into subdir %s" % (lineArray[0], str(workingDir))) + print(" array of data: ") + print(lineArray) + + # check if both anchors are mapped on the same chromosome + geneDict={ 'geneId': lineArray[0], + 'geneChrom':lineArray[1], + 'geneStart':lineArray[2], + 'geneStop':lineArray[3], + 'geneScore':lineArray[4], + 'geneStrand':lineArray[5], + 'marker5pChrom':lineArray[6], + 'marker5pStart':lineArray[7], + 'marker5pStop':lineArray[8], + 'marker5pId':lineArray[9], + 'marker5pDistance':lineArray[10], + 'marker3pChrom':lineArray[11], + 'marker3pStart':lineArray[12], + 'marker3pStop':lineArray[13], + 'marker3pId':lineArray[14], + 'marker3pDistance':lineArray[15], + 'regionLengthOnQuery':lineArray[16] + } + print(" Gene directory:") + print(geneDict) + + if geneDict['marker5pChrom'] == '.' or geneDict['marker3pChrom'] == '.': + # deal afterward with genes with one anchor missing (basecally start and end of chromosomes) + # TODO + sys.stderr.write(" current gene %s has missing anchor on one side \n" % geneDict['geneId']) + continue + + + # get mapping of the flanking anchors on the target genome + anchor5pTargetCoods_dict=markersOnTarget_dict[geneDict['marker5pId']] + anchor3pTargetCoods_dict=markersOnTarget_dict[geneDict['marker3pId']] + print(" anchors 5prime:\n") + print(anchor5pTargetCoods_dict) + print(" anchors 3prime:\n") + print(anchor3pTargetCoods_dict) + + if anchor5pTargetCoods_dict['chrom'] == anchor3pTargetCoods_dict['chrom']: + print(" both anchors are mapped on the same chrom. we can proceed to sequence extraction and mapping\n") + ### deal with query sequence + queryGenomeSeq=getFastaSeq(fasta=queryFasta, chrom=geneDict['marker5pChrom'], start=int(geneDict['marker5pStart']), stop=int(geneDict['marker3pStart'])) + querySeqRecord= SeqRecord( + Seq(queryGenomeSeq, IUPAC.ambiguous_dna), + id='query_'+str(geneDict['marker5pChrom'])+'_'+str(geneDict['marker5pStart'])+'-'+str(geneDict['marker3pStart']), + description='target gene '+geneDict['geneId']+', flanking markers '+ geneDict['marker5pId']+'-'+geneDict['marker3pId']) + print(querySeqRecord) + queryFasta2blast = workingDir+'/query.fasta' + SeqIO.write(querySeqRecord, queryFasta2blast, 'fasta') + + ### deal with target sequence + targetGenomeSeq=getFastaSeq(fasta=targetFasta, chrom=anchor5pTargetCoods_dict['chrom'], start=int(anchor5pTargetCoods_dict['start']), stop=int(anchor3pTargetCoods_dict['stop'])) + + + else: + sys.stderr.write(" 5prime anchor %s and 3prime anchor %s are not on the same chromsome\n" % (geneDict['marker5pId'],geneDict['marker3pId'])) + sys.stderr.write(" Cannot anchoring gene %s \n" % geneDict['geneId']) + continue + + + input() + +def getFastaSeq(fasta,chrom,start,stop): + seq=fasta.fetch(reference=chrom, start=start,end=stop) + print(" fasta sequence for chrom %s from %s to %s:\n" %(chrom,start,stop)) + print(seq) + return seq +def getCoordsFromBed (bedfile): + checkFile(file=bedfile) + bedDict = defaultdict() + with open(bedfile) as file: + for line in file.readlines(): + bed=line.rstrip("\n").split("\t") + if len(bed) < 4: + sys.stderr.write(" ERROR PARSING BEDFILE %s " % str(bedfile)) + sys.stderr.write(" File do ot have 4 columns or is not tab delimited\n") + sys.stderr.write(" Please check the format\n") + sys.exit() + #print(" bed array content ") + #print(bed) + bedDict[str(bed[3])] ={'chrom': bed[0],'start':bed[1],'stop':bed[2],'mapQ': bed[4],'strand':bed[5]} + numRecords=len(bedDict.keys()) + sys.stdout.write(" Found %i records in Bed file %s " % (numRecords, bedfile)) + return bedDict def checkFile (file): if os.path.isfile(file): @@ -34,8 +159,10 @@ def checkFile (file): def checkDir(directory): if os.path.isdir(directory): sys.stdout.write(" Directory %s found\n" % str(directory)) + return 1 else: - sys.stderr.write(" ERROR: Cannot find directory %s \n" % str(directory)) + sys.stderr.write(" Cannot find directory %s \n" % str(directory)) + return 0 if __name__== "__main__": diff --git a/rules/bedtoolsClosest.smk b/rules/bedtoolsClosest.smk index 1f89433..922a723 100644 --- a/rules/bedtoolsClosest.smk +++ b/rules/bedtoolsClosest.smk @@ -11,7 +11,7 @@ rule selectMappedISBP: rule upstreamClosest: message: " Collect closest marker upstream of genes" input: annot=config['results']+"/1.features.bed", markers=config['results']+'/2.mappedISBPs_coordsOnQuery.bed' - output: config['results']+'/2.closestbedUpstream.txt' + output: temp(config['results']+'/2.closestbedUpstream.txt') log: config['results']+"/2.closestbedUpstream.log" shell: """ @@ -21,7 +21,7 @@ rule upstreamClosest: rule downstreamClosest: message: " Collect Downstream marker upstream of genes" input: annot=config['results']+"/1.features.bed", markers=config['results']+'/2.mappedISBPs_coordsOnQuery.bed' - output: config['results']+'/2.downstreamClosest.txt' + output: temp(config['results']+'/2.downstreamClosest.txt') log: config['results']+"/2.downstreamClosest.log" shell: """ diff --git a/rules/geneAnchoring.smk b/rules/geneAnchoring.smk new file mode 100644 index 0000000..0847122 --- /dev/null +++ b/rules/geneAnchoring.smk @@ -0,0 +1,15 @@ +rule mapHomologousRegions: + message: " mapping homologous regions of both references using ISBPs markers as anchors" + input: closestMarkers=config['results']+'/2.mergedClosestMarkers.txt', + refQuery=config['queryFasta'], + faiQuery=config['queryFasta']+'.fai', + refTarget=config['targetFasta'], + faiTarget=config['targetFasta']+'.fai', + markersOnTarget=config['results']+"/1.filteredISBPs.sorted.bed", + markersIds=config['results']+"/1.filteredISBPs.ids" + output: directory(config['results']+'/3.mapping') + log: config['results']+'/3.mapping.log' + shell: + """ + bin/microMappingPipeline.py {input.closestMarkers} {output} {input.refQuery} {input.refTarget} {input.markersOnTarget} {markersIds} &> {log} + """ \ No newline at end of file diff --git a/rules/preprocessISBP.smk b/rules/preprocessISBP.smk index 7b343b6..3db0ce3 100644 --- a/rules/preprocessISBP.smk +++ b/rules/preprocessISBP.smk @@ -1,7 +1,7 @@ rule filterBam: message: "Filtering BAM file of ISBPs" input: config['isbpBam'] - output: config['results']+'/1.filteredISBPs.bam' + output: temp(config['results']+'/1.filteredISBPs.bam') log: config['results']+'/1.filterBam.log' shell: "bin/filterBam.py {input} {output} 1> {log}" @@ -17,4 +17,10 @@ rule sortBed: input: config['results']+"/1.filteredISBPs.bed" output: config['results']+"/1.filteredISBPs.sorted.bed" log: config['results']+'/1.sortBed.log' - shell: "sort -k1,1 -k2,2n {input} 1> {output} 2> {log}" \ No newline at end of file + shell: "sort -k1,1 -k2,2n {input} 1> {output} 2> {log}" + +rule dumpISBPsID: + message: "Dump ISBPs IDs" + input: config['results']+"/1.filteredISBPs.sorted.bed" + output: temp(config['results']+"/1.filteredISBPs.ids") + shell: " cut -f 4 {input} > {output}" \ No newline at end of file -- GitLab