rule mapHomologousRegions: message: " mapping homologous regions of both references using ISBPs markers as anchors for chromosome {wildcards.chrom}" input: #closestMarkers=config['results']+'/2.mergedClosestMarkers.txt', marker5prime=config['results']+'/2.closestbed_split/{chrom}.upstream.txt', marker3prime=config['results']+'/2.closestbed_split/{chrom}.downstream.txt', refQuery=config['queryFasta'], faiQuery=config['queryFasta']+'.fai', refTarget=config['targetFasta'], faiTarget=config['targetFasta']+'.fai', markersOnTarget=config['results']+"/1.filteredISBPs.sorted.{chrom}.bed" output: allblat=config['results']+'/3.mapping/{chrom}/allBlat.csv', summary=config['results']+'/3.mapping/{chrom}/mappingSummary.csv' params: dir=directory(config['results']+'/3.mapping/{chrom}') log: config['results']+'/3.mapping/{chrom}/microMappingPipeline.log' shell: """ bin/microMappingPipeline.py {input.marker5prime} {input.marker3prime} {params.dir} {input.refQuery} {input.refTarget} {input.markersOnTarget} &> {log} """ rule recalcBlatMapped: message: " Recalc the coordinates of genes mapped with the Blat pipeline for chromosome {wildcards.chrom}" input: allBlat=config['results']+'/3.mapping/{chrom}/allBlat.csv', summary=config['results']+'/3.mapping/{chrom}/mappingSummary.csv', refQuery=config['queryFasta'], faiQuery=config['queryFasta']+'.fai', refTarget=config['targetFasta'], faiTarget=config['targetFasta']+'.fai', refGff=config['annotationQuery'] output: gff=temp(config['results']+'/4.recalcBlat/{chrom}/RecalcAnnotOnTarget.gff3') log: config['results']+'/4.recalcBlat/{chrom}/RecalcAnnotOnTarget.log' shell: """ bin/recalcCoordsFromBlat.py -b {input.allBlat} -s {input.summary} -o {output.gff} -q {input.refQuery} -t {input.refTarget} -g {input.refGff} &> {log} """ rule gtCleanBlatGff: message: " Clean the gff file recalculated based on Blat fine mapping for chromosome {wildcards.chrom}" input: config['results']+'/4.recalcBlat/{chrom}/RecalcAnnotOnTarget.gff3' output: config['results']+'/4.recalcBlat/{chrom}/RecalcAnnotOnTarget-clean.gff3' log: config['results']+'/4.recalcBlat/{chrom}/RecalcAnnotOnTarget-clean.log' shell: """ gt gff3 -fixregionboundaries -sort -tidy -retainids {input} 1> {output} 2> {log} """ rule gmapRescue: message: " Rescue anchoring of failed genes with gmap for chrom {wildcards.chrom}" input: blat=config['results']+'/3.mapping/{chrom}/allBlat.csv', gmapindex=config['results']+"/target_gmapindex" output: txt=config['results']+'/4.gmapRescue/{chrom}.txt', targetgff=config['results']+'/4.gmapRescue/{chrom}.target.gff3', wholegenomegff=config['results']+'/4.gmapRescue/{chrom}.wholeGenome.gff3' params: blastdb=config['blastdb'], anchoringDir=config['results']+'/3.mapping/{chrom}/', gmapIndexDir=config['results'] log: config['results']+'/4.gmapRescue/{chrom}.log' shell: """ bin/gmapRescue.sh {params.anchoringDir} {params.blastdb} {input.gmapindex} {params.gmapIndexDir} {output.targetgff} {output.wholegenomegff} 1> {output.txt} 2> {log} """ rule recalcGmapRescue: message: "Recalc the coordinates of the GMAP on target GFF3 files for chromosome {wildcards.chrom}" input: config['results']+'/4.gmapRescue/{chrom}.target.gff3', output: config['results']+'/4.recalcGmap/{chrom}/recalc.gff3' params: anchoringDir=config['results']+'/3.mapping/{chrom}', bindir=os.getcwd() log: config['results']+'/4.recalcGmap/{chrom}/recalc.log' shell: """ bin/recalcGmapRescue.sh {params.anchoringDir} {input} {output} {params.bindir} &> {log} """ rule saveGmapWG: message: "Save the GMAP results of genes mapped on Whole Genome" input: gff=config['results']+'/4.gmapRescue/{chrom}.wholeGenome.gff3', map=config['chromMapID'] output: gff=config['results']+'/4.gmapWholeGenome/{chrom}.wholeGenome.gff3', gfferror=config['results']+'/4.gmapWholeGenome/{chrom}.wholeGenome_differentChrom.gff3' log: config['results']+'/4.gmapWholeGenome/{chrom}.wholeGenome.log' shell: """ #cp {input} {output} &> {log} bin/reformatGmapWGGFF.py -i {input.gff} -m {input.map} -o {output.gff} -e {output.gfferror} &> {log} """ rule mergeFinalGff3: message: " Merge Final GFF3 files: blat.gff3, rescue.gff3 and wholeGenome.gff3" input: blat=config['results']+'/4.recalcBlat/{chrom}/RecalcAnnotOnTarget-clean.gff3', rescue=config['results']+'/4.recalcGmap/{chrom}/recalc.gff3', wg=config['results']+'/4.gmapWholeGenome/{chrom}.wholeGenome.gff3' output: annot=config['results']+'/5.FINAL/{chrom}/annotation.gff3' log: annot=config['results']+'/5.FINAL/{chrom}/annotation.log' shell: """ gt gff3 -sort -fixregionboundaries -tidy -retainids {input.blat} {input.rescue} {input.wg} 1> {output.annot} 2> {log.annot} """ rule checkMissing: message: "" input: gff=config['results']+'/5.FINAL/{chrom}/annotation.gff3', ref=config['results']+"/1.features/{chrom}.bed" output: config['results']+'/5.FINAL/{chrom}/missing.txt' log: config['results']+'/5.FINAL/{chrom}/missing.log' shell: """ bin/checkFinalAnnot.py -i {input.gff} -a {input.ref} -m {output} &> {log} """ rule concatAllChromResults: message: " concat all per chromosome results" input: annot=expand(config['results']+'/5.FINAL/{chrom}/annotation.gff3',chrom=config['chromosomes']), differentChrom=expand(config['results']+'/4.gmapWholeGenome/{chrom}.wholeGenome_differentChrom.gff3',chrom=config['chromosomes']), missing=expand(config['results']+'/5.FINAL/{chrom}/missing.txt', chrom=config['chromosomes']) output: annot=temp(config['finalPrefix']+'tmp.gff3'), diffChrom=config['finalPrefix']+'_gmap_differentChrom.gff3', missing=config['finalPrefix']+'_missing.txt' log: config['finalPrefix']+'.log' shell: """ gt gff3 -fixregionboundaries -sort -tidy -retainids {input.annot} 1> {output.annot} 2> {log} gt gff3 -fixregionboundaries -sort -tidy -retainids {input.differentChrom} 1> {output.diffChrom} 2>> {log} cat {input.missing} 1> {output.missing} 2>> {log} """ rule renameGeneIds: message: " set final gene IDs for the new annotation" input: gff=config['finalPrefix']+'tmp.gff3', map=config['chromMapID'] output: gffhc=config['finalPrefix']+'_HC.gff3', gfflc=config['finalPrefix']+'_LC.gff3', correspondance=config['finalPrefix']+'_IDMapping.csv', overlap=config['finalPrefix']+'_overlap.bed' params: prefix=config['gff_prefix'], version=config['gff_version'], source=config['gff_source'] log: config['finalPrefix']+'_IDMapping.log' shell: """ bin/renameGffID.py -i {input.gff} -m {input.map} -o {output.gffhc} -l {output.gfflc} -c {output.correspondance} -p {params.prefix} -v {params.version} -s {params.source} -x {output.overlap} &> {log} """ rule generateFastaSequencesHC: message: "generate fasta sequences using gffread for HC genes" input: gff=config['finalPrefix']+'_HC.gff3', fastaref=config['targetFasta'] output: mrna=config['finalPrefix']+'_HC.transcripts.fasta', cds=config['finalPrefix']+'_HC.cds.fasta', pep=config['finalPrefix']+'_HC.proteins.fasta' log: config['finalPrefix']+'_HC.gffread.log' shell: """ gffread {input.gff} -w {output.mrna} -x {output.cds} -y {output.pep} -g {input.fastaref} -W -O &> {log} """ rule generateFastaSequencesLC: message: "generate fasta sequences using gffread for LC genes" input: gff=config['finalPrefix']+'_LC.gff3', fastaref=config['targetFasta'] output: mrna=config['finalPrefix']+'_LC.transcripts.fasta', cds=config['finalPrefix']+'_LC.cds.fasta', pep=config['finalPrefix']+'_LC.proteins.fasta' log: config['finalPrefix']+'_LC.gffread.log' shell: """ gffread {input.gff} -w {output.mrna} -x {output.cds} -y {output.pep} -g {input.fastaref} -W -O &> {log} """ rule concatblatSummary: message: " Concat all Blat summary" input:blat=expand(config['results']+'/3.mapping/{chrom}/allBlat.csv', chrom=config['chromosomes']) output: config['finalPrefix']+'_blatSummary.csv' shell: """ cat {input.blat} |cut -f -3,5- 1> {output} """ rule concatAnchoringSummary: message: " Concat all Anchoring summary" input:anchoring=expand(config['results']+'/3.mapping/{chrom}/mappingSummary.csv', chrom=config['chromosomes']) output: config['finalPrefix']+'_anchoringSummary.csv' shell: """ cat {input.anchoring} |cut -f -4,5,8- 1> {output} """