Skip to content
Snippets Groups Projects
geneAnchoring.smk 8.59 KiB
Newer Older
rule mapHomologousRegions:
Helene Rimbert's avatar
Helene Rimbert committed
	message: " mapping homologous regions of both references using ISBPs markers as anchors for chromosome {wildcards.chrom}"
	input: 
		#closestMarkers=config['results']+'/2.mergedClosestMarkers.txt',
		marker5prime=config['results']+'/2.closestbed/{chrom}/upstream.txt',
		marker3prime=config['results']+'/2.closestbed/{chrom}/downstream.txt',
		refQuery=config['queryFasta'],
		faiQuery=config['queryFasta']+'.fai',
		refTarget=config['targetFasta'],
		faiTarget=config['targetFasta']+'.fai',
		markersOnTarget=config['results']+"/1.filteredISBPs/{chrom}/sorted.bed"
Helene Rimbert's avatar
Helene Rimbert committed
	output: 
		allblat=config['results']+'/3.mapping/{chrom}/allBlat.csv',
		summary=config['results']+'/3.mapping/{chrom}/mappingSummary.csv',
		mapping=temp(directory(config['results']+'/3.mapping/{chrom}/temp'))
Helene Rimbert's avatar
Helene Rimbert committed
	params: dir=directory(config['results']+'/3.mapping/{chrom}')
	log: config['results']+'/3.mapping/{chrom}/microMappingPipeline.log'
Helene Rimbert's avatar
Helene Rimbert committed
		bin/microMappingPipeline.py {input.marker5prime} {input.marker3prime} {params.dir} {input.refQuery} {input.refTarget} {input.markersOnTarget} &> {log}
Helene Rimbert's avatar
Helene Rimbert committed
rule recalcBlatMapped:
	message: " Recalc the coordinates of genes mapped with the Blat pipeline for chromosome {wildcards.chrom}"
	input: 
		allBlat=config['results']+'/3.mapping/{chrom}/allBlat.csv',
		summary=config['results']+'/3.mapping/{chrom}/mappingSummary.csv',
		mapping=config['results']+'/3.mapping/{chrom}/temp',
Helene Rimbert's avatar
Helene Rimbert committed
		refQuery=config['queryFasta'],
		faiQuery=config['queryFasta']+'.fai',
		refTarget=config['targetFasta'],
		faiTarget=config['targetFasta']+'.fai',
		refGff=config['annotationQuery']
	output: gff=temp(config['results']+'/4.recalcBlat/{chrom}/RecalcAnnotOnTarget.gff3')
	log: config['results']+'/4.recalcBlat/{chrom}/RecalcAnnotOnTarget.log'
	shell: 
		"""
		bin/recalcCoordsFromBlat.py -b {input.allBlat} -s {input.summary} -o {output.gff} -q {input.refQuery} -t {input.refTarget} -g {input.refGff} &> {log}
		"""
rule gtCleanBlatGff:
	message: " Clean the gff file recalculated based on Blat fine mapping for chromosome {wildcards.chrom}"
	input: gff=config['results']+'/4.recalcBlat/{chrom}/RecalcAnnotOnTarget.gff3',
		mapping=config['results']+'/3.mapping/{chrom}/temp'
	output: temp(config['results']+'/4.recalcBlat/{chrom}/RecalcAnnotOnTarget-clean.gff3')
Helene Rimbert's avatar
Helene Rimbert committed
	log: config['results']+'/4.recalcBlat/{chrom}/RecalcAnnotOnTarget-clean.log'
	shell: 
		gt gff3 -fixregionboundaries -sort -tidy -retainids {input.gff} 1> {output} 2> {log}
		"""

rule gmapRescue:
	message: " Rescue anchoring of failed genes with gmap for chrom {wildcards.chrom}"
	input: blat=config['results']+'/3.mapping/{chrom}/allBlat.csv', 
		wholeGenomeFasta=config['targetFasta'],
		mapping=config['results']+'/3.mapping/{chrom}/temp'
Helene Rimbert's avatar
Helene Rimbert committed
	output: txt=config['results']+'/4.gmapRescue/{chrom}.txt',
		targetgff=temp(config['results']+'/4.gmapRescue/{chrom}.target.gff3'),
		wholegenomegff=temp(config['results']+'/4.gmapRescue/{chrom}.wholeGenome.gff3')
	params: blastdb=config['blastdb'], anchoringDir=config['results']+'/3.mapping/{chrom}/', gmapIndexDir=config['results'], gmapindex="target_gmapindex"
	log: config['results']+'/4.gmapRescue/{chrom}.log'
		bin/gmapRescue.sh {params.anchoringDir} {params.blastdb} {params.gmapindex} {params.gmapIndexDir} {output.targetgff} {output.wholegenomegff} {input.wholeGenomeFasta} 1> {output.txt} 2> {log}

rule recalcGmapRescue:
	message: "Recalc the coordinates of the GMAP on target GFF3 files for chromosome {wildcards.chrom}"
	input: gff=config['results']+'/4.gmapRescue/{chrom}.target.gff3',
		mapping=config['results']+'/3.mapping/{chrom}/temp'
	output: temp(config['results']+'/4.recalcGmap/{chrom}/recalc.gff3')
Helene Rimbert's avatar
Helene Rimbert committed
	params: anchoringDir=config['results']+'/3.mapping/{chrom}',
		bindir=os.getcwd()
	log: config['results']+'/4.recalcGmap/{chrom}/recalc.log'
	shell:
		"""
		bin/convertGmapCoords.py -G {input.gff} -m 'GMAP_ON_TARGET' -o {output} &> {log}
Helene Rimbert's avatar
Helene Rimbert committed
		"""

rule saveGmapWG:
	message: "Save the GMAP results of genes mapped on Whole Genome"
	input: gff=config['results']+'/4.gmapRescue/{chrom}.wholeGenome.gff3',
		map=config['chromMapID'],
		mapping=config['results']+'/3.mapping/{chrom}/temp'
	output: gff=temp(config['results']+'/4.gmapWholeGenome/{chrom}.wholeGenome.gff3'),
		gfferror=temp(config['results']+'/4.gmapWholeGenome/{chrom}.wholeGenome_differentChrom.gff3')
Helene Rimbert's avatar
Helene Rimbert committed
	log: config['results']+'/4.gmapWholeGenome/{chrom}.wholeGenome.log'
	shell:
		"""
		#cp {input} {output} &> {log}
		bin/reformatGmapWGGFF.py -i {input.gff} -m {input.map} -o {output.gff} -e {output.gfferror} &> {log}

Helene Rimbert's avatar
Helene Rimbert committed
		"""

rule mergeFinalGff3:
	message: " Merge Final GFF3 files: blat.gff3, rescue.gff3 and wholeGenome.gff3"
	input: blat=config['results']+'/4.recalcBlat/{chrom}/RecalcAnnotOnTarget-clean.gff3',
		rescue=config['results']+'/4.recalcGmap/{chrom}/recalc.gff3', 
		wg=config['results']+'/4.gmapWholeGenome/{chrom}.wholeGenome.gff3',
		mapping=config['results']+'/3.mapping/{chrom}/temp'
	output: annot=temp(config['results']+'/5.FINAL/{chrom}/annotation.gff3')
	log: annot=config['results']+'/5.FINAL/{chrom}/annotation.log'
Helene Rimbert's avatar
Helene Rimbert committed
	shell:
		"""
		gt gff3 -sort -fixregionboundaries -tidy -retainids {input.blat} {input.rescue} {input.wg} 1> {output.annot} 2> {log.annot}
Helene Rimbert's avatar
Helene Rimbert committed
		"""
rule checkMissing:
	message: ""
	input: gff=config['results']+'/5.FINAL/{chrom}/annotation.gff3',
		ref=config['results']+"/1.features/{chrom}.bed"
	output: temp(config['results']+'/5.FINAL/{chrom}/missing.txt')
Helene Rimbert's avatar
Helene Rimbert committed
	log: config['results']+'/5.FINAL/{chrom}/missing.log'
	shell:
		"""
		bin/checkFinalAnnot.py -i {input.gff} -a {input.ref} -m {output} &> {log}
Helene Rimbert's avatar
Helene Rimbert committed
		"""

rule concatAllChromResults:
	message: " concat all per chromosome results"
	input: annot=expand(config['results']+'/5.FINAL/{chrom}/annotation.gff3',chrom=config['chromosomes']),
		differentChrom=expand(config['results']+'/4.gmapWholeGenome/{chrom}.wholeGenome_differentChrom.gff3',chrom=config['chromosomes']),
Helene Rimbert's avatar
Helene Rimbert committed
		missing=expand(config['results']+'/5.FINAL/{chrom}/missing.txt', chrom=config['chromosomes'])
	output: annot=temp(config['finalPrefix']+'tmp.gff3'),
		diffChrom=config['finalPrefix']+'_gmap_differentChrom.gff3',
Helene Rimbert's avatar
Helene Rimbert committed
		missing=config['finalPrefix']+'_missing.txt'
	log: config['finalPrefix']+'.log'
	shell:
		"""
		gt gff3 -fixregionboundaries -sort -tidy -retainids {input.annot} 1> {output.annot} 2> {log}
		gt gff3 -fixregionboundaries -sort -tidy -retainids {input.differentChrom} 1> {output.diffChrom} 2>> {log}
Helene Rimbert's avatar
Helene Rimbert committed
		cat {input.missing} 1> {output.missing} 2>> {log}
		"""

rule renameGeneIds:
	message: " set final gene IDs for the new annotation"
	input: gff=config['finalPrefix']+'tmp.gff3',
		map=config['chromMapID']
	output: gffhc=config['finalPrefix']+'_HC.gff3',
		gfflc=config['finalPrefix']+'_LC.gff3',
		correspondance=config['finalPrefix']+'_IDMapping.csv',
		overlap=config['finalPrefix']+'_overlap.bed'
	params: prefix=config['gff_prefix'], version=config['gff_version'], source=config['gff_source']
Helene Rimbert's avatar
Helene Rimbert committed
	log: config['finalPrefix']+'_IDMapping.log'
	shell: 
		"""
		bin/renameGffID.py -i {input.gff} -m {input.map} -o {output.gffhc} -l {output.gfflc} -c {output.correspondance} -p {params.prefix} -v {params.version} -s {params.source} -x {output.overlap} &> {log}
		"""

rule generateFastaSequencesHC:
	message: "generate fasta sequences using gffread for HC genes"
	input: gff=config['finalPrefix']+'_HC.gff3',
		fastaref=config['targetFasta']
	output: mrna=config['finalPrefix']+'_HC.transcripts.fasta',
		cds=config['finalPrefix']+'_HC.cds.fasta',
		pep=config['finalPrefix']+'_HC.proteins.fasta'
	log: config['finalPrefix']+'_HC.gffread.log'
	shell:
		"""
		gffread {input.gff} -w {output.mrna} -x {output.cds} -y {output.pep} -g {input.fastaref} -W -O &> {log}
		"""
rule generateFastaSequencesLC:
	message: "generate fasta sequences using gffread for LC genes"
	input: 
		gff=config['finalPrefix']+'_LC.gff3',
		fastaref=config['targetFasta']
	output: mrna=config['finalPrefix']+'_LC.transcripts.fasta',
		cds=config['finalPrefix']+'_LC.cds.fasta',
		pep=config['finalPrefix']+'_LC.proteins.fasta'
	log: config['finalPrefix']+'_LC.gffread.log'
Helene Rimbert's avatar
Helene Rimbert committed
		gffread {input.gff} -w {output.mrna} -x {output.cds} -y {output.pep} -g {input.fastaref} -W -O &> {log}
		"""

rule concatblatSummary:
	message: " Concat all Blat summary"
	input:blat=expand(config['results']+'/3.mapping/{chrom}/allBlat.csv', chrom=config['chromosomes'])
	output: config['finalPrefix']+'_blatSummary.csv'
	shell:
		"""
		cat {input.blat} |cut -f -3,5- 1> {output}
		"""

rule concatAnchoringSummary:
	message: " Concat all Anchoring summary"
	input:anchoring=expand(config['results']+'/3.mapping/{chrom}/mappingSummary.csv', chrom=config['chromosomes'])
	output: config['finalPrefix']+'_anchoringSummary.csv'
	shell:
		"""
		cat {input.anchoring} |cut -f -4,5,8- 1> {output}
		"""