Skip to content
Snippets Groups Projects
geneAnchoring.smk 7.93 KiB
Newer Older
rule mapHomologousRegions:
Helene Rimbert's avatar
Helene Rimbert committed
	message: " mapping homologous regions of both references using ISBPs markers as anchors for chromosome {wildcards.chrom}"
	input: 
		#closestMarkers=config['results']+'/2.mergedClosestMarkers.txt',
		marker5prime=config['results']+'/2.closestbed_split/{chrom}.upstream.txt',
		marker3prime=config['results']+'/2.closestbed_split/{chrom}.downstream.txt',
		refQuery=config['queryFasta'],
		faiQuery=config['queryFasta']+'.fai',
		refTarget=config['targetFasta'],
		faiTarget=config['targetFasta']+'.fai',
		markersOnTarget=config['results']+"/1.filteredISBPs.sorted.{chrom}.bed"
Helene Rimbert's avatar
Helene Rimbert committed
	output: 
		allblat=config['results']+'/3.mapping/{chrom}/allBlat.csv',
		summary=config['results']+'/3.mapping/{chrom}/mappingSummary.csv'
	params: dir=directory(config['results']+'/3.mapping/{chrom}')
	log: config['results']+'/3.mapping/{chrom}/microMappingPipeline.log'
Helene Rimbert's avatar
Helene Rimbert committed
		bin/microMappingPipeline.py {input.marker5prime} {input.marker3prime} {params.dir} {input.refQuery} {input.refTarget} {input.markersOnTarget} &> {log}
Helene Rimbert's avatar
Helene Rimbert committed
rule recalcBlatMapped:
	message: " Recalc the coordinates of genes mapped with the Blat pipeline for chromosome {wildcards.chrom}"
	input: 
		allBlat=config['results']+'/3.mapping/{chrom}/allBlat.csv',
		summary=config['results']+'/3.mapping/{chrom}/mappingSummary.csv',
		refQuery=config['queryFasta'],
		faiQuery=config['queryFasta']+'.fai',
		refTarget=config['targetFasta'],
		faiTarget=config['targetFasta']+'.fai',
		refGff=config['annotationQuery']
	output: gff=temp(config['results']+'/4.recalcBlat/{chrom}/RecalcAnnotOnTarget.gff3')
	log: config['results']+'/4.recalcBlat/{chrom}/RecalcAnnotOnTarget.log'
	shell: 
		"""
		bin/recalcCoordsFromBlat.py -b {input.allBlat} -s {input.summary} -o {output.gff} -q {input.refQuery} -t {input.refTarget} -g {input.refGff} &> {log}
		"""
rule gtCleanBlatGff:
	message: " Clean the gff file recalculated based on Blat fine mapping for chromosome {wildcards.chrom}"
	input: config['results']+'/4.recalcBlat/{chrom}/RecalcAnnotOnTarget.gff3'
	output: config['results']+'/4.recalcBlat/{chrom}/RecalcAnnotOnTarget-clean.gff3'
	log: config['results']+'/4.recalcBlat/{chrom}/RecalcAnnotOnTarget-clean.log'
	shell: 
Helene Rimbert's avatar
Helene Rimbert committed
		gt gff3 -sort -tidy -retainids {input} 1> {output} 2> {log}
		"""

rule gmapRescue:
	message: " Rescue anchoring of failed genes with gmap for chrom {wildcards.chrom}"
Helene Rimbert's avatar
Helene Rimbert committed
	input: blat=config['results']+'/3.mapping/{chrom}/allBlat.csv', gmapindex=config['results']+"/target_gmapindex"
	output: txt=config['results']+'/4.gmapRescue/{chrom}.txt',
		targetgff=config['results']+'/4.gmapRescue/{chrom}.target.gff3',
		wholegenomegff=config['results']+'/4.gmapRescue/{chrom}.wholeGenome.gff3'
	params: blastdb=config['blastdb'], anchoringDir=config['results']+'/3.mapping/{chrom}/', gmapIndexDir=config['results']
	log: config['results']+'/4.gmapRescue/{chrom}.log'
Helene Rimbert's avatar
Helene Rimbert committed
		bin/gmapRescue.sh {params.anchoringDir} {params.blastdb} {input.gmapindex} {params.gmapIndexDir} {output.targetgff} {output.wholegenomegff} 1> {output.txt} 2> {log}

rule recalcGmapRescue:
	message: "Recalc the coordinates of the GMAP on target GFF3 files for chromosome {wildcards.chrom}"
Helene Rimbert's avatar
Helene Rimbert committed
	input: config['results']+'/4.gmapRescue/{chrom}.target.gff3',
	output: config['results']+'/4.recalcGmap/{chrom}/recalc.gff3'
	params: anchoringDir=config['results']+'/3.mapping/{chrom}',
		bindir=os.getcwd()
	log: config['results']+'/4.recalcGmap/{chrom}/recalc.log'
	shell:
		"""
		bin/recalcGmapRescue.sh {params.anchoringDir} {input} {output} {params.bindir} &> {log}
		"""

rule saveGmapWG:
	message: "Save the GMAP results of genes mapped on Whole Genome"
	input: config['results']+'/4.gmapRescue/{chrom}.wholeGenome.gff3'
	output: config['results']+'/4.gmapWholeGenome/{chrom}.wholeGenome.gff3'
	log: config['results']+'/4.gmapWholeGenome/{chrom}.wholeGenome.log'
	shell:
		"""
		cp {input} {output} &> {log}
		"""

rule mergeFinalGff3:
	message: " Merge Final GFF3 files: blat.gff3, rescue.gff3 and wholeGenome.gff3"
	input: blat=config['results']+'/4.recalcBlat/{chrom}/RecalcAnnotOnTarget-clean.gff3',
		rescue=config['results']+'/4.recalcGmap/{chrom}/recalc.gff3', 
		wg=config['results']+'/4.gmapWholeGenome/{chrom}.wholeGenome.gff3'
	output: annot=config['results']+'/5.FINAL/{chrom}/annotation.gff3', gmapWGOnly=config['results']+'/5.FINAL/{chrom}/missing-gmap.gff3'
	log: annot=config['results']+'/5.FINAL/{chrom}/annotation.log', gmap=config['results']+'/5.FINAL/{chrom}/missing-gmap.log'
	shell:
		"""
		gt gff3 -sort -tidy -retainids {input.blat} {input.rescue} 1> {output.annot} 2> {log.annot}
		gt gff3 -sort -tidy -retainids {input.wg} 1> {output.gmapWGOnly} 2> {log.gmap}
		"""
rule checkMissing:
	message: ""
	input: gff=config['results']+'/5.FINAL/{chrom}/annotation.gff3',
		gmap=config['results']+'/5.FINAL/{chrom}/missing-gmap.gff3',
		ref=config['results']+"/1.features/{chrom}.bed"
	output: config['results']+'/5.FINAL/{chrom}/missing.txt'
	log: config['results']+'/5.FINAL/{chrom}/missing.log'
	shell:
		"""
		bin/checkFinalAnnot.py -i {input.gff} -a {input.ref} -g {input.gmap} -m {output} &> {log}
		"""

rule concatAllChromResults:
	message: " concat all per chromosome results"
	input: annot=expand(config['results']+'/5.FINAL/{chrom}/annotation.gff3',chrom=config['chromosomes']),
		rescue=expand(config['results']+'/5.FINAL/{chrom}/missing-gmap.gff3',chrom=config['chromosomes']),
		missing=expand(config['results']+'/5.FINAL/{chrom}/missing.txt', chrom=config['chromosomes'])
	output: annot=temp(config['finalPrefix']+'tmp.gff3'),
		rescue=config['finalPrefix']+'_rescue.gff3',
		missing=config['finalPrefix']+'_missing.txt'
	log: config['finalPrefix']+'.log'
	shell:
		"""
		gt gff3 -sort -tidy -retainids {input.annot} 1> {output.annot} 2> {log}
		gt gff3 -sort -tidy -retainids {input.rescue} 1> {output.rescue} 2> {log}
		cat {input.missing} 1> {output.missing} 2>> {log}
		"""

rule renameGeneIds:
	message: " set final gene IDs for the new annotation"
	input: gff=config['finalPrefix']+'tmp.gff3',
		map=config['chromMapID']
	output: gffhc=config['finalPrefix']+'_HC.gff3',
		gfflc=config['finalPrefix']+'_LC.gff3',
		correspondance=config['finalPrefix']+'_IDMapping.csv',
		overlap=config['finalPrefix']+'_overlap.bed'
	params: prefix='TraesCS', version='03G', source='IWGSC_v2.1'
	log: config['finalPrefix']+'_IDMapping.log'
	shell: 
		"""
		bin/renameGffID.py -i {input.gff} -m {input.map} -o {output.gffhc} -l {output.gfflc} -c {output.correspondance} -p {params.prefix} -v {params.version} -s {params.source} -x {output.overlap} &> {log}
		"""

rule generateFastaSequencesHC:
	message: "generate fasta sequences using gffread for HC genes"
	input: gff=config['finalPrefix']+'_HC.gff3',
		fastaref=config['targetFasta']
	output: mrna=config['finalPrefix']+'_HC.transcripts.fasta',
		cds=config['finalPrefix']+'_HC.cds.fasta',
		pep=config['finalPrefix']+'_HC.proteins.fasta'
	log: config['finalPrefix']+'_HC.gffread.log'
	shell:
		"""
		gffread {input.gff} -w {output.mrna} -x {output.cds} -y {output.pep} -g {input.fastaref} -W -O &> {log}
		"""
rule generateFastaSequencesLC:
	message: "generate fasta sequences using gffread for LC genes"
	input: 
		gff=config['finalPrefix']+'_LC.gff3',
		fastaref=config['targetFasta']
	output: mrna=config['finalPrefix']+'_LC.transcripts.fasta',
		cds=config['finalPrefix']+'_LC.cds.fasta',
		pep=config['finalPrefix']+'_LC.proteins.fasta'
	log: config['finalPrefix']+'_LC.gffread.log'
Helene Rimbert's avatar
Helene Rimbert committed
		gffread {input.gff} -w {output.mrna} -x {output.cds} -y {output.pep} -g {input.fastaref} -W -O &> {log}
		"""
rule generateFastaSequencesRescue:
	message: "generate fasta sequences using gffread for rescue genes"
	input: 
		gff=config['finalPrefix']+'_rescue.gff3',
		fastaref=config['targetFasta']
	output: mrna=config['finalPrefix']+'_rescue.transcripts.fasta',
		cds=config['finalPrefix']+'_rescue.cds.fasta',
		pep=config['finalPrefix']+'_rescue.proteins.fasta'
	log: config['finalPrefix']+'_rescue.gffread.log'
	shell:
Helene Rimbert's avatar
Helene Rimbert committed
		gffread {input.gff} -w {output.mrna} -x {output.cds} -y {output.pep} -g {input.fastaref} -W -O &> {log}
		"""