diff --git a/Snakefile b/Snakefile index 3271b8b9568d36a6d2e6dc9cad39060c35b94d37..0fd66859fa1e927ca9d183d1ebf4d56458ec5762 100644 --- a/Snakefile +++ b/Snakefile @@ -30,7 +30,7 @@ rule grepGffFeature: rule splitGffPerChrom: message: "Split Gff Features per chromosome: current is {wildcards.chrom}" input: config['results']+"/1.features.bed" - output: config['results']+"/1.features/{chrom}.bed" + output: temp(config['results']+"/1.features/{chrom}.bed") log: config['results']+"/1.features/{chrom}.fgrep.log" params: "chr{chrom}" shell: diff --git a/bin/gmapRescue.sh b/bin/gmapRescue.sh index 38cb82f10958d0b66a74d9c317ad816f15a3759a..86200010621579e5cbb85701b904f6eda9e29eac 100755 --- a/bin/gmapRescue.sh +++ b/bin/gmapRescue.sh @@ -1,14 +1,4 @@ #!/bin/bash -##SBATCH --array=0-7 -##SBATCH -n1 -##SBATCH --mail-user=helene.rimbert@inrae.fr -##SBATCH --mail-type=ALL - -# for use with slurm -#chroms=($(echo TraesCS{1..7} TraesCSU)) -#chrom=${chroms[$SLURM_ARRAY_TASK_ID]} - -# get the list of query and target file for gmap inputDir=$1 blastdb=$2 @@ -16,21 +6,18 @@ gmapdb=$3 gmapdbdir=$4 outputRescueGff=$5 outputRescueWholeGenomeGff=$6 +wholegenomeFasta=$7 querylist=($(find $inputDir -type f -name 'query.fasta')) targetlist=($(find $inputDir -type f -name 'target.fasta')) numquery=${#querylist[@]} numtarget=${#targetlist[@]} -#blastdb='/banks/blast/IWGSCv1.1_repr_mrna' echo "STARTING AT `date` FOR DATASET $inputDir" echo "found $numquery query fasta file" echo "found $numtarget target fasta file" -echo "Check for genes with invilid isbp pairs" -fgrep -h 'Could not identify a pair of marker to use to anchor' *.out |cut -d ' ' -f 15|cut -d '.' -f 1|sort > 3_mapping_wrongISBP_pair.txt - for i in "${querylist[@]}"; do rootdir=`dirname $i` @@ -40,19 +27,7 @@ do echo " gene is $geneid" echo " datadir is $rootdir" - echo "1. creating gmap db for dataset $rootdir" - echo "================================================" - gmap_build -D $rootdir -d query $rootdir'/query.fasta' & - pid=$! - - if [ -f "$rootdir/GMAPRESCUE" ] - then - gmap_build -D $rootdir -d target $rootdir'/target.fasta' & - pid2=$! - fi - wait $pid $pid2 - - echo "2. extract all mrnas" + echo "1. extract all mrnas" echo "================================================" # on dump d'abord tous les IDs de la banque blast ('-outfmt %i)'), # puis on ne garde que les mrna du gène en cours de traitement, @@ -60,7 +35,7 @@ do blastdbcmd -db $blastdb -entry all -outfmt %i |fgrep -w $geneid |blastdbcmd -db $blastdb -entry_batch - > $rootdir'/query.mrna.fasta' - echo "3. run gmap on target and query" + echo "2. run gmap on target and query" echo "================================================" targetindex='target' targetindexdir=$rootdir @@ -70,48 +45,49 @@ do # check if we use as target the sub sequence or the entire genome if [ -f "$rootdir/UNMAPPED" ] then - echo " Gene $geneid has status UNMAPPED: mapping on the whole genome target" + echo " Gene $geneid has status UNMAPPED: mapping on the whole genome" targetindex=$gmapdb targetindexdir=$gmapdbdir targetoutputGff=$rootdir'/gmap.wholeGenome.gff3' gmapexe='gmapl' + eval $gmapexe --min-identity 0.90 --min-trimmed-coverage 0.80 --ordered -n 1 -d $targetindex -D $targetindexdir -f 2 $rootdir'/query.mrna.fasta' 1> $rootdir'/gmap.wholeGenome.gff3' + else + echo "running gmap on target" + eval gmap --ordered -n 1 -g $rootdir'/target.fasta' -f 2 $rootdir'/query.mrna.fasta' > $rootdir'/gmap.target.gff3' fi - eval $gmapexe -n 1 -d $targetindex -D $targetindexdir -f 2 $rootdir'/query.mrna.fasta' > $targetoutputGff & + eval gmap --ordered -n 1 -g $rootdir'/query.fasta' -f 2 $rootdir'/query.mrna.fasta' > $rootdir'/gmap.query.gff3' + + echo "3. indexing the query and target genomic" + echo "================================================" + samtools faidx $rootdir'/query.fasta' & pid=$! - eval gmap -n 1 -d query -D $rootdir -f 2 $rootdir'/query.mrna.fasta' > $rootdir'/gmap.query.gff3' & + samtools faidx $rootdir'/target.fasta' & pid2=$! wait $pid $pid2 - rm -rf $rootdir'/query' $rootdir'/target' - - echo "4. indexing the query and target genomic" + echo "4. running gffread to extract the pep sequence" echo "================================================" - samtools faidx $rootdir'/query.fasta' - samtools faidx $rootdir'/target.fasta' - - - - if [ -f "$rootdir/GMAPRESCUE" ] + if [ -f "$rootdir/UNMAPPED" ] then - echo "5. running gffread to extract the pep sequence" - echo "================================================" + gffread -x $rootdir'/target.cds.fasta' -y $rootdir'/target.pep.faa' -g $wholegenomeFasta $rootdir'/gmap.wholeGenome.gff3' + else gffread -x $rootdir'/target.cds.fasta' -y $rootdir'/target.pep.faa' -g $rootdir'/target.fasta' $rootdir'/gmap.target.gff3' - gffread -x $rootdir'/query.cds.fasta' -y $rootdir'/query.pep.faa' -g $rootdir'/query.fasta' $rootdir'/gmap.query.gff3' + fi + gffread -x $rootdir'/query.cds.fasta' -y $rootdir'/query.pep.faa' -g $rootdir'/query.fasta' $rootdir'/gmap.query.gff3' - echo "6. running diff to check if the pep sequences are different or not" - diffresult=$(diff $rootdir'/target.pep.faa' $rootdir'/query.pep.faa') - if [ "$diffresult" != "" ] - then - echo "RESULTS pep of gene $geneid has changed" - echo $diffresults > $rootdir/CDS_CHANGED - else - echo "RESULTS pep of gene $geneid stays the same" - touch $rootdir/CDS_OK - fi + echo "5. running diff to check if the pep sequences are different or not" + echo "================================================" + diffresult=$(diff $rootdir'/target.pep.faa' $rootdir'/query.pep.faa') + if [ "$diffresult" != "" ] + then + echo "RESULTS pep of gene $geneid has changed" + echo $diffresults > $rootdir/CDS_CHANGED + awk 'BEGIN{OFS="\t"}{if ($3 ~ /gene/){print $0";cds=CDS_CHANGED"} else {print $0}}' $rootdir'/gmap.wholeGenome.gff3' > $rootdir/'tmp.gff3' && mv $rootdir/'tmp.gff3' $rootdir'/gmap.wholeGenome.gff3' else - echo " rescue done on the whole genome: we do not check for pep sequences" - touch $rootdir/CDS_UNKNOWN + echo "RESULTS pep of gene $geneid stays the same" + touch $rootdir/CDS_OK + awk 'BEGIN{OFS="\t"}{if ($3 ~ /gene/){print $0";cds=CDS_OK"} else {print $0}}' $rootdir'/gmap.wholeGenome.gff3' > $rootdir/'tmp.gff3' && mv $rootdir/'tmp.gff3' $rootdir'/gmap.wholeGenome.gff3' fi done diff --git a/bin/microMappingPipeline.py b/bin/microMappingPipeline.py index f9411635f9784d238c092bb5a83c9aee07915a22..33c24f9918c7a0c6f3bd1c154df96846bfbb2ecb 100755 --- a/bin/microMappingPipeline.py +++ b/bin/microMappingPipeline.py @@ -464,8 +464,10 @@ def checkPerfectHit(blatresult, workingDir, maxhit): def runBlat(db,query,blatResult, outFormat): cmd='blat -noHead -extendThroughN -out='+outFormat+' '+db+' '+query+' '+blatResult + #cmd='blastn -outfmt "6 std qcovs slen sframe" -perc_identity 95 -qcov_hsp_perc 80 -query '+query+' -subject '+db print(' blat command to run: %s'% str(cmd)) os.system('blat -noHead -extendThroughN -out={} {} {} {}'.format(outFormat,db,query,blatResult)) + #os.system('blastn -outfmt "6 std qcovs slen sframe" -perc_identity 95 -qcov_hsp_perc 80 -query {} -subject {} -out {}'.format(query, db, blatResult)) return blatResult def getFastaSeq(fasta,chrom,start,stop): diff --git a/config.yaml b/config.yaml index 7f1703ce656bf341f60a92678088907f6fa1fd87..b1b9ce31c6a9e1d4347332057fc528ad3df66668 100644 --- a/config.yaml +++ b/config.yaml @@ -25,8 +25,8 @@ mapq: 30 mismatches: 0 ##### OUTPUT directory -results: 'results_200615' -finalPrefix: 'IWGSC_refseqv2.1_annotv2.0_200617' +results: 'results_2008' +finalPrefix: 'IWGSC_refseqv2.1_annotv2.0_200819' # this file contains two columns: the first is the chromosome name as it appears in the genome.fasta of the new reference, # and the second the chromosome name as it will appear in the new gene Names chromMapID: 'data/chromosomeMappingID.csv' @@ -35,4 +35,4 @@ chromMapID: 'data/chromosomeMappingID.csv' # used in rule renameGeneIds (rules/geneAnchoring.smk) gff_prefix: 'TraesCS' gff_version: '03G' -gff_source: 'IWGSC_v2.1' \ No newline at end of file +gff_source: 'IWGSC_v2.1' diff --git a/rules/bedtoolsClosest.smk b/rules/bedtoolsClosest.smk index 49466cadc9071b8c4d922c23d80759d6529d5392..3dcf9a2ab7432642f1e8e16da9f21647cfe58509 100644 --- a/rules/bedtoolsClosest.smk +++ b/rules/bedtoolsClosest.smk @@ -1,8 +1,8 @@ rule selectMappedISBP: message: " Select only mapped ISBPS on new refseq" - input: mapped=config['results']+'/1.filteredISBPs.sorted.bed', original=config['isbpBed'] - output: config['results']+'/2.mappedISBPs_coordsOnQuery.bed' - log: config['results']+'/2.mappedISBPs_coordsOnQuery.log' + input: mapped=config['results']+'/1.filteredISBPs/sorted.bed', original=config['isbpBed'] + output: temp(config['results']+'/2.mappedISBPs/coordsOnQuery.bed') + log: config['results']+'/2.mappedISBPs/coordsOnQuery.log' shell: """ cut -f 4 {input.mapped}|fgrep -wf - {input.original} |sort -k1,1 -k2,2n 1> {output} 2> {log} @@ -10,9 +10,9 @@ rule selectMappedISBP: rule keepMappedOnSameChrom: message: " Select Mapped ISBPs on same chromosome (or on unknown chromosome)" - input: isbpOnQuery=config['results']+'/2.mappedISBPs_coordsOnQuery.bed', isbpOnTarget=config['results']+"/1.filteredISBPs.sorted.bed" - output: config['results']+'/2.isbpMappedOnSameChrom.bed' - log: config['results']+'/2.isbpMappedOnSameChrom.log' + input: isbpOnQuery=config['results']+'/2.mappedISBPs/coordsOnQuery.bed', isbpOnTarget=config['results']+"/1.filteredISBPs/sorted.bed" + output: temp(config['results']+'/2.mappedISBPs/OnSameChrom.bed') + log: config['results']+'/2.mappedISBPs/OnSameChrom.log' shell: """ join -1 4 -2 4 <(sort -k4,4 {input.isbpOnQuery}) <(sort -k4,4 {input.isbpOnTarget})|sed "s/ /\t/g"|awk "{{if (tolower(\$2) == tolower(\$5)) {{print \$2,\$3,\$4,\$1}} }}" |sort -k1,1 -k2,2n |sed "s/ /\t/g"> {output} @@ -22,9 +22,9 @@ rule keepMappedOnSameChrom: rule upstreamClosest: message: " Collect closest marker upstream of genes" - input: annot=config['results']+"/1.features.bed", markers=config['results']+'/2.isbpMappedOnSameChrom.bed' - output: temp(config['results']+'/2.closestbedUpstream.txt') - log: config['results']+"/2.closestbedUpstream.log" + input: annot=config['results']+"/1.features.bed", markers=config['results']+'/2.mappedISBPs/OnSameChrom.bed' + output: temp(config['results']+'/2.closestbed/Upstream.txt') + log: config['results']+"/2.closestbed/Upstream.log" shell: """ bedtools closest -k 5 -id -D ref -io -a {input.annot} -b {input.markers} 1> {output} 2> {log} @@ -32,9 +32,9 @@ rule upstreamClosest: rule downstreamClosest: message: " Collect Downstream marker downstream of genes" - input: annot=config['results']+"/1.features.bed", markers=config['results']+'/2.isbpMappedOnSameChrom.bed' - output: temp(config['results']+'/2.downstreamClosest.txt') - log: config['results']+"/2.downstreamClosest.log" + input: annot=config['results']+"/1.features.bed", markers=config['results']+'/2.mappedISBPs/OnSameChrom.bed' + output: temp(config['results']+'/2.closestbed/downstream.txt') + log: config['results']+"/2.closestbed/downstream.log" shell: """ bedtools closest -k 5 -iu -D ref -io -a {input.annot} -b {input.markers} 1> {output} 2> {log} @@ -43,13 +43,13 @@ rule downstreamClosest: rule splitPerChrom: message: "Split data per chromosome" input: - upstream=config['results']+'/2.closestbedUpstream.txt', - downstream=config['results']+'/2.downstreamClosest.txt' + upstream=config['results']+'/2.closestbed/Upstream.txt', + downstream=config['results']+'/2.closestbed/downstream.txt' output: - splitUp=config['results']+'/2.closestbed_split/{chrom}.upstream.txt', - splitDown=config['results']+'/2.closestbed_split/{chrom}.downstream.txt' + temp(splitUp=config['results']+'/2.closestbed/{chrom}/upstream.txt'), + splitDown=config['results']+'/2.closestbed/{chrom}/downstream.txt' params: chromPref='TraesCS{chrom}' - log: config['results']+'/2.closestbed_split/{chrom}.split.log' + log: config['results']+'/2.closestbed/{chrom}/split.log' shell: """ fgrep {params.chromPref} {input.upstream} 1> {output.splitUp} 2> {log} diff --git a/rules/geneAnchoring.smk b/rules/geneAnchoring.smk index 816c2bff6f280d81f0423cb53472f65afe4037dc..ea0d2ff0671d06d582bad2df1d108130339cdf98 100644 --- a/rules/geneAnchoring.smk +++ b/rules/geneAnchoring.smk @@ -2,13 +2,13 @@ rule mapHomologousRegions: message: " mapping homologous regions of both references using ISBPs markers as anchors for chromosome {wildcards.chrom}" input: #closestMarkers=config['results']+'/2.mergedClosestMarkers.txt', - marker5prime=config['results']+'/2.closestbed_split/{chrom}.upstream.txt', - marker3prime=config['results']+'/2.closestbed_split/{chrom}.downstream.txt', + marker5prime=config['results']+'/2.closestbed/{chrom}/upstream.txt', + marker3prime=config['results']+'/2.closestbed/{chrom}/downstream.txt', refQuery=config['queryFasta'], faiQuery=config['queryFasta']+'.fai', refTarget=config['targetFasta'], faiTarget=config['targetFasta']+'.fai', - markersOnTarget=config['results']+"/1.filteredISBPs.sorted.{chrom}.bed" + markersOnTarget=config['results']+"/1.filteredISBPs/{chrom}/sorted.bed" output: allblat=config['results']+'/3.mapping/{chrom}/allBlat.csv', summary=config['results']+'/3.mapping/{chrom}/mappingSummary.csv' @@ -38,7 +38,7 @@ rule recalcBlatMapped: rule gtCleanBlatGff: message: " Clean the gff file recalculated based on Blat fine mapping for chromosome {wildcards.chrom}" input: config['results']+'/4.recalcBlat/{chrom}/RecalcAnnotOnTarget.gff3' - output: config['results']+'/4.recalcBlat/{chrom}/RecalcAnnotOnTarget-clean.gff3' + output: temp(config['results']+'/4.recalcBlat/{chrom}/RecalcAnnotOnTarget-clean.gff3') log: config['results']+'/4.recalcBlat/{chrom}/RecalcAnnotOnTarget-clean.log' shell: """ @@ -47,21 +47,21 @@ rule gtCleanBlatGff: rule gmapRescue: message: " Rescue anchoring of failed genes with gmap for chrom {wildcards.chrom}" - input: blat=config['results']+'/3.mapping/{chrom}/allBlat.csv', gmapindex=config['results']+"/target_gmapindex" + input: blat=config['results']+'/3.mapping/{chrom}/allBlat.csv', wholeGenomeFasta=config['targetFasta'] output: txt=config['results']+'/4.gmapRescue/{chrom}.txt', - targetgff=config['results']+'/4.gmapRescue/{chrom}.target.gff3', - wholegenomegff=config['results']+'/4.gmapRescue/{chrom}.wholeGenome.gff3' - params: blastdb=config['blastdb'], anchoringDir=config['results']+'/3.mapping/{chrom}/', gmapIndexDir=config['results'] + targetgff=temp(config['results']+'/4.gmapRescue/{chrom}.target.gff3'), + wholegenomegff=temp(config['results']+'/4.gmapRescue/{chrom}.wholeGenome.gff3') + params: blastdb=config['blastdb'], anchoringDir=config['results']+'/3.mapping/{chrom}/', gmapIndexDir=config['results'], gmapindex="target_gmapindex" log: config['results']+'/4.gmapRescue/{chrom}.log' shell: """ - bin/gmapRescue.sh {params.anchoringDir} {params.blastdb} {input.gmapindex} {params.gmapIndexDir} {output.targetgff} {output.wholegenomegff} 1> {output.txt} 2> {log} + bin/gmapRescue.sh {params.anchoringDir} {params.blastdb} {params.gmapindex} {params.gmapIndexDir} {output.targetgff} {output.wholegenomegff} {input.wholeGenomeFasta} 1> {output.txt} 2> {log} """ rule recalcGmapRescue: message: "Recalc the coordinates of the GMAP on target GFF3 files for chromosome {wildcards.chrom}" input: config['results']+'/4.gmapRescue/{chrom}.target.gff3', - output: config['results']+'/4.recalcGmap/{chrom}/recalc.gff3' + output: temp(config['results']+'/4.recalcGmap/{chrom}/recalc.gff3') params: anchoringDir=config['results']+'/3.mapping/{chrom}', bindir=os.getcwd() log: config['results']+'/4.recalcGmap/{chrom}/recalc.log' @@ -74,8 +74,8 @@ rule saveGmapWG: message: "Save the GMAP results of genes mapped on Whole Genome" input: gff=config['results']+'/4.gmapRescue/{chrom}.wholeGenome.gff3', map=config['chromMapID'] - output: gff=config['results']+'/4.gmapWholeGenome/{chrom}.wholeGenome.gff3', - gfferror=config['results']+'/4.gmapWholeGenome/{chrom}.wholeGenome_differentChrom.gff3' + output: gff=temp(config['results']+'/4.gmapWholeGenome/{chrom}.wholeGenome.gff3'), + gfferror=tmp(config['results']+'/4.gmapWholeGenome/{chrom}.wholeGenome_differentChrom.gff3') log: config['results']+'/4.gmapWholeGenome/{chrom}.wholeGenome.log' shell: """ @@ -89,7 +89,7 @@ rule mergeFinalGff3: input: blat=config['results']+'/4.recalcBlat/{chrom}/RecalcAnnotOnTarget-clean.gff3', rescue=config['results']+'/4.recalcGmap/{chrom}/recalc.gff3', wg=config['results']+'/4.gmapWholeGenome/{chrom}.wholeGenome.gff3' - output: annot=config['results']+'/5.FINAL/{chrom}/annotation.gff3' + output: annot=temp(config['results']+'/5.FINAL/{chrom}/annotation.gff3') log: annot=config['results']+'/5.FINAL/{chrom}/annotation.log' shell: """ @@ -99,7 +99,7 @@ rule checkMissing: message: "" input: gff=config['results']+'/5.FINAL/{chrom}/annotation.gff3', ref=config['results']+"/1.features/{chrom}.bed" - output: config['results']+'/5.FINAL/{chrom}/missing.txt' + output: temp(config['results']+'/5.FINAL/{chrom}/missing.txt') log: config['results']+'/5.FINAL/{chrom}/missing.log' shell: """ diff --git a/rules/preprocessISBP.smk b/rules/preprocessISBP.smk index c4542f124140741f31282a93e29d350cd27bef24..83493d6437f828682c22ccb66e38d27d86d81577 100644 --- a/rules/preprocessISBP.smk +++ b/rules/preprocessISBP.smk @@ -16,21 +16,21 @@ rule bam2bed: rule sortBed: message: "Sort ISBPs bed file" input: config['results']+"/1.filteredISBPs.bed" - output: config['results']+"/1.filteredISBPs.sorted.bed" + output: temp(config['results']+"/1.filteredISBPs/sorted.bed") log: config['results']+'/1.sortBed.log' shell: "sort -k1,1 -k2,2n {input} 1> {output} 2> {log}" rule dumpISBPsID: message: "Dump ISBPs IDs" - input: config['results']+"/1.filteredISBPs.sorted.bed" + input: config['results']+"/1.filteredISBPs/sorted.bed" output: temp(config['results']+"/1.filteredISBPs.ids") shell: " cut -f 4 {input} > {output}" rule splitISBP: message: "Split isbps per chromosome" input: config['results']+"/1.filteredISBPs.sorted.bed" - output: config['results']+"/1.filteredISBPs.sorted.{chrom}.bed" - log: config['results']+"/1.filteredISBPs.splitPerChrom{chrom}.log" + output: temp(config['results']+"/1.filteredISBPs/{chrom}/sorted.bed") + log: config['results']+"/1.filteredISBPs/{chrom}/sorted.log" params: 'Chr{chrom}' shell: """