diff --git a/bin/gmapRescue.sh b/bin/gmapRescue.sh index 35bc35312b458307f37b01e1786293c9a0f99c02..a449cf8613ebd8b88064960c69a75d31abbaf70d 100755 --- a/bin/gmapRescue.sh +++ b/bin/gmapRescue.sh @@ -48,12 +48,12 @@ do targetindexdir=$gmapdbdir targetgff=$rootdir'/gmap.wholeGenome.gff3' gmapexe='gmapl' - eval $gmapexe --min-identity 0.90 --min-trimmed-coverage 0.80 --ordered -n 1 -d $targetindex -D $targetindexdir -f 2 $rootdir'/query.mrna.fasta' 1> $targetgff 2> $rootdir'/gmap.wg.log' + eval $gmapexe --min-identity 0.80 --min-trimmed-coverage 0.50 --ordered -n 1 -d $targetindex -D $targetindexdir -f 2 $rootdir'/query.mrna.fasta' 1> $targetgff 2> $rootdir'/gmap.wg.log' && rm $rootdir'/query.mrna.fasta' else echo "running gmap on target" - eval gmap --ordered -n 1 -g $rootdir'/target.fasta' -f 2 $rootdir'/query.mrna.fasta' 1> $targetgff 2> $rootdir'/gmap.target.log' + eval gmap --ordered --min-identity 0.80 --min-trimmed-coverage 0.50 -n 1 -g $rootdir'/target.fasta' -f 2 $rootdir'/query.mrna.fasta' 1> $targetgff 2> $rootdir'/gmap.target.log' && rm $rootdir'/query.mrna.fasta' fi - eval gmap --ordered -n 1 -g $rootdir'/query.fasta' -f 2 $rootdir'/query.mrna.fasta' 1> $rootdir'/gmap.query.gff3' + eval gmap --ordered --min-identity 0.80 --min-trimmed-coverage 0.50 -n 1 -g $rootdir'/query.fasta' -f 2 $rootdir'/query.mrna.fasta' 1> $rootdir'/gmap.query.gff3' && rm $rootdir'/query.mrna.fasta' echo "3. indexing the query and target genomic" samtools faidx $rootdir'/query.fasta' & @@ -65,7 +65,7 @@ do echo "4. running gffread to extract the pep sequence" if [ -f "$rootdir/UNMAPPED" ] then - gffread -x $rootdir'/target.cds.fasta' -y $rootdir'/target.pep.faa' -g $wholegenomeFasta $targetgff + gffread -x $rootdir'/target.cds.fasta' -y $rootdir'/target.pep.faa' -g $wholegenomeFasta $targetgff && rm $rootdir'/target.fasta' else gffread -x $rootdir'/target.cds.fasta' -y $rootdir'/target.pep.faa' -g $rootdir'/target.fasta' $targetgff && rm $rootdir'/target.fasta' fi @@ -77,13 +77,14 @@ do then echo "RESULTS pep of gene $geneid has changed" echo $diffresults > $rootdir/CDS_CHANGED - awk 'BEGIN{OFS="\t"}{if ($3 ~ /gene/){print $0";cds=CDS_CHANGED"} else {print $0}}' $targetgff > $rootdir/'tmp.gff3' && mv $rootdir/'tmp.gff3' $targetgff + awk 'BEGIN{OFS="\t"}{if ($3 ~ /gene/){print $0";cds=CDS_CHANGED"} else {print $0}}' $targetgff > $rootdir/'tmp.gff3' && mv $rootdir/'tmp.gff3' $targetgffi else echo "RESULTS pep of gene $geneid stays the same" touch $rootdir/CDS_OK awk 'BEGIN{OFS="\t"}{if ($3 ~ /gene/){print $0";cds=CDS_OK"} else {print $0}}' $targetgff > $rootdir/'tmp.gff3' && mv $rootdir/'tmp.gff3' $targetgff fi + rm $rootdir'/target.pep.faa' $rootdir'/query.pep.faa' done diff --git a/bin/microMappingPipeline.py b/bin/microMappingPipeline.py index cdc4ceb8e3bb551446e984c489bade362e11e45f..a0502e86c7e51606b4f685472529ee363c40ad9f 100755 --- a/bin/microMappingPipeline.py +++ b/bin/microMappingPipeline.py @@ -509,7 +509,7 @@ def getCoordsFromBed (bedfile): sys.exit() #print(" bed array content ") #print(bed) - bedDict[str(bed[3])] ={'chrom': bed[0],'start':bed[1],'stop':bed[2],'mapQ': bed[4],'strand':bed[5]} + bedDict[str(bed[3])] ={'chrom': bed[0],'start':bed[1],'stop':bed[2]} numRecords=len(bedDict.keys()) sys.stdout.write(" Found %i records in Bed file %s " % (numRecords, bedfile)) @@ -542,4 +542,4 @@ if __name__== "__main__": outputTableFH=open(outputTable_file, 'w') outputBlatFH=open(outputBlatFile, 'w') - main() \ No newline at end of file + main() diff --git a/bin/recalcCoordsFromBlat.py b/bin/recalcCoordsFromBlat.py index d0b9e8c9eaddfe92cd1e79733d444c0e4c6e5336..21e716a2a2cb95105f789d49fdf6ce1de9f3f89f 100755 --- a/bin/recalcCoordsFromBlat.py +++ b/bin/recalcCoordsFromBlat.py @@ -238,8 +238,8 @@ class recalcCoords (object): if seq1 != seq2: sys.stderr.write(" error with feature {} :".format(gffInfoDict['ID'])) sys.stderr.write( "the 2 sequences are not the same: warning added into the gff file ! \n" ) - sys.stderr.write(' Query: {}'.format(seq1)) - sys.stderr.write(' Target: {}'.format(seq2)) + #sys.stderr.write(' Query: {}'.format(seq1)) + #sys.stderr.write(' Target: {}'.format(seq2)) transfertStatus=1 else: diff --git a/bin/reformatGmapWGGFF.py b/bin/reformatGmapWGGFF.py index 6b4cf00edc582af1705973a7da49d40f1296bdef..e474732cce6916fd93c8c98f47c4fa0c06910380 100755 --- a/bin/reformatGmapWGGFF.py +++ b/bin/reformatGmapWGGFF.py @@ -37,7 +37,7 @@ class reformatGmapGff (object): self.writeOutputGff() self.outputFH.close() self.errorFH.close() - + def writeOutputGff(self): for gene in self.knownGenes: print('###', file=self.outputFH) @@ -53,7 +53,7 @@ class reformatGmapGff (object): print('\t'.join(map(str, gff_gene)), file=self.outputFH) fake_gene_feature = 1 - + for isoform in self.geneIsoforms[gene]: pprint(" working with gene {} isoform {}".format(gene, isoform)) for feature in self.mrnaFeatures[isoform]: @@ -73,7 +73,7 @@ class reformatGmapGff (object): print("\t".join(map(str, feature)), file=self.errorFH) else: print("\t".join(map(str, feature)), file=self.errorFH) - + def reformatGff(self): lastmrna='' featureindex={'CDS':1, 'exon':1} @@ -89,7 +89,7 @@ class reformatGmapGff (object): # get only the gene name because here the ID corresponds to the mrna isoform which has been mapped geneId=featureId.split('.')[0] lastmrna='' - + if geneId not in self.knownGenes: self.knownGenes.append(geneId) self.geneIsoforms[geneId]=[] @@ -110,6 +110,7 @@ class reformatGmapGff (object): featureindex={'CDS':1, 'exon':1} self.knownMrna.append(mrna) parent=mrna.split('.')[0] + self.geneIsoforms[parent].append(mrna) gff_array[8] = self.setGffAttributes(attributes=gff_array[8], new={'Parent': parent, 'ID':mrna, 'Name':mrna}) self.mrnaFeatures[mrna]=[gff_array] @@ -123,9 +124,9 @@ class reformatGmapGff (object): self.geneFeatures[parent]['stop'] = gff_array[4] sys.stderr.write(" Update stop of gene {}\n".format(parent)) sys.stderr.write(" New gene length: {}\n".format(int(gff_array[4])-int(gff_array[3])+1)) - + else: - + featureId=lastmrna+'.'+featuretype.lower()+str(featureindex[featuretype]) featureParent=lastmrna gff_array[8] = self.setGffAttributes(attributes=gff_array[8], new={'ID':featureId, 'Parent': featureParent, 'Name':featureId}) @@ -136,7 +137,7 @@ class reformatGmapGff (object): numgenes=len(self.knownGenes) nummrna=len(self.knownMrna) sys.stderr.write(" Found {} genes for {} mRNA in {} GFF3 file.\n".format(numgenes, nummrna, self.options.output)) - + def setGffAttributes(self, attributes, new): attr_array=attributes.rstrip('\n').split(';') attr_dict=defaultdict() @@ -145,7 +146,7 @@ class reformatGmapGff (object): (key, val) = attr.split('=') attr_dict[key] = val ordered_keys.append(key) - + for newAttr in new.keys(): attr_dict[newAttr] = new[newAttr] if newAttr not in ordered_keys: @@ -154,7 +155,7 @@ class reformatGmapGff (object): output_array=[] for attr in ordered_keys: output_array.append(str(attr)+'='+str(attr_dict[attr])) - + return ';'.join(map(str, output_array)) diff --git a/bin/renameGffID.py b/bin/renameGffID.py index 45cc17f7288a4a84b3a5ff3ac2ee1bf139294f0a..4d2ec3af6aa38e016a3bd1fd24938904f61d31dd 100755 --- a/bin/renameGffID.py +++ b/bin/renameGffID.py @@ -71,7 +71,7 @@ class renameIDs (object): newGffLine[1]=self.options.source #pprint("New gff record for gene {}".format(newGffLine)) - + ## print gff output if self.regexLC.search(newGeneId): print('###', file=self.outputGFFFHLC) @@ -86,7 +86,7 @@ class renameIDs (object): mrnaIndex+=1 mrnaId=newGeneId+'.'+str(mrnaIndex) subfeaturesIndex ={'CDS':0, 'exon':0, 'utr5p':0, 'utr3p':0, 'ncss5p':0} - + #print(" old mrna {} changed to {}\n".format(oldId, mrnaId)) # newGffLine is an array, easier to print @@ -99,12 +99,12 @@ class renameIDs (object): #pprint("New gff record for mrna {}".format(newGffLine)) if self.regexLC.search(mrnaId): - print('###', file=self.outputGFFFHLC) + #print('###', file=self.outputGFFFHLC) print('\t'.join(map(str, newGffLine)), file=self.outputGFFFHLC) else: - print('###', file=self.outputGFFFH) + #print('###', file=self.outputGFFFH) print('\t'.join(map(str, newGffLine)), file=self.outputGFFFH) - + elif featureType in ['CDS', 'exon', 'five_prime_UTR', 'three_prime_UTR', 'non_canonical_five_prime_splice_site']: oldId=self.getFeatureAttribute(gff=line, attribute='ID') # shorten too long feature type for alias in ID @@ -129,15 +129,15 @@ class renameIDs (object): #pprint("New gff record for mrna {}".format(newGffLine)) if self.regexLC.search(mrnaId): - print('###', file=self.outputGFFFHLC) + #print('###', file=self.outputGFFFHLC) print('\t'.join(map(str, newGffLine)), file=self.outputGFFFHLC) else: - print('###', file=self.outputGFFFH) + #print('###', file=self.outputGFFFH) print('\t'.join(map(str, newGffLine)), file=self.outputGFFFH) - + else: - continue - + continue + def setGffAttribute(self, dict, gff): gff_array=gff.rstrip('\n').split('\t') gff_attributes=gff_array[8].split(';') @@ -148,7 +148,7 @@ class renameIDs (object): (key,val) = attr.split('=') attributes_dict[key]=val attribute_keys_list.append(key) - + # update the desired attribute with the new value for attr_to_change in dict.keys(): attributes_dict[attr_to_change] = dict[attr_to_change] @@ -159,7 +159,7 @@ class renameIDs (object): newAttrArray =[] for attr in attribute_keys_list: newAttrArray.append(attr+'='+attributes_dict[attr]) - + # create the entire gff record output_array.append(';'.join(newAttrArray)) return output_array @@ -173,7 +173,7 @@ class renameIDs (object): for coord in sorted(coords): oldGeneId=self.geneMapCoord[chrom][str(coord)] geneNum += step - newGeneId = self.options.prefix + self.chromosomeMap[chrom] + self.options.version + str(geneNum).zfill(6) + newGeneId = self.options.prefix + self.chromosomeMap[chrom] + self.options.version + str(geneNum).zfill(8) if self.regexLC.search(oldGeneId): newGeneId+='LC' #print(" current gene {} at coordinate {} on chrom {}: new ID = {}".format(oldGeneId, coord, chrom, newGeneId)) @@ -189,7 +189,7 @@ class renameIDs (object): print("old_id\tnew_ID", file=tmpfh) for gene in sorted(self.correspondance.keys()): print("{}\t{}".format(gene, self.correspondance[gene]), file=tmpfh) - + # save overlapping gene info to BED file with open(self.options.overlap, 'w') as tmpfh: for gene in sorted(self.overlappingGenes.keys()): @@ -199,7 +199,7 @@ class renameIDs (object): def createCoordGeneMap(self): """ - populate dictionary self.geneMapCoord with structure: + populate dictionary self.geneMapCoord with structure: defaultdict(None, {'Chr1A': defaultdict(None, {'102794': 'TraesCS1A02G000400', @@ -251,12 +251,12 @@ class renameIDs (object): self.overlappingGenes[geneId] = {'start': int(coord), 'chrom': chrom, 'stop':int(stop)} coord = str(int(coord) +1) numGenesWithSamCoord+=1 - + self.geneMapCoord[chrom][coord] = geneId numGenesAddedInDict+=1 sys.stdout.write(" Added {} in dictionary\n".format(numGenesAddedInDict)) sys.stdout.write(" {} had same coordinates on chromosomes\n".format(numGenesWithSamCoord)) - + def getFeatureAttribute(self, gff, attribute): attr_array=gff.rstrip('\n').split('\t')[8].split(';') attr_dict=defaultdict() @@ -287,7 +287,7 @@ class renameIDs (object): self.newGeneMapCoord[name] = defaultdict() sys.stdout.write(' found {} chromosome records in file {}\n'.format(len(self.chromosomeMap.keys()),self.options.map )) pprint(self.chromosomeMap) - + def getOptions(self): parser=argparse.ArgumentParser(description='Check If some genes are missing in the final annotation') parser.add_argument('-i', '--input', help='Input GFF3 file to rename ', required=True) @@ -320,4 +320,4 @@ class renameIDs (object): if __name__== "__main__": run=renameIDs() - run.main() \ No newline at end of file + run.main() diff --git a/cluster-sibi.json b/cluster-sibi.json new file mode 100644 index 0000000000000000000000000000000000000000..71528bfd24cce8f2420f8a515f0d4d673854167e --- /dev/null +++ b/cluster-sibi.json @@ -0,0 +1,22 @@ +{ + "__default__" : + { + "c" : 1, + "mem" : "4G", + "jobName" : "magatt_{rule}", + "error" : "slurm/%x-%J.log", + "output" : "slurm/%x-%J.log" + }, + "mapHomologousRegions" : + { + "mem" : "8G" + }, + "gmapRescue": + { + "mem" : "8G" + }, + "gmapIndexTarget": + { + "mem" : "64G" + } +} diff --git a/config.yaml b/config.yaml index b1b9ce31c6a9e1d4347332057fc528ad3df66668..7aa033f54dde5120afc5667fb2d0a6d2788dda84 100644 --- a/config.yaml +++ b/config.yaml @@ -1,38 +1,38 @@ -##### QUERY related files/parameters (refseqv1.0) +##### QUERY related files/parameters (refseqv2.1) # GFF annotatin to transfert -annotationQuery: 'data/IWGSC_v1.2_20200615.gff3' +annotationQuery: 'data/IWGSC_refseqv2.1/IWGSC_refseqv2.1_annotation.gff3' # feature type used for anchoring on target genome featureType: 'gene' # FASTA of the query (used to check the sequences after the coordinates are calculated on the target genome) -queryFasta: 'data/161010_Chinese_Spring_v1.0_pseudomolecules.fasta' +queryFasta: 'data/IWGSC_refseqv2.1/IWGSC_refseqv2.1_genome.fasta' # blastdb of all mrnas. used to rescue genes which have failed in the transfert using the targeted approache -blastdb: 'data/IWGSC_v1.2_20200615.transcripts.fasta' +blastdb: 'data/IWGSC_refseqv2.1/IWGSC_refseqv2.1_mrna' # map of all chromosome ids --> NEED TO BE UPDATED in another version WITH ONE ARRAY FOR THE QUERY AND ONE ARRAY FOR THE TARGET GENOME ASSEMBLY chromosomes: ['1A', '2A', '3A', '4A', '5A', '6A', '7A', '1B', '2B', '3B', '4B', '5B', '6B', '7B', '1D', '2D', '3D', '4D', '5D', '6D', '7D', 'U'] refChrom: ['chr1A', 'chr1B', 'chr1D', 'chr2A', 'chr2B', 'chr2D', 'chr3A', 'chr3B', 'chr3D', 'chr4A', 'chr4B', 'chr4D', 'chr5A', 'chr5B', 'chr5D', 'chr6A', 'chr6B', 'chr6D', 'chr7A', 'chr7B', 'chr7D', 'chrUn'] -##### TARGET related files/parameters (refseqv2.1) -targetFasta: 'data/CS_pesudo_v2.1.fa' +##### TARGET related files/parameters (julius) +targetFasta: 'data/TraesJulius_pseudo.fasta' ##### ISBP/markers related config and parameters -# BAM file of markers/ISBPs mapped on the target genome (REFSEQ v2.1) -isbpBam: 'data/iwgsc_refseqv1_ISBP.bwav2.1.bam' -# BED file of coordinates on the query genome (REFSEQ v1.0) -isbpBed: 'data/ISBP_refseqv1.bed' +# BAM file of markers/ISBPs mapped on the target genome (Julius) +isbpBam: 'data/TraesJulius_pseudo.isbps.bam' +# BED file of coordinates on the query genome (REFSEQ v2.1) +isbpBed: 'data/IWGSC_refseqv2.1/IWGSC_refseqv2.1_ISBPs.bed' # minimum mapping quality of markers on the target genome mapq: 30 # max mismatches per ISBP/marker -mismatches: 0 +mismatches: 2 ##### OUTPUT directory -results: 'results_2008' -finalPrefix: 'IWGSC_refseqv2.1_annotv2.0_200819' +results: 'results' +finalPrefix: 'TaeJulius_magatt_20november' # this file contains two columns: the first is the chromosome name as it appears in the genome.fasta of the new reference, # and the second the chromosome name as it will appear in the new gene Names chromMapID: 'data/chromosomeMappingID.csv' ##### Nomenclature for final gene IDs # used in rule renameGeneIds (rules/geneAnchoring.smk) -gff_prefix: 'TraesCS' -gff_version: '03G' -gff_source: 'IWGSC_v2.1' +gff_prefix: 'TraesJU' +gff_version: '01G' +gff_source: 'MAGATT-IWGSCCSv2.1' diff --git a/report/dag.pdf b/report/dag.pdf deleted file mode 100644 index 95ad06c3e5004b691e2a3aa878711571e413a9be..0000000000000000000000000000000000000000 Binary files a/report/dag.pdf and /dev/null differ diff --git a/report/dag.png b/report/dag.png index 4f1c95fcb1eec2a9e25f1ef2c1570a2c57469cf2..c3579d73dfe01858409dba460ce21a2ad610770c 100644 Binary files a/report/dag.png and b/report/dag.png differ diff --git a/report/rulegraph.pdf b/report/rulegraph.pdf deleted file mode 100644 index 908a62d9677887e19eaebf22490a566791760c79..0000000000000000000000000000000000000000 Binary files a/report/rulegraph.pdf and /dev/null differ diff --git a/report/rulegraph.png b/report/rulegraph.png index e5e7065525801bdc680e1e6d412aa674b709877e..b4f59aba070d375f311922c07a1e02104bfe168c 100644 Binary files a/report/rulegraph.png and b/report/rulegraph.png differ diff --git a/rules/bedtoolsClosest.smk b/rules/bedtoolsClosest.smk index 4b1fcda78740fcb39c5f4bc9954b94b6cb02f82e..87c12bad4c2508183e7ede076b73361136f52745 100644 --- a/rules/bedtoolsClosest.smk +++ b/rules/bedtoolsClosest.smk @@ -1,30 +1,32 @@ rule selectMappedISBP: message: " Select only mapped ISBPS on new refseq" - input: mapped=config['results']+'/1.filteredISBPs/sorted.bed', original=config['isbpBed'] - output: temp(config['results']+'/2.mappedISBPs/coordsOnQuery.bed') - log: config['results']+'/2.mappedISBPs/coordsOnQuery.log' + input: mapped=config['results']+'/1.filteredISBPs/{chrom}/sorted.bed', + original=config['isbpBed'] + output: config['results']+'/2.mappedISBPs/{chrom}/coordsOnQuery.bed' + log: config['results']+'/2.mappedISBPs/{chrom}/coordsOnQuery.log' shell: """ cut -f 4 {input.mapped}|fgrep -wf - {input.original} |sort -k1,1 -k2,2n 1> {output} 2> {log} + fgrep -i chrun {input.original} |sort -k1,1 -k2,2n 1>> {output} 2>> {log} """ rule keepMappedOnSameChrom: message: " Select Mapped ISBPs on same chromosome (or on unknown chromosome)" - input: isbpOnQuery=config['results']+'/2.mappedISBPs/coordsOnQuery.bed', isbpOnTarget=config['results']+"/1.filteredISBPs/sorted.bed" - output: temp(config['results']+'/2.mappedISBPs/OnSameChrom.bed') - log: config['results']+'/2.mappedISBPs/OnSameChrom.log' + input: isbpOnQuery=config['results']+'/2.mappedISBPs/{chrom}/coordsOnQuery.bed', + isbpOnTarget=config['results']+'/1.filteredISBPs/{chrom}/sorted.bed', + output: config['results']+'/2.mappedISBPs/{chrom}/OnSameChrom.bed' + log: config['results']+'/2.mappedISBPs/{chrom}/OnSameChrom.log' shell: """ - join -1 4 -2 4 <(sort -k4,4 {input.isbpOnQuery}) <(sort -k4,4 {input.isbpOnTarget})|sed "s/ /\t/g"|awk "{{if (tolower(\$2) == tolower(\$5)) {{print \$2,\$3,\$4,\$1}} }}" |sort -k1,1 -k2,2n |sed "s/ /\t/g"> {output} - join -1 4 -2 4 <(sort -k4,4 {input.isbpOnQuery}) <(sort -k4,4 {input.isbpOnTarget})|sed "s/ /\t/g"|fgrep -i chrun | awk "{{print \$2,\$3,\$4,\$1}}" |sed "s/ /\t/g">> {output} - sort -k1,1 -k2,2n {output} > toto && mv toto {output} + cut -f 4 {input.isbpOnTarget}|fgrep -wf - {input.isbpOnQuery} |cut -f -4 |sort -k1,1 -k2,2n 1> {output} 2> {log} """ rule upstreamClosest: message: " Collect closest marker upstream of genes" - input: annot=config['results']+"/1.features.bed", markers=config['results']+'/2.mappedISBPs/OnSameChrom.bed' - output: temp(config['results']+'/2.closestbed/Upstream.txt') - log: config['results']+"/2.closestbed/Upstream.log" + input: annot=config['results']+"/1.features/{chrom}.bed", + markers=config['results']+'/2.mappedISBPs/{chrom}/OnSameChrom.bed' + output: config['results']+'/2.closestbed/{chrom}/upstream.txt' + log: config['results']+"/2.closestbed/{chrom}/upstream.log" shell: """ bedtools closest -k 5 -id -D ref -io -a {input.annot} -b {input.markers} 1> {output} 2> {log} @@ -32,26 +34,27 @@ rule upstreamClosest: rule downstreamClosest: message: " Collect Downstream marker downstream of genes" - input: annot=config['results']+"/1.features.bed", markers=config['results']+'/2.mappedISBPs/OnSameChrom.bed' - output: temp(config['results']+'/2.closestbed/downstream.txt') - log: config['results']+"/2.closestbed/downstream.log" + input: annot=config['results']+"/1.features/{chrom}.bed", + markers=config['results']+'/2.mappedISBPs/{chrom}/OnSameChrom.bed' + output: config['results']+'/2.closestbed/{chrom}/downstream.txt' + log: config['results']+"/2.closestbed/{chrom}/downstream.log" shell: """ bedtools closest -k 5 -iu -D ref -io -a {input.annot} -b {input.markers} 1> {output} 2> {log} """ -rule splitPerChrom: - message: "Split data per chromosome" - input: - upstream=config['results']+'/2.closestbed/Upstream.txt', - downstream=config['results']+'/2.closestbed/downstream.txt' - output: - splitUp=temp(config['results']+'/2.closestbed/{chrom}/upstream.txt'), - splitDown=config['results']+'/2.closestbed/{chrom}/downstream.txt' - params: chromPref='TraesCS{chrom}' - log: config['results']+'/2.closestbed/{chrom}/split.log' - shell: - """ - fgrep {params.chromPref} {input.upstream} 1> {output.splitUp} 2> {log} - fgrep {params.chromPref} {input.downstream} 1> {output.splitDown} 2>> {log} - """ +# rule splitPerChrom: +# message: "Split data per chromosome" +# input: +# upstream=config['results']+'/2.closestbed/{chrom}/upstream.txt', +# downstream=config['results']+'/2.closestbed/{chrom}/downstream.txt' +# output: +# splitUp=config['results']+'/2.closestbed/{chrom}/upstream.txt', +# splitDown=config['results']+'/2.closestbed/{chrom}/downstream.txt' +# params: chromPref='TraesCS{chrom}' +# log: config['results']+'/2.closestbed/{chrom}/split.log' +# shell: +# """ +# fgrep {params.chromPref} {input.upstream} 1> {output.splitUp} 2> {log} +# fgrep {params.chromPref} {input.downstream} 1> {output.splitDown} 2>> {log} +# """ diff --git a/rules/geneAnchoring.smk b/rules/geneAnchoring.smk index adc619abc18634a0a98fbc3831fdee022a80faca..f4154b94fe1b8902fb52f1b1bc9504b304f8b2b1 100644 --- a/rules/geneAnchoring.smk +++ b/rules/geneAnchoring.smk @@ -12,7 +12,7 @@ rule mapHomologousRegions: output: allblat=config['results']+'/3.mapping/{chrom}/allBlat.csv', summary=config['results']+'/3.mapping/{chrom}/mappingSummary.csv', - mapping=temp(directory(config['results']+'/3.mapping/{chrom}/temp')) + mapping=directory(config['results']+'/3.mapping/{chrom}/temp') params: dir=directory(config['results']+'/3.mapping/{chrom}') log: config['results']+'/3.mapping/{chrom}/microMappingPipeline.log' shell: @@ -118,7 +118,8 @@ rule concatAllChromResults: message: " concat all per chromosome results" input: annot=expand(config['results']+'/5.FINAL/{chrom}/annotation.gff3',chrom=config['chromosomes']), differentChrom=expand(config['results']+'/4.gmapWholeGenome/{chrom}.wholeGenome_differentChrom.gff3',chrom=config['chromosomes']), - missing=expand(config['results']+'/5.FINAL/{chrom}/missing.txt', chrom=config['chromosomes']) + missing=expand(config['results']+'/5.FINAL/{chrom}/missing.txt', chrom=config['chromosomes']), + mapping=expand(config['results']+'/3.mapping/{chrom}/temp', chrom=config['chromosomes']) output: annot=temp(config['finalPrefix']+'tmp.gff3'), diffChrom=config['finalPrefix']+'_gmap_differentChrom.gff3', missing=config['finalPrefix']+'_missing.txt' @@ -173,7 +174,8 @@ rule generateFastaSequencesLC: rule concatblatSummary: message: " Concat all Blat summary" - input:blat=expand(config['results']+'/3.mapping/{chrom}/allBlat.csv', chrom=config['chromosomes']) + input:blat=expand(config['results']+'/3.mapping/{chrom}/allBlat.csv', chrom=config['chromosomes']), + mapping=expand(config['results']+'/3.mapping/{chrom}/temp', chrom=config['chromosomes']) output: config['finalPrefix']+'_blatSummary.csv' shell: """ @@ -182,7 +184,8 @@ rule concatblatSummary: rule concatAnchoringSummary: message: " Concat all Anchoring summary" - input:anchoring=expand(config['results']+'/3.mapping/{chrom}/mappingSummary.csv', chrom=config['chromosomes']) + input:anchoring=expand(config['results']+'/3.mapping/{chrom}/mappingSummary.csv', chrom=config['chromosomes']), + mapping=expand(config['results']+'/3.mapping/{chrom}/temp', chrom=config['chromosomes']) output: config['finalPrefix']+'_anchoringSummary.csv' shell: """ diff --git a/rules/preprocessGenomes.smk b/rules/preprocessGenomes.smk index 3b9d80b1d44161498847af0e733128a2f49ab6e0..3e21db8e965e7dc36fcd0811889e917100bf992b 100644 --- a/rules/preprocessGenomes.smk +++ b/rules/preprocessGenomes.smk @@ -4,10 +4,7 @@ rule grepGffFeature: params: config['featureType'] output: temp(config['results']+"/1.features.bed") log: config['results']+"/1.grepGffFeature.log" - shell: - """ - bin/gff2bed.sh {params} {input} 1> {output} 2> {log} - """ + shell: "bin/gff2bed.sh {params} {input} 1> {output} 2> {log}" rule splitGffPerChrom: message: "Split Gff Features per chromosome: current is {wildcards.chrom}" @@ -15,10 +12,7 @@ rule splitGffPerChrom: output: temp(config['results']+"/1.features/{chrom}.bed") log: config['results']+"/1.features/{chrom}.fgrep.log" params: "chr{chrom}" - shell: - """ - fgrep -i {params} {input} 1> {output} 2> {log} - """ + shell: "fgrep -i {params} {input} 1> {output} 2> {log}" rule indexQuery: message: " Indexing Query fasta file using samtools faidx" diff --git a/rules/preprocessISBP.smk b/rules/preprocessISBP.smk index 0ba5a9737925249ea0adbfe1205630e07e745f56..705fe87833c4ef442d75a03aa8be351e31b1535b 100644 --- a/rules/preprocessISBP.smk +++ b/rules/preprocessISBP.smk @@ -9,30 +9,21 @@ rule filterBam: rule bam2bed: message: "Convert Filtered BAM file into BED" input: config['results']+"/1.filteredISBPs.bam" - output: temp(config['results']+"/1.filteredISBPs.bed") + output: config['results']+"/1.filteredISBPs.bed" log: config['results']+"/2.bam2bed.log" - shell: "bamToBed -i {input} 1> {output} 2> {log}" + shell: "bamToBed -i {input} |sort -k1,1 -k2,2n 1> {output} 2> {log}" -rule sortBed: - message: "Sort ISBPs bed file" - input: config['results']+"/1.filteredISBPs.bed" - output: temp(config['results']+"/1.filteredISBPs/sorted.bed") - log: config['results']+'/1.sortBed.log' - shell: "sort -k1,1 -k2,2n {input} 1> {output} 2> {log}" rule dumpISBPsID: message: "Dump ISBPs IDs" - input: config['results']+"/1.filteredISBPs/sorted.bed" - output: temp(config['results']+"/1.filteredISBPs.ids") + input: config['results']+"/1.filteredISBPs.bed" + output: config['results']+"/1.filteredISBPs.ids" shell: " cut -f 4 {input} > {output}" rule splitISBP: message: "Split isbps per chromosome" - input: config['results']+"/1.filteredISBPs/sorted.bed" - output: temp(config['results']+"/1.filteredISBPs/{chrom}/sorted.bed") + input: config['results']+"/1.filteredISBPs.bed" + output: config['results']+"/1.filteredISBPs/{chrom}/sorted.bed" log: config['results']+"/1.filteredISBPs/{chrom}/sorted.log" params: 'Chr{chrom}' - shell: - """ - fgrep -i {params} {input} 1> {output} 2> {log} - """ + shell: "fgrep -i {params} {input} 1> {output} 2> {log}"