From 21fe79681c1491dd6838b6b152dc935580bc9eb2 Mon Sep 17 00:00:00 2001
From: mariabernard <maria.bernard@jouy.inra.fr>
Date: Wed, 19 Jun 2019 12:57:24 +0200
Subject: [PATCH] 1000RNASEQ chicken ASE: correct wildcard constraint, rmdup
 command line and add fasta index  as input in SplitCigar

---
 Snakemake/1000RNASeq_chicken/ASE/rules/STAR.smk | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/Snakemake/1000RNASeq_chicken/ASE/rules/STAR.smk b/Snakemake/1000RNASeq_chicken/ASE/rules/STAR.smk
index 4ffd456..315f8d7 100644
--- a/Snakemake/1000RNASeq_chicken/ASE/rules/STAR.smk
+++ b/Snakemake/1000RNASeq_chicken/ASE/rules/STAR.smk
@@ -8,7 +8,7 @@ Align RNASeq reads on masked genome
 '''
 
 wildcard_constraints:
-    properlyPaired_rmDup = '_pp_|_'
+    properlyPaired = '_pp_|_'
 
 STAR_INDEX_FILE=['genomeParameters.txt', 'chrName.txt', 'chrLength.txt', 'chrStart.txt', 'chrNameLength.txt', 'exonGeTrInfo.tab', 'geneInfo.tab', 'transcriptInfo.tab', 'exonInfo.tab', 'sjdbList.fromGTF.out.tab', 'sjdbInfo.txt', 'sjdbList.out.tab', 'Genome', 'SA', 'SAindex']
 
@@ -157,18 +157,20 @@ def get_properly_paired_bam(wildcards):
             return "Results/STAR_Aln_2/" + wildcards.sample + "_rg_genomic.bam"
 
 
+# note : This tool uses the READ_NAME_REGEX and the OPTICAL_DUPLICATE_PIXEL_DISTANCE options as the primary methods to identify and differentiate duplicate types. Set READ_NAME_REGEX to null to skip optical duplicate detection, e.g. for RNA-seq or other data where duplicate sets are extremely large and estimating library complexity is not an aim
+
 rule RmDuplicates:
     input:
         bam = get_properly_paired_bam
     output:
-        bam = "Results/STAR_Aln_2/{sample}_rg_genomic{properlyPaired}.bam",
+        bam = "Results/STAR_Aln_2/{sample}_rg_genomic{properlyPaired}rmdup.bam",
         metrics = "Results/STAR_Aln_2/{sample}_rg_genomic{properlyPaired}.metrics.txt",
     params:
         mem=str(int(config["rmdup"]["mem"].replace("G","")) -4 )+"G" if int(config["rmdup"]["mem"].replace("G","")) -4 > 1 else config["rmdup"]["mem"] ,
         jar=find_jar("picard.jar")
     shell:
         """
-        java -Xmx{params.mem} -jar {params.jar} MarkDuplicates  READ_NAME_REGEX=null I={input.bam} O={output.bam} REMOVE_DUPLICATES=true CREATE_INDEX=true VALIDATION_STRINGENCY=SILENT M={output.metrics}
+        java -Xmx{params.mem} -jar {params.jar} MarkDuplicates  CREATE_INDEX=false READ_NAME_REGEX=null I={input.bam} O={output.bam} REMOVE_DUPLICATES=true VALIDATION_STRINGENCY=SILENT M={output.metrics}
         """
 
 rule remove_multimap:
@@ -184,9 +186,11 @@ rule remove_multimap:
 rule SplitNCigarReads:
     input:
         fasta = 'Results/genomeMasked/' + os.path.splitext(os.path.basename(config['fasta_ref']))[0] + '_masked.fa', 
+        idx = 'Results/genomeMasked/' + os.path.splitext(os.path.basename(config['fasta_ref']))[0] + '_masked.fa.fai', 
+        dict = 'Results/genomeMasked/' + os.path.splitext(os.path.basename(config['fasta_ref']))[0] + '_masked.dict', 
         bam = "Results/STAR_Aln_2/{sample}_rg_genomic{properlyPaired}rmdup_uniq.bam"
     output:
-        bam = "Results/STAR_Aln_2/{sample}_rg_genomic{properlyPairedrmdup_uniq_split.bam"
+        bam = "Results/STAR_Aln_2/{sample}_rg_genomic{properlyPaired}rmdup_uniq_split.bam"
     params:
         mem=config["SplitNCigarReads"]["mem"],
         jar=find_jar("GenomeAnalysisTK.jar")
-- 
GitLab