Merge branch 'develop'

5e59c856 · mariabernard · 9ae7f1ff · 33ef13bc · 5e59c856 · 5e59c856
Commit 5e59c856 authored 4 years ago by mariabernard
--- a/Snakemake/1000RNASeq_chicken/ASE/Snakefile
+++ b/Snakemake/1000RNASeq_chicken/ASE/Snakefile
@@ -28,6 +28,7 @@ def check_param(rule,resource):
 rule_resources = {
    'indexFasta' : ['mem'],
    'FS_QD_clust' : ['mem'],
+    'vcf_filtration' : ['cpu'],
    'SelectVariants' : ['mem'],
    'STAR_Index_1' : ['cpu'],
    'STAR_Aln_1' : ['cpu'],
@@ -150,7 +151,7 @@ final_outputs.append('Results_ASE/vcfFiltration/' + os.path.basename(config['vcf
 final_outputs.append('Results_ASE/vcfFiltration/' + os.path.basename(config['vcf']).replace('.vcf.gz','_FsQdBiall_withGTpopCR50_DPgt05rPopCR20_GT.tsv'))
 final_outputs.append('Results_ASE/vcfFiltration/' + os.path.basename(config['vcf']).replace('.vcf.gz','_FsQdBiall_withGTpopCR50_DPgt05rPopCR20_DP.tsv'))
 final_outputs.append('Results_ASE/vcfFiltration/' + os.path.basename(config['vcf']).replace('.vcf.gz','_FsQdBiall_withGTpopCR50_DPgt05rPopCR20_AD.tsv'))
-final_outputs.append('Results_ASE/Summary/' + os.path.basename(config['vcf']).replace('.vcf.gz','') + '_VariantFiltration_summary.txt')
+final_outputs.append('Results_ASE/Summary/VariantFiltration_summary.txt')

 # counter
 for sample in table["sample_name"].unique():
@@ -172,7 +173,7 @@ for sample in table["sample_name"].unique():
 # 'Results_ASE/vcfFiltration/variants_FsQdBiall_withGTpopCR50_DPgt05rPopCR20_GT.tsv', 
 # 'Results_ASE/vcfFiltration/variants_FsQdBiall_withGTpopCR50_DPgt05rPopCR20_DP.tsv',
 # 'Results_ASE/vcfFiltration/variants_FsQdBiall_withGTpopCR50_DPgt05rPopCR20_AD.tsv', 
-# 'Results_ASE/Summary/variants_VariantFiltration_summary.txt', 
+# 'Results_ASE/Summary/VariantFiltration_summary.txt', 
 # 'Results_ASE/ASEReadCounter/samplePE_ASEReadCounter.csv', 
 # 'Results_ASE/phASER/samplePE_phASER.allelic_counts.txt', 
 # 'Results_ASE/phASER_gene_ae/samplePE_phASER.gene_ae.txt', 

--- a/Snakemake/1000RNASeq_chicken/ASE/config_ase.yaml.example
+++ b/Snakemake/1000RNASeq_chicken/ASE/config_ase.yaml.example
@@ -42,12 +42,6 @@ data_dir : data
 SNP_filter : True
 # quality trimming threshold used in trimgalore to remove low quality bases.
 trimming_quality : 15
-# minimum calling depth
-depth : 10
-# minimum percentage (between 0 and 1) of sample with known genotype
-GTpopCR_th : 0.50
-# minimum percentage (between 0 and 1) of sample with known genotype with DP > 5
-DPgt05rPopCR_th : 0.20
 # minimum base quality
 baseQuality : 20
 # minimum mapping quality

--- a/Snakemake/1000RNASeq_chicken/ASE/resources_SLURM.yaml
+++ b/Snakemake/1000RNASeq_chicken/ASE/resources_SLURM.yaml
@@ -4,28 +4,36 @@ __default__:
    time : "4:00:00"
    partition : workq

+trim_pe :
+    time : "24:00:00"
+
+trim_se :
+    time : "24:00:00"
+
 FS_QD_clust:
    mem : "20G"
    
 SelectVariants:
    mem : "20G"

-VCF_for_ASE_and_table:
+vcf_filtration:
+    cpu : 10
+    mem : "16G"
    time : "96:00:00"
-    partition : unlimitq
+    partition : workq

 STAR_Index_1:
-    mem : "18G"
+    mem : "50G"
    cpu : 8
    time : "24:00:00"
    
 STAR_Aln_1:
-    mem : "12G"
+    mem : "30G"
    cpu : 8
    time : "24:00:00"
    
 STAR_Aln_2:
-    mem : "12G"
+    mem : "30G"
    cpu : 8
    time : "24:00:00"

@@ -37,9 +45,11 @@ RmDuplicates:
    
 SplitNCigarReads:
    mem : "30G"
-
+    time : "8:00:00"
+    
 phASER:
-    mem : "20G"
+    cpu : 8
+    mem : "130G"

 phASER_expr_matrix:
    cpu : 8

--- a/Snakemake/1000RNASeq_chicken/ASE/rules/ASE_counter.smk
+++ b/Snakemake/1000RNASeq_chicken/ASE/rules/ASE_counter.smk
@@ -51,18 +51,9 @@ rule ASEReadCounter:
   		baseQuality = config['baseQuality']
 	shell:
 		"""
-		gatk --java-options "-Xmx{params.mem}" ASEReadCounter -R {input.ref} -O {output} -I {input.bam} --variant {input.vcf}   --min-mapping-quality {params.mappingQuality} --min-base-quality {params.baseQuality}
+		gatk --java-options "-Xmx{params.mem}" ASEReadCounter -R {input.ref} -O {output} -I {input.bam} --tmp-dir Results_ASE/ASEReadCounter --variant {input.vcf}   --min-mapping-quality {params.mappingQuality} --min-base-quality {params.baseQuality}
 		"""

-rule gtf2bed:
-	input:
-		gtf = config['gtf_ref']
-	output:
-		bed = 'Results_ASE/ref_bed/' + os.path.basename(config['gtf_ref']).replace('.gtf', '.bed')
-	script:
-		"../script/gtf2bed.py"
-
-
 rule phASER:
 	input:
 		vcf = 'Results_ASE/vcfFiltration/' + os.path.basename(config['vcf']).replace('.vcf.gz','_FsQdBiall.vcf.gz'),
@@ -80,9 +71,19 @@ rule phASER:
   		idSeparator = '_' if config['id_separator'] == '' else config['id_separator']
 	shell:
 		"""
-		python2.7 {config[bin_dir]}/phaser.py --vcf {input.vcf} --bam {input.bam} --sample {wildcards.sample} --threads {threads} --o Results_ASE/Counter/{wildcards.sample}_phASER --paired_end {params.paired_option} --baseq {params.baseQuality} --mapq {params.mappingQuality} --id_separator {params.idSeparator}
+		python2.7 {config[bin_dir]}/phaser.py --vcf {input.vcf} --bam {input.bam} --sample {wildcards.sample} --threads {threads} --o Results_ASE/phASER/{wildcards.sample}_phASER --paired_end {params.paired_option} --baseq {params.baseQuality} --mapq {params.mappingQuality} --id_separator {params.idSeparator}
 		"""

+
+rule gtf2bed:
+	input:
+		gtf = config['gtf_ref']
+	output:
+		bed = 'Results_ASE/ref_bed/' + os.path.basename(config['gtf_ref']).replace('.gtf', '.bed')
+	script:
+		"../script/gtf2bed.py"
+
+
 rule phASER_gene_ae:
 	input:
 		hap_count = "Results_ASE/phASER/{sample}_phASER.haplotypic_counts.txt",
@@ -93,7 +94,7 @@ rule phASER_gene_ae:
 		idSeparator = '_' if config['id_separator'] == '' else config['id_separator']
 	shell:
 		"""
-		python2.7 {config[bin_dir]}/phaser_gene_ae.py --haplotypic_counts {input.hap_count} --features {input.bed} --o {output} --id_separa
+		python2.7 {config[bin_dir]}/phaser_gene_ae.py --haplotypic_counts {input.hap_count} --features {input.bed} --o {output} --id_separator {params.idSeparator}
 		"""

 # TO FINISH AND TEST

--- a/Snakemake/1000RNASeq_chicken/ASE/rules/basics.smk
+++ b/Snakemake/1000RNASeq_chicken/ASE/rules/basics.smk
@@ -45,4 +45,4 @@ rule bam_index:
        temp("{file}.bam.bai")
        
    shell:
-        "samtools index {input} "
\ No newline at end of file
+        "samtools index {input} "
--- a/Snakemake/1000RNASeq_chicken/ASE/rules/variantFiltration.smk
+++ b/Snakemake/1000RNASeq_chicken/ASE/rules/variantFiltration.smk
@@ -6,13 +6,18 @@ __version__ = '1.0.0'
 '''
 ?????
 '''
+import sys
+from pyfaidx import Fasta
+genome = Fasta(config['fasta_ref'])
+chrom_list = [ seq.name for seq in genome if seq.name.isdigit() ]

+PREFIX = os.path.basename(config['vcf']).replace('.vcf.gz','')
    # GATK4
 rule SelectVariants :
    input :
        vcf = config['vcf']
    output :
-        vcf = temp("Results_ASE/vcfFiltration/{prefix}_SNP.vcf")
+        vcf = temp("Results_ASE/vcfFiltration/" + PREFIX + "_SNP.vcf")
    params :
        mem = config["SelectVariants"]["mem"]
    shell:
@@ -22,7 +27,7 @@ rule SelectVariants :

 def get_SNP_vcf(wildcards):
    if config['SNP_filter']:
-        return "Results_ASE/vcfFiltration/{prefix}_SNP.vcf"
+        return "Results_ASE/vcfFiltration/" + PREFIX + "_SNP.vcf"
    else:
        return config['vcf']

@@ -34,7 +39,8 @@ rule FS_QD_clust :
        vcf = get_SNP_vcf,
        ref = config['fasta_ref']
    output :
-        vcf = temp("Results_ASE/vcfFiltration/{prefix}_FS_QD_clust_tagged.vcf")
+        vcf = temp("Results_ASE/vcfFiltration/" + PREFIX + "_FS_QD_clust_tagged.vcf"),
+        idx = temp("Results_ASE/vcfFiltration/" + PREFIX + "_FS_QD_clust_tagged.vcf.idx")
    params :
        mem = config["FS_QD_clust"]["mem"]
    shell:
@@ -42,21 +48,116 @@ rule FS_QD_clust :
        gatk --java-options "-Xmx{params.mem}" VariantFiltration -R {input.ref} -V {input.vcf} -O {output.vcf} --filter-name FS --filter-expression "FS > 30.0" --filter-name QD --filter-expression "QD < 2.0" -window 35 -cluster 3
        '''

-rule VCF_for_ASE_and_table :
+rule vcf_filtration :
    input :
-        vcf = "Results_ASE/vcfFiltration/{prefix}_FS_QD_clust_tagged.vcf",
+        vcf = "Results_ASE/vcfFiltration/" + PREFIX + "_FS_QD_clust_tagged.vcf",
+        idx = "Results_ASE/vcfFiltration/" + PREFIX + "_FS_QD_clust_tagged.vcf.idx",
        gtf = config['gtf_ref'],
        fasta = config['fasta_ref']
    output :
-        vcf = temp("Results_ASE/vcfFiltration/{prefix}_FsQdBiall.vcf") ,
-        gt = "Results_ASE/vcfFiltration/{prefix}_FsQdBiall_withGTpopCR50_DPgt05rPopCR20_GT.tsv" ,
-        dp = "Results_ASE/vcfFiltration/{prefix}_FsQdBiall_withGTpopCR50_DPgt05rPopCR20_DP.tsv" ,
-        ad = "Results_ASE/vcfFiltration/{prefix}_FsQdBiall_withGTpopCR50_DPgt05rPopCR20_AD.tsv" ,
-        summary = "Results_ASE/Summary/{prefix}_VariantFiltration_summary.txt"
+        vcf = temp("Results_ASE/vcfFiltration/by_chrom/{chrom}_" + PREFIX + "_FsQdBiall.vcf") ,
+        gt = temp("Results_ASE/vcfFiltration/by_chrom/{chrom}_" + PREFIX + "_FsQdBiall_withGTpopCR50_DPgt05rPopCR20_GT.tsv") ,
+        dp = temp("Results_ASE/vcfFiltration/by_chrom/{chrom}_" + PREFIX + "_FsQdBiall_withGTpopCR50_DPgt05rPopCR20_DP.tsv" ),
+        ad = temp("Results_ASE/vcfFiltration/by_chrom/{chrom}_" + PREFIX + "_FsQdBiall_withGTpopCR50_DPgt05rPopCR20_AD.tsv" ),
+        summary = temp("Results_ASE/vcfFiltration/by_chrom/{chrom}_" + PREFIX + "_VariantFiltration_summary.txt")
+    params :
+        chrom_selection = lambda wildcards : "--accepted-chrom " + wildcards.chrom if wildcards.chrom != "other" else "--excluded-chrom " + " ".join(chrom_list),
+        cpu = config['vcf_filtration']['cpu']
+    shell:
+        '''
+        variantFiltration.py --nb-cpus {params.cpu} --in-vcf {input.vcf} --in-gtf {input.gtf} --in-fasta {input.fasta} {params.chrom_selection} --out-vcf {output.vcf} --out-gt {output.gt} --out-dp {output.dp} --out-ad {output.ad} --summary {output.summary}
+        '''
+
+rule merge_filteredVariant :
+    input :
+        chr_vcf = expand("Results_ASE/vcfFiltration/by_chrom/{CHR}_" + PREFIX + "_FsQdBiall.vcf", CHR=chrom_list),
+        other_vcf = "Results_ASE/vcfFiltration/by_chrom/other_" + PREFIX + "_FsQdBiall.vcf"
+    output :
+        vcf = "Results_ASE/vcfFiltration/" + PREFIX + "_FsQdBiall.vcf"
    shell:
        '''
-        vcf_annotator.py --in-vcf {input.vcf} --in-gtf {input.gtf} --in-fasta {input.fasta} --out-vcf {output.vcf} --out-gt {output.gt} --out-dp {output.dp} --out-ad {output.ad} --summar {output.summary}
+        bcftools concat -o {output.vcf} -O v {input.chr_vcf} {input.other_vcf}
        '''
+def get_table(wildcards):
+    table_list = expand("Results_ASE/vcfFiltration/by_chrom/{chrom}_" + PREFIX + "_FsQdBiall_withGTpopCR50_DPgt05rPopCR20_"+wildcards.suffix + ".tsv", chrom=chrom_list )
+    return table_list
+    
+rule merge_filtered_table:
+    input :
+        chr_tsv = get_table,
+        other_tsv = "Results_ASE/vcfFiltration/by_chrom/other_" + PREFIX + "_FsQdBiall_withGTpopCR50_DPgt05rPopCR20_{suffix}.tsv"
+    output :
+        tsv = "Results_ASE/vcfFiltration/" + PREFIX + "_FsQdBiall_withGTpopCR50_DPgt05rPopCR20_{suffix}.tsv"
+    wildcard_constraints:
+        suffix="GT|AD|DP"
+    shell:
+        '''
+        head -n 1 {input.other_tsv} > {output.tsv}
+        cat {input.chr_tsv} {input.other_tsv}| grep -v "#" >> {output.tsv}
+        '''
+
+rule merge_filtered_summary:
+    input :
+        chr_tsv = expand("Results_ASE/vcfFiltration/by_chrom/{chrom}_" + PREFIX + "_VariantFiltration_summary.txt", chrom=chrom_list),
+        other_tsv = "Results_ASE/vcfFiltration/by_chrom/other_" + PREFIX + "_VariantFiltration_summary.txt"
+    output :
+        summary = "Results_ASE/Summary/VariantFiltration_summary.txt"
+    run : 
+        dic_results = dict()
+        in_list = input.chr_tsv
+        in_list.append(input.other_tsv)
+        # in_list = ['Results_ASE/vcfFiltration/by_chrom/1_test_VariantFiltration_summary.txt', 'Results_ASE/vcfFiltration/by_chrom/25_test_VariantFiltration_summary.txt', 'Results_ASE/vcfFiltration/by_chrom/26_test_VariantFiltration_summary.txt', 'Results_ASE/vcfFiltration/by_chrom/other_test_VariantFiltration_summary.txt']
+        first=in_list.pop(0)
+        FH_first = open(first,'rt')
+        summary = ''
+        first_key = ''
+        percent_list = list()
+        for line in FH_first:
+            if ':' in line:
+                category = line.split(':')[0]
+                key = category.upper().strip().replace(' ','_')
+                
+                # store first key concidered as the total of variant
+                if first_key == '':
+                    first_key = key
+                
+                # store key where we want to compute percentage
+                if '%)' in line:
+                    percent_list.append(key)
+                    
+                value = int(line.split(':')[1].strip().split()[0])
+                
+                summary += category + ': ###' + key + '###\n'
+                dic_results[key] = value
+            
+            elif "# Input vcf file is" in line:
+                line = "# Input vcf file is " + "Results_ASE/vcfFiltration/" + PREFIX + "_FS_QD_clust_tagged.vcf\n"
+                summary += line
+            elif "those SNP are described in" in line:
+                line = "\t\t\tthose SNP are described in " + "Results_ASE/vcfFiltration/" + PREFIX + "_FsQdBiall_withGTpopCR50_DPgt05rPopCR20_GT.tsv, Results_ASE/vcfFiltration/" + PREFIX + "_FsQdBiall_withGTpopCR50_DPgt05rPopCR20_DP.tsv, Results_ASE/vcfFiltration/" + PREFIX + "_FsQdBiall_withGTpopCR50_DPgt05rPopCR20_AD.tsv\n"
+                summary += line
+            else:
+                summary += line
+        FH_first.close()

+        for f in in_list:
+            FH_in = open(f,'rt')
+            for line in FH_in:
+                if ':' in line:
+                    category = line.strip().split(':')[0]
+                    key = category.upper().strip().replace(' ','_')
+                    value = int(line.strip().split(':')[1].strip().split()[0])
+                    dic_results[key] += value
+            FH_in.close()


+        for k in dic_results:
+            if k in percent_list:
+                percent = round(dic_results[k]*100/dic_results[first_key],2)
+                summary = summary.replace('###' + k + '###' , str(dic_results[k]) + ' (' + str(percent) + '%)')
+            else:
+                summary = summary.replace('###' + k + '###' , str(dic_results[k]))
+                
+        FH_out = open(output.summary,'wt')
+        FH_out.write(summary)
+        FH_out.close()
--- a/Snakemake/1000RNASeq_chicken/ASE/script/variantFiltration.py
+++ b/Snakemake/1000RNASeq_chicken/ASE/script/variantFiltration.py
+#!/usr/bin/env python3
+
+import argparse
+import os,sys
+import time
+import subprocess
+from subprocess import Popen, PIPE
+import multiprocessing
+
+##########################
+# FUNCTION
+def select_on_chrom(in_vcf, accepted_chrom, excluded_chrom, selected_vcf):
+    
+    FH_in = open(in_vcf, 'rt')
+    FH_out = open(selected_vcf,'wt')
+    
+    for line in FH_in:
+        if line.startswith('#'):
+            FH_out.write(line)
+        elif accepted_chrom and line.split()[0] in accepted_chrom :
+            FH_out.write(line)
+        elif excluded_chrom and line.split()[0] not in excluded_chrom :
+            FH_out.write(line)
+
+def split_vcf(in_vcf, nb_cpus, tmp_files ):
+    
+    # count variant
+    cmd = 'grep -vc "#" ' + in_vcf
+    p = Popen(cmd, shell=True, universal_newlines=True, stdout=PIPE, stderr=PIPE)
+    stdout, stderr = p.communicate()
+    nb_var = int(stdout.strip())
+    nb_by_file = int(nb_var / nb_cpus)
+    
+    # write splitted VCF
+    tmp_FH = [ open(f,'wt') for f in tmp_files]
+    FH_in = open(in_vcf)
+    header = list()
+    
+    c=0
+    var=list()
+
+    for line in FH_in:
+        if line.startswith('#'):
+            header.append(line)
+        else:
+            if len(header) > 0:
+                for t in tmp_FH:
+                    t.write("".join(header))
+                header.clear()
+            c += 1
+            var.append(line)
+            
+            # write variant by batch of 10000
+            if len(tmp_FH)>0 and len(var) == min(10000,nb_by_file):
+                tmp_FH[0].write(''.join(var))
+                var.clear()
+                
+            # change temp output file
+            if len(tmp_FH)>0 and c == nb_by_file:
+                c = 0
+                tmp_FH[0].close()
+                del(tmp_FH[0])
+
+    
+    # write remaining var
+    if len(var) > 0:
+        tmp_FH = open(tmp_files[-1],'a')
+        tmp_FH.write(''.join(var))
+        var.clear()
+        tmp_FH.close()
+    
+def submit_cmd( cmd, cwd=None):
+    """
+    @summary: Submits a command on system.
+    @param cmd: [list] The command.
+    @param stdout_path: The path to the file where the standard outputs will be written.
+    @param stderr_path: The path to the file where the error outputs will be written.
+    """
+    p = Popen(cmd, universal_newlines=True, stdout=PIPE, stderr=PIPE, cwd=cwd)
+    stdout, stderr = p.communicate()
+
+    # check error status
+    if p.returncode != 0:
+        raise Exception( stderr )
+        
+def process_variantFiltering_and_Annot(in_vcf, in_gtf, in_fasta, out_vcf, out_gt, out_dp, out_ad, out_summary):
+
+    cmd = ['vcf_filter_and_annot.py',  '--in-vcf',  in_vcf, '--in-gtf', in_gtf, '--in-fasta', in_fasta, 
+        '--out-vcf', out_vcf, '--out-gt', out_gt, '--out-dp', out_dp, '--out-ad', out_ad, '--summary', out_summary]
+    print(" ".join(cmd) + "\n")
+    submit_cmd( cmd )
+
+
+def parallel_submission( function, inputs_vcf, in_gtf, in_fasta, outputs_vcf, outputs_gt, outputs_dp, outputs_ad, outputs_summary):
+    cpu_used = len(inputs_vcf)
+    processes = [{'process':None, 'input_vcf':None, 'gtf' : in_gtf, 'fasta':in_fasta,
+                  'output_vcf':None, 'output_gt':None, 'output_dp':None, 'output_ad':None, 'output_summary':None } for idx in range(cpu_used)]
+    
+    # Launch processes
+    for idx in range(cpu_used):
+        process_idx = idx % cpu_used
+        processes[process_idx]['input_vcf'] = inputs_vcf[idx]
+        processes[process_idx]['output_vcf'] = outputs_vcf[idx]
+        processes[process_idx]['output_gt'] = outputs_gt[idx]
+        processes[process_idx]['output_dp'] = outputs_dp[idx]
+        processes[process_idx]['output_ad'] = outputs_ad[idx]
+        processes[process_idx]['output_summary'] = outputs_summary[idx]
+
+    for current_process in processes:
+        if idx == 0:  # First process is threaded with parent job
+            current_process['process'] = threading.Thread(target=function,
+                                                          args=(current_process['input_vcf'],current_process['gtf'], current_process['fasta'], current_process['output_vcf'], current_process['output_gt'], current_process['output_dp'], current_process['output_ad'], current_process['output_summary']))
+        else:  # Others processes are processed on diffrerent CPU
+            current_process['process'] = multiprocessing.Process(target=function,
+                                                                 args=(current_process['input_vcf'],current_process['gtf'], current_process['fasta'], current_process['output_vcf'], current_process['output_gt'], current_process['output_dp'], current_process['output_ad'], current_process['output_summary']))
+        current_process['process'].start()
+    # Wait processes end
+    for current_process in processes:
+        current_process['process'].join()
+    # Check processes status
+    for current_process in processes:
+        if issubclass(current_process['process'].__class__, multiprocessing.Process) and current_process['process'].exitcode != 0:
+            raise Exception("Error in sub-process execution.")
+
+def merge_vcf(in_list,out):
+    
+    cmd = ['bcftools','concat','-o',out,'-O','v'] + in_list
+    print ("\n#merge temp VCFs\n" + " ".join(cmd) + "\n")
+    submit_cmd( cmd  )
+
+def merge_tsv(in_list, out):
+    
+    print ("\n#merge temp tables\n\tInputs : " + " ".join(in_list) + "\n\tOutputs : " + out + "\n")
+    
+    FH_out = open(out,'wt')
+    first = True
+    
+    for f in in_list:
+        FH_in = open(f,'rt')
+        header=FH_in.readline()
+        
+        if first :
+            FH_out.write(header)
+            first = False
+            
+        for line in FH_in:
+            FH_out.write(line)
+            
+def merge_summary(in_list, out):
+    
+    print ("\n#merge temp summaries\n\tInputs : " + " ".join(in_list) + "\n\tOutputs : " + out + "\n")
+    
+    dic_results = dict()
+    first=in_list.pop(0)
+    FH_first = open(first,'rt')
+    # prefix = tmp_dir/time_pid_i_
+    prefix = os.path.join(os.path.dirname(first), '_'.join(os.path.basename(first).split('_')[0:3])) + '_'
+    summary = ''
+    first_key = ''
+    percent_list = list()
+    for line in FH_first:
+        if prefix in line:
+            summary += line.replace(prefix,'')
+            
+        elif ':' in line:
+            category = line.split(':')[0]
+            key = category.upper().strip().replace(' ','_')
+            
+            # store first key concidered as the total of variant
+            if first_key == '':
+                first_key = key
+            
+            # store key where we want to compute percentage
+            if '%)' in line:
+                percent_list.append(key)
+                
+            value = int(line.split(':')[1].strip().split()[0])
+            
+            summary += category + ': ###' + key + '###\n'
+            dic_results[key] = value
+        
+        else:
+            summary += line
+
+    for f in in_list:
+        FH_in = open(f,'rt')
+        for line in FH_in:
+            if ':' in line:
+                category = line.strip().split(':')[0]
+                key = category.upper().strip().replace(' ','_')
+                value = int(line.strip().split(':')[1].strip().split()[0])
+                dic_results[key] += value
+
+    for k in dic_results:
+        if k in percent_list:
+            percent = round(dic_results[k]*100/dic_results[first_key],2)
+            summary = summary.replace('###' + k + '###' , str(dic_results[k]) + ' (' + str(percent) + '%)')
+        else:
+            summary = summary.replace('###' + k + '###' , str(dic_results[k]))
+            
+    FH_out = open(out,'wt')
+    FH_out.write(summary)
+    FH_out.close()
+    
+def process(params):
+    
+    in_vcf=params.in_vcf
+    out_dir = os.path.dirname(params.out_vcf)
+    temp_dir = os.path.join(out_dir,str(time.time()) + "_" + str(os.getpid()) + '_temp_variantFiltration')
+    temp_files = list()
+    try:
+        # select variant based on chrom
+        if params.accepted_chrom is not None or params.excluded_chrom is not None:
+            os.mkdir(temp_dir)
+            selected_vcf = os.path.join(temp_dir, str(time.time()) + "_" + str(os.getpid()) + "_selectOnChrom_" + os.path.basename(in_vcf))
+            temp_files.append(selected_vcf)
+            select_on_chrom(in_vcf, params.accepted_chrom, params.excluded_chrom, selected_vcf)
+            in_vcf = selected_vcf
+            
+        if params.nb_cpus == 1:
+            process_variantFiltering_and_Annot(in_vcf, params.in_gtf, params.in_fasta, params.out_vcf, params.out_gt, params.out_dp, params.out_ad, params.summary)
+        else :
+            #### create temp_dir
+            if not os.path.exists(temp_dir):
+                os.mkdir(temp_dir)
+            
+            #### split VCF in [nb_cpus] files
+            prefix = str(time.time()) + "_" + str(os.getpid())
+            splitted_in_vcf_files = [ os.path.join(temp_dir, prefix + "_" + str(i) + "_" + os.path.basename(in_vcf)) for i in range(0,params.nb_cpus )]
+            temp_files += splitted_in_vcf_files
+            split_vcf(in_vcf, params.nb_cpus, splitted_in_vcf_files )
+
+            #### filter_and_annotate
+            splitted_out_vcf_files = [ os.path.join(temp_dir, prefix + "_" + str(i) + "_" + os.path.basename(params.out_vcf)) for i in range(0,params.nb_cpus)]
+            splitted_out_gt_files = [ os.path.join(temp_dir, prefix + "_" + str(i) + "_" + os.path.basename(params.out_gt)) for i in range(0,params.nb_cpus)]
+            splitted_out_dp_files = [ os.path.join(temp_dir, prefix + "_" + str(i) + "_" + os.path.basename(params.out_dp)) for i in range(0,params.nb_cpus)]
+            splitted_out_ad_files = [ os.path.join(temp_dir, prefix + "_" + str(i) + "_" + os.path.basename(params.out_ad)) for i in range(0,params.nb_cpus)]
+            splitted_out_summary_files = [ os.path.join(temp_dir, prefix + "_" + str(i) + "_" + os.path.basename(params.summary)) for i in range(0,params.nb_cpus)]
+            temp_files += splitted_out_vcf_files + splitted_out_gt_files + splitted_out_ad_files + splitted_out_dp_files + splitted_out_summary_files
+            
+            parallel_submission( process_variantFiltering_and_Annot, splitted_in_vcf_files, params.in_gtf, params.in_fasta, splitted_out_vcf_files, splitted_out_gt_files, splitted_out_dp_files, splitted_out_ad_files, splitted_out_summary_files )
+            
+            # merge outputs
+            merge_vcf(splitted_out_vcf_files, params.out_vcf)
+            merge_tsv(splitted_out_gt_files, params.out_gt)
+            merge_tsv(splitted_out_dp_files, params.out_dp)
+            merge_tsv(splitted_out_ad_files, params.out_ad)
+            merge_summary(splitted_out_summary_files, params.summary)
+        
+    finally :
+        # remove temp_dir
+        if not params.debug and os.path.exists(temp_dir):
+            # remove temp files
+            for f in temp_files:
+                if os.path.exists(f):
+                    os.remove(f)
+            # remove temp dir
+            os.rmdir(temp_dir)
+        
+
+##########################
+# MAIN
+
+if __name__ == '__main__':
+    # Parameters
+    parser = argparse.ArgumentParser(description="Annot variant, by adding INFO fields : \n " +
+        " - LOCALISATION: PASS or a combination of jnct5nt (near exon junction), homodimer5 ( in homopolymer), cl3snp35nt(in SNP cluster) \n" + 
+        " - GTpopCR= X% : percentage of individuals genotyped \n" +
+           " - DPgt05rPopCR= X% : percentage of individuals genotyped with DP>5\n")
+    parser.add_argument('-n','--nb-cpus', type=int, default=1, help='Number of CPU to paralelise the treatment')
+    parser.add_argument('-d','--debug', default=False, action='store_true', help='keep temporary files')
+    parser.add_argument('-a','--accepted-chrom', nargs="+", required=False, help='list of chromosome to include')
+    parser.add_argument('-e','--excluded-chrom', nargs="+", required=False, help='list of chromosome to exclude')
+    #     Inputs
+    group_input = parser.add_argument_group( 'Inputs' )
+    group_input.add_argument('--in-vcf', required=True, help="The input vcf file to annotate.")
+    group_input.add_argument('--in-gtf', required=True, help="The genome annotations GTF file, to tag SNP closed to exon boundaries (+/- 5bp)")
+    group_input.add_argument('--in-fasta', required=True, help="The genome Fasta file, to tag SNP in homopolymer ( 5 bp).")
+    #     Outputs
+    group_output = parser.add_argument_group( 'Outputs' )
+    group_output.add_argument('--out-vcf', required=True, help="The annotated vcf file output.")
+    group_output.add_argument('--out-gt', required=True, help="A GT table of variants with FILTER PASS, GTpopCR > 50 and DPgt05rPopCR > 20 ")
+    group_output.add_argument('--out-dp', required=True, help="A DP table of variants with FILTER PASS, GTpopCR > 50 and DPgt05rPopCR > 20 ")
+    group_output.add_argument('--out-ad', required=True, help="A AD table of variants with FILTER PASS, GTpopCR > 50 and DPgt05rPopCR > 20 ")
+    group_output.add_argument('--summary', required=True, help="Annotation summary")
+    args = parser.parse_args()
+
+    if args.accepted_chrom is not None and args.excluded_chrom is not None:
+        intersect_chrom = [ c for c in args.accepted_chrom if c in args.excluded_chrom]
+        if len(intersect_chrom) > 0:
+            raise Exception("accepted_chrom and excluded_chrom list must not intersect\n")
+
+    process(args)
--- a/Snakemake/1000RNASeq_chicken/ASE/script/vcf_annotator.py
+++ b/Snakemake/1000RNASeq_chicken/ASE/script/vcf_annotator.py
@@ -78,6 +78,11 @@ def process(params):
    # FILTER PASS, LOC PASS, GTpopCR > 50, DPgt05rPopCR > 20
    PERFECT = 0

+    # compteur d'ecriture par bloc
+    write_variant = list()
+    write_table_dp = list()
+    write_table_ad = list()
+    write_table_gt = list()
    # parse input file
    for line in FH_in:
        if line.startswith("#"):
@@ -100,11 +105,11 @@ def process(params):
                    "\t".join([s + "_DP" for s in samples]) + "\n")
                FH_ad.write("\t".join(["#chrom", "pos", "REF", "ALT", "DPpop", "jnct5nt", "homodimer5" ,"cl3snp35nt" , "LOCALISATION"]) + "\t" + 
                    "\t".join([s + "_AD_" + a for s in samples for a in ["REF","ALT"] ]) + "\n")
-
+                print ("end parsing header\n")
            continue

        tot_snp += 1
-        line_list = line.split("\t")
+        line_list = line.strip().split("\t")
        chrom = line_list[0]
        pos = int(line_list[1])
        ref = line_list[3]
@@ -214,7 +219,11 @@ def process(params):
        line_list[7] = ";".join(info)
        # ECRIRE LES VARIANTS FILTER PASS, GTpopCR50 & DPgt05rPopCR20 DANS LES TABLES GT DP AD
        if "PASS" in filter:
-            FH_out.write("\t".join(line_list))
+            write_variant.append("\t".join(line_list) + '\n')
+            if len(write_variant) == 10000:
+                print ("writing 10000 filtered variants")
+                FH_out.write("".join(write_variant))
+                write_variant=list()
            if GTpopCR >= 50 and DPgt05rPopCR >= 20 :
                GTpopCR50_DPgt05rPopCR20 += 1
                # recover DP
@@ -230,33 +239,51 @@ def process(params):
                common_str.append("1") if  "cl3snp35nt" in LOCALISATION else common_str.append("0")
                common_str.append("PASS") if  "PASS" in LOCALISATION else common_str.append(".")
                
-                FH_gt.write("\t".join(common_str) + "\t" + "\t".join(gt_list) + "\n")
-                FH_dp.write("\t".join(common_str) + "\t" + "\t".join(dp_list) + "\n")
-                FH_ad.write("\t".join(common_str) + "\t" + "\t".join([ad.replace(",","\t") for ad in ad_list]) + "\n")
-
+                write_table_gt.append("\t".join(common_str) + "\t" + "\t".join(gt_list) + "\n")
+                write_table_dp.append("\t".join(common_str) + "\t" + "\t".join(dp_list) + "\n")
+                write_table_ad.append("\t".join(common_str) + "\t" + "\t".join([ad.replace(",","\t") for ad in ad_list]) + "\n")
+                
+                if len(write_table_gt) == 10000:
+                    FH_gt.write("".join(write_table_gt))
+                    FH_dp.write("".join(write_table_dp))
+                    FH_ad.write("".join(write_table_ad))
+                    write_table_dp = list()
+                    write_table_ad = list()
+                    write_table_gt = list()
+
+    if len(write_variant) > 0:
+        FH_out.write("".join(write_variant))
    FH_out.close()
    
+    if len(write_table_gt) > 0:
+        FH_gt.write("".join(write_table_gt))
+        FH_dp.write("".join(write_table_dp))
+        FH_ad.write("".join(write_table_ad))
+    FH_gt.close()
+    FH_dp.close()
+    FH_ad.close()
+    
    # ECRIRE SUMMARY
    FH_summary = open(params.summary, "wt")
    
-    FH_summary.write("# Input vcf file :" + params.in_vcf + "\n" \
+    FH_summary.write("# Input vcf file is " + params.in_vcf + "\n" \
    + "\t contains : " + str(tot_snp) + " variants\n" \
    + "# Filtering step\n" \
-    + "\t filtered on non biallelic SNP: " + str(not_SNP_BiAll) + "\n" \
-    + "\t filtered on FS: " + str(FS) + "\n" \
-    + "\t filtered on QD: " + str(QD) + "\n" \
-    + "\t kept SNP: " + str(FILTER_PASS) + " (" + str(round(FILTER_PASS * 100.0 / tot_snp , 2)) + "%)\n" \
+    + "\t filtered on non biallelic SNP : " + str(not_SNP_BiAll) + "\n" \
+    + "\t filtered on FS : " + str(FS) + "\n" \
+    + "\t filtered on QD : " + str(QD) + "\n" \
+    + "\t kept SNP : " + str(FILTER_PASS) + " (" + str(round(FILTER_PASS * 100.0 / tot_snp , 2)) + "%)\n" \
    + "\t# filter PASS SNP annotation step\n"
-    +"\t\t localized in SNP cluster: " + str(cl3snp35nt) + "\n" \
-    +"\t\t localized near exon junctions: " + str(jnct5nt) + "\n" \
-    +"\t\t localized in homopolymers: " + str(homodimer5) + "\n" \
-    +"\t\t localized in unbiased region: " + str(LOC_PASS) + "\n" \
+    +"\t\t localized in SNP cluster : " + str(cl3snp35nt) + "\n" \
+    +"\t\t localized near exon junctions : " + str(jnct5nt) + "\n" \
+    +"\t\t localized in homopolymers : " + str(homodimer5) + "\n" \
+    +"\t\t localized in unbiased region : " + str(LOC_PASS) + "\n" \
    + "\t# computing call rate step\n" \
-    + "\t\t number of SNP with a call rate >= 50%: " + str(GTpopCR50) + "\n" \
-    + "\t\t number of SNP with a signigicant (GT with DP>5) call rate >= 20%: " + str(DPgt05rPopCR20) + "\n" \
-    + "\t\t number of SNP with GTpopCR >= 50% and DPgt05rPopCR >= 20: " + str(GTpopCR50_DPgt05rPopCR20) + " ("+ str(round(GTpopCR50_DPgt05rPopCR20 * 100.0/ tot_snp,2)) + "%)\n" \
-    + "\t\t\t those SNP are described in TSV files : " + params.out_gt + ", "+ params.out_dp + ", " + params.out_ad + "\n" \
-    + "\t\t number of SNP with FILTER PASS, LOCALISATION PASS, GTpopCR >=50%, DPgt05rPopCR >= 20%: " + str(PERFECT) + " (" + str(round(PERFECT * 100.0 / tot_snp , 2)) + "%)\n")
+    + "\t\t number of SNP with a call rate >= 50% : " + str(GTpopCR50) + "\n" \
+    + "\t\t number of SNP with a signigicant (GT with DP>5) call rate >= 20% : " + str(DPgt05rPopCR20) + "\n" \
+    + "\t\t number of SNP with GTpopCR >= 50% and DPgt05rPopCR >= 20 : " + str(GTpopCR50_DPgt05rPopCR20) + " ("+ str(round(GTpopCR50_DPgt05rPopCR20 * 100.0/ tot_snp,2)) + "%)\n" \
+    + "\t\t\t those SNP are described in " + params.out_gt + ", "+ params.out_dp + ", " + params.out_ad + "\n" \
+    + "\t\t number of SNP with FILTER PASS, LOCALISATION PASS, GTpopCR >=50%, DPgt05rPopCR >= 20% : " + str(PERFECT) + " (" + str(round(PERFECT * 100.0 / tot_snp , 2)) + "%)\n")
    

 ##########################

--- a/Snakemake/1000RNASeq_chicken/calling/SLURM.sh
+++ b/Snakemake/1000RNASeq_chicken/calling/SLURM.sh
@@ -17,7 +17,7 @@ snakemake -s $WORKFLOW_DIR/Snakefile_calling --printshellcmds --jobs 200 \
 	# {cluster.cpu}, {cluster.mem} and {cluster.partition} will be replace by value defined in the resources_SLURM.yaml file
 snakemake -s $WORKFLOW_DIR/Snakefile_calling --printshellcmds --jobs 200 \
           --configfile config_calling.yaml.example \
-           --cluster-config $WORKFLOW_DIR/resources_SLURM.yaml \
+           --cluster-config $WORKFLOW_DIR/resources_calling_SLURM.yaml \
           --cluster "sbatch -p {cluster.partition} --cpus-per-task={cluster.cpu} --mem-per-cpu={cluster.mem} --error=logs/%x.stderr --output=logs/%x.stdout " --latency-wait 30

 # If you want to launch the workflow partially, indicate the final output files you want (see dryrun log)
@@ -26,7 +26,7 @@ snakemake -s $WORKFLOW_DIR/Snakefile_calling --printshellcmds --jobs 200 \
 PREFIX=??		# your sample_config TSV file whithout extension
 snakemake -s $WORKFLOW_DIR/Snakefile_calling --printshellcmds --jobs 200 \
           --configfile config_calling.yaml.example \
-           --cluster-config $WORKFLOW_DIR/resources_SLURM.yaml \
+           --cluster-config $WORKFLOW_DIR/resources_calling_SLURM.yaml \
           --cluster "sbatch -p {cluster.partition} --cpus-per-task={cluster.cpu} --mem-per-cpu={cluster.mem} --error=logs/%x.stderr --output=logs/%x.stdout " --latency-wait 30 \
           Results/Summary/Log_TrimGalore_summary.tsv Results/Summary/Log_STAR_Aln_1_summary.tsv Results/Summary/Log_STAR_Aln_2_summary.tsv \
            Results/Summary/RSEM_multimap_summary_${PREFIX}_genes_expected_count.tsv \
@@ -46,7 +46,7 @@ snakemake -s $WORKFLOW_DIR/Snakefile_calling --printshellcmds --jobs 200 \
 PREFIX=??		# your sample_config TSV file whithout extension
 snakemake -s $WORKFLOW_DIR/Snakefile_calling --printshellcmds --jobs 200 \
           --configfile config_calling.yaml.example \
-           --cluster-config $WORKFLOW_DIR/resources_SLURM.yaml \
+           --cluster-config $WORKFLOW_DIR/resources_calling_SLURM.yaml \
           --cluster "sbatch -p {cluster.partition} --cpus-per-task={cluster.cpu} --mem-per-cpu={cluster.mem} --error=logs/%x.stderr --output=logs/%x.stdout " --latency-wait 30 \
           Results/Summary/Log_TrimGalore_summary.tsv Results/Summary/Log_STAR_Aln_1_summary.tsv Results/Summary/Log_STAR_Aln_2_summary.tsv \
           Results/GATK/${PREFIX}_SNP.vcf.gz\

--- a/Snakemake/1000RNASeq_chicken/calling/resources_calling_SLURM.yaml
+++ b/Snakemake/1000RNASeq_chicken/calling/resources_calling_SLURM.yaml
@@ -2,7 +2,7 @@ __default__ :
    mem : "8G"
    cpu : 8
    time : "24:00:00"
-    queue : workq
+    partition : workq

 convertPhred:
    cpu : 1

--- a/Snakemake/RNAseqVariantCalling/How_to_install.md
+++ b/Snakemake/RNAseqVariantCalling/How_to_install.md
+## Install snakemake 5 with a venv
+
+```
+module load system/Python-3.6.3
+python3.6 -m venv env/snakemake-5_venv
+module purge
+source env/snakemake-5_venv/bin/activate
+pip install snakemake
+pip install drmaa
+pip install pandas
+```
+
+## Install conda and singularity
+
+Conda:
+```
+wget https://repo.continuum.io/miniconda/Miniconda2-4.6.14-Linux-x86_64.sh
+sh Miniconda2-4.6.14-Linux-x86_64.sh
+```
+
+Restart shell:
+```
+conda config --set auto_activate_base false
+conda config --add channels bioconda
+```
+
+Singularity:
+```
+sudo apt-get install singularity-container
+```
+
+## Create singularity image
+
+[see build process](./env/README.md)
\ No newline at end of file
--- a/Snakemake/RNAseqVariantCalling/README.md
+++ b/Snakemake/RNAseqVariantCalling/README.md
-# How to install
+# RNASeq Analysis workflow : quantification and SNP calling

-## Install snakemake 5 with a venv
+__authors__: Maria Bernard (INRA - SIGENAE) / Kevin Muret - Frederic Jehl - Sandrine Lagarrigue (INRA - PEGASE)

-```
-module load system/Python-3.6.3
-python3.6 -m venv env/snakemake-5_venv
-module purge
-source env/snakemake-5_venv/bin/activate
-pip install snakemake
-pip install drmaa
-pip install pandas
-```
+- [RNASeq Analysis workflow : quantification and SNP calling](#rnaseq-analysis-workflow---quantification-and-snp-calling)
+  * [What it does ?](#what-it-does--)
+  * [1. Input](#1-input)
+  * [2. Data Processing](#2-data-processing)
+    + [Raw reads trimming](#raw-reads-trimming)
+    + [Sequence alignment](#sequence-alignment)
+    + [Transcriptomic Alignment post processing](#transcriptomic-alignment-post-processing)
+    + [Gene/Isoform expression](#gene-isoform-expression)
+    + [Genomic Alignment post processing](#genomic-alignment-post-processing)
+    + [Variant Calling](#variant-calling)
+  * [3. Dependencies](#3-dependencies)
+  * [4. Configure and running workflow](#4-configure-and-running-workflow)

-## Install conda and singularity
+## What it does ? 

-Conda:
-```
-wget https://repo.continuum.io/miniconda/Miniconda2-4.6.14-Linux-x86_64.sh
-sh Miniconda2-4.6.14-Linux-x86_64.sh
-```
+This pipeline will perform classical RNASeq analysis workflow (trimming, STAR mapping, and RSEM expression quantification) but also call variant based on the [GATK RNASeq recommandations](https://software.broadinstitute.org/gatk/documentation/article.php?id=3891).

-Restart shell:
-```
-conda config --set auto_activate_base false
-conda config --add channels bioconda
-```
+ The pipeline is represented in the graph below:

-Singularity:
-```
-sudo apt-get install singularity-container
-```
+![](/home/maria/workspace/workflows_develop/Snakemake/1000RNASeq_chicken/calling/img/full_dryrun.png)

-## Create singularity image

-[see build process](./env/README.md)

+## 1. Input

-# How to run
+The user must use the [config_calling.yaml](example/config_calling.yaml) file to provide all necessary inputs for the pipeline:
+
+- A reference fasta file (fasta_ref option)
+
+- A genome reference GTF file (gtf_ref option)
+
+- A set of known variants used to recalibrate bases quality in GATK preprocessing steps RealignerTargetCreator and BaseRecalibrator (known_vcf option)
+
+- A sample reads description TSV file (sample_config option), with the following columns:
+
+  - idx : a unique index number to identify each fastq (paire) file
+  - name : the sample name
+  - forward_read and reverse_read are fastq file names. These files need to be in a directory precised with the data_dir option
+    - If single end, leave an empty column in the reverse_read
+    - If paired, file names need to ends with _R1.fastq.gz or _R2.fastq.gz (or fq instead of fastq, and not necessarly compressed)
+  - sequencer : sequencer used to generate the sequences : "ILLUMINA", "SLX", "SOLEXA", "SOLID", "454", "LS454", "COMPLETE", "PACBIO", "IONTORRENT", "CAPILLARY", "HELICOS", "UNKNOWN"
+  - oriented : to indicate if sequences are generated with strand-specific protocol. It corresponds to the forward_prob RSEM parameter. Indicate : 
+    - 1 for a strand-specific protocol where all (upstream) reads are derived from the forward strand,
+    - 0 for a strand-specific protocol where all (upstream) read are derived from the reverse strand,
+    - 0.5 for a non-strand-specific protocol. 
+  - phred scale: to indicate the phred score scale use to code base quality: either 33 (Sanger and illumina 1.8+) or 64 (Solexa, illumina 1.3 to 1.8 excluded)
+
+
+
+
+## 2. Data Processing
+
+### Raw reads trimming
+
+First step is to filter and trim raw reads before alignment. This step will convert phred 64 scale fastq files into phred 33 scale fastq files and then trim reads with [trim_galore](https://www.bioinformatics.babraham.ac.uk/projects/trim_galore/). It will trim adapter, low-quality ends (see the trimming_quality option in the config_calling.yaml) and filtered out trimmed reads that are smaller than the third of the read length.
+
+You will find in Result/Summary directory, a Log_TrimGalore_summary.tsv that detail the trim_galor statistics.
+
+### Sequence alignment
+
+Trimmed reads will be align against the reference genome thanks to the [STAR](https://academic.oup.com/bioinformatics/article/29/1/15/272537) following the multi-sample 2-pass mapping procedure, with the GTF  reference file.
+
+After a first pass of alignment, SJout (exon junctions) output files will be filtered to create one SJout file that contain:
+new canonical junctions covered by at least 2 uniquely mapped reads and present in at least 2 samples.
+STAR 2nd pass alignment will use this new junction file and the provided GTF to generate bam files in genome and in transcriptome coordinates.
+
+You will find in the Results/Summary directory , 2 files reporting alignment statistics : Log_STAR_Aln_1_summary.tsv and Log_STAR_Aln_2_summary.tsv
+
+
+
+### Transcriptomic Alignment post processing
+
+Reads with multiple alignment will be filtered out to generate a seconde transcriptomic bam:
+
+```samtools view -q 255```
+
+
+
+### Gene/Isoform expression 
+
+Expression will be compute on all reads align on transcriptome or on uniquely mapped reads thanks to [RSEM](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-12-323). For each sample, it will generate gene and isoforms results.
+
+You will find in the Results/Summary directory, expression quantification for all samples:

-## Configure workflow
+* for gene and isorms
+* for expected count, TPM and FPKM quantification method
+* based on uniquely mapped reads or all alignment.

-### 1. Create sample file based on data/sample.example.csv file
+The file will be named : 

-`sample_config` file is a tabular file describing each input fastq file.
+	RSEM\__[multimap/uniqmap]_\_summary_PREFIX\__[gene/isoforms]_\__[TPM/expected\_count/FPKM]_.tsv

-The expected header columns are : `idx	name	forward_read reverse_read sequencer	read_length	oriented	phred_scale`
+PREFIX is the name of your sample_config file define in the config.yaml file.

-#### `name` will be used to agregated sample in the final VCF

-#### `forward_read` and `reverse_read` are fastq file names. These files need to be in the data_dir directory sample may be paired end or single end.

-* If single end leave an empty column in the reverse_read
-* If paired, file names need to ends with _R1.fastq.gz or _R2.fastq.gz (or fq instead of fastq, and not necessarly compressed)
+### Genomic Alignment post processing
+
+Genomic alignement will be used to call variants. Each sample will have its bam treated like this:
+
+- cleaned thanks to cleanSam from GATK
+- annotated with AddOrReplaceReadGroups from [Picard tools](https://broadinstitute.github.io/picard/) based on information you provide in the sample config file
+- multimapped reads filtered out (samtools -q 255)
+- merged with other bam files belonging to the same sample name (optionnal)
+- with PCR and optical duplicates marked thanks to Picard tools
+- with splitted alignment convert in separated alignments and mapping quality of 250 reassigned to 60 with SplitNCigarReads from GATK 
+- with its indel realign thanks to RealignerTargetCreator and IndelRealigner, and its base quality score recalibrated thanks to BaseRecalibrator and PrintReads from GATK using the known_vcf file provided in the config.yaml.
+
+
+
+The final bam alignment files will be find in Results/GATK/ directory and will be called : _[sample]_\_uniq_cs_rg_merge_md_split_real_recal.bam
+
+
+
+### Variant Calling
+
+Variants will called thanks to HaplotypeCaller from GATK which will generate one gVCF per sample (Results/GATK/_[sample]_\_.g.vcf.gz files) with the following options:
+
+- -stand_call_conf 20.0 
+- --min_base_quality_score 10 
+- --min_mapping_quality_score 20
+
+GenotypeGVCFs from GATK will take all gVCF files to generate one VCF file called Results/GATK/PREFIX_full.vcf.gz
+
+Finally, this VCF file will be splitted into two VCF files for SNP or INDEL using SelectVariant from GATK:Results/GATK/PREFIX_SNP.vcf.gz and Results/GATK/PREFIX_INDEL.vcf.gz
+
+
+
+# How to install
+
+The workflow depends on:
+
+* cutadapt
+* trimgalore (version 0.4.5)
+* STAR (version 2.5.2b)
+* picard.jar (version 2.1.1)
+* samtools (version 1.3.1 )
+* rsem-prepare-reference ( RSEM version 1.3.0)
+* rsem-calculate-expression ( RSEM version 1.3.0)
+* GenomeAnalysisTK.jar (version 3.7)
+* java (version 8)
+
+
+
+These dependencies are all included in the singularity environment, or must be available in your PATH or in a directory that you must precise in the [config_calling.yaml](example/config_calling.yaml)
+
+See [How_to_install.md](How_to_install.md) for more details on building singularity environment
+
+
+
+# How to run
+
+### Configure workflow
+
+1. Create sample file based on example/sample.tsv file (see  [config_calling.yaml](example/config_calling.yaml) or section [1. Input](#1-input):

-#### `sequencer`
-* ["ILLUMINA","SLX","SOLEXA","SOLID","454","LS454","COMPLETE","PACBIO","IONTORRENT","CAPILLARY","HELICOS","UNKNOWN"]
+   - The name of the file will be used to prefix the final results

-#### `oriented`
-This is the forward_prob RSEM parameter to indicate :
-* 1 for a strand-specific protocol where all (upstream) reads are derived from the forward strand,
-* 0 for a strand-specific protocol where all (upstream) read are derived from the reverse strand,
-* 0.5 for a non-strand-specific protocol. (Default: off)
+   - sample with the same `name` will be aggregated

-#### `phred scale` indicate the phred score scale use to code base quality
-* either 33 (Sanger and illumina 1.8+)
-* or 64 (Solexa, illumina 1.3 to 1.8 excluded)
+   - fastq files need to be in the `data_dir` specified in the `config.yaml` file (see next point)

+     

-### 2. edit config file config_calling.yaml
-Copy file `example/config_calling.yaml` and fill it with exepected values :
+2. Copy file `example/config_calling.yaml` and fill it with expected values :

-For workflow software : you can use either binary in path, directory path with all requiered binary or singularity image.
-[see build process](./env/README.md)
+   For workflow software : you can use either binary in path, directory path with all required binary or singularity image (see [build process](./env/README.md))

-Configure input path files:
-* sample_config
-* fasta_ref
-* gtf_ref
-* known_vcf
-* resources
+   

-### 3. set environment variable
+3. launch snakemake command as presented in SLURM.sh or Singularity_SLURM.sh to launch the workflow. This script explain how to launch a dryrun, a full run, perform quantification expression only or variant calling only

-If you don't use singularity image you must fix path to java jar files for picard tools and GATK (3.8)
-```
-picard_jar : $PICARD_PATH
-gatk_jar : $GATK_PATH
-```
+   

-## Execution on a cluster
-Create script file `launsh_snakemake.sh`:
-```
-#!/bin/sh
-#SBATCH --mail-type=BEGIN,END,FAIL
-#SBATCH -J GATK_RSEM
-#SBATCH -o log/%x.out
-#SBATCH -e log/%x.err
+4. If necessary update [resources_SLURM.yaml](resources_SLURM.yaml) to define cpu memory and cluster partition resources for each rule. The default section is the minimal resources per job. The other section take as name the rules name and define only resources that are different from the default section.

-source ./env/snakemake-5_venv/bin/activate 
-module load system/singularity-3.0.1

-mkdir log
-snakemake \
-    --snakefile Snakefile \
-    --configfile config_calling.yaml \
-    --jobs 200 --cluster-config example/resources_calling_SLURM.yaml \
-    --use-singularity \
-    --cluster "sbatch -p {cluster.queue} -e log/%x.err -o log/%x.out \
-    --cpus-per-task={cluster.cpu} --time={cluster.time} --mem-per-cpu={cluster.mem}" \
-    --printshellcmds --latency-wait 30
-```
+resources_SLURM.yaml, SLURM.sh, Singularity_SLURM.sh are specific to SLURM cluster. Therefore, if you use another cluster, you need to create other files. However, you can use these files to inspire you.
+    

-launch command with:
-```
-sbatch launsh_snakemake.sh
-```
+This is the official documentation of Snakemake if you need more informations : <https://snakemake.readthedocs.io/en/stable/tutorial/tutorial.html>
--- a/Snakemake/RNAseqVariantCalling/SLURM.sh
+++ b/Snakemake/RNAseqVariantCalling/SLURM.sh
+#!/bin/bash
+##############################################################################
+## launch workflow
+
+# if used on Genologin cluster (INRA Toulouse )
+# 	module load system/Python-3.6.3
+
+# Path to the SNAKEMAKE workflow directory
+WORKFLOW_DIR=.
+
+mkdir -p logs
+
+# This is an example of the snakemake command line to launch a dryrun
+snakemake -s $WORKFLOW_DIR/Snakefile_calling --printshellcmds --jobs 200 \
+           --configfile config_calling.yaml.example --dryrun
+
+# This is an example of the snakemake command line to launch the full workflow on a SLURM cluster
+	# {cluster.cpu}, {cluster.mem} and {cluster.partition} will be replace by value defined in the resources_SLURM.yaml file
+snakemake -s $WORKFLOW_DIR/Snakefile_calling --printshellcmds --jobs 200 \
+           --configfile config_calling.yaml.example \
+           --cluster-config $WORKFLOW_DIR/resources_SLURM.yaml \
+           --cluster "sbatch -p {cluster.partition} --cpus-per-task={cluster.cpu} --mem-per-cpu={cluster.mem} --error=logs/%x.stderr --output=logs/%x.stdout " --latency-wait 30
+
+# If you want to launch the workflow partially, indicate the final output files you want (see dryrun log)
+# 
+# For RSEM quantification only
+PREFIX=??		# your sample_config TSV file whithout extension
+snakemake -s $WORKFLOW_DIR/Snakefile_calling --printshellcmds --jobs 200 \
+           --configfile config_calling.yaml.example \
+           --cluster-config $WORKFLOW_DIR/resources_SLURM.yaml \
+           --cluster "sbatch -p {cluster.partition} --cpus-per-task={cluster.cpu} --mem-per-cpu={cluster.mem} --error=logs/%x.stderr --output=logs/%x.stdout " --latency-wait 30 \
+           Results/Summary/Log_TrimGalore_summary.tsv Results/Summary/Log_STAR_Aln_1_summary.tsv Results/Summary/Log_STAR_Aln_2_summary.tsv \
+            Results/Summary/RSEM_multimap_summary_${PREFIX}_genes_expected_count.tsv \
+            Results/Summary/RSEM_uniqmap_summary_${PREFIX}_genes_expected_count.tsv \
+			Results/Summary/RSEM_multimap_summary_${PREFIX}_genes_FPKM.tsv \
+			Results/Summary/RSEM_uniqmap_summary_${PREFIX}_genes_FPKM.tsv \
+			Results/Summary/RSEM_multimap_summary_${PREFIX}_genes_TPM.tsv \
+			Results/Summary/RSEM_uniqmap_summary_${PREFIX}_genes_TPM.tsv \
+			Results/Summary/RSEM_multimap_summary_${PREFIX}_isoforms_expected_count.tsv \
+			Results/Summary/RSEM_uniqmap_summary_${PREFIX}_isoforms_expected_count.tsv \
+			Results/Summary/RSEM_multimap_summary_${PREFIX}_isoforms_FPKM.tsv \
+			Results/Summary/RSEM_uniqmap_summary_${PREFIX}_isoforms_FPKM.tsv \
+			Results/Summary/RSEM_multimap_summary_${PREFIX}_isoforms_TPM.tsv \
+			Results/Summary/RSEM_uniqmap_summary_${PREFIX}_isoforms_TPM.tsv
+
+# For GATK calling only
+PREFIX=??		# your sample_config TSV file whithout extension
+snakemake -s $WORKFLOW_DIR/Snakefile_calling --printshellcmds --jobs 200 \
+           --configfile config_calling.yaml.example \
+           --cluster-config $WORKFLOW_DIR/resources_SLURM.yaml \
+           --cluster "sbatch -p {cluster.partition} --cpus-per-task={cluster.cpu} --mem-per-cpu={cluster.mem} --error=logs/%x.stderr --output=logs/%x.stdout " --latency-wait 30 \
+           Results/Summary/Log_TrimGalore_summary.tsv Results/Summary/Log_STAR_Aln_1_summary.tsv Results/Summary/Log_STAR_Aln_2_summary.tsv \
+           Results/GATK/${PREFIX}_SNP.vcf.gz\
+           Results/GATK/${PREFIX}_INDEL.vcf.gz
--- a/Snakemake/RNAseqVariantCalling/Singularity_SLURM.sh
+++ b/Snakemake/RNAseqVariantCalling/Singularity_SLURM.sh
+#!/bin/bash
+
+##############################################################################
+## launch workflow
+
+# if used on Genologin cluster (INRA Toulouse )
+# 	module load system/Python-3.6.3
+#   module load system/singularity-3.0.1
+
+# Path to the SNAKEMAKE workflow directory
+WORKFLOW_DIR=.
+
+mkdir -p logs
+
+# This is an example of the snakemake command line to launch a dryrun
+snakemake -s $WORKFLOW_DIR/Snakefile_calling --printshellcmds --jobs 200 \
+           --configfile config_calling.yaml.example --dryrun
+
+# This is an example of the snakemake command line to launch the full workflow on a SLURM cluster
+	# {cluster.cpu}, {cluster.mem} and {cluster.partition} will be replace by value defined in the resources_SLURM.yaml file
+snakemake -s $WORKFLOW_DIR/Snakefile_calling --printshellcmds --jobs 200 \
+           --use-singularity \
+           --configfile config_calling.yaml.example \
+           --cluster-config $WORKFLOW_DIR/resources_SLURM.yaml \
+           --cluster "sbatch -p {cluster.partition} --cpus-per-task={cluster.cpu} --mem-per-cpu={cluster.mem} --error=logs/%x.stderr --output=logs/%x.stdout " --latency-wait 30
+
+# If you want to launch the workflow partially, indicate the final output files you want (see dryrun log)
+# 
+# For RSEM quantification only
+PREFIX=??		# your sample_config TSV file whithout extension
+snakemake -s $WORKFLOW_DIR/Snakefile_calling --printshellcmds --jobs 200 \
+           --configfile config_calling.yaml.example \
+           --use-singularity \
+           --cluster-config $WORKFLOW_DIR/resources_SLURM.yaml \
+           --cluster "sbatch -p {cluster.partition} --cpus-per-task={cluster.cpu} --mem-per-cpu={cluster.mem} --error=logs/%x.stderr --output=logs/%x.stdout " --latency-wait 30 \
+           Results/Summary/Log_TrimGalore_summary.tsv Results/Summary/Log_STAR_Aln_1_summary.tsv Results/Summary/Log_STAR_Aln_2_summary.tsv \
+           Results/Summary/RSEM_multimap_summary_${PREFIX}_genes_expected_count.tsv \
+           Results/Summary/RSEM_uniqmap_summary_${PREFIX}_genes_expected_count.tsv \
+			     Results/Summary/RSEM_multimap_summary_${PREFIX}_genes_FPKM.tsv \
+    			 Results/Summary/RSEM_uniqmap_summary_${PREFIX}_genes_FPKM.tsv \
+    			 Results/Summary/RSEM_multimap_summary_${PREFIX}_genes_TPM.tsv \
+    			 Results/Summary/RSEM_uniqmap_summary_${PREFIX}_genes_TPM.tsv \
+    			 Results/Summary/RSEM_multimap_summary_${PREFIX}_isoforms_expected_count.tsv \
+    			 Results/Summary/RSEM_uniqmap_summary_${PREFIX}_isoforms_expected_count.tsv \
+    			 Results/Summary/RSEM_multimap_summary_${PREFIX}_isoforms_FPKM.tsv \
+    			 Results/Summary/RSEM_uniqmap_summary_${PREFIX}_isoforms_FPKM.tsv \
+    			 Results/Summary/RSEM_multimap_summary_${PREFIX}_isoforms_TPM.tsv \
+    			 Results/Summary/RSEM_uniqmap_summary_${PREFIX}_isoforms_TPM.tsv
+
+# For GATK calling only
+PREFIX=??		# your sample_config TSV file whithout extension
+snakemake -s $WORKFLOW_DIR/Snakefile_calling --printshellcmds --jobs 200 \
+           --configfile config_calling.yaml.example \
+           --use-singularity \
+           --cluster-config $WORKFLOW_DIR/resources_SLURM.yaml \
+           --cluster "sbatch -p {cluster.partition} --cpus-per-task={cluster.cpu} --mem-per-cpu={cluster.mem} --error=logs/%x.stderr --output=logs/%x.stdout " --latency-wait 30 \
+           Results/Summary/Log_TrimGalore_summary.tsv Results/Summary/Log_STAR_Aln_1_summary.tsv Results/Summary/Log_STAR_Aln_2_summary.tsv \
+           Results/GATK/${PREFIX}_SNP.vcf.gz\
+           Results/GATK/${PREFIX}_INDEL.vcf.gz
--- a/Snakemake/RNAseqVariantCalling/Snakefile
+++ b/Snakemake/RNAseqVariantCalling/Snakefile
--- a/Snakemake/RNAseqVariantCalling/example/config_calling.yaml
+++ b/Snakemake/RNAseqVariantCalling/example/config_calling.yaml
+################################################################################
+#
 # SOFTWARE : soft should bin in PATH or in bin_dir or use a singularity image
-# bin_dir is a directory containing software binary used in this pipeline:
-# bin_dir:
+#
+### Singularity Environment
 # Could be generate with env/rnaseq-gatk-singularity-receipe.txt
 singulatiry_img: <ABS_PATH>/rnaseq-gatk-singularity

-# for singularity image use following environment variable.
+### OR executable in a bin directory
+# bin_dir is a directory containing software binary used in this pipeline:
+# bin_dir:
+
 # for bin_dir fix path to jar files
+# for singularity image use following environment variable: $PICARD_PATH and $GATK_PATH.
 picard_jar: $PICARD_PATH
 gatk_jar: $GATK_PATH

+
+################################################################################
+#
+# WORKFLOW INPUTS
+#
+
 # data_dir contains fastq files described in sample_config file
 data_dir: <ABS_PATH>
-# sample file see REAME file for detailed explanations
+
+# sample_config file is a tabular file describing each input fastq file
+# sample_config file name will be used as prefix of the output files.
+# header columns are : idx	name	forward_read	reverse_read	sequencer	read_length	oriented	phred_scale
+# 	forward_read and reverse_read are fastq file names. These files need to be in the data_dir directory
+# 	sample may be paired end or single end. 
+#       - If single end, leave an empty column in the reverse_read
+#       - If paired, file names need to ends with _R1.fastq.gz or _R2.fastq.gz (or fq instead of fastq, and not necessarly compressed)
+# 	sequencer need to be choosen in the list : ["ILLUMINA","SLX","SOLEXA","SOLID","454","LS454","COMPLETE","PACBIO","IONTORRENT","CAPILLARY","HELICOS","UNKNOWN"]
+# 	oriented is the forward_prob RSEM parameter to indicate : 
+#       1 for a strand-specific protocol where all (upstream) reads are derived from the forward strand, 
+#       0 for a strand-specific protocol where all (upstream) read are derived from the reverse strand, 
+#       0.5 for a non-strand-specific protocol. 
+# 	phred scale indicate the phred score scale use to code base quality: either 33 (Sanger and illumina 1.8+) or 64 (Solexa, illumina 1.3 to 1.8 excluded)
 sample_config : sample.tsv
 # fasta_ref file is the genome reference Fasta file
 fasta_ref: <ABS_PATH>/reference.fa

--- a/Snakemake/RNAseqVariantCalling/example/resources_calling_SLURM.yaml
+++ b/Snakemake/RNAseqVariantCalling/example/resources_calling_SLURM.yaml
@@ -2,7 +2,7 @@ __default__ :
    mem : "8G"
    cpu : 8
    time : "24:00:00"
-    queue : workq
+    partition : workq

 convertPhred:
    cpu : 1