Skip to content
Snippets Groups Projects
Commit 5e59c856 authored by mariabernard's avatar mariabernard
Browse files

Merge branch 'develop'

parents 9ae7f1ff 33ef13bc
No related branches found
No related tags found
No related merge requests found
Showing
with 1017 additions and 337 deletions
......@@ -28,6 +28,7 @@ def check_param(rule,resource):
rule_resources = {
'indexFasta' : ['mem'],
'FS_QD_clust' : ['mem'],
'vcf_filtration' : ['cpu'],
'SelectVariants' : ['mem'],
'STAR_Index_1' : ['cpu'],
'STAR_Aln_1' : ['cpu'],
......@@ -150,7 +151,7 @@ final_outputs.append('Results_ASE/vcfFiltration/' + os.path.basename(config['vcf
final_outputs.append('Results_ASE/vcfFiltration/' + os.path.basename(config['vcf']).replace('.vcf.gz','_FsQdBiall_withGTpopCR50_DPgt05rPopCR20_GT.tsv'))
final_outputs.append('Results_ASE/vcfFiltration/' + os.path.basename(config['vcf']).replace('.vcf.gz','_FsQdBiall_withGTpopCR50_DPgt05rPopCR20_DP.tsv'))
final_outputs.append('Results_ASE/vcfFiltration/' + os.path.basename(config['vcf']).replace('.vcf.gz','_FsQdBiall_withGTpopCR50_DPgt05rPopCR20_AD.tsv'))
final_outputs.append('Results_ASE/Summary/' + os.path.basename(config['vcf']).replace('.vcf.gz','') + '_VariantFiltration_summary.txt')
final_outputs.append('Results_ASE/Summary/VariantFiltration_summary.txt')
# counter
for sample in table["sample_name"].unique():
......@@ -172,7 +173,7 @@ for sample in table["sample_name"].unique():
# 'Results_ASE/vcfFiltration/variants_FsQdBiall_withGTpopCR50_DPgt05rPopCR20_GT.tsv',
# 'Results_ASE/vcfFiltration/variants_FsQdBiall_withGTpopCR50_DPgt05rPopCR20_DP.tsv',
# 'Results_ASE/vcfFiltration/variants_FsQdBiall_withGTpopCR50_DPgt05rPopCR20_AD.tsv',
# 'Results_ASE/Summary/variants_VariantFiltration_summary.txt',
# 'Results_ASE/Summary/VariantFiltration_summary.txt',
# 'Results_ASE/ASEReadCounter/samplePE_ASEReadCounter.csv',
# 'Results_ASE/phASER/samplePE_phASER.allelic_counts.txt',
# 'Results_ASE/phASER_gene_ae/samplePE_phASER.gene_ae.txt',
......
......@@ -42,12 +42,6 @@ data_dir : data
SNP_filter : True
# quality trimming threshold used in trimgalore to remove low quality bases.
trimming_quality : 15
# minimum calling depth
depth : 10
# minimum percentage (between 0 and 1) of sample with known genotype
GTpopCR_th : 0.50
# minimum percentage (between 0 and 1) of sample with known genotype with DP > 5
DPgt05rPopCR_th : 0.20
# minimum base quality
baseQuality : 20
# minimum mapping quality
......
......@@ -4,28 +4,36 @@ __default__:
time : "4:00:00"
partition : workq
trim_pe :
time : "24:00:00"
trim_se :
time : "24:00:00"
FS_QD_clust:
mem : "20G"
SelectVariants:
mem : "20G"
VCF_for_ASE_and_table:
vcf_filtration:
cpu : 10
mem : "16G"
time : "96:00:00"
partition : unlimitq
partition : workq
STAR_Index_1:
mem : "18G"
mem : "50G"
cpu : 8
time : "24:00:00"
STAR_Aln_1:
mem : "12G"
mem : "30G"
cpu : 8
time : "24:00:00"
STAR_Aln_2:
mem : "12G"
mem : "30G"
cpu : 8
time : "24:00:00"
......@@ -37,9 +45,11 @@ RmDuplicates:
SplitNCigarReads:
mem : "30G"
time : "8:00:00"
phASER:
mem : "20G"
cpu : 8
mem : "130G"
phASER_expr_matrix:
cpu : 8
......
......@@ -51,18 +51,9 @@ rule ASEReadCounter:
baseQuality = config['baseQuality']
shell:
"""
gatk --java-options "-Xmx{params.mem}" ASEReadCounter -R {input.ref} -O {output} -I {input.bam} --variant {input.vcf} --min-mapping-quality {params.mappingQuality} --min-base-quality {params.baseQuality}
gatk --java-options "-Xmx{params.mem}" ASEReadCounter -R {input.ref} -O {output} -I {input.bam} --tmp-dir Results_ASE/ASEReadCounter --variant {input.vcf} --min-mapping-quality {params.mappingQuality} --min-base-quality {params.baseQuality}
"""
rule gtf2bed:
input:
gtf = config['gtf_ref']
output:
bed = 'Results_ASE/ref_bed/' + os.path.basename(config['gtf_ref']).replace('.gtf', '.bed')
script:
"../script/gtf2bed.py"
rule phASER:
input:
vcf = 'Results_ASE/vcfFiltration/' + os.path.basename(config['vcf']).replace('.vcf.gz','_FsQdBiall.vcf.gz'),
......@@ -80,9 +71,19 @@ rule phASER:
idSeparator = '_' if config['id_separator'] == '' else config['id_separator']
shell:
"""
python2.7 {config[bin_dir]}/phaser.py --vcf {input.vcf} --bam {input.bam} --sample {wildcards.sample} --threads {threads} --o Results_ASE/Counter/{wildcards.sample}_phASER --paired_end {params.paired_option} --baseq {params.baseQuality} --mapq {params.mappingQuality} --id_separator {params.idSeparator}
python2.7 {config[bin_dir]}/phaser.py --vcf {input.vcf} --bam {input.bam} --sample {wildcards.sample} --threads {threads} --o Results_ASE/phASER/{wildcards.sample}_phASER --paired_end {params.paired_option} --baseq {params.baseQuality} --mapq {params.mappingQuality} --id_separator {params.idSeparator}
"""
rule gtf2bed:
input:
gtf = config['gtf_ref']
output:
bed = 'Results_ASE/ref_bed/' + os.path.basename(config['gtf_ref']).replace('.gtf', '.bed')
script:
"../script/gtf2bed.py"
rule phASER_gene_ae:
input:
hap_count = "Results_ASE/phASER/{sample}_phASER.haplotypic_counts.txt",
......@@ -93,7 +94,7 @@ rule phASER_gene_ae:
idSeparator = '_' if config['id_separator'] == '' else config['id_separator']
shell:
"""
python2.7 {config[bin_dir]}/phaser_gene_ae.py --haplotypic_counts {input.hap_count} --features {input.bed} --o {output} --id_separa
python2.7 {config[bin_dir]}/phaser_gene_ae.py --haplotypic_counts {input.hap_count} --features {input.bed} --o {output} --id_separator {params.idSeparator}
"""
# TO FINISH AND TEST
......
......@@ -45,4 +45,4 @@ rule bam_index:
temp("{file}.bam.bai")
shell:
"samtools index {input} "
\ No newline at end of file
"samtools index {input} "
......@@ -6,13 +6,18 @@ __version__ = '1.0.0'
'''
?????
'''
import sys
from pyfaidx import Fasta
genome = Fasta(config['fasta_ref'])
chrom_list = [ seq.name for seq in genome if seq.name.isdigit() ]
PREFIX = os.path.basename(config['vcf']).replace('.vcf.gz','')
# GATK4
rule SelectVariants :
input :
vcf = config['vcf']
output :
vcf = temp("Results_ASE/vcfFiltration/{prefix}_SNP.vcf")
vcf = temp("Results_ASE/vcfFiltration/" + PREFIX + "_SNP.vcf")
params :
mem = config["SelectVariants"]["mem"]
shell:
......@@ -22,7 +27,7 @@ rule SelectVariants :
def get_SNP_vcf(wildcards):
if config['SNP_filter']:
return "Results_ASE/vcfFiltration/{prefix}_SNP.vcf"
return "Results_ASE/vcfFiltration/" + PREFIX + "_SNP.vcf"
else:
return config['vcf']
......@@ -34,7 +39,8 @@ rule FS_QD_clust :
vcf = get_SNP_vcf,
ref = config['fasta_ref']
output :
vcf = temp("Results_ASE/vcfFiltration/{prefix}_FS_QD_clust_tagged.vcf")
vcf = temp("Results_ASE/vcfFiltration/" + PREFIX + "_FS_QD_clust_tagged.vcf"),
idx = temp("Results_ASE/vcfFiltration/" + PREFIX + "_FS_QD_clust_tagged.vcf.idx")
params :
mem = config["FS_QD_clust"]["mem"]
shell:
......@@ -42,21 +48,116 @@ rule FS_QD_clust :
gatk --java-options "-Xmx{params.mem}" VariantFiltration -R {input.ref} -V {input.vcf} -O {output.vcf} --filter-name FS --filter-expression "FS > 30.0" --filter-name QD --filter-expression "QD < 2.0" -window 35 -cluster 3
'''
rule VCF_for_ASE_and_table :
rule vcf_filtration :
input :
vcf = "Results_ASE/vcfFiltration/{prefix}_FS_QD_clust_tagged.vcf",
vcf = "Results_ASE/vcfFiltration/" + PREFIX + "_FS_QD_clust_tagged.vcf",
idx = "Results_ASE/vcfFiltration/" + PREFIX + "_FS_QD_clust_tagged.vcf.idx",
gtf = config['gtf_ref'],
fasta = config['fasta_ref']
output :
vcf = temp("Results_ASE/vcfFiltration/{prefix}_FsQdBiall.vcf") ,
gt = "Results_ASE/vcfFiltration/{prefix}_FsQdBiall_withGTpopCR50_DPgt05rPopCR20_GT.tsv" ,
dp = "Results_ASE/vcfFiltration/{prefix}_FsQdBiall_withGTpopCR50_DPgt05rPopCR20_DP.tsv" ,
ad = "Results_ASE/vcfFiltration/{prefix}_FsQdBiall_withGTpopCR50_DPgt05rPopCR20_AD.tsv" ,
summary = "Results_ASE/Summary/{prefix}_VariantFiltration_summary.txt"
vcf = temp("Results_ASE/vcfFiltration/by_chrom/{chrom}_" + PREFIX + "_FsQdBiall.vcf") ,
gt = temp("Results_ASE/vcfFiltration/by_chrom/{chrom}_" + PREFIX + "_FsQdBiall_withGTpopCR50_DPgt05rPopCR20_GT.tsv") ,
dp = temp("Results_ASE/vcfFiltration/by_chrom/{chrom}_" + PREFIX + "_FsQdBiall_withGTpopCR50_DPgt05rPopCR20_DP.tsv" ),
ad = temp("Results_ASE/vcfFiltration/by_chrom/{chrom}_" + PREFIX + "_FsQdBiall_withGTpopCR50_DPgt05rPopCR20_AD.tsv" ),
summary = temp("Results_ASE/vcfFiltration/by_chrom/{chrom}_" + PREFIX + "_VariantFiltration_summary.txt")
params :
chrom_selection = lambda wildcards : "--accepted-chrom " + wildcards.chrom if wildcards.chrom != "other" else "--excluded-chrom " + " ".join(chrom_list),
cpu = config['vcf_filtration']['cpu']
shell:
'''
variantFiltration.py --nb-cpus {params.cpu} --in-vcf {input.vcf} --in-gtf {input.gtf} --in-fasta {input.fasta} {params.chrom_selection} --out-vcf {output.vcf} --out-gt {output.gt} --out-dp {output.dp} --out-ad {output.ad} --summary {output.summary}
'''
rule merge_filteredVariant :
input :
chr_vcf = expand("Results_ASE/vcfFiltration/by_chrom/{CHR}_" + PREFIX + "_FsQdBiall.vcf", CHR=chrom_list),
other_vcf = "Results_ASE/vcfFiltration/by_chrom/other_" + PREFIX + "_FsQdBiall.vcf"
output :
vcf = "Results_ASE/vcfFiltration/" + PREFIX + "_FsQdBiall.vcf"
shell:
'''
vcf_annotator.py --in-vcf {input.vcf} --in-gtf {input.gtf} --in-fasta {input.fasta} --out-vcf {output.vcf} --out-gt {output.gt} --out-dp {output.dp} --out-ad {output.ad} --summar {output.summary}
bcftools concat -o {output.vcf} -O v {input.chr_vcf} {input.other_vcf}
'''
def get_table(wildcards):
table_list = expand("Results_ASE/vcfFiltration/by_chrom/{chrom}_" + PREFIX + "_FsQdBiall_withGTpopCR50_DPgt05rPopCR20_"+wildcards.suffix + ".tsv", chrom=chrom_list )
return table_list
rule merge_filtered_table:
input :
chr_tsv = get_table,
other_tsv = "Results_ASE/vcfFiltration/by_chrom/other_" + PREFIX + "_FsQdBiall_withGTpopCR50_DPgt05rPopCR20_{suffix}.tsv"
output :
tsv = "Results_ASE/vcfFiltration/" + PREFIX + "_FsQdBiall_withGTpopCR50_DPgt05rPopCR20_{suffix}.tsv"
wildcard_constraints:
suffix="GT|AD|DP"
shell:
'''
head -n 1 {input.other_tsv} > {output.tsv}
cat {input.chr_tsv} {input.other_tsv}| grep -v "#" >> {output.tsv}
'''
rule merge_filtered_summary:
input :
chr_tsv = expand("Results_ASE/vcfFiltration/by_chrom/{chrom}_" + PREFIX + "_VariantFiltration_summary.txt", chrom=chrom_list),
other_tsv = "Results_ASE/vcfFiltration/by_chrom/other_" + PREFIX + "_VariantFiltration_summary.txt"
output :
summary = "Results_ASE/Summary/VariantFiltration_summary.txt"
run :
dic_results = dict()
in_list = input.chr_tsv
in_list.append(input.other_tsv)
# in_list = ['Results_ASE/vcfFiltration/by_chrom/1_test_VariantFiltration_summary.txt', 'Results_ASE/vcfFiltration/by_chrom/25_test_VariantFiltration_summary.txt', 'Results_ASE/vcfFiltration/by_chrom/26_test_VariantFiltration_summary.txt', 'Results_ASE/vcfFiltration/by_chrom/other_test_VariantFiltration_summary.txt']
first=in_list.pop(0)
FH_first = open(first,'rt')
summary = ''
first_key = ''
percent_list = list()
for line in FH_first:
if ':' in line:
category = line.split(':')[0]
key = category.upper().strip().replace(' ','_')
# store first key concidered as the total of variant
if first_key == '':
first_key = key
# store key where we want to compute percentage
if '%)' in line:
percent_list.append(key)
value = int(line.split(':')[1].strip().split()[0])
summary += category + ': ###' + key + '###\n'
dic_results[key] = value
elif "# Input vcf file is" in line:
line = "# Input vcf file is " + "Results_ASE/vcfFiltration/" + PREFIX + "_FS_QD_clust_tagged.vcf\n"
summary += line
elif "those SNP are described in" in line:
line = "\t\t\tthose SNP are described in " + "Results_ASE/vcfFiltration/" + PREFIX + "_FsQdBiall_withGTpopCR50_DPgt05rPopCR20_GT.tsv, Results_ASE/vcfFiltration/" + PREFIX + "_FsQdBiall_withGTpopCR50_DPgt05rPopCR20_DP.tsv, Results_ASE/vcfFiltration/" + PREFIX + "_FsQdBiall_withGTpopCR50_DPgt05rPopCR20_AD.tsv\n"
summary += line
else:
summary += line
FH_first.close()
for f in in_list:
FH_in = open(f,'rt')
for line in FH_in:
if ':' in line:
category = line.strip().split(':')[0]
key = category.upper().strip().replace(' ','_')
value = int(line.strip().split(':')[1].strip().split()[0])
dic_results[key] += value
FH_in.close()
for k in dic_results:
if k in percent_list:
percent = round(dic_results[k]*100/dic_results[first_key],2)
summary = summary.replace('###' + k + '###' , str(dic_results[k]) + ' (' + str(percent) + '%)')
else:
summary = summary.replace('###' + k + '###' , str(dic_results[k]))
FH_out = open(output.summary,'wt')
FH_out.write(summary)
FH_out.close()
#!/usr/bin/env python3
import argparse
import os,sys
import time
import subprocess
from subprocess import Popen, PIPE
import multiprocessing
##########################
# FUNCTION
def select_on_chrom(in_vcf, accepted_chrom, excluded_chrom, selected_vcf):
FH_in = open(in_vcf, 'rt')
FH_out = open(selected_vcf,'wt')
for line in FH_in:
if line.startswith('#'):
FH_out.write(line)
elif accepted_chrom and line.split()[0] in accepted_chrom :
FH_out.write(line)
elif excluded_chrom and line.split()[0] not in excluded_chrom :
FH_out.write(line)
def split_vcf(in_vcf, nb_cpus, tmp_files ):
# count variant
cmd = 'grep -vc "#" ' + in_vcf
p = Popen(cmd, shell=True, universal_newlines=True, stdout=PIPE, stderr=PIPE)
stdout, stderr = p.communicate()
nb_var = int(stdout.strip())
nb_by_file = int(nb_var / nb_cpus)
# write splitted VCF
tmp_FH = [ open(f,'wt') for f in tmp_files]
FH_in = open(in_vcf)
header = list()
c=0
var=list()
for line in FH_in:
if line.startswith('#'):
header.append(line)
else:
if len(header) > 0:
for t in tmp_FH:
t.write("".join(header))
header.clear()
c += 1
var.append(line)
# write variant by batch of 10000
if len(tmp_FH)>0 and len(var) == min(10000,nb_by_file):
tmp_FH[0].write(''.join(var))
var.clear()
# change temp output file
if len(tmp_FH)>0 and c == nb_by_file:
c = 0
tmp_FH[0].close()
del(tmp_FH[0])
# write remaining var
if len(var) > 0:
tmp_FH = open(tmp_files[-1],'a')
tmp_FH.write(''.join(var))
var.clear()
tmp_FH.close()
def submit_cmd( cmd, cwd=None):
"""
@summary: Submits a command on system.
@param cmd: [list] The command.
@param stdout_path: The path to the file where the standard outputs will be written.
@param stderr_path: The path to the file where the error outputs will be written.
"""
p = Popen(cmd, universal_newlines=True, stdout=PIPE, stderr=PIPE, cwd=cwd)
stdout, stderr = p.communicate()
# check error status
if p.returncode != 0:
raise Exception( stderr )
def process_variantFiltering_and_Annot(in_vcf, in_gtf, in_fasta, out_vcf, out_gt, out_dp, out_ad, out_summary):
cmd = ['vcf_filter_and_annot.py', '--in-vcf', in_vcf, '--in-gtf', in_gtf, '--in-fasta', in_fasta,
'--out-vcf', out_vcf, '--out-gt', out_gt, '--out-dp', out_dp, '--out-ad', out_ad, '--summary', out_summary]
print(" ".join(cmd) + "\n")
submit_cmd( cmd )
def parallel_submission( function, inputs_vcf, in_gtf, in_fasta, outputs_vcf, outputs_gt, outputs_dp, outputs_ad, outputs_summary):
cpu_used = len(inputs_vcf)
processes = [{'process':None, 'input_vcf':None, 'gtf' : in_gtf, 'fasta':in_fasta,
'output_vcf':None, 'output_gt':None, 'output_dp':None, 'output_ad':None, 'output_summary':None } for idx in range(cpu_used)]
# Launch processes
for idx in range(cpu_used):
process_idx = idx % cpu_used
processes[process_idx]['input_vcf'] = inputs_vcf[idx]
processes[process_idx]['output_vcf'] = outputs_vcf[idx]
processes[process_idx]['output_gt'] = outputs_gt[idx]
processes[process_idx]['output_dp'] = outputs_dp[idx]
processes[process_idx]['output_ad'] = outputs_ad[idx]
processes[process_idx]['output_summary'] = outputs_summary[idx]
for current_process in processes:
if idx == 0: # First process is threaded with parent job
current_process['process'] = threading.Thread(target=function,
args=(current_process['input_vcf'],current_process['gtf'], current_process['fasta'], current_process['output_vcf'], current_process['output_gt'], current_process['output_dp'], current_process['output_ad'], current_process['output_summary']))
else: # Others processes are processed on diffrerent CPU
current_process['process'] = multiprocessing.Process(target=function,
args=(current_process['input_vcf'],current_process['gtf'], current_process['fasta'], current_process['output_vcf'], current_process['output_gt'], current_process['output_dp'], current_process['output_ad'], current_process['output_summary']))
current_process['process'].start()
# Wait processes end
for current_process in processes:
current_process['process'].join()
# Check processes status
for current_process in processes:
if issubclass(current_process['process'].__class__, multiprocessing.Process) and current_process['process'].exitcode != 0:
raise Exception("Error in sub-process execution.")
def merge_vcf(in_list,out):
cmd = ['bcftools','concat','-o',out,'-O','v'] + in_list
print ("\n#merge temp VCFs\n" + " ".join(cmd) + "\n")
submit_cmd( cmd )
def merge_tsv(in_list, out):
print ("\n#merge temp tables\n\tInputs : " + " ".join(in_list) + "\n\tOutputs : " + out + "\n")
FH_out = open(out,'wt')
first = True
for f in in_list:
FH_in = open(f,'rt')
header=FH_in.readline()
if first :
FH_out.write(header)
first = False
for line in FH_in:
FH_out.write(line)
def merge_summary(in_list, out):
print ("\n#merge temp summaries\n\tInputs : " + " ".join(in_list) + "\n\tOutputs : " + out + "\n")
dic_results = dict()
first=in_list.pop(0)
FH_first = open(first,'rt')
# prefix = tmp_dir/time_pid_i_
prefix = os.path.join(os.path.dirname(first), '_'.join(os.path.basename(first).split('_')[0:3])) + '_'
summary = ''
first_key = ''
percent_list = list()
for line in FH_first:
if prefix in line:
summary += line.replace(prefix,'')
elif ':' in line:
category = line.split(':')[0]
key = category.upper().strip().replace(' ','_')
# store first key concidered as the total of variant
if first_key == '':
first_key = key
# store key where we want to compute percentage
if '%)' in line:
percent_list.append(key)
value = int(line.split(':')[1].strip().split()[0])
summary += category + ': ###' + key + '###\n'
dic_results[key] = value
else:
summary += line
for f in in_list:
FH_in = open(f,'rt')
for line in FH_in:
if ':' in line:
category = line.strip().split(':')[0]
key = category.upper().strip().replace(' ','_')
value = int(line.strip().split(':')[1].strip().split()[0])
dic_results[key] += value
for k in dic_results:
if k in percent_list:
percent = round(dic_results[k]*100/dic_results[first_key],2)
summary = summary.replace('###' + k + '###' , str(dic_results[k]) + ' (' + str(percent) + '%)')
else:
summary = summary.replace('###' + k + '###' , str(dic_results[k]))
FH_out = open(out,'wt')
FH_out.write(summary)
FH_out.close()
def process(params):
in_vcf=params.in_vcf
out_dir = os.path.dirname(params.out_vcf)
temp_dir = os.path.join(out_dir,str(time.time()) + "_" + str(os.getpid()) + '_temp_variantFiltration')
temp_files = list()
try:
# select variant based on chrom
if params.accepted_chrom is not None or params.excluded_chrom is not None:
os.mkdir(temp_dir)
selected_vcf = os.path.join(temp_dir, str(time.time()) + "_" + str(os.getpid()) + "_selectOnChrom_" + os.path.basename(in_vcf))
temp_files.append(selected_vcf)
select_on_chrom(in_vcf, params.accepted_chrom, params.excluded_chrom, selected_vcf)
in_vcf = selected_vcf
if params.nb_cpus == 1:
process_variantFiltering_and_Annot(in_vcf, params.in_gtf, params.in_fasta, params.out_vcf, params.out_gt, params.out_dp, params.out_ad, params.summary)
else :
#### create temp_dir
if not os.path.exists(temp_dir):
os.mkdir(temp_dir)
#### split VCF in [nb_cpus] files
prefix = str(time.time()) + "_" + str(os.getpid())
splitted_in_vcf_files = [ os.path.join(temp_dir, prefix + "_" + str(i) + "_" + os.path.basename(in_vcf)) for i in range(0,params.nb_cpus )]
temp_files += splitted_in_vcf_files
split_vcf(in_vcf, params.nb_cpus, splitted_in_vcf_files )
#### filter_and_annotate
splitted_out_vcf_files = [ os.path.join(temp_dir, prefix + "_" + str(i) + "_" + os.path.basename(params.out_vcf)) for i in range(0,params.nb_cpus)]
splitted_out_gt_files = [ os.path.join(temp_dir, prefix + "_" + str(i) + "_" + os.path.basename(params.out_gt)) for i in range(0,params.nb_cpus)]
splitted_out_dp_files = [ os.path.join(temp_dir, prefix + "_" + str(i) + "_" + os.path.basename(params.out_dp)) for i in range(0,params.nb_cpus)]
splitted_out_ad_files = [ os.path.join(temp_dir, prefix + "_" + str(i) + "_" + os.path.basename(params.out_ad)) for i in range(0,params.nb_cpus)]
splitted_out_summary_files = [ os.path.join(temp_dir, prefix + "_" + str(i) + "_" + os.path.basename(params.summary)) for i in range(0,params.nb_cpus)]
temp_files += splitted_out_vcf_files + splitted_out_gt_files + splitted_out_ad_files + splitted_out_dp_files + splitted_out_summary_files
parallel_submission( process_variantFiltering_and_Annot, splitted_in_vcf_files, params.in_gtf, params.in_fasta, splitted_out_vcf_files, splitted_out_gt_files, splitted_out_dp_files, splitted_out_ad_files, splitted_out_summary_files )
# merge outputs
merge_vcf(splitted_out_vcf_files, params.out_vcf)
merge_tsv(splitted_out_gt_files, params.out_gt)
merge_tsv(splitted_out_dp_files, params.out_dp)
merge_tsv(splitted_out_ad_files, params.out_ad)
merge_summary(splitted_out_summary_files, params.summary)
finally :
# remove temp_dir
if not params.debug and os.path.exists(temp_dir):
# remove temp files
for f in temp_files:
if os.path.exists(f):
os.remove(f)
# remove temp dir
os.rmdir(temp_dir)
##########################
# MAIN
if __name__ == '__main__':
# Parameters
parser = argparse.ArgumentParser(description="Annot variant, by adding INFO fields : \n " +
" - LOCALISATION: PASS or a combination of jnct5nt (near exon junction), homodimer5 ( in homopolymer), cl3snp35nt(in SNP cluster) \n" +
" - GTpopCR= X% : percentage of individuals genotyped \n" +
" - DPgt05rPopCR= X% : percentage of individuals genotyped with DP>5\n")
parser.add_argument('-n','--nb-cpus', type=int, default=1, help='Number of CPU to paralelise the treatment')
parser.add_argument('-d','--debug', default=False, action='store_true', help='keep temporary files')
parser.add_argument('-a','--accepted-chrom', nargs="+", required=False, help='list of chromosome to include')
parser.add_argument('-e','--excluded-chrom', nargs="+", required=False, help='list of chromosome to exclude')
# Inputs
group_input = parser.add_argument_group( 'Inputs' )
group_input.add_argument('--in-vcf', required=True, help="The input vcf file to annotate.")
group_input.add_argument('--in-gtf', required=True, help="The genome annotations GTF file, to tag SNP closed to exon boundaries (+/- 5bp)")
group_input.add_argument('--in-fasta', required=True, help="The genome Fasta file, to tag SNP in homopolymer ( 5 bp).")
# Outputs
group_output = parser.add_argument_group( 'Outputs' )
group_output.add_argument('--out-vcf', required=True, help="The annotated vcf file output.")
group_output.add_argument('--out-gt', required=True, help="A GT table of variants with FILTER PASS, GTpopCR > 50 and DPgt05rPopCR > 20 ")
group_output.add_argument('--out-dp', required=True, help="A DP table of variants with FILTER PASS, GTpopCR > 50 and DPgt05rPopCR > 20 ")
group_output.add_argument('--out-ad', required=True, help="A AD table of variants with FILTER PASS, GTpopCR > 50 and DPgt05rPopCR > 20 ")
group_output.add_argument('--summary', required=True, help="Annotation summary")
args = parser.parse_args()
if args.accepted_chrom is not None and args.excluded_chrom is not None:
intersect_chrom = [ c for c in args.accepted_chrom if c in args.excluded_chrom]
if len(intersect_chrom) > 0:
raise Exception("accepted_chrom and excluded_chrom list must not intersect\n")
process(args)
......@@ -78,6 +78,11 @@ def process(params):
# FILTER PASS, LOC PASS, GTpopCR > 50, DPgt05rPopCR > 20
PERFECT = 0
# compteur d'ecriture par bloc
write_variant = list()
write_table_dp = list()
write_table_ad = list()
write_table_gt = list()
# parse input file
for line in FH_in:
if line.startswith("#"):
......@@ -100,11 +105,11 @@ def process(params):
"\t".join([s + "_DP" for s in samples]) + "\n")
FH_ad.write("\t".join(["#chrom", "pos", "REF", "ALT", "DPpop", "jnct5nt", "homodimer5" ,"cl3snp35nt" , "LOCALISATION"]) + "\t" +
"\t".join([s + "_AD_" + a for s in samples for a in ["REF","ALT"] ]) + "\n")
print ("end parsing header\n")
continue
tot_snp += 1
line_list = line.split("\t")
line_list = line.strip().split("\t")
chrom = line_list[0]
pos = int(line_list[1])
ref = line_list[3]
......@@ -214,7 +219,11 @@ def process(params):
line_list[7] = ";".join(info)
# ECRIRE LES VARIANTS FILTER PASS, GTpopCR50 & DPgt05rPopCR20 DANS LES TABLES GT DP AD
if "PASS" in filter:
FH_out.write("\t".join(line_list))
write_variant.append("\t".join(line_list) + '\n')
if len(write_variant) == 10000:
print ("writing 10000 filtered variants")
FH_out.write("".join(write_variant))
write_variant=list()
if GTpopCR >= 50 and DPgt05rPopCR >= 20 :
GTpopCR50_DPgt05rPopCR20 += 1
# recover DP
......@@ -230,33 +239,51 @@ def process(params):
common_str.append("1") if "cl3snp35nt" in LOCALISATION else common_str.append("0")
common_str.append("PASS") if "PASS" in LOCALISATION else common_str.append(".")
FH_gt.write("\t".join(common_str) + "\t" + "\t".join(gt_list) + "\n")
FH_dp.write("\t".join(common_str) + "\t" + "\t".join(dp_list) + "\n")
FH_ad.write("\t".join(common_str) + "\t" + "\t".join([ad.replace(",","\t") for ad in ad_list]) + "\n")
write_table_gt.append("\t".join(common_str) + "\t" + "\t".join(gt_list) + "\n")
write_table_dp.append("\t".join(common_str) + "\t" + "\t".join(dp_list) + "\n")
write_table_ad.append("\t".join(common_str) + "\t" + "\t".join([ad.replace(",","\t") for ad in ad_list]) + "\n")
if len(write_table_gt) == 10000:
FH_gt.write("".join(write_table_gt))
FH_dp.write("".join(write_table_dp))
FH_ad.write("".join(write_table_ad))
write_table_dp = list()
write_table_ad = list()
write_table_gt = list()
if len(write_variant) > 0:
FH_out.write("".join(write_variant))
FH_out.close()
if len(write_table_gt) > 0:
FH_gt.write("".join(write_table_gt))
FH_dp.write("".join(write_table_dp))
FH_ad.write("".join(write_table_ad))
FH_gt.close()
FH_dp.close()
FH_ad.close()
# ECRIRE SUMMARY
FH_summary = open(params.summary, "wt")
FH_summary.write("# Input vcf file :" + params.in_vcf + "\n" \
FH_summary.write("# Input vcf file is " + params.in_vcf + "\n" \
+ "\t contains : " + str(tot_snp) + " variants\n" \
+ "# Filtering step\n" \
+ "\t filtered on non biallelic SNP: " + str(not_SNP_BiAll) + "\n" \
+ "\t filtered on FS: " + str(FS) + "\n" \
+ "\t filtered on QD: " + str(QD) + "\n" \
+ "\t kept SNP: " + str(FILTER_PASS) + " (" + str(round(FILTER_PASS * 100.0 / tot_snp , 2)) + "%)\n" \
+ "\t filtered on non biallelic SNP : " + str(not_SNP_BiAll) + "\n" \
+ "\t filtered on FS : " + str(FS) + "\n" \
+ "\t filtered on QD : " + str(QD) + "\n" \
+ "\t kept SNP : " + str(FILTER_PASS) + " (" + str(round(FILTER_PASS * 100.0 / tot_snp , 2)) + "%)\n" \
+ "\t# filter PASS SNP annotation step\n"
+"\t\t localized in SNP cluster: " + str(cl3snp35nt) + "\n" \
+"\t\t localized near exon junctions: " + str(jnct5nt) + "\n" \
+"\t\t localized in homopolymers: " + str(homodimer5) + "\n" \
+"\t\t localized in unbiased region: " + str(LOC_PASS) + "\n" \
+"\t\t localized in SNP cluster : " + str(cl3snp35nt) + "\n" \
+"\t\t localized near exon junctions : " + str(jnct5nt) + "\n" \
+"\t\t localized in homopolymers : " + str(homodimer5) + "\n" \
+"\t\t localized in unbiased region : " + str(LOC_PASS) + "\n" \
+ "\t# computing call rate step\n" \
+ "\t\t number of SNP with a call rate >= 50%: " + str(GTpopCR50) + "\n" \
+ "\t\t number of SNP with a signigicant (GT with DP>5) call rate >= 20%: " + str(DPgt05rPopCR20) + "\n" \
+ "\t\t number of SNP with GTpopCR >= 50% and DPgt05rPopCR >= 20: " + str(GTpopCR50_DPgt05rPopCR20) + " ("+ str(round(GTpopCR50_DPgt05rPopCR20 * 100.0/ tot_snp,2)) + "%)\n" \
+ "\t\t\t those SNP are described in TSV files : " + params.out_gt + ", "+ params.out_dp + ", " + params.out_ad + "\n" \
+ "\t\t number of SNP with FILTER PASS, LOCALISATION PASS, GTpopCR >=50%, DPgt05rPopCR >= 20%: " + str(PERFECT) + " (" + str(round(PERFECT * 100.0 / tot_snp , 2)) + "%)\n")
+ "\t\t number of SNP with a call rate >= 50% : " + str(GTpopCR50) + "\n" \
+ "\t\t number of SNP with a signigicant (GT with DP>5) call rate >= 20% : " + str(DPgt05rPopCR20) + "\n" \
+ "\t\t number of SNP with GTpopCR >= 50% and DPgt05rPopCR >= 20 : " + str(GTpopCR50_DPgt05rPopCR20) + " ("+ str(round(GTpopCR50_DPgt05rPopCR20 * 100.0/ tot_snp,2)) + "%)\n" \
+ "\t\t\t those SNP are described in " + params.out_gt + ", "+ params.out_dp + ", " + params.out_ad + "\n" \
+ "\t\t number of SNP with FILTER PASS, LOCALISATION PASS, GTpopCR >=50%, DPgt05rPopCR >= 20% : " + str(PERFECT) + " (" + str(round(PERFECT * 100.0 / tot_snp , 2)) + "%)\n")
##########################
......
......@@ -17,7 +17,7 @@ snakemake -s $WORKFLOW_DIR/Snakefile_calling --printshellcmds --jobs 200 \
# {cluster.cpu}, {cluster.mem} and {cluster.partition} will be replace by value defined in the resources_SLURM.yaml file
snakemake -s $WORKFLOW_DIR/Snakefile_calling --printshellcmds --jobs 200 \
--configfile config_calling.yaml.example \
--cluster-config $WORKFLOW_DIR/resources_SLURM.yaml \
--cluster-config $WORKFLOW_DIR/resources_calling_SLURM.yaml \
--cluster "sbatch -p {cluster.partition} --cpus-per-task={cluster.cpu} --mem-per-cpu={cluster.mem} --error=logs/%x.stderr --output=logs/%x.stdout " --latency-wait 30
# If you want to launch the workflow partially, indicate the final output files you want (see dryrun log)
......@@ -26,7 +26,7 @@ snakemake -s $WORKFLOW_DIR/Snakefile_calling --printshellcmds --jobs 200 \
PREFIX=?? # your sample_config TSV file whithout extension
snakemake -s $WORKFLOW_DIR/Snakefile_calling --printshellcmds --jobs 200 \
--configfile config_calling.yaml.example \
--cluster-config $WORKFLOW_DIR/resources_SLURM.yaml \
--cluster-config $WORKFLOW_DIR/resources_calling_SLURM.yaml \
--cluster "sbatch -p {cluster.partition} --cpus-per-task={cluster.cpu} --mem-per-cpu={cluster.mem} --error=logs/%x.stderr --output=logs/%x.stdout " --latency-wait 30 \
Results/Summary/Log_TrimGalore_summary.tsv Results/Summary/Log_STAR_Aln_1_summary.tsv Results/Summary/Log_STAR_Aln_2_summary.tsv \
Results/Summary/RSEM_multimap_summary_${PREFIX}_genes_expected_count.tsv \
......@@ -46,7 +46,7 @@ snakemake -s $WORKFLOW_DIR/Snakefile_calling --printshellcmds --jobs 200 \
PREFIX=?? # your sample_config TSV file whithout extension
snakemake -s $WORKFLOW_DIR/Snakefile_calling --printshellcmds --jobs 200 \
--configfile config_calling.yaml.example \
--cluster-config $WORKFLOW_DIR/resources_SLURM.yaml \
--cluster-config $WORKFLOW_DIR/resources_calling_SLURM.yaml \
--cluster "sbatch -p {cluster.partition} --cpus-per-task={cluster.cpu} --mem-per-cpu={cluster.mem} --error=logs/%x.stderr --output=logs/%x.stdout " --latency-wait 30 \
Results/Summary/Log_TrimGalore_summary.tsv Results/Summary/Log_STAR_Aln_1_summary.tsv Results/Summary/Log_STAR_Aln_2_summary.tsv \
Results/GATK/${PREFIX}_SNP.vcf.gz\
......
......@@ -2,7 +2,7 @@ __default__ :
mem : "8G"
cpu : 8
time : "24:00:00"
queue : workq
partition : workq
convertPhred:
cpu : 1
......
## Install snakemake 5 with a venv
```
module load system/Python-3.6.3
python3.6 -m venv env/snakemake-5_venv
module purge
source env/snakemake-5_venv/bin/activate
pip install snakemake
pip install drmaa
pip install pandas
```
## Install conda and singularity
Conda:
```
wget https://repo.continuum.io/miniconda/Miniconda2-4.6.14-Linux-x86_64.sh
sh Miniconda2-4.6.14-Linux-x86_64.sh
```
Restart shell:
```
conda config --set auto_activate_base false
conda config --add channels bioconda
```
Singularity:
```
sudo apt-get install singularity-container
```
## Create singularity image
[see build process](./env/README.md)
\ No newline at end of file
# How to install
# RNASeq Analysis workflow : quantification and SNP calling
## Install snakemake 5 with a venv
__authors__: Maria Bernard (INRA - SIGENAE) / Kevin Muret - Frederic Jehl - Sandrine Lagarrigue (INRA - PEGASE)
```
module load system/Python-3.6.3
python3.6 -m venv env/snakemake-5_venv
module purge
source env/snakemake-5_venv/bin/activate
pip install snakemake
pip install drmaa
pip install pandas
```
- [RNASeq Analysis workflow : quantification and SNP calling](#rnaseq-analysis-workflow---quantification-and-snp-calling)
* [What it does ?](#what-it-does--)
* [1. Input](#1-input)
* [2. Data Processing](#2-data-processing)
+ [Raw reads trimming](#raw-reads-trimming)
+ [Sequence alignment](#sequence-alignment)
+ [Transcriptomic Alignment post processing](#transcriptomic-alignment-post-processing)
+ [Gene/Isoform expression](#gene-isoform-expression)
+ [Genomic Alignment post processing](#genomic-alignment-post-processing)
+ [Variant Calling](#variant-calling)
* [3. Dependencies](#3-dependencies)
* [4. Configure and running workflow](#4-configure-and-running-workflow)
## Install conda and singularity
## What it does ?
Conda:
```
wget https://repo.continuum.io/miniconda/Miniconda2-4.6.14-Linux-x86_64.sh
sh Miniconda2-4.6.14-Linux-x86_64.sh
```
This pipeline will perform classical RNASeq analysis workflow (trimming, STAR mapping, and RSEM expression quantification) but also call variant based on the [GATK RNASeq recommandations](https://software.broadinstitute.org/gatk/documentation/article.php?id=3891).
Restart shell:
```
conda config --set auto_activate_base false
conda config --add channels bioconda
```
The pipeline is represented in the graph below:
Singularity:
```
sudo apt-get install singularity-container
```
![](/home/maria/workspace/workflows_develop/Snakemake/1000RNASeq_chicken/calling/img/full_dryrun.png)
## Create singularity image
[see build process](./env/README.md)
## 1. Input
# How to run
The user must use the [config_calling.yaml](example/config_calling.yaml) file to provide all necessary inputs for the pipeline:
- A reference fasta file (fasta_ref option)
- A genome reference GTF file (gtf_ref option)
- A set of known variants used to recalibrate bases quality in GATK preprocessing steps RealignerTargetCreator and BaseRecalibrator (known_vcf option)
- A sample reads description TSV file (sample_config option), with the following columns:
- idx : a unique index number to identify each fastq (paire) file
- name : the sample name
- forward_read and reverse_read are fastq file names. These files need to be in a directory precised with the data_dir option
- If single end, leave an empty column in the reverse_read
- If paired, file names need to ends with _R1.fastq.gz or _R2.fastq.gz (or fq instead of fastq, and not necessarly compressed)
- sequencer : sequencer used to generate the sequences : "ILLUMINA", "SLX", "SOLEXA", "SOLID", "454", "LS454", "COMPLETE", "PACBIO", "IONTORRENT", "CAPILLARY", "HELICOS", "UNKNOWN"
- oriented : to indicate if sequences are generated with strand-specific protocol. It corresponds to the forward_prob RSEM parameter. Indicate :
- 1 for a strand-specific protocol where all (upstream) reads are derived from the forward strand,
- 0 for a strand-specific protocol where all (upstream) read are derived from the reverse strand,
- 0.5 for a non-strand-specific protocol.
- phred scale: to indicate the phred score scale use to code base quality: either 33 (Sanger and illumina 1.8+) or 64 (Solexa, illumina 1.3 to 1.8 excluded)
## 2. Data Processing
### Raw reads trimming
First step is to filter and trim raw reads before alignment. This step will convert phred 64 scale fastq files into phred 33 scale fastq files and then trim reads with [trim_galore](https://www.bioinformatics.babraham.ac.uk/projects/trim_galore/). It will trim adapter, low-quality ends (see the trimming_quality option in the config_calling.yaml) and filtered out trimmed reads that are smaller than the third of the read length.
You will find in Result/Summary directory, a Log_TrimGalore_summary.tsv that detail the trim_galor statistics.
### Sequence alignment
Trimmed reads will be align against the reference genome thanks to the [STAR](https://academic.oup.com/bioinformatics/article/29/1/15/272537) following the multi-sample 2-pass mapping procedure, with the GTF reference file.
After a first pass of alignment, SJout (exon junctions) output files will be filtered to create one SJout file that contain:
new canonical junctions covered by at least 2 uniquely mapped reads and present in at least 2 samples.
STAR 2nd pass alignment will use this new junction file and the provided GTF to generate bam files in genome and in transcriptome coordinates.
You will find in the Results/Summary directory , 2 files reporting alignment statistics : Log_STAR_Aln_1_summary.tsv and Log_STAR_Aln_2_summary.tsv
### Transcriptomic Alignment post processing
Reads with multiple alignment will be filtered out to generate a seconde transcriptomic bam:
```samtools view -q 255```
### Gene/Isoform expression
Expression will be compute on all reads align on transcriptome or on uniquely mapped reads thanks to [RSEM](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-12-323). For each sample, it will generate gene and isoforms results.
You will find in the Results/Summary directory, expression quantification for all samples:
## Configure workflow
* for gene and isorms
* for expected count, TPM and FPKM quantification method
* based on uniquely mapped reads or all alignment.
### 1. Create sample file based on data/sample.example.csv file
The file will be named :
`sample_config` file is a tabular file describing each input fastq file.
​ RSEM\__[multimap/uniqmap]_\_summary_PREFIX\__[gene/isoforms]_\__[TPM/expected\_count/FPKM]_.tsv
The expected header columns are : `idx name forward_read reverse_read sequencer read_length oriented phred_scale`
PREFIX is the name of your sample_config file define in the config.yaml file.
#### `name` will be used to agregated sample in the final VCF
#### `forward_read` and `reverse_read` are fastq file names. These files need to be in the data_dir directory sample may be paired end or single end.
* If single end leave an empty column in the reverse_read
* If paired, file names need to ends with _R1.fastq.gz or _R2.fastq.gz (or fq instead of fastq, and not necessarly compressed)
### Genomic Alignment post processing
Genomic alignement will be used to call variants. Each sample will have its bam treated like this:
- cleaned thanks to cleanSam from GATK
- annotated with AddOrReplaceReadGroups from [Picard tools](https://broadinstitute.github.io/picard/) based on information you provide in the sample config file
- multimapped reads filtered out (samtools -q 255)
- merged with other bam files belonging to the same sample name (optionnal)
- with PCR and optical duplicates marked thanks to Picard tools
- with splitted alignment convert in separated alignments and mapping quality of 250 reassigned to 60 with SplitNCigarReads from GATK
- with its indel realign thanks to RealignerTargetCreator and IndelRealigner, and its base quality score recalibrated thanks to BaseRecalibrator and PrintReads from GATK using the known_vcf file provided in the config.yaml.
The final bam alignment files will be find in Results/GATK/ directory and will be called : _[sample]_\_uniq_cs_rg_merge_md_split_real_recal.bam
### Variant Calling
Variants will called thanks to HaplotypeCaller from GATK which will generate one gVCF per sample (Results/GATK/_[sample]_\_.g.vcf.gz files) with the following options:
- -stand_call_conf 20.0
- --min_base_quality_score 10
- --min_mapping_quality_score 20
GenotypeGVCFs from GATK will take all gVCF files to generate one VCF file called Results/GATK/PREFIX_full.vcf.gz
Finally, this VCF file will be splitted into two VCF files for SNP or INDEL using SelectVariant from GATK:Results/GATK/PREFIX_SNP.vcf.gz and Results/GATK/PREFIX_INDEL.vcf.gz
# How to install
The workflow depends on:
* cutadapt
* trimgalore (version 0.4.5)
* STAR (version 2.5.2b)
* picard.jar (version 2.1.1)
* samtools (version 1.3.1 )
* rsem-prepare-reference ( RSEM version 1.3.0)
* rsem-calculate-expression ( RSEM version 1.3.0)
* GenomeAnalysisTK.jar (version 3.7)
* java (version 8)
These dependencies are all included in the singularity environment, or must be available in your PATH or in a directory that you must precise in the [config_calling.yaml](example/config_calling.yaml)
See [How_to_install.md](How_to_install.md) for more details on building singularity environment
# How to run
### Configure workflow
1. Create sample file based on example/sample.tsv file (see [config_calling.yaml](example/config_calling.yaml) or section [1. Input](#1-input):
#### `sequencer`
* ["ILLUMINA","SLX","SOLEXA","SOLID","454","LS454","COMPLETE","PACBIO","IONTORRENT","CAPILLARY","HELICOS","UNKNOWN"]
- The name of the file will be used to prefix the final results
#### `oriented`
This is the forward_prob RSEM parameter to indicate :
* 1 for a strand-specific protocol where all (upstream) reads are derived from the forward strand,
* 0 for a strand-specific protocol where all (upstream) read are derived from the reverse strand,
* 0.5 for a non-strand-specific protocol. (Default: off)
- sample with the same `name` will be aggregated
#### `phred scale` indicate the phred score scale use to code base quality
* either 33 (Sanger and illumina 1.8+)
* or 64 (Solexa, illumina 1.3 to 1.8 excluded)
- fastq files need to be in the `data_dir` specified in the `config.yaml` file (see next point)
### 2. edit config file config_calling.yaml
Copy file `example/config_calling.yaml` and fill it with exepected values :
2. Copy file `example/config_calling.yaml` and fill it with expected values :
For workflow software : you can use either binary in path, directory path with all requiered binary or singularity image.
[see build process](./env/README.md)
For workflow software : you can use either binary in path, directory path with all required binary or singularity image (see [build process](./env/README.md))
Configure input path files:
* sample_config
* fasta_ref
* gtf_ref
* known_vcf
* resources
### 3. set environment variable
3. launch snakemake command as presented in SLURM.sh or Singularity_SLURM.sh to launch the workflow. This script explain how to launch a dryrun, a full run, perform quantification expression only or variant calling only
If you don't use singularity image you must fix path to java jar files for picard tools and GATK (3.8)
```
picard_jar : $PICARD_PATH
gatk_jar : $GATK_PATH
```
## Execution on a cluster
Create script file `launsh_snakemake.sh`:
```
#!/bin/sh
#SBATCH --mail-type=BEGIN,END,FAIL
#SBATCH -J GATK_RSEM
#SBATCH -o log/%x.out
#SBATCH -e log/%x.err
4. If necessary update [resources_SLURM.yaml](resources_SLURM.yaml) to define cpu memory and cluster partition resources for each rule. The default section is the minimal resources per job. The other section take as name the rules name and define only resources that are different from the default section.
source ./env/snakemake-5_venv/bin/activate
module load system/singularity-3.0.1
mkdir log
snakemake \
--snakefile Snakefile \
--configfile config_calling.yaml \
--jobs 200 --cluster-config example/resources_calling_SLURM.yaml \
--use-singularity \
--cluster "sbatch -p {cluster.queue} -e log/%x.err -o log/%x.out \
--cpus-per-task={cluster.cpu} --time={cluster.time} --mem-per-cpu={cluster.mem}" \
--printshellcmds --latency-wait 30
```
resources_SLURM.yaml, SLURM.sh, Singularity_SLURM.sh are specific to SLURM cluster. Therefore, if you use another cluster, you need to create other files. However, you can use these files to inspire you.
launch command with:
```
sbatch launsh_snakemake.sh
```
This is the official documentation of Snakemake if you need more informations : <https://snakemake.readthedocs.io/en/stable/tutorial/tutorial.html>
#!/bin/bash
##############################################################################
## launch workflow
# if used on Genologin cluster (INRA Toulouse )
# module load system/Python-3.6.3
# Path to the SNAKEMAKE workflow directory
WORKFLOW_DIR=.
mkdir -p logs
# This is an example of the snakemake command line to launch a dryrun
snakemake -s $WORKFLOW_DIR/Snakefile_calling --printshellcmds --jobs 200 \
--configfile config_calling.yaml.example --dryrun
# This is an example of the snakemake command line to launch the full workflow on a SLURM cluster
# {cluster.cpu}, {cluster.mem} and {cluster.partition} will be replace by value defined in the resources_SLURM.yaml file
snakemake -s $WORKFLOW_DIR/Snakefile_calling --printshellcmds --jobs 200 \
--configfile config_calling.yaml.example \
--cluster-config $WORKFLOW_DIR/resources_SLURM.yaml \
--cluster "sbatch -p {cluster.partition} --cpus-per-task={cluster.cpu} --mem-per-cpu={cluster.mem} --error=logs/%x.stderr --output=logs/%x.stdout " --latency-wait 30
# If you want to launch the workflow partially, indicate the final output files you want (see dryrun log)
#
# For RSEM quantification only
PREFIX=?? # your sample_config TSV file whithout extension
snakemake -s $WORKFLOW_DIR/Snakefile_calling --printshellcmds --jobs 200 \
--configfile config_calling.yaml.example \
--cluster-config $WORKFLOW_DIR/resources_SLURM.yaml \
--cluster "sbatch -p {cluster.partition} --cpus-per-task={cluster.cpu} --mem-per-cpu={cluster.mem} --error=logs/%x.stderr --output=logs/%x.stdout " --latency-wait 30 \
Results/Summary/Log_TrimGalore_summary.tsv Results/Summary/Log_STAR_Aln_1_summary.tsv Results/Summary/Log_STAR_Aln_2_summary.tsv \
Results/Summary/RSEM_multimap_summary_${PREFIX}_genes_expected_count.tsv \
Results/Summary/RSEM_uniqmap_summary_${PREFIX}_genes_expected_count.tsv \
Results/Summary/RSEM_multimap_summary_${PREFIX}_genes_FPKM.tsv \
Results/Summary/RSEM_uniqmap_summary_${PREFIX}_genes_FPKM.tsv \
Results/Summary/RSEM_multimap_summary_${PREFIX}_genes_TPM.tsv \
Results/Summary/RSEM_uniqmap_summary_${PREFIX}_genes_TPM.tsv \
Results/Summary/RSEM_multimap_summary_${PREFIX}_isoforms_expected_count.tsv \
Results/Summary/RSEM_uniqmap_summary_${PREFIX}_isoforms_expected_count.tsv \
Results/Summary/RSEM_multimap_summary_${PREFIX}_isoforms_FPKM.tsv \
Results/Summary/RSEM_uniqmap_summary_${PREFIX}_isoforms_FPKM.tsv \
Results/Summary/RSEM_multimap_summary_${PREFIX}_isoforms_TPM.tsv \
Results/Summary/RSEM_uniqmap_summary_${PREFIX}_isoforms_TPM.tsv
# For GATK calling only
PREFIX=?? # your sample_config TSV file whithout extension
snakemake -s $WORKFLOW_DIR/Snakefile_calling --printshellcmds --jobs 200 \
--configfile config_calling.yaml.example \
--cluster-config $WORKFLOW_DIR/resources_SLURM.yaml \
--cluster "sbatch -p {cluster.partition} --cpus-per-task={cluster.cpu} --mem-per-cpu={cluster.mem} --error=logs/%x.stderr --output=logs/%x.stdout " --latency-wait 30 \
Results/Summary/Log_TrimGalore_summary.tsv Results/Summary/Log_STAR_Aln_1_summary.tsv Results/Summary/Log_STAR_Aln_2_summary.tsv \
Results/GATK/${PREFIX}_SNP.vcf.gz\
Results/GATK/${PREFIX}_INDEL.vcf.gz
#!/bin/bash
##############################################################################
## launch workflow
# if used on Genologin cluster (INRA Toulouse )
# module load system/Python-3.6.3
# module load system/singularity-3.0.1
# Path to the SNAKEMAKE workflow directory
WORKFLOW_DIR=.
mkdir -p logs
# This is an example of the snakemake command line to launch a dryrun
snakemake -s $WORKFLOW_DIR/Snakefile_calling --printshellcmds --jobs 200 \
--configfile config_calling.yaml.example --dryrun
# This is an example of the snakemake command line to launch the full workflow on a SLURM cluster
# {cluster.cpu}, {cluster.mem} and {cluster.partition} will be replace by value defined in the resources_SLURM.yaml file
snakemake -s $WORKFLOW_DIR/Snakefile_calling --printshellcmds --jobs 200 \
--use-singularity \
--configfile config_calling.yaml.example \
--cluster-config $WORKFLOW_DIR/resources_SLURM.yaml \
--cluster "sbatch -p {cluster.partition} --cpus-per-task={cluster.cpu} --mem-per-cpu={cluster.mem} --error=logs/%x.stderr --output=logs/%x.stdout " --latency-wait 30
# If you want to launch the workflow partially, indicate the final output files you want (see dryrun log)
#
# For RSEM quantification only
PREFIX=?? # your sample_config TSV file whithout extension
snakemake -s $WORKFLOW_DIR/Snakefile_calling --printshellcmds --jobs 200 \
--configfile config_calling.yaml.example \
--use-singularity \
--cluster-config $WORKFLOW_DIR/resources_SLURM.yaml \
--cluster "sbatch -p {cluster.partition} --cpus-per-task={cluster.cpu} --mem-per-cpu={cluster.mem} --error=logs/%x.stderr --output=logs/%x.stdout " --latency-wait 30 \
Results/Summary/Log_TrimGalore_summary.tsv Results/Summary/Log_STAR_Aln_1_summary.tsv Results/Summary/Log_STAR_Aln_2_summary.tsv \
Results/Summary/RSEM_multimap_summary_${PREFIX}_genes_expected_count.tsv \
Results/Summary/RSEM_uniqmap_summary_${PREFIX}_genes_expected_count.tsv \
Results/Summary/RSEM_multimap_summary_${PREFIX}_genes_FPKM.tsv \
Results/Summary/RSEM_uniqmap_summary_${PREFIX}_genes_FPKM.tsv \
Results/Summary/RSEM_multimap_summary_${PREFIX}_genes_TPM.tsv \
Results/Summary/RSEM_uniqmap_summary_${PREFIX}_genes_TPM.tsv \
Results/Summary/RSEM_multimap_summary_${PREFIX}_isoforms_expected_count.tsv \
Results/Summary/RSEM_uniqmap_summary_${PREFIX}_isoforms_expected_count.tsv \
Results/Summary/RSEM_multimap_summary_${PREFIX}_isoforms_FPKM.tsv \
Results/Summary/RSEM_uniqmap_summary_${PREFIX}_isoforms_FPKM.tsv \
Results/Summary/RSEM_multimap_summary_${PREFIX}_isoforms_TPM.tsv \
Results/Summary/RSEM_uniqmap_summary_${PREFIX}_isoforms_TPM.tsv
# For GATK calling only
PREFIX=?? # your sample_config TSV file whithout extension
snakemake -s $WORKFLOW_DIR/Snakefile_calling --printshellcmds --jobs 200 \
--configfile config_calling.yaml.example \
--use-singularity \
--cluster-config $WORKFLOW_DIR/resources_SLURM.yaml \
--cluster "sbatch -p {cluster.partition} --cpus-per-task={cluster.cpu} --mem-per-cpu={cluster.mem} --error=logs/%x.stderr --output=logs/%x.stdout " --latency-wait 30 \
Results/Summary/Log_TrimGalore_summary.tsv Results/Summary/Log_STAR_Aln_1_summary.tsv Results/Summary/Log_STAR_Aln_2_summary.tsv \
Results/GATK/${PREFIX}_SNP.vcf.gz\
Results/GATK/${PREFIX}_INDEL.vcf.gz
################################################################################
#
# SOFTWARE : soft should bin in PATH or in bin_dir or use a singularity image
# bin_dir is a directory containing software binary used in this pipeline:
# bin_dir:
#
### Singularity Environment
# Could be generate with env/rnaseq-gatk-singularity-receipe.txt
singulatiry_img: <ABS_PATH>/rnaseq-gatk-singularity
# for singularity image use following environment variable.
### OR executable in a bin directory
# bin_dir is a directory containing software binary used in this pipeline:
# bin_dir:
# for bin_dir fix path to jar files
# for singularity image use following environment variable: $PICARD_PATH and $GATK_PATH.
picard_jar: $PICARD_PATH
gatk_jar: $GATK_PATH
################################################################################
#
# WORKFLOW INPUTS
#
# data_dir contains fastq files described in sample_config file
data_dir: <ABS_PATH>
# sample file see REAME file for detailed explanations
# sample_config file is a tabular file describing each input fastq file
# sample_config file name will be used as prefix of the output files.
# header columns are : idx name forward_read reverse_read sequencer read_length oriented phred_scale
# forward_read and reverse_read are fastq file names. These files need to be in the data_dir directory
# sample may be paired end or single end.
# - If single end, leave an empty column in the reverse_read
# - If paired, file names need to ends with _R1.fastq.gz or _R2.fastq.gz (or fq instead of fastq, and not necessarly compressed)
# sequencer need to be choosen in the list : ["ILLUMINA","SLX","SOLEXA","SOLID","454","LS454","COMPLETE","PACBIO","IONTORRENT","CAPILLARY","HELICOS","UNKNOWN"]
# oriented is the forward_prob RSEM parameter to indicate :
# 1 for a strand-specific protocol where all (upstream) reads are derived from the forward strand,
# 0 for a strand-specific protocol where all (upstream) read are derived from the reverse strand,
# 0.5 for a non-strand-specific protocol.
# phred scale indicate the phred score scale use to code base quality: either 33 (Sanger and illumina 1.8+) or 64 (Solexa, illumina 1.3 to 1.8 excluded)
sample_config : sample.tsv
# fasta_ref file is the genome reference Fasta file
fasta_ref: <ABS_PATH>/reference.fa
......
......@@ -2,7 +2,7 @@ __default__ :
mem : "8G"
cpu : 8
time : "24:00:00"
queue : workq
partition : workq
convertPhred:
cpu : 1
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment