Newer
Older
##########################################################
### Import section
import os
import yaml
import pandas as pd
##########################################################
### system options
# Recover software memory and cpus resources
with open(config["resources"]) as yml:
config.update(yaml.load(yml, Loader=yaml.SafeLoader))
if not "__default__" in config or not "cpu" in config["__default__"] or not "mem" in config["__default__"] :
raise Exception("resources config file need to define at least default resources cpu mem (minimum 1G) in a __default__ section")
if int(config["__default__"]["mem"].replace("G","")) < 1 :
raise Exception("you need at least 5G of memory per job to run this pipeline")
mariabernard
committed
#~ print(config)
def check_param(rule,resource):
if not rule in config:
config[rule]={resource:config["__default__"][resource]}
elif not resource in config[rule]:
config[rule][resource] = config["__default__"][resource]
rule_resources = {
'indexFasta' : ['mem'],
'FS_QD_clust' : ['mem'],
'vcf_filtration' : ['cpu'],
'SelectVariants' : ['mem'],
'STAR_Index_1' : ['cpu'],
'STAR_Aln_1' : ['cpu'],
'STAR_Aln_2' : ['cpu'],
'AddOrReplaceReadGroups' : ['mem'],
'merge_bam' : ['cpu'],
'rmdup' : ['mem'],
'SplitNCigarReads' : ['mem'],
'ASEReadCounter' : ['mem'],
'phASER' : ['cpu']
}
for rule in rule_resources:
for resource in rule_resources[rule]:
check_param(rule,resource)
# add bin directory in PATH
if "bin_dir" in config:
os.environ['PATH'] = config["bin_dir"] + os.pathsep + os.environ['PATH']
# add script dir to PATH
SCRIPT_DIR= os.path.abspath(os.path.join(workflow.basedir, "script"))
os.environ['PATH'] = SCRIPT_DIR + os.pathsep + os.environ['PATH']
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
##########################################################
### workflows options
# check the reference genome files
if not "fasta_ref" in config or not os.path.exists(config["fasta_ref"]):
raise Exception("You need to provide an existing fasta reference file, using fasta_ref in your config.yaml file")
elif not os.path.exists(config["fasta_ref"] + ".fai") or not os.path.exists(os.path.splitext(config["fasta_ref"])[0] + ".dict"):
raise Exception("Your fasta reference file need to be indexed with samtools faidx and GATK CreateSequenceDictionnary ")
if not "gtf_ref" in config or not os.path.exists(config["gtf_ref"]):
raise Exception("You need to provide an existing gtf reference file, using gtf_ref in your config.yaml file")
# check about VCF files
if not 'vcf' in config or not os.path.exists(config['vcf']):
raise Exception('Your need to provide an existing VCF files, using the vcf parameter in the config.yaml file')
if not os.path.exists(config['vcf'] + '.tbi'):
raise Exception('Your input VCF file : ' + config['vcf'] + ', is not indexed by tabix.')
if not config['vcf'].endswith('.gz'):
raise Exception('Your input VCF file : ' + config['vcf'] + ', is not compressed with bgzip')
# check sample config TSV file
if not "sample_config" in config or not os.path.exists(config["sample_config"]):
raise Exception("You need to provide an existing sample config tabular file, using sample_config in your config.yaml file")
if not 'SNP_filter' in config:
config['SNP_filter'] = False
elif config['SNP_filter'] != False and config['SNP_filter'] != True:
raise Exception('SNP_filter must be set to True or False')
# load sample_config
table = pd.read_csv(config["sample_config"], sep='\t', dtype=str)
# check sequencer type
available_PL=["ILLUMINA","SLX","SOLEXA","SOLID","454","LS454","COMPLETE","PACBIO","IONTORRENT","CAPILLARY","HELICOS","UNKNOWN"]
sequencer=table.sequencer.unique()
for i in sequencer:
if not i.upper() in available_PL:
raise Exception(i.upper()+" sequencer type povided in your sample_config file is not allowed, please choose among : " + ",".join(available_PL))
if table.shape[0] != len(table["forward_read"].unique()):
raise Exception("Some forward_read files is not unique in your sample config file\n")
#################################################################################################
## FUNCTIONS
def find_jar(jar):
path=""
for bin_dir in os.environ['PATH'].split(os.pathsep):
if os.path.exists(os.path.join(bin_dir,jar)):
path = os.path.join(bin_dir,jar)
if path != "":
return path
else:
raise Exception("Could not find "+jar+" path in PATH")
def gatk_multi_arg(flag, files):
flag += " "
return " ".join(flag + f for f in files)
##########################################################
### Snakemake rules
# genome processing
include : "rules/basics.smk"
include : "rules/variantFiltration.smk"
# reads preprocessing and alignment
include : "rules/TrimGalore.smk"
include : "rules/alignmentProcess.smk"
# ASE
include : "rules/ASE_counter.smk"
localrules: all
final_outputs=list()
# STAR alignment
for sample in table["sample_name"].unique():
sample_table = table[table["sample_name"] == sample]
# paired end, filter on properly paired
if len(sample_table["forward_read"]) == len(sample_table["reverse_read"].dropna()) :
mariabernard
committed
final_outputs.append("Results_ASE/STAR_Aln_2/" + sample + "_rg_genomic_pp_rmdup_uniq_split.bam")
# single end, no filter on properly paired
elif len(sample_table["reverse_read"].dropna() ) == 0:
mariabernard
committed
final_outputs.append("Results_ASE/STAR_Aln_2/" + sample + "_rg_genomic_rmdup_uniq_split.bam")
mariabernard
committed
final_outputs.append("Results_ASE/Summary/Log_TrimGalore_summary.tsv")
final_outputs.append("Results_ASE/Summary/Log_STAR_Aln_1_summary.tsv")
final_outputs.append("Results_ASE/Summary/Log_STAR_Aln_2_summary.tsv")
mariabernard
committed
final_outputs.append('Results_ASE/vcfFiltration/' + os.path.basename(config['vcf']).replace('.vcf.gz','_FsQdBiall.vcf.gz'))
final_outputs.append('Results_ASE/vcfFiltration/' + os.path.basename(config['vcf']).replace('.vcf.gz','_FsQdBiall_withGTpopCR50_DPgt05rPopCR20_GT.tsv'))
final_outputs.append('Results_ASE/vcfFiltration/' + os.path.basename(config['vcf']).replace('.vcf.gz','_FsQdBiall_withGTpopCR50_DPgt05rPopCR20_DP.tsv'))
final_outputs.append('Results_ASE/vcfFiltration/' + os.path.basename(config['vcf']).replace('.vcf.gz','_FsQdBiall_withGTpopCR50_DPgt05rPopCR20_AD.tsv'))
final_outputs.append('Results_ASE/Summary/VariantFiltration_summary.txt')
# counter
for sample in table["sample_name"].unique():
mariabernard
committed
final_outputs.append('Results_ASE/ASEReadCounter/' + sample + '_ASEReadCounter.csv')
final_outputs.append('Results_ASE/phASER/' + sample + '_phASER.allelic_counts.txt')
final_outputs.append('Results_ASE/phASER_gene_ae/' + sample + '_phASER.gene_ae.txt')
final_outputs.append('Results_ASE/phASER_pop/' + os.path.basename(config['vcf']).replace('.vcf.gz','_phASER.bed.gz'))
final_outputs.append('Results_ASE/phASER_pop/' + os.path.basename(config['vcf']).replace('.vcf.gz','_phASER.gw_phased.bed.gz'))
# ['Results_ASE/STAR_Aln_2/samplePE_rg_genomic_pp_rmdup_uniq_split.bam',
# 'Results_ASE/STAR_Aln_2/sampleSE_rg_genomic_rmdup_uniq_split.bam',
# 'Results_ASE/Summary/Log_TrimGalore_summary.tsv',
# 'Results_ASE/Summary/Log_STAR_Aln_1_summary.tsv',
# 'Results_ASE/Summary/Log_STAR_Aln_2_summary.tsv',
# 'Results_ASE/vcfFiltration/variants_FsQdBiall.vcf.gz',
# 'Results_ASE/vcfFiltration/variants_FsQdBiall_withGTpopCR50_DPgt05rPopCR20_GT.tsv',
# 'Results_ASE/vcfFiltration/variants_FsQdBiall_withGTpopCR50_DPgt05rPopCR20_DP.tsv',
# 'Results_ASE/vcfFiltration/variants_FsQdBiall_withGTpopCR50_DPgt05rPopCR20_AD.tsv',
# 'Results_ASE/Summary/VariantFiltration_summary.txt',
# 'Results_ASE/ASEReadCounter/samplePE_ASEReadCounter.csv',
# 'Results_ASE/phASER/samplePE_phASER.allelic_counts.txt',
# 'Results_ASE/phASER_gene_ae/samplePE_phASER.gene_ae.txt',
# 'Results_ASE/ASEReadCounter/sampleSE_ASEReadCounter.csv',
# 'Results_ASE/phASER/sampleSE_phASER.allelic_counts.txt',
# 'Results_ASE/phASER_gene_ae/sampleSE_phASER.gene_ae.txt']
rule all:
input:
final_outputs