Snakefile

##########################################################
### Import section

import os
import yaml
import pandas as pd

##########################################################
### system options

# Recover software memory and cpus resources
with open(config["resources"]) as yml:
    config.update(yaml.load(yml, Loader=yaml.SafeLoader))

if not "__default__" in config or not "cpu" in config["__default__"] or not "mem" in config["__default__"] :
    raise Exception("resources config file need to define at least default resources cpu mem (minimum 1G) in a __default__ section")
if int(config["__default__"]["mem"].replace("G","")) < 1 :
    raise Exception("you need at least 5G of memory per job to run this pipeline")

#~ print(config)

def check_param(rule,resource):
    if not rule in config:
        config[rule]={resource:config["__default__"][resource]}
    elif not resource in config[rule]:
        config[rule][resource] = config["__default__"][resource]
        
rule_resources = {
    'indexFasta' : ['mem'],
    'FS_QD_clust' : ['mem'],
    'vcf_filtration' : ['cpu'],
    'SelectVariants' : ['mem'],
    'STAR_Index_1' : ['cpu'],
    'STAR_Aln_1' : ['cpu'],
    'STAR_Aln_2' : ['cpu'],
    'AddOrReplaceReadGroups' : ['mem'],
    'merge_bam' : ['cpu'],
    'rmdup' : ['mem'],
    'SplitNCigarReads' : ['mem'],
    'ASEReadCounter' : ['mem'],
    'phASER' : ['cpu']
    }

for rule in rule_resources:
    for resource in rule_resources[rule]:
        check_param(rule,resource)

# add bin directory in PATH
if "bin_dir" in config:
    os.environ['PATH'] = config["bin_dir"] + os.pathsep + os.environ['PATH']
# add script dir to PATH
SCRIPT_DIR= os.path.abspath(os.path.join(workflow.basedir, "script"))
os.environ['PATH'] = SCRIPT_DIR + os.pathsep + os.environ['PATH'] 

##########################################################
### workflows options

# check the reference genome files
if not "fasta_ref" in config or not os.path.exists(config["fasta_ref"]):
    raise Exception("You need to provide an existing fasta reference file, using fasta_ref in your config.yaml file")
elif not os.path.exists(config["fasta_ref"] + ".fai") or not os.path.exists(os.path.splitext(config["fasta_ref"])[0] + ".dict"):
    raise Exception("Your fasta reference file need to be indexed with samtools faidx and GATK CreateSequenceDictionnary ")

if not "gtf_ref" in config or not os.path.exists(config["gtf_ref"]):
    raise Exception("You need to provide an existing gtf reference file, using gtf_ref in your config.yaml file")    

# check about VCF files
if not 'vcf' in config or not os.path.exists(config['vcf']):
        raise Exception('Your need to provide an existing VCF files, using the vcf parameter in the config.yaml file')

if not os.path.exists(config['vcf'] + '.tbi'):
        raise Exception('Your input VCF file : ' + config['vcf'] + ', is not indexed by tabix.')

if not config['vcf'].endswith('.gz'):
        raise Exception('Your input VCF file : ' + config['vcf'] + ', is not compressed with bgzip')

# check sample config TSV file
if not "sample_config" in config or not os.path.exists(config["sample_config"]):
    raise Exception("You need to provide an existing sample config tabular file, using sample_config in your config.yaml file")

if not 'SNP_filter' in config:
    config['SNP_filter'] = False
elif config['SNP_filter'] != False and config['SNP_filter'] != True:
    raise Exception('SNP_filter must be set to True or False')

# load sample_config
table = pd.read_csv(config["sample_config"], sep='\t', dtype=str)

# check sequencer type
available_PL=["ILLUMINA","SLX","SOLEXA","SOLID","454","LS454","COMPLETE","PACBIO","IONTORRENT","CAPILLARY","HELICOS","UNKNOWN"]
sequencer=table.sequencer.unique()
for i in sequencer:
    if not i.upper() in available_PL:
        raise Exception(i.upper()+" sequencer type povided in your sample_config file is not allowed, please choose among : " + ",".join(available_PL))

if table.shape[0] != len(table["forward_read"].unique()):
    raise Exception("Some forward_read files is not unique in your sample config file\n")


#################################################################################################
## FUNCTIONS

def find_jar(jar):
    path=""
    for bin_dir in os.environ['PATH'].split(os.pathsep):
        if os.path.exists(os.path.join(bin_dir,jar)):
           path = os.path.join(bin_dir,jar)
    if path != "":
        return path
    else:
        raise Exception("Could not find "+jar+" path in PATH")


def gatk_multi_arg(flag, files):
    flag += " "
    return " ".join(flag + f for f in files)

##########################################################
### Snakemake rules

# genome processing
include : "rules/basics.smk"
include : "rules/variantFiltration.smk"
# reads preprocessing and alignment
include : "rules/TrimGalore.smk"
include : "rules/alignmentProcess.smk"
# ASE
include : "rules/ASE_counter.smk"

localrules: all

final_outputs=list()

# STAR alignment
for sample in table["sample_name"].unique():
	sample_table = table[table["sample_name"] == sample]
	# paired end, filter on properly paired
	if len(sample_table["forward_read"]) == len(sample_table["reverse_read"].dropna()) :
	    final_outputs.append("Results_ASE/STAR_Aln_2/" + sample + "_rg_genomic_pp_rmdup_uniq_split.bam")
	# single end, no filter on properly paired
	elif len(sample_table["reverse_read"].dropna() ) == 0:
	    final_outputs.append("Results_ASE/STAR_Aln_2/" + sample + "_rg_genomic_rmdup_uniq_split.bam")

# # Summary
final_outputs.append("Results_ASE/Summary/Log_TrimGalore_summary.tsv")
final_outputs.append("Results_ASE/Summary/Log_STAR_Aln_1_summary.tsv")
final_outputs.append("Results_ASE/Summary/Log_STAR_Aln_2_summary.tsv")

# variant Filtration
final_outputs.append('Results_ASE/vcfFiltration/' + os.path.basename(config['vcf']).replace('.vcf.gz','_FsQdBiall.vcf.gz'))
final_outputs.append('Results_ASE/vcfFiltration/' + os.path.basename(config['vcf']).replace('.vcf.gz','_FsQdBiall_withGTpopCR50_DPgt05rPopCR20_GT.tsv'))
final_outputs.append('Results_ASE/vcfFiltration/' + os.path.basename(config['vcf']).replace('.vcf.gz','_FsQdBiall_withGTpopCR50_DPgt05rPopCR20_DP.tsv'))
final_outputs.append('Results_ASE/vcfFiltration/' + os.path.basename(config['vcf']).replace('.vcf.gz','_FsQdBiall_withGTpopCR50_DPgt05rPopCR20_AD.tsv'))
final_outputs.append('Results_ASE/Summary/VariantFiltration_summary.txt')

# counter
for sample in table["sample_name"].unique():
	final_outputs.append('Results_ASE/ASEReadCounter/' + sample + '_ASEReadCounter.csv')
	final_outputs.append('Results_ASE/phASER/' + sample + '_phASER.allelic_counts.txt')
	final_outputs.append('Results_ASE/phASER_gene_ae/' + sample + '_phASER.gene_ae.txt')

final_outputs.append('Results_ASE/phASER_pop/' + os.path.basename(config['vcf']).replace('.vcf.gz','_phASER.bed.gz'))
final_outputs.append('Results_ASE/phASER_pop/' + os.path.basename(config['vcf']).replace('.vcf.gz','_phASER.gw_phased.bed.gz'))

# print(final_outputs)
# ['Results_ASE/STAR_Aln_2/samplePE_rg_genomic_pp_rmdup_uniq_split.bam', 
# 'Results_ASE/STAR_Aln_2/sampleSE_rg_genomic_rmdup_uniq_split.bam', 
# 'Results_ASE/Summary/Log_TrimGalore_summary.tsv',
# 'Results_ASE/Summary/Log_STAR_Aln_1_summary.tsv',
# 'Results_ASE/Summary/Log_STAR_Aln_2_summary.tsv',
# 'Results_ASE/vcfFiltration/variants_FsQdBiall.vcf.gz',
# 'Results_ASE/vcfFiltration/variants_FsQdBiall_withGTpopCR50_DPgt05rPopCR20_GT.tsv', 
# 'Results_ASE/vcfFiltration/variants_FsQdBiall_withGTpopCR50_DPgt05rPopCR20_DP.tsv',
# 'Results_ASE/vcfFiltration/variants_FsQdBiall_withGTpopCR50_DPgt05rPopCR20_AD.tsv', 
# 'Results_ASE/Summary/VariantFiltration_summary.txt', 
# 'Results_ASE/ASEReadCounter/samplePE_ASEReadCounter.csv', 
# 'Results_ASE/phASER/samplePE_phASER.allelic_counts.txt', 
# 'Results_ASE/phASER_gene_ae/samplePE_phASER.gene_ae.txt', 
# 'Results_ASE/ASEReadCounter/sampleSE_ASEReadCounter.csv',
# 'Results_ASE/phASER/sampleSE_phASER.allelic_counts.txt',
# 'Results_ASE/phASER_gene_ae/sampleSE_phASER.gene_ae.txt']


rule all:
    input:
        final_outputs