Skip to content
Snippets Groups Projects
popsim.snk 7.5 KiB
Newer Older
import sys
import os

Thomas Faraut's avatar
Thomas Faraut committed
include: "tools/utils.snk"


ROOTPATH = os.path.dirname(os.path.dirname(workflow.snakefile))
sys.path.insert(0, os.path.join(ROOTPATH, "lib"))
os.environ["PYTHONPATH"] = os.path.join(ROOTPATH, "lib")
os.environ["PATH"] = ":".join([os.path.join(ROOTPATH, "bin"), os.path.join(ROOTPATH, "popsim"), os.environ["PATH"]])

WDIR = config['wdir']
workdir: WDIR

NB_INDS = config["nb_inds"]
REFERENCE = config["reference"]
REFBUNDLE = config["refbundle"] if "refbundle" in config else None
NSTRETCHES = "exclusion/gaps.bed"
SV_LIST = config["sv_list"] if "sv_list" in config else None
COVERAGE = config["coverage"]
FORCE_POLYMORPHISME = config["force_polymorphism"]
HAPLOID = config["haploid"]
PROBA_DEL = config["proba_del"]
PROBA_INV = config["proba_inv"]
READ_LEN = config["read_len"]
INSERT_LEN_MEAN = config["insert_len_mean"]
INSERT_LEN_SD = config["insert_len_sd"]
MIN_DELETIONS = config["min_deletions"]
MIN_INVERSIONS = config["min_inversions"]
MIN_DUPLICATIONS = config["min_duplications"]
FREQ_DEL = config["freq_del"]
FREQ_INV = config["freq_inv"]
MAX_TRY = config["max_try"]
GENOTYPES = config["genotypes"] if "genotypes" in config else None
OVERLAP_CUTOFF = config["overlap_cutoff"]
LEFT_PRECISION = config["left_precision"]
RIGHT_PRECISION = config["right_precision"]
CHROMS = config['chromosomes']


# Functions here

def buildpop_inputs():
    inputs = {
        "reference": REFERENCE
    }
    params = {
        "nstretches": False,
        "genotypes": False,
        "sv_list": False
    }
    if NSTRETCHES is not None:
        inputs["nstretches"] = NSTRETCHES
        params["nstretches"] = True
    if GENOTYPES is not None:
        inputs["genotypes"] = GENOTYPES
        params["genotypes"] = True
    if SV_LIST is not None:
        inputs["sv_list"] = SV_LIST
        params["sv_list"] = True

    return inputs, params

def buildpop_outputs():
    outputs = []
    for n in range(1, NB_INDS+1):
        outputs += [os.path.abspath(os.path.join("pop", "indiv%d_1.fq.gz" % n)), os.path.abspath(os.path.join("pop", "indiv%d_2.fq.gz" % n))]
    return outputs

bp_inputs, bp_params = buildpop_inputs()
bp_outputs = buildpop_outputs()


include: "align.snk"
include: "referencebundle.snk"
include: "detection.snk"
localrules: all_popsim
rule all_popsim:
        expand("results/{svtype}/Summarized_results_{svtype}.ipynb", svtype=config["varianttypes"]),
        expand("results/{svtype}/Summarized_results_{svtype}.html", svtype=config["varianttypes"]),
        #set_all_inputs
        #expand("bams/{sample}.bam", sample=["indiv%d" % i for i in range(1, NB_INDS+1)])
        #*bp_outputs
rule get_fasta_popsim:
        "reference_raw.fasta"
        "reference.fasta"
    run:
        from Bio import SeqIO
        with open(input[0], "rU") as handle, open(output[0], "w") as f:
            for seq in SeqIO.parse(handle, "fasta"):
                if seq.id in CHROMS:
                    SeqIO.write([seq], f, "fasta")
        i_fai = input[0] + ".fai"
        if os.path.exists(i_fai):
            os.remove(i_fai)
rule index_fasta:
    input:
        "reference.fasta"
    output:
        "reference.fasta.fai"
    log:
        "logs/index_fasta.log"
    shell:
        "samtools faidx {input} &> {log}"

rule buildpop:
    input:
        *bp_outputs,
        os.path.join("pop", "genotypes.vcf.gz")
        nb_inds = NB_INDS,
        out_dir = "pop",
        coverage = COVERAGE,
        force_polymorphism = FORCE_POLYMORPHISME,
        haploid = HAPLOID,
        proba_del = PROBA_DEL,
        proba_inv = PROBA_INV,
        read_len = READ_LEN,
        insert_len_mean = INSERT_LEN_MEAN,
        insert_len_sd = INSERT_LEN_SD,
        min_deletions = MIN_DELETIONS,
        min_inversions = MIN_INVERSIONS,
Floreal Cabanettes's avatar
Floreal Cabanettes committed
        freq_inv = FREQ_INV,
        max_try = MAX_TRY
    threads:
        8
    log:
        stdout = "logs/popsim.o",
        stderr = "logs/popsim.e"
    run:
        command = ["build_pop.py",
                   "-n", str(params.nb_inds),
                   "-r", input.reference,
                   "-c", str(params.coverage),
                   "-o", params.out_dir,
                   "-pd", str(params.proba_del),
                   "-pi", str(params.proba_inv),
                   "-l", str(params.read_len),
                   "-m", str(params.insert_len_mean),
                   "-v", str(params.insert_len_sd),
                   "-q",
                   "-md", str(params.min_deletions),
                   "-mi", str(params.min_inversions),
Floreal Cabanettes's avatar
Floreal Cabanettes committed
                   "-fd", " ".join(map(str,params.freq_del)),
                   "-fi", " ".join(map(str,params.freq_inv)),
                   "-fu", " ".join(map(str,params.freq_dup)),
                   "--max-try", str(params.max_try),
                   "-t", str(threads),
                   "-e"]
        if hasattr(input, "nstretches"):
            command += ["-ns", input.nstretches]
        if hasattr(input, "sv_list"):
            command += ["-s", input.sv_list]
        if params.force_polymorphism:
            command.append("-f")
        if params.haploid:
            command.append("-a")
        shell(" ".join(command) + " 1> %s 2> %s" % (log.stdout, log.stderr))

rule linkdatabams:
    input:
        bam="bams/{sample}.bam",
        bai="bams/{sample}.bam.bai"
    output:
        bam="data/bams/{sample}.bam",
        bai="data/bams/{sample}.bam.bai"
    run:
        os.symlink(os.path.abspath(input.bam), output.bam)
        os.symlink(os.path.abspath(input.bai), output.bai)

rule buildinfilesresults:
    input:
        genotypes = expand("{batch}/genotypes/{{svtype}}/svtyper_{chrom}_{{svtype}}_genotypes.vcf.gz",
                           chrom=chromosomes,
                           batch=samples.keys()),
        filtered = expand("{batch}/filtered/{{svtype}}/svtyper_{chrom}_{{svtype}}_genotypes_filtered.vcf.gz",
                          chrom=chromosomes,
                          batch=samples.keys())
    output:
        genotypes = "list_files/tools_results_files_{svtype}.list",
        filtered = "list_files/filtered_results_files_{svtype}.list"
    threads:
        1
    run:
        with open(output.genotypes, "w") as o_gt:
            for i_gt in input.genotypes:
                o_gt.write("%s\n" % i_gt)
        with open(output.filtered, "w") as o_ft:
            for i_ft in input.filtered:
                o_ft.write("%s\n" % i_ft)

rule buildresults:
    input:
        genotypes = "list_files/tools_results_files_{svtype}.list",
        filtered = "list_files/filtered_results_files_{svtype}.list",
        true_vcf = "pop/genotypes.vcf.gz"
    output:
        "results/{svtype}/Summarized_results_{svtype}.ipynb",
        "results/{svtype}/Summarized_results_{svtype}.html"
    params:
        overlap_cutoff = OVERLAP_CUTOFF,
        left_precision = LEFT_PRECISION,
        right_precision = RIGHT_PRECISION,
        svlist = SV_LIST,
        haploid = HAPLOID
    threads:
        1
    log:
Floreal Cabanettes's avatar
Floreal Cabanettes committed
        stdout = "logs/results_{svtype}.o",
        stderr = "logs/results_{svtype}.e"
    shell:
        """
            touch results/{wildcards.svtype}/Summarized_results_{wildcards.svtype}.ipynb
            touch results/{wildcards.svtype}/Summarized_results_{wildcards.svtype}.html
        """