Skip to content
Snippets Groups Projects
Commit 8492f706 authored by Thomas Faraut's avatar Thomas Faraut
Browse files

all snakefiles for refactor

parent 8d56ce38
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env python
import sys
import os
import configparser
import traceback
from subprocess import call
import yaml
# PATCH convert yaml config file to conf file in INI format for build_variant_pop
def reformat_configyaml(yaml_config, ini_conf_file):
yaml_config = read_yaml(yaml_config)
config = configparser.ConfigParser()
config["Defaults"] = yaml_config
with open(ini_conf_file, 'w') as configfile: # save
config.write(configfile)
def read_yaml(yaml_config_file):
with open(yaml_config_file, 'r') as stream:
try:
config = yaml.safe_load(stream)
except yaml.YAMLError as exc:
print(exc)
exit(1)
return config
def launch(reference, conf_file, outdir, threads, nstretches=None):
# Convert yaml config file to ini file
ini_conf_file = "conffile.ini"
reformat_configyaml(conf_file, ini_conf_file)
command = "build_variant_pop.py "
command += "--reference {ref} --conf-file {cfile} -o {odir} "\
"-t {threads} -e -q ".format(ref=reference,
cfile=ini_conf_file,
odir=outdir,
threads=threads)
if nstretches is not None:
command += "--nstretches %s" % nstretches
returncode = call(command, shell=True)
if returncode == 0:
os.remove(ini_conf_file)
else:
print("build_variant_pop.py command failed", file=sys.stderr)
return 1
return 0
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(
prog="buildpop_launcher.py",
description="Launch builpop simulation")
parser.add_argument("-r", "--reference", required=True,
help="refernece genome")
parser.add_argument("--conf-file", required=True,
help="config file, yaml forma",
metavar="FILE")
parser.add_argument("-ns", "--nstretches",
help="N-stretches positions bed file",
required=False)
parser.add_argument("-o", "--outdir", required=True,
help="Specify the output dir")
parser.add_argument("-t", "--threads", type=int,
help="Number of threads to use", default=1)
args = parser.parse_args()
exit(launch(**{k: v for k, v in vars(args).items() if k in launch.__code__.co_varnames
or ("kwargs" in launch.__code__.co_varnames and k != "func")}))
......@@ -40,14 +40,12 @@ def launch(bamlist, input_vcf, output_vcf, threads):
vcf_out=vcf_out,
cores=threads)
tmpdir = tempfile.mkdtemp()
command = "bcftools sort -O z -t {tmpdir} ". format(tmpdir=tmpdir)
command = "bcftools sort -O z -T {tmpdir} ". format(tmpdir=tmpdir)
command += "-o {ovcf} {ivcf} ".format(ivcf=vcf_out, ovcf=output_vcf)
returncode = call(command, shell=True)
if returncode == 0:
os.remove(vcf_out)
tabix_index(output_vcf, force=True, preset="vcf")
shutil.rmtree(tmpdir)
else:
print("Bctools sort command failed", file=sys.stderr)
return 1
......
......@@ -164,9 +164,7 @@ class Detection(Command):
# WARNING wich set the bam files in the working dir
#Detection.setabsolute_bam_paths(self.samples)
set_absolute_path(params, "samples")
self.samples = pd.read_table(params["samples"]).set_index("sample",
drop=False)
self.samples = self.set_samples_dataframe(params)
self.chromosomes = self.params['chromosomes']
varianttypes = ['DEL', 'INV', 'DUP', 'mCNV']
......@@ -181,13 +179,14 @@ class Detection(Command):
self.sample_batches = self.set_sample_batches()
self.chrom_batches = self.get_all_chrom_batches()
self.sample_batches = {'batch001': ['indiv1', 'indiv2',
'indiv3', 'indiv4']}
# TODO have to clean this
self.empty_template = os.path.join(self.root_path, "lib", "svreader",
"resources", "template.vcf")
def set_samples_dataframe(self, params):
set_absolute_path(params, "samples")
return pd.read_table(params["samples"]).set_index("sample", drop=False)
@staticmethod
def setabsolute_bam_paths(samples):
absolutepaths = []
......@@ -222,11 +221,10 @@ class Detection(Command):
return bam
def get_all_outputs(self, wildcards):
outputs = []
outputs += expand("{batch}/filtered/{svtype}/svtyper_{chrom}_{svtype}_genotypes_filtered.vcf.gz",
outputs = expand("{batch}/filtered/{svtype}/svtyper_{chrom}_{svtype}_genotypes_filtered.vcf.gz",
chrom=self.chromosomes,
svtype=[x for x in self.varianttypes if x != "mCNV"],
batch=self.sample_batches.keys())
batch=list(self.sample_batches.keys()))
return outputs
def get_input_bams(self, wildcards):
......@@ -420,45 +418,63 @@ class MergeBatches(Detection):
return outputs
class Popsim(Command):
class Simulation(Detection):
def __init__(self, params):
Command.__init__(self, params)
self._nb_inds = params["nb_inds"]
Detection.__init__(self, params)
self.popsim_outdir = "pop"
def set_samples_dataframe(self, params):
samples = {"sample": ["indiv%d" % i for i in range(1, self._nb_inds+1)]}
return pd.DataFrame(samples, index=samples["sample"])
def get_popsim_outdir(self):
return self.popsim_outdir
@property
def nb_inds(self):
return self._nb_inds
def get_read_group(self, wildcards):
" Denote sample name and platform in read group. "
return r"'@RG\tID:{sample}\tSM:{sample}\tPL:{sample}'".format(
sample=wildcards.sample)
def get_all_outputs(self, wildcards):
outputs = ["Summarized_results.html"]
outputs += expand("filtered/{svtype}/svtyper_{chrom}_{svtype}_genotypes_filtered.vcf.gz",
chrom=self.chromosomes,
svtype=[x for x in self.varianttypes if x != "mCNV"])
outputs = []
outputs += expand("results/{svtype}/Summarized_results_{svtype}.ipynb",
svtype=[x for x in self.varianttypes if x != "mCNV"])
outputs += expand("results/{svtype}/Summarized_results_{svtype}.html",
svtype=[x for x in self.varianttypes if x != "mCNV"])
return outputs
def buildpop_inputs(self, wildcards):
inputs = {
"reference": self.reference
}
params = {
"nstretches": False,
"genotypes": False,
"sv_list": False
}
if NSTRETCHES is not None:
inputs["nstretches"] = NSTRETCHES
params["nstretches"] = True
if GENOTYPES is not None:
inputs["genotypes"] = GENOTYPES
params["genotypes"] = True
if SV_LIST is not None:
inputs["sv_list"] = SV_LIST
params["sv_list"] = True
return inputs, params
def buildpop_outputs(self):
def get_builpop_fastqs(self):
outputs = []
for n in range(1, NB_INDS+1):
outputs += [os.path.abspath(os.path.join("pop", "indiv%d_1.fq.gz" % n)),
os.path.abspath(os.path.join("pop", "indiv%d_2.fq.gz" % n))]
for s in self.get_samples:
outputs += [os.path.join(self.popsim_outdir, "%s_1.fq.gz" % s),
os.path.join(self.popsim_outdir, "%s_2.fq.gz" % s)]
return outputs
def get_sample_fastq(self, wildcards):
fastq = [os.path.join(self.popsim_outdir, "%s_1.fq.gz" % wildcards.sample),
os.path.join(self.popsim_outdir, "%s_2.fq.gz" % wildcards.sample)]
return {'fastq': fastq}
def get_sample_bam(self, sample):
return "data/bams/%s.bam" % sample
def buildinfilesresults(self, wildcards):
genotypes = expand("{batch}/filtered/{{svtype}}/svtyper_{chrom}_{{svtype}}_genotypes_filtered.vcf.gz",
chrom=self.chromosomes,
batch=self.sample_batches.keys())
filtered = expand("{batch}/filtered/{{svtype}}/svtyper_{chrom}_{{svtype}}_genotypes_filtered.vcf.gz",
chrom=self.chromosomes,
batch=self.sample_batches.keys())
return {"genotypes": genotypes, "filtered": filtered}
def set_absolute_path(params, key):
params["_"+key] = params[key]
......@@ -495,8 +511,3 @@ def which(program):
if is_exe(exe_file):
return exe_file
return None
if __name__ == "__main__":
params = {'samples': "samples.tsv"}
align = CommandFactory.factory("align", params)
......@@ -18,8 +18,8 @@ rule bwaindex:
rule bwamap:
input:
unpack(cmd.get_sample_fastq),
reference = reference,
ref_index = reference + ".bwt"
reference = cmd.get_reference,
ref_index = "%s.bwt" % cmd.get_reference
output:
bam = temp("bwa/{sample}.bam")
params:
......
......@@ -3,7 +3,7 @@
# TODO this should appear in another place (default values crom cnvpiepelines)
config['read_length'] = 100
localrules: gendermap, bamlist, batchesdump
localrules: gendermap, bamlist
rule filtering:
input:
......@@ -135,10 +135,3 @@ rule bamlist:
batch = "\w+"
run:
cmd.dump_batch_bamlist(wildcards.batch, output.bamlist)
# rule batchesdump:
# output:
# yamldump = "sample_batches.yaml"
# run:
# with open(output.yamldump, "w") as f:
# print(yaml.dump(cmd.sample_batches), file=f)
import sys
import os
rule get_fasta_popsim:
input:
fasta = "reference_raw.fasta",
index = "reference_raw.fasta.fai"
output:
"reference.fasta"
params:
chromosomes = config['chromosomes']
shell:
"samtools faidx {input.fasta} {params.chromosomes} > {output}; "
"samtools faidx {output}"
rule buildpop:
input:
reference=cmd.get_reference,
nstretches="exclusion/gaps.bed"
output:
touch("tmp.txt"),
cmd.get_builpop_fastqs(),
genotype = "pop/genotypes.vcf.gz",
yamlconf = temp("builpop_yamlconf.yaml"),
params:
outdir = "pop"
log:
stdout = "logs/popsim.o",
stderr = "logs/popsim.e"
run:
import yaml
with open(output.yamlconf, 'w') as outfile:
yaml.dump(config, outfile, default_flow_style=False)
command = ["buildpop_launcher.py",
"--reference %s" % input.reference,
"--conf-file %s" % output.yamlconf,
"--nstretches %s" % input.nstretches,
"--outdir %s" % params.outdir,
"--threads %s" % threads]
shell(" ".join(command) + " 1> %s 2> %s" % (log.stdout, log.stderr))
rule linkdatabams:
input:
bam = "bams/{sample}.bam",
bai = "bams/{sample}.bam.bai"
output:
bam = "data/bams/{sample}.bam",
bai = "data/bams/{sample}.bam.bai"
run:
os.symlink(os.path.abspath(input.bam), output.bam)
os.symlink(os.path.abspath(input.bai), output.bai)
rule buildinfilesresults:
input:
unpack(cmd.buildinfilesresults)
output:
genotypes = "list_files/tools_results_files_{svtype}.list",
filtered = "list_files/filtered_results_files_{svtype}.list"
threads:
1
run:
with open(output.genotypes, "w") as o_gt:
for i_gt in input.genotypes:
o_gt.write("%s\n" % i_gt)
with open(output.filtered, "w") as o_ft:
for i_ft in input.filtered:
o_ft.write("%s\n" % i_ft)
rule buildresults:
input:
genotypes = "list_files/tools_results_files_{svtype}.list",
filtered = "list_files/filtered_results_files_{svtype}.list",
true_vcf = "pop/genotypes.vcf.gz"
output:
"results/{svtype}/Summarized_results_{svtype}.ipynb",
"results/{svtype}/Summarized_results_{svtype}.html"
# params:
# overlap_cutoff = OVERLAP_CUTOFF,
# left_precision = LEFT_PRECISION,
# right_precision = RIGHT_PRECISION,
# svlist = SV_LIST,
# haploid = HAPLOID
threads:
1
log:
stdout = "logs/results_{svtype}.o",
stderr = "logs/results_{svtype}.e"
shell:
"""
touch results/{wildcards.svtype}/Summarized_results_{wildcards.svtype}.ipynb
touch results/{wildcards.svtype}/Summarized_results_{wildcards.svtype}.html
"""
include: "common.smk"
cmd = CommandFactory.factory(config)
workdir: config['wdir']
rule all:
input:
cmd.get_all_outputs
include: "rules/popsim.smk"
include: "rules/alignment.smk"
include: "rules/detectioncommon.smk"
include: "rules/tools/delly.smk"
include: "rules/tools/lumpy.smk"
include: "rules/tools/pindel.smk"
include: "rules/tools/genomestrip.smk"
include: "rules/tools/genotyping.smk"
include: "rules/tools/cnvpipeline.smk"
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment