Skip to content
Snippets Groups Projects
Commit 9452d0d0 authored by Floreal Cabanettes's avatar Floreal Cabanettes
Browse files

Add refbundle workflow to the simulation pipeline

parent afbac7cc
No related branches found
No related tags found
1 merge request!11Integrate popsim simulation software to cnvpipelines
......@@ -627,7 +627,7 @@ class CnvPipeline:
"wdir": self.wdir,
"reference": reference,
"species": species,
"readlength": read_length,
"read_len": read_length,
"maxNstretches": max_n_stretches
}
......@@ -792,7 +792,7 @@ class CnvPipeline:
def run_simulation(self, nb_inds, reference, nstretches, sv_list, coverage, force_polymorphism,
haploid, proba_del, proba_inv, read_len, insert_len_mean, insert_len_sd, min_deletions,
min_inversions, max_try, genotypes, tools, force_wdir=False, **kwargs):
min_inversions, max_try, genotypes, tools, species, max_n_stretches, force_wdir=False, **kwargs):
"""
Run simulation
:param nb_inds:
......@@ -813,6 +813,9 @@ class CnvPipeline:
:param genotypes:
:param tools:
:param kwargs:
:param species:
:param max_n_stretches:
:param force_wdir:
:return:
"""
try:
......@@ -837,6 +840,8 @@ class CnvPipeline:
raise ValueError("proba_inv must be positive or null")
if proba_inv == 0 and proba_del == 0:
raise ValueError("At least one proba of variant (del or inv) must be positive")
if "genomestrip" in tools and species is None:
raise ValueError("Species parameter is required for genomestrip")
self._check_wdir(force_wdir)
......@@ -878,9 +883,16 @@ class CnvPipeline:
"start_batch_nb": 1,
"chromosomes": "all",
"varianttypes": variant_types,
"no_detection_index": True
"no_detection_index": True,
"build_refbundle": "genomestrip" in tools,
"species": species,
"maxNstretches": max_n_stretches
}
if "genomestrip" in tools:
config["refbundle"] = os.path.join("refbundle", "reference.fasta")
config["rb_subdir"] = "refbundle"
nstretches_final = None
if nstretches is not None:
nstretches_final = os.path.join(self.wdir, os.path.basename(nstretches))
......@@ -1162,6 +1174,10 @@ if __name__ == "__main__":
simul_parser.add_argument("-g", "--genotypes", help="Position of SV with genotypes of individuals")
simul_parser.add_argument('-t', '--tools', type=str, required=True, help="Tools to launch, coma separated",
nargs="+", choices=TOOLS)
simul_parser.add_argument('-sp', '--species', type=str, required=False,
help="[refbundle] Species name, according to the NCBI Taxonomy database")
simul_parser.add_argument('-mn', '--max-n-stretches', type=int, required=False, default=100,
help="[refbundle] Max size of nstretches to consider them as it")
add_run_and_rerun_options(simul_parser)
simul_parser.add_argument('-w', '--wdir', type=str, required=True,
help="Output folder where data will be stored")
......
......@@ -228,7 +228,7 @@ rule excluded:
" -t {threads}"
" 1>{log.stdout} 2>{log.stderr}"
rule nstretches:
rule nstretches_d:
input:
reference = REFERENCE
output:
......
......@@ -63,6 +63,7 @@ bp_outputs = buildpop_outputs()
include: "tools/threads.snk"
include: "align.snk"
include: "referencebundle.snk"
include: "detection.snk"
localrules: all_popsim
......
......@@ -26,6 +26,7 @@ shell.prefix("export PATH={root_dir}/bin:{sv_dir}/bwa:\"$PATH\"; export LD_LIBRA
root_dir=ROOTPATH, sv_dir=SV_DIR))
PREFIX = "reference"
SUBDIR = config["rb_subdir"] + os.path.sep if "rb_subdir" in config else ""
SUFF_TYPES = ("gcmask", "svmask", "lcmask")
p_repeats = re.compile('.*(Satellite|Simple_repeat|Low_complexity).*')
......@@ -74,21 +75,21 @@ CHROMS = getChroms(config['reference'])
rule final:
input:
PREFIX+".fasta.fai",
PREFIX+".gcmask.fasta.fai",
PREFIX+".svmask.fasta.fai",
PREFIX+".lcmask.fasta.fai",
PREFIX+".rdmask.bed",
PREFIX+".dict",
PREFIX+".ploidymap.txt",
PREFIX+".gendermask.bed",
PREFIX+".Nstretch.bed"
SUBDIR + PREFIX+".fasta.fai",
SUBDIR + PREFIX+".gcmask.fasta.fai",
SUBDIR + PREFIX+".svmask.fasta.fai",
SUBDIR + PREFIX+".lcmask.fasta.fai",
SUBDIR + PREFIX+".rdmask.bed",
SUBDIR + PREFIX+".dict",
SUBDIR + PREFIX+".ploidymap.txt",
SUBDIR + PREFIX+".gendermask.bed",
SUBDIR + PREFIX+".Nstretch.bed"
rule get_fasta:
input:
config['reference']
output:
PREFIX+".fasta"
SUBDIR + PREFIX+".fasta"
run:
from Bio import SeqIO
with open(input[0], "rU") as handle, open(output[0], "w") as f:
......@@ -98,9 +99,9 @@ rule get_fasta:
rule index:
input:
"{type}.fasta"
SUBDIR + "{type}.fasta"
output:
"{type}.fasta.fai"
SUBDIR + "{type}.fasta.fai"
log:
"logs/index/{type}.log"
shell:
......@@ -108,20 +109,20 @@ rule index:
rule length:
input:
fasta = PREFIX+".fasta",
fai = PREFIX+".fasta.fai"
fasta = SUBDIR + PREFIX+".fasta",
fai = SUBDIR + PREFIX+".fasta.fai"
output:
"reference.fasta.length"
SUBDIR + "reference.fasta.length"
shell:
"fastalength {input.fasta} > {output}"
rule preparesvmask:
input:
fasta = PREFIX+".fasta",
fai = PREFIX+".fasta.fai"
fasta = SUBDIR + PREFIX+".fasta",
fai = SUBDIR + PREFIX+".fasta.fai"
output:
seq = "work/reference.fasta",
index = "work/reference.fasta.bwt"
seq = SUBDIR + "work/reference.fasta",
index = SUBDIR + "work/reference.fasta.bwt"
log:
"logs/preparesvmask.log"
shell:
......@@ -132,20 +133,20 @@ rule preparesvmask:
rule mergesvmask:
input:
expand("work/svmask_{chrom}.fasta", chrom=CHROMS)
expand(SUBDIR + "work/svmask_{chrom}.fasta", chrom=CHROMS)
output:
"reference.svmask.fasta"
SUBDIR + "reference.svmask.fasta"
shell:
"cat {input} > {output}"
rule chromsvmask:
input:
"work/reference.fasta.bwt",
fa = "work/reference.fasta"
SUBDIR + "work/reference.fasta.bwt",
fa = SUBDIR + "work/reference.fasta"
output:
"work/svmask_{chrom}.fasta"
SUBDIR + "work/svmask_{chrom}.fasta"
params:
rl = config['readlength']
rl = config['read_len']
log:
"logs/svmask/{chrom}.log"
shell:
......@@ -155,11 +156,11 @@ rule chromsvmask:
rule bed2fasta:
input:
PREFIX+".fasta.fai",
ref = PREFIX+".fasta",
bed = "{type}.bed"
SUBDIR + PREFIX+".fasta.fai",
ref = SUBDIR + PREFIX+".fasta",
bed = SUBDIR + "{type}.bed"
output:
"{type}.fasta"
SUBDIR + "{type}.fasta"
wildcard_constraints:
type = ".*(lc|sv|gc)mask"
shell:
......@@ -171,10 +172,10 @@ rule bed2fasta:
rule repeatmask:
input:
fasta = "reference.fasta",
fai = PREFIX+".fasta.fai"
fasta = SUBDIR + "reference.fasta",
fai = SUBDIR + PREFIX+".fasta.fai"
output:
"reference.fasta.out"
SUBDIR + "reference.fasta.out"
params:
sp = config['species']
threads:
......@@ -185,9 +186,9 @@ rule repeatmask:
rule rdmask:
input:
length = PREFIX+".fasta.length"
length = SUBDIR + PREFIX+".fasta.length"
output:
bed = PREFIX+".rdmask.bed"
bed = SUBDIR + PREFIX+".rdmask.bed"
run:
with open(input.length) as f:
with open(output.bed, "w") as fout:
......@@ -199,9 +200,9 @@ rule rdmask:
rule lcmaskbed:
input:
repeats = PREFIX+".fasta.out"
repeats = SUBDIR + PREFIX+".fasta.out"
output:
bed = PREFIX+".lcmask.bed"
bed = SUBDIR + PREFIX+".lcmask.bed"
run:
repeats = []
with open(output.bed, "w") as fout:
......@@ -218,9 +219,9 @@ rule lcmaskbed:
rule gcmaskbed:
input:
repeats = "reference.fasta.out"
repeats = SUBDIR + "reference.fasta.out"
output:
bed = "reference.gcmask.bed"
bed = SUBDIR + "reference.gcmask.bed"
run:
repeats = []
with open(output.bed, "w") as fout:
......@@ -237,10 +238,10 @@ rule gcmaskbed:
rule dict:
input:
fasta = PREFIX+".fasta",
fai = PREFIX+".fasta.fai"
fasta = SUBDIR + PREFIX+".fasta",
fai = SUBDIR + PREFIX+".fasta.fai"
output:
PREFIX+".dict"
SUBDIR + PREFIX+".dict"
shell:
"picard CreateSequenceDictionary "
" R= {input.fasta} "
......@@ -248,10 +249,10 @@ rule dict:
rule nstretches:
input:
fasta = PREFIX+".fasta",
fai = PREFIX+".fasta.fai"
fasta = SUBDIR + PREFIX+".fasta",
fai = SUBDIR + PREFIX+".fasta.fai"
output:
PREFIX+".Nstretch.bed"
SUBDIR + PREFIX+".Nstretch.bed"
params:
nstrech = config['maxNstretches']
threads:
......@@ -262,12 +263,12 @@ rule nstretches:
rule ploidy:
output:
PREFIX+".ploidymap.txt"
SUBDIR + PREFIX+".ploidymap.txt"
shell:
" printf \"*\t*\t*\t*\t2\n\" > {output}"
rule gendermask:
output:
PREFIX+".gendermask.bed"
SUBDIR + PREFIX+".gendermask.bed"
shell:
" touch {output}"
......@@ -11,21 +11,34 @@ rule gendermap:
fout.write("\t".join([sample, "F"])+"\n")
fout.close()
rule preprocess:
input:
unpack(snakemake_utils.get_inputs_bams),
bamlist = "{batch}/bamlist.list",
gendermap = "{batch}/genomestrip/gendermap.txt",
genome = config['refbundle'] if 'refbundle' in config else REFERENCE
output:
metadata = "{batch}/genomestrip/metadata/sample_gender.report.txt"
log:
stdout = "{batch}/logs/genomestrip/preprocess.o",
stderr = "{batch}/logs/genomestrip/preprocess.e"
shell:
"gstrip_preprocess.sh {input.bamlist} {input.gendermap}"
" {input.genome} {wildcards.batch}/genomestrip preprocess "
" 1>{log.stdout} 2>{log.stderr}"
if "genomestrip" in tools:
PREFIX = os.path.splitext(config["refbundle"])[0]
rule preprocess:
input:
PREFIX+".fasta.fai",
PREFIX+".gcmask.fasta.fai",
PREFIX+".svmask.fasta.fai",
PREFIX+".lcmask.fasta.fai",
PREFIX+".rdmask.bed",
PREFIX+".dict",
PREFIX+".ploidymap.txt",
PREFIX+".gendermask.bed",
PREFIX+".Nstretch.bed",
unpack(snakemake_utils.get_inputs_bams),
bamlist = "{batch}/bamlist.list",
gendermap = "{batch}/genomestrip/gendermap.txt",
genome = config['refbundle']
output:
metadata = "{batch}/genomestrip/metadata/sample_gender.report.txt"
log:
stdout = "{batch}/logs/genomestrip/preprocess.o",
stderr = "{batch}/logs/genomestrip/preprocess.e"
shell:
"gstrip_preprocess.sh {input.bamlist} {input.gendermap}"
" {input.genome} {wildcards.batch}/genomestrip preprocess "
" 1>{log.stdout} 2>{log.stderr}"
rule genomestrip:
input:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment