Skip to content
Snippets Groups Projects
Commit e4a96663 authored by Floreal Cabanettes's avatar Floreal Cabanettes
Browse files

Some improvements

parent 0465abac
No related branches found
No related tags found
No related merge requests found
......@@ -2,6 +2,7 @@
import sys
import os
import shutil
import random
import argparse
from collections import OrderedDict
......@@ -11,6 +12,7 @@ import tempfile
from lib.svrunner_utils import eprint
from pysam import tabix_compress, tabix_index
from variants_simulator import VariantsSimulator
from exceptions import InputException
def parse_args():
......@@ -25,10 +27,12 @@ def parse_args():
parser.add_argument("-s", "--sv-list", help="File containing the SVs", required=False)
parser.add_argument("-c", "--coverage", help="Coverage of reads", default=15, type=int)
parser.add_argument("-o", "--output-directory", help="Output directory", default="res")
parser.add_argument("-f", "--force-polymorphism", help="Force polymorphism for each SV", action='store_const', const=True,
default=False)
parser.add_argument("-a", "--haploid", help="Make a haploid genome, instead of diploid one", action="store_const", const=True,
default=False)
parser.add_argument("-e", "--erase-output", help="Delete output, if directory already exists", action='store_const',
const=True, default=False)
parser.add_argument("-f", "--force-polymorphism", help="Force polymorphism for each SV", action='store_const',
const=True, default=False)
parser.add_argument("-a", "--haploid", help="Make a haploid genome, instead of diploid one", action="store_const",
const=True, default=False)
parser.add_argument("-p", "--proba-del", help="Probabilty to have a deletion", type=float, default=0.000001)
parser.add_argument("-l", "--read-len", help="Generate reads having a length of LEN", type=int, default=100)
parser.add_argument("-m", "--insert-len-mean", help="Generate inserts (fragments) having an average length of LEN",
......@@ -285,11 +289,24 @@ def generate_samples_fastq(haploid, nb_inds, output_dir, coverage, read_len, ins
insert_len_mean, insert_len_sd, threads))
def confirm(deletions: dict):
def confirm(deletions: dict, variants: dict):
nb_dels = 0
variants = sorted(variants["DEL"], key=lambda x: x["min"])
variants_counts = OrderedDict()
for variant in variants:
variants_counts["{0}-{1}".format(variant["min"], variant["max"])] = 0
for deletes in deletions.values():
nb_dels += len(deletes)
for delete in deletes:
v_len = delete["length"]
for variant in variants:
if variant["min"] < v_len < variant["max"]:
variants_counts["{0}-{1}".format(variant["min"], variant["max"])] += 1
break
print("We generate {0} deletion variants.".format(nb_dels))
for v_range, v_count in variants_counts.items():
print("\t- Range {0}: {1}".format(v_range, v_count))
print("")
return input("Continue [Y/n]? ") in ["y", "Y", ""]
......@@ -303,8 +320,11 @@ def main():
nb_inds = args.nb_inds
if os.path.isdir(output_dir):
eprint("Error: output directory {0} already exists.".format(output_dir))
return 1
if args.erase_output:
shutil.rmtree(output_dir)
else:
eprint("Error: output directory {0} already exists.".format(output_dir))
return 1
elif os.path.isfile(output_dir):
eprint("Error: unable to create output directory {0}: file exists.".format(output_dir))
return 1
......@@ -329,9 +349,13 @@ def main():
################
print("GENERATE RANDOM DELETIONS VARIANTS...\n")
sv_sim = VariantsSimulator(sv_list)
deletions = sv_sim.get_random_deletions(args.proba_del, args.reference)
if args.quiet or confirm(deletions):
try:
sv_sim = VariantsSimulator(sv_list)
deletions = sv_sim.get_random_deletions(args.proba_del, args.reference)
except InputException as e:
print(e)
return False
if args.quiet or confirm(deletions, sv_sim.variants):
genotypes_for_inds = build_genotypes_vcf_list(deletions, output_vcf, haploid, args.force_polymorphism, nb_inds,
tmp_dir, prg_path)
build_fastas_chromosomes(reference, genotypes_for_inds, haploid, output_dir)
......
DEL 100 200 0.7
DEL 200 500 0.2
DEL 500 1000 0.07
DEL 1000 2000 0.02
DEL 2000 10000 0.01
\ No newline at end of file
#!/usr/bin/env python3
import os
import sys
import re
import random
......@@ -11,39 +12,9 @@ from Bio import SeqIO
class VariantsSimulator:
def __init__(self, sv_list=None):
if sv_list is not None:
self.variants = self.__load_variants(sv_list)
else:
self.variants = {
"DEL": [
{
"min": 100,
"max": 200,
"proba": 0.7,
"proba_cumul": 0.7
}, {
"min": 200,
"max": 500,
"proba": 0.2,
"proba_cumul": 0.9
}, {
"min": 500,
"max": 1000,
"proba": 0.07,
"proba_cumul": 0.97
}, {
"min": 1000,
"max": 2000,
"proba": 0.02,
"proba_cumul": 0.99
}, {
"min": 2000,
"max": 10000,
"proba": 0.01,
"proba_cumul": 1.0
}
]
}
if sv_list is None:
sv_list = os.path.join(os.path.dirname(os.path.realpath(__file__)), "defaults.rules")
self.variants = self.__load_variants(sv_list)
self.deletions = {}
@staticmethod
......@@ -52,7 +23,7 @@ class VariantsSimulator:
allowed_sv = {"DEL", "INV", "DUP"}
with open(sv_list, "r") as sv_list_f:
for line in sv_list_f:
parts = re.split("\s", line.strip("\n"))
parts = re.split("\s+", line.strip("\n"))
type_sv = parts[0]
if type_sv not in allowed_sv:
raise InputException("Disallowed variant type: " + type_sv)
......@@ -119,9 +90,8 @@ class VariantsSimulator:
freq = 0.2 if random.uniform(0, 1) < 0.5 else 0.5
if chrm not in deletions:
deletions[chrm] = []
nb_total_str = str(nb_total)
deletions[chrm].append({
"name": "DEL{0}_{1}_{2}".format("0" * (5-len(nb_total_str)) + nb_total_str, chrm, nb_on_ch),
"name": "DEL{0}_{1}_{2}".format("%05d" % nb_total, chrm, nb_on_ch),
"start": i+1,
"end": i+1+del_size,
"length": del_size,
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment