Skip to content
Snippets Groups Projects
build_pop.py 27.8 KiB
Newer Older
Floreal Cabanettes's avatar
Floreal Cabanettes committed
#!/usr/bin/env python3

import sys
Floreal Cabanettes's avatar
Floreal Cabanettes committed
import os
Floreal Cabanettes's avatar
Floreal Cabanettes committed
import shutil
Floreal Cabanettes's avatar
Floreal Cabanettes committed
import random
import argparse
Floreal Cabanettes's avatar
Floreal Cabanettes committed
from collections import OrderedDict
import vcf
from Bio import SeqIO
from pysam import tabix_compress, tabix_index, VariantFile
Floreal Cabanettes's avatar
Floreal Cabanettes committed
from lib.variants_simulator import VariantsSimulator
from lib.exceptions import InputException, ExecException
import multiprocessing
Floreal Cabanettes's avatar
Floreal Cabanettes committed

class Print:
    def __init__(self, stdout=None, stderr=None):
        self.stdout = stdout
        self.stderr = stderr
        if self.stdout is not None and os.path.isfile(self.stdout):
            os.remove(self.stdout)
        if self.stderr is not None and os.path.isfile(self.stderr):
            os.remove(self.stderr)

    def out(self, message):
        if self.stdout is not None:
            with open(self.stdout, "a") as stdout_f:
                print(message, file=stdout_f)
        else:
            print(message, file=sys.stdout)

    def err(self, message):
        if self.stderr is not None:
            with open(self.stderr, "a") as stderr_f:
                print(message, file=stderr_f)
        else:
            print(message, file=sys.stderr)


def check_min_size(value):
    ivalue = int(value)
    if ivalue <= 1:
        raise argparse.ArgumentTypeError("%s is an invalid size value (>= 1)" % value)
    return ivalue


def parse_args():
    """
    Parse script arguments
    :return: argparse arguments object
    """
    parser = argparse.ArgumentParser(description='Generate simulated populations with SV',
                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("-n", "--nb-inds", help="Number of individuals to generate", type=int)
    parser.add_argument("-r", "--reference", help="Reference genome", required=True)
    parser.add_argument("-ns", "--nstretches", help="N-stretches positions bed file", required=False)
    parser.add_argument("-s", "--sv-list", help="File containing the SVs", required=False)
    parser.add_argument("-c", "--coverage", help="Coverage of reads", default=15, type=int)
    parser.add_argument("-o", "--output-dir", help="Output directory", default="res")
    parser.add_argument("-e", "--force-outputdir", help="Delete output directory before start, if already exists",
                        action='store_const', const=True, default=False)
Floreal Cabanettes's avatar
Floreal Cabanettes committed
    parser.add_argument("-f", "--force-polymorphism", help="Force polymorphism for each SV", action='store_const',
                        const=True, default=False)
    parser.add_argument("-a", "--haploid", help="Make a haploid genome, instead of diploid one", action="store_const",
                        const=True, default=False)
    parser.add_argument("-pd", "--proba-del", help="Probabilty to have a deletion", type=float, default=0.000001)
    parser.add_argument("-pi", "--proba-inv", help="Probablity to have an insertion", type=float, default=0.000001)
    parser.add_argument("-l", "--read-len", help="Generate reads having a length of LEN", type=int, default=100)
    parser.add_argument("-m", "--insert-len-mean", help="Generate inserts (fragments) having an average length of LEN",
                        type=int, default=300)
    parser.add_argument("-v", "--insert-len-sd", help="Set the standard deviation of the insert (fragment) length (%%)",
                        type=int, default=30)
    parser.add_argument("-q", "--quiet", help="Don't ask anything, choose default answer instead", action="store_const",
                        const=True, default=False)
Floreal Cabanettes's avatar
Floreal Cabanettes committed
    parser.add_argument("-md", "--min-deletions", help="Minimum of deletions to generate (>=1)", default=0,
Floreal Cabanettes's avatar
Floreal Cabanettes committed
    parser.add_argument("-mi", "--min-inversions", help="Minimum of inversions to generate (>=1)", default=0,
                        type=check_min_size)
    parser.add_argument("--max-try", help="Maximum of tries", default=10, type=check_min_size)
    parser.add_argument("-t", "--threads", help="Number of threads", default=multiprocessing.cpu_count(), type=int)
    parser.add_argument("-g", "--genotypes", help="Position of SV with genotypes of individuals")

    args = parser.parse_args()

    if args.nb_inds is None and args.genotypes is None:
        parser.error("--nb-inds parameter is required (except if --genotypes parameter is given)")

    if args.proba_del > 1 or args.proba_del < 0:
        parser.error("Probability of deletions must be between 0 and 1")

    if args.proba_inv > 1 or args.proba_inv < 0:
        parser.error("Probability of inversions must be between 0 and 1")

    if args.proba_del + args.proba_inv > 1:
        parser.error("Sum of inversions and deletions probabilities must be lesser than 1")

    return args


def _allele(frequency):
    """
    Get randomly an allele, given a frequency
    :param frequency: frequency of the allele {float}
    :return: allele randomly choosen {0 or 1}
    """
    return 1 if random.uniform(0, 1) < frequency else 0
def _get_genotypes_for_inds(nb_inds, haploid, freq):
    """
    Get genotypes for each individual
    :param nb_inds: number of individuals {int}
    :param haploid: is the genome hamploid {bool}
    :param freq: frequency of the allele {float}
    :return: list of genotypes, vcf data {list}
    """
    all_genotypes = []
    genotypes_row = []
    for i in range(1, nb_inds + 1):
Floreal Cabanettes's avatar
Floreal Cabanettes committed
        if not haploid:
            genotype = str(_allele(freq)) + "/" + str(_allele(freq))
Floreal Cabanettes's avatar
Floreal Cabanettes committed
        else:
            genotype = str(_allele(freq))
        genotype_data = vcf.model._Call(None, "indiv" + str(i), vcf.model.make_calldata_tuple("GT")(GT=genotype))
        genotypes_row.append(genotype_data)
        all_genotypes.append(genotype)
    return all_genotypes, genotypes_row


def _svsort(sv, chrm, genotypes_for_inds):
    """
    Function to sort regions
    :param sv: the variant
    :param chrm: chromosome {str}
    :param genotypes_for_inds: dictionary of genotypes for each individual
    return int(genotypes_for_inds[chrm][sv]["start"])
def _build_vcf_header(vcf_file, prg_path, tmp_dir, nb_inds):
    """
    Build header of the VCF file
    :param vcf_file: vcf file full path name {str}
    :param prg_path: program path {str}
    :param tmp_dir: temporary directory {str}
    :param nb_inds: number of individuals {int}
    """
    try:
        with open(os.path.join(prg_path, "template.vcf"), "r") as template:
            try:
                with open(vcf_file, "w") as my_template:
                    for line in template:
                        if line[:6] == "#CHROM":
                            line = line.replace("\n", "")
                            for i in range(0, nb_inds):
                                line += "\tindiv" + str(i+1)
                            line += "\n"
                        my_template.write(line)
            except IOError:
                raise ExecException("ERROR: unable to create template file \"{0}\" in temp dir.".
                                format(os.path.join(tmp_dir, "template.vcf")))
    except IOError:
        raise ExecException("ERROR: template file not found in program directory.")
def build_genotypes_vcf_list(deletions: dict, inversions: dict, output_vcf, haploid, force_polymorphism, nb_inds,
                             tmp_dir, prg_path):
    """
    Build VCF file describing genotypes for each individual (and the associated python dictionary)
    :param deletions: deletions description {dict}
    :param inversions: inversions description {dict}
    :param output_vcf: output VCF file full path name {str}
    :param haploid: is haploid {bool}
    :param force_polymorphism: force polymorphism {bool}
    :param nb_inds: number of individuals {int}
    :param tmp_dir: temporary directory {str}
    :param prg_path: program path {str}
    :return: dictionary of genotypes for each variant for each dictionary {OrderedDict}
    """
    genotypes_for_inds_DEL = OrderedDict()
    # { chr: { id_indiv: {start: #start, end: #end, genotypes: [[0,1],[1,1],...], ...}, # ...}

    vcf_file = os.path.join(tmp_dir, "template.vcf")

    # Build VCF header:
    _build_vcf_header(vcf_file, prg_path, tmp_dir, nb_inds)

    for chrm, deletes in deletions.items():
        for delete in deletes:
            if chrm not in genotypes_for_inds_DEL:
                genotypes_for_inds_DEL[chrm] = {}
            genotypes_for_inds_DEL[chrm][delete["name"]] = {"start": delete["start"], "end": delete["end"], "genotypes": []}

            # Get genotypes:
            all_genotypes, genotypes = [], []
            if force_polymorphism:
                polymorph = False
                while not polymorph:
                    all_genotypes, genotypes = _get_genotypes_for_inds(nb_inds, haploid, delete["freq"])
                    polymorph = len(set(all_genotypes)) > 1
            else:
                all_genotypes, genotypes = _get_genotypes_for_inds(nb_inds, haploid, delete["freq"])
            genotypes_for_inds_DEL[chrm][delete["name"]]["genotypes"] = [x.split("/") for x in all_genotypes]

            info = {"END": delete["end"], "AF": delete["freq"]}
            vcf_record = vcf.model._Record(chrm, delete["start"], delete["name"], "N", [vcf.model._SV("DEL")], ".",
                                           ".", info, "GT", [0], genotypes)
            records.append(vcf_record)

    # Inversions:
    genotypes_for_inds_INV = OrderedDict()
    for chrm, the_inversions in inversions.items():
        for inversion in the_inversions:
            if chrm not in genotypes_for_inds_INV:
                genotypes_for_inds_INV[chrm] = {}
            genotypes_for_inds_INV[chrm][inversion["name"]] = {"start": inversion["start"], "end": inversion["end"],
                                                           "genotypes": []}
            # Get genotypes:
            all_genotypes, genotypes = [], []
            if force_polymorphism:
                polymorph = False
                while not polymorph:
                    all_genotypes, genotypes = _get_genotypes_for_inds(nb_inds, haploid, inversion["freq"])
                    polymorph = len(set(all_genotypes)) > 1
            else:
                all_genotypes, genotypes = _get_genotypes_for_inds(nb_inds, haploid, inversion["freq"])
            genotypes_for_inds_INV[chrm][inversion["name"]]["genotypes"] = [x.split("/") for x in all_genotypes]

            info = {"END": inversion["end"], "AF": inversion["freq"]}
            vcf_record = vcf.model._Record(chrm, inversion["start"], inversion["name"], "N", [vcf.model._SV("INV")],
                                           ".", ".", info, "GT", [0], genotypes)
            records.append(vcf_record)
    records.sort(key=lambda r: (r.CHROM, r.start))

    with open(output_vcf, "w") as o_vcf:
        vcf_reader = vcf.Reader(filename=vcf_file)
        vcf_writer = vcf.Writer(o_vcf, vcf_reader)
        for record in records:
            vcf_writer.write_record(record)
        vcf_writer.close()

    tabix_compress(output_vcf, output_vcf + ".gz", True)
    tabix_index(output_vcf + ".gz", force=True, preset="vcf")
    os.remove(output_vcf)

    return genotypes_for_inds_DEL, genotypes_for_inds_INV
def load_genotypes_from_file(genotypes_file):
    variants = VariantFile(genotypes_file)
    genotypes_for_inds_DEL = {}
    genotypes_for_inds_INV = {}
    nb_inds = 0
    for variant in variants:
        chrm = variant.chrom
        id_v = variant.id
        start = variant.start
        end = variant.stop
        type_v = variant.alts[0][1:-1]
        samples = variant.samples
        gt = [list(map(str, samples[x]["GT"])) for x in samples]
        genotypes_for_inds = None
        if type_v == "DEL":
            genotypes_for_inds = genotypes_for_inds_DEL
        elif type_v == "INV":
            genotypes_for_inds = genotypes_for_inds_INV
        else:
            print("ERROR: Variant type not supported: %s. Ignoring this line..." % type_v)
        if genotypes_for_inds is not None:
            if chrm not in genotypes_for_inds:
                genotypes_for_inds[chrm] = {}
            genotypes_for_inds[chrm][id_v] = {
                "start": start,
                "end": end,
                "genotypes": gt
            }
            nb_inds = len(gt)
    return nb_inds, genotypes_for_inds_DEL, genotypes_for_inds_INV
def _compute_keeped_genomic_regions(svs, svs_infos, haploid):
    """
    Get list of all regions keeped (not deleted)
    :param svs: list of variants (deletions) {list}
    :param svs_infos: infos of variants {dict}
    :param haploid: is haploid {bool}
    :return: regions for each individuals {OrderedDict}, the last position for each region {OrderedDict}
    """
Floreal Cabanettes's avatar
Floreal Cabanettes committed
    regions = OrderedDict()
    current_region_pointer = OrderedDict()
    for svs_i in svs:
        i = 0
        for genotypes in svs_infos[svs_i]["genotypes"]:
            if i not in regions:
                regions[i] = {}
                current_region_pointer[i] = {}
Floreal Cabanettes's avatar
Floreal Cabanettes committed
            for j in range(0, 2 if not haploid else 1):  # For each chromosome of the diploid genome, or the chromosome
                # of the haploid genome
Floreal Cabanettes's avatar
Floreal Cabanettes committed
                if j not in regions[i]:
                    regions[i][j] = []
                    current_region_pointer[i][j] = "0"
                if svs_infos[svs_i]["genotypes"][i][j] == "1":
                    regions[i][j].append([current_region_pointer[i][j], svs_infos[svs_i]["start"]])
                    current_region_pointer[i][j] = svs_infos[svs_i]["end"]
            i += 1
    return regions, current_region_pointer
Floreal Cabanettes's avatar
Floreal Cabanettes committed
def _build_fastas_deletions(chrm, regions, current_region_pointer, output_dir, fasta_orig_chr, last_nt):
    """
    Build fasta files
    :param chrm:
    :param regions:
    :param current_region_pointer:
    :param output_dir:
    :param fasta_orig_chr:
    :param last_nt:
    :return:
    """
Floreal Cabanettes's avatar
Floreal Cabanettes committed
    fastas = []
Floreal Cabanettes's avatar
Floreal Cabanettes committed
    for indiv, chrs_dips in regions.items():
        id_chrs = list(chrs_dips.keys())
        id_chrs = sorted(id_chrs)
Floreal Cabanettes's avatar
Floreal Cabanettes committed
        fastas.append([])
Floreal Cabanettes's avatar
Floreal Cabanettes committed
        for id_chr in id_chrs:
            chr_dip = chrs_dips[id_chr]  # SVs for each diploid chromosome
            logs_regions = []  # Store logs
            # Build FASTA and store logs:
Floreal Cabanettes's avatar
Floreal Cabanettes committed

            fasta = ""
            last_one = 0
            for chr_region in chr_dip:
                fasta += fasta_orig_chr[n][c][int(chr_region[0]):int(chr_region[1])]
                logs_regions.append("\t".join(str(x) for x in chr_region))
                last_one = int(chr_region[1])
            if last_one < last_nt:
                logs_regions.append("\t".join([str(current_region_pointer[indiv][id_chr]), str(last_nt)]))
                fasta += fasta_orig_chr[n][c][int(current_region_pointer[indiv][id_chr]):last_nt]
            fastas[n].append(fasta)
Floreal Cabanettes's avatar
Floreal Cabanettes committed

            # Write logs
                with open(os.path.join(output_dir, "indiv" + str(indiv + 1) + ".regions.log"), "a") as log_handle:
                    log_handle.write(chrm + "_" + str(id_chr) + "\t")
                    log_handle.write("\n\t".join(logs_regions))
                    log_handle.write("\n")
            except IOError:
                raise ExecException("ERROR: unable to write log file: \"{0}\"".
                                    format(os.path.join(output_dir, "indiv" + str(indiv + 1) + ".regions.log")))
            c += 1
        n += 1
Floreal Cabanettes's avatar
Floreal Cabanettes committed
    return fastas
def _build_fastas_inversions(fasta_orig_chrm: dict, genotypes_for_inds: dict, nb_inds, haploid):
    fastas = []
    for j in range(0, nb_inds):
        fastas.append([fasta_orig_chrm] if haploid else [fasta_orig_chrm, fasta_orig_chrm])
    for inv_name, props in genotypes_for_inds.items():
        i = 0
        for genotype in props["genotypes"]:
            for k in range(0, len(genotype)):
                if genotype[k] == "1":
                    start = int(props["start"])
                    end = int(props["end"])
Floreal Cabanettes's avatar
Floreal Cabanettes committed
                    o_id = fastas[i][k].id
                    o_name = fastas[i][k].name
                    o_description = fastas[i][k].description
                    fastas[i][k] = fastas[i][k][:start] + fastas[i][k][start:end].reverse_complement() + \
                                   fastas[i][k][end:]
Floreal Cabanettes's avatar
Floreal Cabanettes committed
                    fastas[i][k].id = o_id
                    fastas[i][k].name = o_name
                    fastas[i][k].description = o_description
            i += 1
    return fastas
Floreal Cabanettes's avatar
Floreal Cabanettes committed
def _write_fastas(fastas, output_dir):
    for indiv, chrs_dips in zip(range(0, len(fastas)), fastas):
        for id_chr, fasta in zip(range(0, len(chrs_dips)), chrs_dips):
            try:
                with open(os.path.join(output_dir, "indiv" + str(indiv + 1) + "_chr_" + str(id_chr) + ".fasta"),
                          "a") as output_handle:
                    SeqIO.write(fasta, output_handle, "fasta")
            except IOError:
                raise ExecException("ERROR: unable to write \"{0}\" file.".
                                    format(os.path.join(output_dir, "indiv" + str(indiv + 1) + "_chr_" + str(id_chr) +
                                                        ".fasta")))


def build_fastas_chromosomes(reference, genotypes_for_inds_DEL, genotypes_for_inds_INV, haploid, output_dir, nb_inds,
                             printer):
    printer.out("BUILD FASTA GENOME FOR EACH INDIVIDUAL...\n")
    fasta_orig = SeqIO.index(reference, "fasta")

    # First, inversions (does not change coordinates):
    printer.out("STARTING WITH INVERSIONS...")
    fastas_by_chrm = {}
Floreal Cabanettes's avatar
Floreal Cabanettes committed
    for chrm, svs_infos in genotypes_for_inds_INV.items():
        printer.out("PROCESSING CHROMOSOME {0}...\n".format(chrm))
        svs = list(svs_infos.keys())
        fastas_by_chrm[chrm] = _build_fastas_inversions(fasta_orig[chrm], genotypes_for_inds_INV[chrm], nb_inds, haploid)

    printer.out("STARTING WITH DELETIONS...")
    # Then, deletions:
    for chrm, svs_infos in genotypes_for_inds_DEL.items():
        printer.out("PROCESSING CHROMOSOME {0}...\n".format(chrm))

        svs = list(svs_infos.keys())
        svs = sorted(svs, key=lambda x:_svsort(x, chrm, genotypes_for_inds_DEL))
Floreal Cabanettes's avatar
Floreal Cabanettes committed
        if chrm in fastas_by_chrm:
            fastas_orig_chr = fastas_by_chrm[chrm]
        else:  # Build data as _build_fastas_inversions does
            fastas_orig_chr = []
            for i in range(0, nb_inds):
                fastas_orig_chr.append([fasta_orig[chrm]] if haploid else [fasta_orig[chrm], fasta_orig[chrm]])
        last_nt = len(fastas_orig_chr[0][0])-1

        # Compute keeped genomic regions for each diploid chromosome:
        regions, current_region_pointer = _compute_keeped_genomic_regions(svs, svs_infos, haploid)

        # Build FASTA of each diploid/haploid chromosome:
Floreal Cabanettes's avatar
Floreal Cabanettes committed
        fastas_by_chrm[chrm] = _build_fastas_deletions(chrm, regions, current_region_pointer, output_dir, fastas_orig_chr, last_nt)

    printer.out("WRITING FASTAS...")
    for chrm, fastas in fastas_by_chrm.items():
        _write_fastas(fastas, output_dir)
    printer.out("")


def generate_samples_fastq(haploid, nb_inds, output_dir, coverage, read_len, insert_len_mean, insert_len_sd,
                           prg_path, threads, stdout, stderr):

    # Generate reads for all individuals:
    cmd = [prg_path + "/pirs/pirs", "simulate", "-z", "-x", str(coverage), "-d", "-B",
           prg_path + "/pirs/Profiles/Base-Calling_Profiles/humNew.PE100.matrix.gz", "-I",
           prg_path + "/pirs/Profiles/InDel_Profiles/phixv2.InDel.matrix", "-l", str(read_len), "-m",
           str(insert_len_mean), "-v", str(insert_len_sd),
           "-G", prg_path + "/pirs/Profiles/GC-depth_Profiles/humNew.gcdep_100.dat", "-t", str(threads), "-o"]
    if haploid:
        cmd.remove("-d")
    for i in range(1, nb_inds+1):
        cmd_full = cmd + [output_dir, "-s", "indiv%d" % i]
        prefix_fasta = os.path.join(output_dir, "indiv" + str(i))
        chr0 = prefix_fasta + "_chr_0.fasta"
        if not haploid:
            chr1 = prefix_fasta + "_chr_1.fasta"
            cmd_full += [chr0, chr1]
        subprocess.call(cmd_full, stdout=open(stdout, "a") if stdout is not None else None,
                        stderr=open(stderr, "a") if stderr is not None else None)
def confirm(deletions: dict, inversions: dict, variants: dict, printer: Print, ask=True):
    # Deletions:
    variants_del = sorted(variants["DEL"], key=lambda x: x["min"])
    variants_del_counts = OrderedDict()
    variants_del_counts_chr = {}
    for variant in variants_del:
        variants_del_counts["{0}-{1}".format(variant["min"], variant["max"])] = 0
    for chrm, deletes in deletions.items():
        nb_dels_chr = len(deletes)
        nb_dels += nb_dels_chr
        variants_del_counts_chr[chrm] = nb_dels_chr
Floreal Cabanettes's avatar
Floreal Cabanettes committed
        for delete in deletes:
            v_len = delete["length"]
            for variant in variants_del:
                if variant["min"] < v_len <= variant["max"]:
                    variants_del_counts["{0}-{1}".format(variant["min"], variant["max"])] += 1
Floreal Cabanettes's avatar
Floreal Cabanettes committed
                    break
    printer.out("We generate {0} deletion variants.".format(nb_dels))
Floreal Cabanettes's avatar
Floreal Cabanettes committed
    if nb_dels > 0:
        printer.out("Ranges:")
        for v_range, v_count in variants_del_counts.items():
            printer.out("\t- Range {0}: {1}".format(v_range, v_count))
        printer.out("Chromosomes:")
        for chrm in sorted(list(variants_del_counts_chr.keys())):
            printer.out("\t- {0}: {1}".format(chrm, variants_del_counts_chr[chrm]))

    # inversions:
    nb_invs = 0
    variants_inv = sorted(variants["INV"], key=lambda x: x["min"])
    variants_inv_counts = OrderedDict()
    variants_inv_counts_chr = {}
    for variant in variants_inv:
        variants_inv_counts["{0}-{1}".format(variant["min"], variant["max"])] = 0
    for chrm, the_inversions in inversions.items():
        nb_invs_chr = len(the_inversions)
        nb_invs += nb_invs_chr
        variants_inv_counts_chr[chrm] = nb_invs_chr
        for inversion in the_inversions:
            v_len = inversion["length"]
            for variant in variants_inv:
                if variant["min"] < v_len <= variant["max"]:
                    variants_inv_counts["{0}-{1}".format(variant["min"], variant["max"])] += 1
                    break
    printer.out("We generate {0} inversion variants.".format(nb_invs))
Floreal Cabanettes's avatar
Floreal Cabanettes committed
    if nb_invs > 0:
        printer.out("Ranges:")
        for v_range, v_count in variants_inv_counts.items():
            printer.out("\t- Range {0}: {1}".format(v_range, v_count))
        printer.out("Chromosomes:")
        for chrm in sorted(list(variants_inv_counts_chr.keys())):
            printer.out("\t- {0}: {1}".format(chrm, variants_inv_counts_chr[chrm]))
    printer.out("")

    # Confirm:
    if ask:
        return input("Continue [Y/n]? ") in ["y", "Y", ""]
    return True
def init(output_dir, force_outputdir, sv_list, nstretches, nb_inds, reference, proba_del, proba_inv, haploid,
         force_polymorphism, coverage, read_len, insert_len_mean, insert_len_sd, threads, genotypes=None,
         min_deletions=1, min_inversions=1, max_try=10, quiet=True, stdout=None, stderr=None):

    printer = Print(stdout=stdout, stderr=stderr)

    if os.path.isdir(output_dir):
        if force_outputdir:
Floreal Cabanettes's avatar
Floreal Cabanettes committed
            shutil.rmtree(output_dir)
        else:
            printer.err("Error: output directory {0} already exists.".format(output_dir))
Floreal Cabanettes's avatar
Floreal Cabanettes committed
            return 1
    elif os.path.isfile(output_dir):
        printer.err("Error: unable to create output directory {0}: file exists.".format(output_dir))
    if nb_inds is not None and nb_inds < 2:
        printer.err("nb-inds must be at least 2")
        return 1

    if not os.path.isfile(reference + ".fai"):
        os.system("samtools faidx " + reference)

    tmp_dir = tempfile.mkdtemp()

    prg_path = os.path.dirname(os.path.realpath(__file__))

Floreal Cabanettes's avatar
Floreal Cabanettes committed
    os.mkdir(output_dir)
Floreal Cabanettes's avatar
Floreal Cabanettes committed
    ############################
    # Define fixed files names #
    ############################
    output_vcf = os.path.join(output_dir, "genotypes.vcf")

    ################
    # Launch steps #
    ################

    inversions = None
    sv_sim = None

    if genotypes is None:
        printer.out("GENERATE RANDOM DELETIONS VARIANTS...\n")
        try:
Floreal Cabanettes's avatar
Floreal Cabanettes committed
            nb_deletions = -1  # To allow 0 deletions
            nb_inversions = -1  # To allow 0 inversions
            nb_try = 0
            sv_sim = VariantsSimulator(sv_list, nstretches, threads, stdout, stderr)
Floreal Cabanettes's avatar
Floreal Cabanettes committed
            printer.out("Try to generate at least %s deletions and %s inversions..." % (min_deletions, min_inversions))
Floreal Cabanettes's avatar
Floreal Cabanettes committed
            while (nb_deletions < min_deletions or nb_inversions < min_inversions) and nb_try < max_try:
                printer.out("\nTry {0} / {1}...".format(nb_try + 1, max_try))
                nb_deletions, deletions, nb_inversions, inversions = sv_sim.get_random_variants(proba_del, proba_inv,
                                                                                                reference)
Floreal Cabanettes's avatar
Floreal Cabanettes committed
                max_del = max(max_del, nb_deletions)
                max_inv = max(max_inv, nb_inversions)
                sv_sim.print_flush()
                nb_try += 1
            if nb_deletions < min_deletions:
                printer.err("\nUnable to generate %s deletions. Try to reduce size of deletions or increase "
                                     "general probability to have a deletion." % min_deletions)
                printer.out("\nMaximum of each event: %d deletions, %d inversions." % (max_del, max_inv))
            if nb_inversions < min_inversions:
                printer.err("\nUnable to generate %s inversions. Try to reduce size of inversions or increase "
                            "general probability to have an inversion." % min_inversions)
Floreal Cabanettes's avatar
Floreal Cabanettes committed
                printer.out("\nMaximum of each event: %d deletions, %d inversions." % (max_del, max_inv))
                return 1
            printer.out("")
        except InputException as e:
            printer.err(e)
        except IOError as e:
            printer.err(e)
            return 1
        except Exception:
            printer.err(traceback.format_exc())
            return 1
        printer_confirm = printer
Floreal Cabanettes's avatar
Floreal Cabanettes committed
        ok = True
        if genotypes is None:
            if quiet:
Floreal Cabanettes's avatar
Floreal Cabanettes committed
                printer_confirm = Print(stdout=os.path.join(output_dir, "confirm.txt"), stderr=stderr)
                ok = confirm(deletions, inversions, sv_sim.variants, printer_confirm, False)
Floreal Cabanettes's avatar
Floreal Cabanettes committed
            else:
                ok = confirm(deletions, inversions, sv_sim.variants, printer_confirm)
        if ok:
            try:
                if genotypes is None:
                    printer.out("GENERATE SUMMARY VCF FILE...\n")
                    genotypes_for_inds_DEL, genotypes_for_inds_INV = \
                        build_genotypes_vcf_list(deletions, inversions, output_vcf, haploid, force_polymorphism, nb_inds,
                                                 tmp_dir, prg_path)
                else:
                    nb_inds, genotypes_for_inds_DEL, genotypes_for_inds_INV = load_genotypes_from_file(genotypes)
                    if nb_inds < 2:
                        printer.err("nb-inds must be at least 2")
                        return 1
                build_fastas_chromosomes(reference, genotypes_for_inds_DEL, genotypes_for_inds_INV, haploid, output_dir,
                                         nb_inds, printer)
                printer.out("GENERATE RANDOM READS FOR EACH INDIVIDUAL FROM GENOME...\n")
                generate_samples_fastq(haploid, nb_inds, output_dir, coverage, read_len, insert_len_mean,
                                       insert_len_sd, prg_path, threads, stdout, stderr)
                printer.out("DONE!\n")
            except ExecException as e:
                printer.err(e)
                return 1
            except Exception:
                printer.err(traceback.format_exc())
                return 1
        else:
            printer.out("Aborted!\n")
Floreal Cabanettes's avatar
Floreal Cabanettes committed
    return 0

def main():
    args = parse_args()
    init(**{k: v for k, v in vars(args).items() if k in init.__code__.co_varnames})

if __name__ == '__main__':
Floreal Cabanettes's avatar
Floreal Cabanettes committed
    sys.exit(main())