build_pop.py

#!/usr/bin/env python3

import sys
import os
import shutil
import random
import argparse
from collections import OrderedDict
import vcf
from Bio import SeqIO
import tempfile
from lib.svrunner_utils import eprint
from pysam import tabix_compress, tabix_index
from variants_simulator import VariantsSimulator
from exceptions import InputException
import multiprocessing
import subprocess

def parse_args():
    """
    Parse script arguments
    :return: argparse arguments object
    """
    parser = argparse.ArgumentParser(description='Generate simulated populations with SV',
                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("-n", "--nb-inds", help="Number of individuals to generate", required=True, type=int)
    parser.add_argument("-r", "--reference", help="Reference genome", required=True)
    parser.add_argument("-s", "--sv-list", help="File containing the SVs", required=False)
    parser.add_argument("-c", "--coverage", help="Coverage of reads", default=15, type=int)
    parser.add_argument("-o", "--output-directory", help="Output directory", default="res")
    parser.add_argument("-e", "--erase-output", help="Delete output, if directory already exists", action='store_const',
                        const=True, default=False)
    parser.add_argument("-f", "--force-polymorphism", help="Force polymorphism for each SV", action='store_const',
                        const=True, default=False)
    parser.add_argument("-a", "--haploid", help="Make a haploid genome, instead of diploid one", action="store_const",
                        const=True, default=False)
    parser.add_argument("-p", "--proba-del", help="Probabilty to have a deletion", type=float, default=0.000001)
    parser.add_argument("-l", "--read-len", help="Generate reads having a length of LEN", type=int, default=100)
    parser.add_argument("-m", "--insert-len-mean", help="Generate inserts (fragments) having an average length of LEN",
                        type=int, default=300)
    parser.add_argument("-v", "--insert-len-sd", help="Set the standard deviation of the insert (fragment) length (%%)",
                        type=int, default=30)
    parser.add_argument("-q", "--quiet", help="Don't ask anything, choose default answer instead", action="store_const",
                        const=True, default=False)
    parser.add_argument("-t", "--threads", help="Number of threads", default=multiprocessing.cpu_count(), type=int)

    args = parser.parse_args()
    return args


def _allele(frequency):
    """
    Get randomly an allele, given a frequency
    :param frequency: frequency of the allele {float}
    :return: allele randomly choosen {0 or 1}
    """
    return 1 if random.uniform(0, 1) < frequency else 0


def _get_genotypes_for_inds(nb_inds, haploid, freq):
    """
    Get genotypes for each individual
    :param nb_inds: number of individuals {int}
    :param haploid: is the genome hamploid {bool}
    :param freq: frequency of the allele {float}
    :return: list of genotypes, vcf data {list}
    """
    all_genotypes = []
    genotypes_row = []
    for i in range(1, nb_inds + 1):
        if not haploid:
            genotype = str(_allele(freq)) + "/" + str(_allele(freq))
        else:
            genotype = str(_allele(freq))
        genotype_data = vcf.model._Call(None, "INDIV_" + str(i), vcf.model.make_calldata_tuple("GT")(GT=genotype))
        genotypes_row.append(genotype_data)
        all_genotypes.append(genotype)
    return all_genotypes, genotypes_row


def _svsort(sv, chrm, genotypes_for_inds):
    """
    Function to sort regions
    :param sv: the variant
    :param chrm: chromosome {str}
    :param genotypes_for_inds: dictionary of genotypes for each individual
    """
    return int(genotypes_for_inds[chrm][sv]["start"])


def _build_vcf_header(vcf_file, prg_path, tmp_dir, nb_inds):
    """
    Build header of the VCF file
    :param vcf_file: vcf file full path name {str}
    :param prg_path: program path {str}
    :param tmp_dir: temporary directory {str}
    :param nb_inds: number of individuals {int}
    """
    try:
        with open(os.path.join(prg_path, "template.vcf"), "r") as template:
            try:
                with open(vcf_file, "w") as my_template:
                    for line in template:
                        if line[:6] == "#CHROM":
                            line = line.replace("\n", "")
                            for i in range(0, nb_inds):
                                line += "\tINDIV_" + str(i+1)
                            line += "\n"
                        my_template.write(line)
            except IOError:
                eprint("ERROR: unable to create template file \"{0}\" in temp dir.".format(os.path.join(tmp_dir, "template.vcf")))
                exit(1)
    except IOError:
        eprint("ERROR: template file not found in program directory.")
        exit(1)


def build_genotypes_vcf_list(deletions: dict, output_vcf, haploid, force_polymorphism, nb_inds, tmp_dir, prg_path):
    """
    Build VCF file describing genotypes for each individual (and the associated python dictionary)
    :param deletions: deletions description {dict}
    :param output_vcf: output VCF file full path name {str}
    :param haploid: is haploid {bool}
    :param force_polymorphism: force polymorphism {bool}
    :param nb_inds: number of individuals {int}
    :param tmp_dir: temporary directory {str}
    :param prg_path: program path {str}
    :return: dictionary of genotypes for each variant for each dictionary {OrderedDict}
    """
    print("GENERATE SUMMARY VCF FILE...\n")
    genotypes_for_inds = OrderedDict()
    # { chr: { id_indiv: {start: #start, end: #end, genotypes: [[0,1],[1,1],...], ...}, # ...}

    vcf_file = os.path.join(tmp_dir, "template.vcf")

    # Build VCF header:
    _build_vcf_header(vcf_file, prg_path, tmp_dir, nb_inds)

    vcf_reader = vcf.Reader(filename=vcf_file)
    vcf_writer = vcf.Writer(open(output_vcf, "w"), vcf_reader)
    for chrm, deletes in deletions.items():
        for delete in deletes:
            if chrm not in genotypes_for_inds:
                genotypes_for_inds[chrm] = {}
            genotypes_for_inds[chrm][delete["name"]] = {"start": delete["start"], "end": delete["end"], "genotypes": []}

            # Get genotypes:
            all_genotypes, genotypes = [], []
            if force_polymorphism:
                polymorph = False
                while not polymorph:
                    all_genotypes, genotypes = _get_genotypes_for_inds(nb_inds, haploid, delete["freq"])
                    polymorph = len(set(all_genotypes)) > 1
            else:
                all_genotypes, genotypes = _get_genotypes_for_inds(nb_inds, haploid, delete["freq"])
            genotypes_for_inds[chrm][delete["name"]]["genotypes"] = [x.split("/") for x in all_genotypes]

            info = {"END": delete["end"], "AF": delete["freq"]}
            vcf_record = vcf.model._Record(chrm, delete["start"], delete["name"], "N", [vcf.model._SV("DEL")], ".",
                                           ".", info, "GT", [0], genotypes)
            vcf_writer.write_record(vcf_record)
    vcf_writer.close()

    tabix_compress(output_vcf, output_vcf + ".gz", True)
    tabix_index(output_vcf + ".gz", force=True, preset="vcf")

    return genotypes_for_inds


def _compute_keeped_genomic_regions(svs, svs_infos, haploid):
    """
    Get list of all regions keeped (not deleted)
    :param svs: list of variants (deletions) {list}
    :param svs_infos: infos of variants {dict}
    :param haploid: is haploid {bool}
    :return: regions for each individuals {OrderedDict}, the last position for each region {OrderedDict}
    """
    regions = OrderedDict()
    current_region_pointer = OrderedDict()
    for svs_i in svs:
        i = 0
        for genotypes in svs_infos[svs_i]["genotypes"]:
            if i not in regions:
                regions[i] = {}
                current_region_pointer[i] = {}
            for j in range(0, 2 if not haploid else 1):  # For each chromosome of the diploid genome, or the chromosome
                # of the haploid genome
                if j not in regions[i]:
                    regions[i][j] = []
                    current_region_pointer[i][j] = "0"
                if svs_infos[svs_i]["genotypes"][i][j] == "1":
                    regions[i][j].append([current_region_pointer[i][j], svs_infos[svs_i]["start"]])
                    current_region_pointer[i][j] = svs_infos[svs_i]["end"]
            i += 1
    return regions, current_region_pointer


def _build_fastas(chrm, regions, current_region_pointer, output_dir, fasta_orig_chr, last_nt):
    """
    Build fasta files
    :param chrm:
    :param regions:
    :param current_region_pointer:
    :param output_dir:
    :param fasta_orig_chr:
    :param last_nt:
    :return:
    """
    for indiv, chrs_dips in regions.items():
        id_chrs = list(chrs_dips.keys())
        id_chrs = sorted(id_chrs)
        for id_chr in id_chrs:
            chr_dip = chrs_dips[id_chr]  # SVs for each diploid chromosome
            logs_regions = []  # Store logs

            # Build FASTA and store logs:
            try:
                with open(os.path.join(output_dir, "INDIV_" + str(indiv + 1) + "_chr_" + str(id_chr) + ".fasta"),
                          "a") as output_handle:
                    fasta = ""
                    last_one = 0
                    for chr_region in chr_dip:
                        fasta += fasta_orig_chr[int(chr_region[0]):int(chr_region[1])]
                        logs_regions.append("\t".join(str(x) for x in chr_region))
                        last_one = int(chr_region[1])
                    if last_one < last_nt:
                        logs_regions.append("\t".join([str(current_region_pointer[indiv][id_chr]), str(last_nt)]))
                        fasta += fasta_orig_chr[int(current_region_pointer[indiv][id_chr]):last_nt]
                    SeqIO.write(fasta, output_handle, "fasta")
            except IOError:
                eprint("ERROR: unable to write \"{0}\" file.".
                       format(os.path.join(output_dir, "INDIV_" + str(indiv + 1) + "_chr_" + str(id_chr) + ".fasta")))
                exit(1)

            # Write logs
            try:
                with open(os.path.join(output_dir, "INDIV_" + str(indiv + 1) + ".regions.log"), "a") as log_handle:
                    log_handle.write(chrm + "_" + str(id_chr) + "\t")
                    log_handle.write("\n\t".join(logs_regions))
                    log_handle.write("\n")
            except IOError:
                eprint("ERROR: unable to write log file: \"{0}\"".
                       format(os.path.join(output_dir, "INDIV_" + str(indiv + 1) + ".regions.log")))


def build_fastas_chromosomes(reference, genotypes_for_inds, haploid, output_dir):
    print("BUILD FASTA GENOME FOR EACH INDIVIDUAL...\n")
    fasta_orig = SeqIO.index(reference, "fasta")

    for chrm, svs_infos in genotypes_for_inds.items():

        print("PROCESSING CHROMOSOME {0}...\n".format(chrm))

        svs = list(svs_infos.keys())
        svs = sorted(svs, key=lambda x:_svsort(x, chrm, genotypes_for_inds))

        fasta_orig_chr = fasta_orig[chrm]
        last_nt = len(fasta_orig_chr)-1

        # Compute keeped genomic regions for each diploid chromosome:
        regions, current_region_pointer = _compute_keeped_genomic_regions(svs, svs_infos, haploid)

        # Build FASTA of each diploid/haploid chromosome:
        _build_fastas(chrm, regions, current_region_pointer, output_dir, fasta_orig_chr, last_nt)


def generate_samples_fastq(haploid, nb_inds, output_dir, coverage, read_len, insert_len_mean, insert_len_sd,
                           prg_path, threads):

    print("GENERATE RANDOM READS FOR EACH INDIVIDUAL FROM GENOME...\n")

    # Generate reads for all individuals:
    cmd = [prg_path + "/pirs/pirs", "simulate", "-z", "-x", str(coverage), "-d", "-B",
           prg_path + "/pirs/Profiles/Base-Calling_Profiles/humNew.PE100.matrix.gz", "-I",
           prg_path + "/pirs/Profiles/InDel_Profiles/phixv2.InDel.matrix", "-l", str(read_len), "-m",
           str(insert_len_mean), "-v", str(insert_len_sd),
           "-G", prg_path + "/pirs/Profiles/GC-depth_Profiles/humNew.gcdep_100.dat", "-t", str(threads), "-o"]
    if haploid:
        cmd.remove("-d")
    for i in range(1, nb_inds+1):
        prefix = os.path.join(output_dir, "INDIV_" + str(i))
        cmd_full = cmd + [prefix]
        chr0 = prefix + "_chr_0.fasta"
        if not haploid:
            chr1 = prefix + "_chr_1.fasta"
            cmd_full += [chr0, chr1]
        else:
            cmd_full.append(chr0)
        subprocess.call(cmd_full, stdout=sys.stdout, stderr=sys.stderr)


def confirm(deletions: dict, variants: dict):
    nb_dels = 0
    variants = sorted(variants["DEL"], key=lambda x: x["min"])
    variants_counts = OrderedDict()
    variants_counts_chr = {}
    for variant in variants:
        variants_counts["{0}-{1}".format(variant["min"], variant["max"])] = 0
    for chrm, deletes in deletions.items():
        nb_dels_chr = len(deletes)
        nb_dels += nb_dels_chr
        variants_counts_chr[chrm] = nb_dels_chr
        for delete in deletes:
            v_len = delete["length"]
            for variant in variants:
                if variant["min"] < v_len < variant["max"]:
                    variants_counts["{0}-{1}".format(variant["min"], variant["max"])] += 1
                    break
    print("We generate {0} deletion variants.".format(nb_dels))
    print("Ranges:")
    for v_range, v_count in variants_counts.items():
        print("\t- Range {0}: {1}".format(v_range, v_count))
    print("Chromosomes:")
    for chrm in sorted(list(variants_counts_chr.keys())):
        print("\t- {0}: {1}".format(chrm, variants_counts_chr[chrm]))
    print("")
    return input("Continue [Y/n]? ") in ["y", "Y", ""]


def init(output_dir, erase_output, sv_list, nb_inds, reference, proba_del, haploid, force_polymorphism,
         coverage, read_len, insert_len_mean, insert_len_sd, threads, quiet=True):
    if os.path.isdir(output_dir):
        if erase_output:
            shutil.rmtree(output_dir)
        else:
            eprint("Error: output directory {0} already exists.".format(output_dir))
            return 1
    elif os.path.isfile(output_dir):
        eprint("Error: unable to create output directory {0}: file exists.".format(output_dir))
        return 1

    if nb_inds < 2:
        raise Exception("nb-inds must be at least 2")

    if not os.path.isfile(reference + ".fai"):
        os.system("samtools faidx " + reference)

    tmp_dir = tempfile.mkdtemp()

    prg_path = os.path.dirname(os.path.realpath(__file__))

    os.mkdir(output_dir)

    ############################
    # Define fixed files names #
    ############################
    output_vcf = os.path.join(output_dir, "genotypes.vcf")

    ################
    # Launch steps #
    ################

    print("GENERATE RANDOM DELETIONS VARIANTS...\n")
    try:
        sv_sim = VariantsSimulator(sv_list, threads)
        deletions = sv_sim.get_random_deletions(proba_del, reference)
        print("")
    except InputException as e:
        print(e)
        return False
    if quiet or confirm(deletions, sv_sim.variants):
        genotypes_for_inds = build_genotypes_vcf_list(deletions, output_vcf, haploid, force_polymorphism, nb_inds,
                                                      tmp_dir, prg_path)
        build_fastas_chromosomes(reference, genotypes_for_inds, haploid, output_dir)
        generate_samples_fastq(haploid, nb_inds, output_dir, coverage, read_len, insert_len_mean,
                               insert_len_sd, prg_path, threads)
        print("DONE!\n")
    else:
        print("Aborted!\n")


def main():
    args = parse_args()
    reference = args.reference
    sv_list = args.sv_list
    output_dir = args.output_directory
    haploid = args.haploid
    nb_inds = args.nb_inds
    erase_output = args.erase_output
    proba_del = args.proba_del
    threads = args.threads
    force_polymorphism = args.force_polymorphism
    coverage = args.coverage
    read_len = args.read_len
    insert_len_mean = args.insert_len_mean
    insert_len_sd = args.insert_len_sd
    quiet = args.quiet
    
    init(output_dir, erase_output, sv_list, nb_inds, reference, proba_del, haploid, force_polymorphism,
         coverage, read_len, insert_len_mean, insert_len_sd, threads, quiet)

if __name__ == '__main__':
    sys.exit(main())