Skip to content
Snippets Groups Projects
build_pop.py 14.5 KiB
Newer Older
Floreal Cabanettes's avatar
Floreal Cabanettes committed
#!/usr/bin/env python3

import sys
Floreal Cabanettes's avatar
Floreal Cabanettes committed
import os
Floreal Cabanettes's avatar
Floreal Cabanettes committed
import random
import argparse
from collections import OrderedDict
import vcf
from Bio import SeqIO
import tempfile
from lib.svrunner_utils import eprint
from pysam import tabix_compress, tabix_index
from variants_simulator import VariantsSimulator
def parse_args():
    """
    Parse script arguments
    :return: argparse arguments object
    """
    parser = argparse.ArgumentParser(description='Generate simulated populations with SV',
                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("--nb-inds", help="Number of individuals to generate", required=True, type=int)
    parser.add_argument("--reference", help="Reference genome", required=True)
    parser.add_argument("--sv-list", help="File containing the SVs", required=False)
    parser.add_argument("--coverage", help="Coverage of reads", default=15, type=int)
    parser.add_argument("--output-directory", help="Output directory", default="res")
    parser.add_argument("--force-polymorphism", help="Force polymorphism for each SV", action='store_const', const=True,
                        default=False)
    parser.add_argument("--haploid", help="Make a haploid genome, instead of diploid one", action="store_const", const=True,
                        default=False)
    parser.add_argument("--proba-del", help="Probabilty to have a deletion", type=float, default=0.000001)
    parser.add_argument("-l", "--read-len", help="Generate reads having a length of LEN", type=int, default=100)
    parser.add_argument("-m", "--insert-len-mean", help="Generate inserts (fragments) having an average length of LEN",
                        type=int, default=300)
    parser.add_argument("-v", "--insert-len-sd", help="Set the standard deviation of the insert (fragment) length (%%)",
                        type=int, default=30)
    parser.add_argument("-q", "--quiet", help="Don't ask anything, choose default answer instead", action="store_const",
                        const=True, default=False)
    parser.add_argument("-t", "--threads", help="Number of threads", default=4)

    args = parser.parse_args()
    return args


def _allele(frequency):
    """
    Get randomly an allele, given a frequency
    :param frequency: frequency of the allele {float}
    :return: allele randomly choosen {0 or 1}
    """
    return 1 if random.uniform(0, 1) < frequency else 0
Floreal Cabanettes's avatar
Floreal Cabanettes committed

def _get_genotypes_for_inds(nb_inds, haploid, freq):
    """
    Get genotypes for each individual
    :param nb_inds: number of individuals {int}
    :param haploid: is the genome hamploid {bool}
    :param freq: frequency of the allele {float}
    :return: list of genotypes, vcf data {list}
    """
    all_genotypes = []
    genotypes_row = []
    for i in range(1, nb_inds + 1):
Floreal Cabanettes's avatar
Floreal Cabanettes committed
        if not haploid:
            genotype = str(_allele(freq)) + "/" + str(_allele(freq))
Floreal Cabanettes's avatar
Floreal Cabanettes committed
        else:
            genotype = str(_allele(freq))
        genotype_data = vcf.model._Call(None, "INDIV_" + str(i), vcf.model.make_calldata_tuple("GT")(GT=genotype))
        genotypes_row.append(genotype_data)
        all_genotypes.append(genotype)
    return all_genotypes, genotypes_row


def _svsort(sv, chrm, genotypes_for_inds):
    """
    Function to sort regions
    :param sv: the variant
    :param chrm: chromosome {str}
    :param genotypes_for_inds: dictionary of genotypes for each individual
    return int(genotypes_for_inds[chrm][sv]["start"])
def _build_vcf_header(vcf_file, prg_path, tmp_dir, nb_inds):
    """
    Build header of the VCF file
    :param vcf_file: vcf file full path name {str}
    :param prg_path: program path {str}
    :param tmp_dir: temporary directory {str}
    :param nb_inds: number of individuals {int}
    """
    try:
        with open(os.path.join(prg_path, "template.vcf"), "r") as template:
            try:
                with open(vcf_file, "w") as my_template:
                    for line in template:
                        if line[:6] == "#CHROM":
                            line = line.replace("\n", "")
                            for i in range(0, nb_inds):
                                line += "\tINDIV_" + str(i+1)
                            line += "\n"
                        my_template.write(line)
            except IOError:
                eprint("ERROR: unable to create template file \"{0}\" in temp dir.".format(os.path.join(tmp_dir, "template.vcf")))
                exit(1)
    except IOError:
        eprint("ERROR: template file not found in program directory.")
        exit(1)
def build_genotypes_vcf_list(deletions: dict, output_vcf, haploid, force_polymorphism, nb_inds, tmp_dir, prg_path):
    """
    Build VCF file describing genotypes for each individual (and the associated python dictionary)
    :param deletions: deletions description {dict}
    :param output_vcf: output VCF file full path name {str}
    :param haploid: is haploid {bool}
    :param force_polymorphism: force polymorphism {bool}
    :param nb_inds: number of individuals {int}
    :param tmp_dir: temporary directory {str}
    :param prg_path: program path {str}
    :return: dictionary of genotypes for each variant for each dictionary {OrderedDict}
    """
    print("GENERATE SUMMARY VCF FILE...\n")
    genotypes_for_inds = OrderedDict()
    # { chr: { id_indiv: {start: #start, end: #end, genotypes: [[0,1],[1,1],...], ...}, # ...}

    vcf_file = os.path.join(tmp_dir, "template.vcf")

    # Build VCF header:
    _build_vcf_header(vcf_file, prg_path, tmp_dir, nb_inds)

    vcf_reader = vcf.Reader(filename=vcf_file)
    vcf_writer = vcf.Writer(open(output_vcf, "w"), vcf_reader)
    for chrm, deletes in deletions.items():
        for delete in deletes:
            if chrm not in genotypes_for_inds:
                genotypes_for_inds[chrm] = {}
            genotypes_for_inds[chrm][delete["name"]] = {"start": delete["start"], "end": delete["end"], "genotypes": []}

            # Get genotypes:
            all_genotypes, genotypes = [], []
            if force_polymorphism:
                polymorph = False
                while not polymorph:
                    all_genotypes, genotypes = _get_genotypes_for_inds(nb_inds, haploid, delete["freq"])
                    polymorph = len(set(all_genotypes)) > 1
            else:
                all_genotypes, genotypes = _get_genotypes_for_inds(nb_inds, haploid, delete["freq"])
            genotypes_for_inds[chrm][delete["name"]]["genotypes"] = [x.split("/") for x in all_genotypes]

            info = {"END": delete["end"], "AF": delete["freq"]}
            vcf_record = vcf.model._Record(chrm, delete["start"], delete["name"], "N", [vcf.model._SV("DEL")], ".",
                                           ".", info, "GT", [0], genotypes)
            vcf_writer.write_record(vcf_record)
    vcf_writer.close()

    tabix_compress(output_vcf, output_vcf + ".gz", True)
    tabix_index(output_vcf + ".gz", force=True, preset="vcf")

    return genotypes_for_inds

def _compute_keeped_genomic_regions(svs, svs_infos, haploid):
    """
    Get list of all regions keeped (not deleted)
    :param svs: list of variants (deletions) {list}
    :param svs_infos: infos of variants {dict}
    :param haploid: is haploid {bool}
    :return: regions for each individuals {OrderedDict}, the last position for each region {OrderedDict}
    """
Floreal Cabanettes's avatar
Floreal Cabanettes committed
    regions = OrderedDict()
    current_region_pointer = OrderedDict()
    for svs_i in svs:
        i = 0
        for genotypes in svs_infos[svs_i]["genotypes"]:
            if i not in regions:
                regions[i] = {}
                current_region_pointer[i] = {}
Floreal Cabanettes's avatar
Floreal Cabanettes committed
            for j in range(0, 2 if not haploid else 1):  # For each chromosome of the diploid genome, or the chromosome
                # of the haploid genome
Floreal Cabanettes's avatar
Floreal Cabanettes committed
                if j not in regions[i]:
                    regions[i][j] = []
                    current_region_pointer[i][j] = "0"
                if svs_infos[svs_i]["genotypes"][i][j] == "1":
                    regions[i][j].append([current_region_pointer[i][j], svs_infos[svs_i]["start"]])
                    current_region_pointer[i][j] = svs_infos[svs_i]["end"]
            i += 1
    return regions, current_region_pointer

def _build_fastas(chrm, regions, current_region_pointer, output_dir, fasta_orig_chr, last_nt):
    """
    Build fasta files
    :param chrm:
    :param regions:
    :param current_region_pointer:
    :param output_dir:
    :param fasta_orig_chr:
    :param last_nt:
    :return:
    """
Floreal Cabanettes's avatar
Floreal Cabanettes committed
    for indiv, chrs_dips in regions.items():
        id_chrs = list(chrs_dips.keys())
        id_chrs = sorted(id_chrs)
        for id_chr in id_chrs:
            chr_dip = chrs_dips[id_chr]  # SVs for each diploid chromosome
            logs_regions = []  # Store logs
            # Build FASTA and store logs:
                with open(os.path.join(output_dir, "INDIV_" + str(indiv + 1) + "_chr_" + str(id_chr) + ".fasta"),
                          "a") as output_handle:
                    fasta = ""
                    last_one = 0
                    for chr_region in chr_dip:
                        fasta += fasta_orig_chr[int(chr_region[0]):int(chr_region[1])]
                        logs_regions.append("\t".join(str(x) for x in chr_region))
                        last_one = int(chr_region[1])
                    if last_one < last_nt:
                        logs_regions.append("\t".join([str(current_region_pointer[indiv][id_chr]), str(last_nt)]))
                        fasta += fasta_orig_chr[int(current_region_pointer[indiv][id_chr]):last_nt]
                    SeqIO.write(fasta, output_handle, "fasta")
            except IOError:
                eprint("ERROR: unable to write \"{0}\" file.".
                       format(os.path.join(output_dir, "INDIV_" + str(indiv + 1) + "_chr_" + str(id_chr) + ".fasta")))
Floreal Cabanettes's avatar
Floreal Cabanettes committed

            # Write logs
                with open(os.path.join(output_dir, "INDIV_" + str(indiv + 1) + ".regions.log"), "a") as log_handle:
                    log_handle.write(chrm + "_" + str(id_chr) + "\t")
                    log_handle.write("\n\t".join(logs_regions))
                    log_handle.write("\n")
            except IOError:
                eprint("ERROR: unable to write log file: \"{0}\"".
                       format(os.path.join(output_dir, "INDIV_" + str(indiv + 1) + ".regions.log")))


def build_fastas_chromosomes(reference, genotypes_for_inds, haploid, output_dir):
    print("BUILD FASTA GENOME FOR EACH INDIVIDUAL...\n")
    fasta_orig = SeqIO.index(reference, "fasta")

    for chrm, svs_infos in genotypes_for_inds.items():

        print("PROCESSING CHROMOSOME {0}...\n".format(chrm))

        svs = list(svs_infos.keys())
        svs = sorted(svs, key=lambda x:_svsort(x, chrm, genotypes_for_inds))

        fasta_orig_chr = fasta_orig[chrm]
        last_nt = len(fasta_orig_chr)-1

        # Compute keeped genomic regions for each diploid chromosome:
        regions, current_region_pointer = _compute_keeped_genomic_regions(svs, svs_infos, haploid)

        # Build FASTA of each diploid/haploid chromosome:
        _build_fastas(chrm, regions, current_region_pointer, output_dir, fasta_orig_chr, last_nt)


def generate_samples_fastq(haploid, nb_inds, output_dir, coverage, read_len, insert_len_mean, insert_len_sd,
                           prg_path, threads):

    print("GENERATE RANDOM READS FOR EACH INDIVIDUAL FROM GENOME...\n")

    # Generate reads for all individuals:
Floreal Cabanettes's avatar
Floreal Cabanettes committed
    if not haploid:
        cmd = str("{4}pirs/pirs simulate -z -x {3} -d -B {4}pirs/Profiles/Base-Calling_Profiles/humNew.PE100.matrix.gz "
                  "-I {4}pirs/Profiles/InDel_Profiles/phixv2.InDel.matrix -l {5} -m {6} -v {7} "
                  "-G {4}pirs/Profiles/GC-depth_Profiles/humNew.gcdep_100.dat -o {0} -t {8} {1} {2}")
Floreal Cabanettes's avatar
Floreal Cabanettes committed
    else:
        cmd = str("{4}pirs/pirs simulate -z -x {3} -B {4}pirs/Profiles/Base-Calling_Profiles/humNew.PE100.matrix.gz "
                  "-I {4}pirs/Profiles/InDel_Profiles/phixv2.InDel.matrix -l {5} -m {6} -v {7} "
                  "-G {4}pirs/Profiles/GC-depth_Profiles/humNew.gcdep_100.dat -o {0} -t {8} {1}")
    for i in range(1, nb_inds+1):
        prefix = os.path.join(output_dir, "INDIV_" + str(i))
        chr0 = prefix + "_chr_0.fasta"
        chr1 = prefix + "_chr_1.fasta"
        if not haploid:
            os.system(cmd.format(prefix, chr0, chr1, coverage, prg_path + os.path.sep, read_len,
                                 insert_len_mean, insert_len_sd, threads))
        else:
            os.system(cmd.format(prefix, chr0, "", coverage, prg_path + os.path.sep, read_len,
                                 insert_len_mean, insert_len_sd, threads))
def confirm(deletions: dict):
    nb_dels = 0
    for deletes in deletions.values():
        nb_dels += len(deletes)
    print("We generate {0} deletion variants.".format(nb_dels))
    return input("Continue [Y/n]? ") in ["y", "Y", ""]


def main():
    args = parse_args()
    reference = args.reference
    sv_list = args.sv_list
    output_dir = args.output_directory
    tmp_dir = tempfile.mkdtemp()
    haploid = args.haploid
    nb_inds = args.nb_inds

    if os.path.isdir(output_dir):
        eprint("Error: output directory {0} already exists.".format(output_dir))
        return 1
    elif os.path.isfile(output_dir):
        eprint("Error: unable to create output directory {0}: file exists.".format(output_dir))
        return 1

    if nb_inds < 2:
        raise Exception("nb-inds must be at least 2")

    if not os.path.isfile(reference + ".fai"):
        os.system("samtools faidx " + reference)

    prg_path = os.path.dirname(os.path.realpath(__file__))

Floreal Cabanettes's avatar
Floreal Cabanettes committed
    os.mkdir(output_dir)
Floreal Cabanettes's avatar
Floreal Cabanettes committed
    ############################
    # Define fixed files names #
    ############################
    output_vcf = os.path.join(output_dir, "genotypes.vcf")

    ################
    # Launch steps #
    ################

    print("GENERATE RANDOM DELETIONS VARIANTS...\n")
    sv_sim = VariantsSimulator(sv_list)
    deletions = sv_sim.get_random_deletions(args.proba_del, args.reference)
    if args.quiet or confirm(deletions):
        genotypes_for_inds = build_genotypes_vcf_list(deletions, output_vcf, haploid, args.force_polymorphism, nb_inds,
                                                      tmp_dir, prg_path)
        build_fastas_chromosomes(reference, genotypes_for_inds, haploid, output_dir)
        generate_samples_fastq(haploid, nb_inds, output_dir, args.coverage, args.read_len, args.insert_len_mean,
                               args.insert_len_sd, prg_path, args.threads)

if __name__ == '__main__':
Floreal Cabanettes's avatar
Floreal Cabanettes committed
    sys.exit(main())