build_pop.py

#!/usr/bin/env python3

import os
import random
import argparse
from collections import OrderedDict
import vcf
from Bio import SeqIO

parser = argparse.ArgumentParser(description='Generate simulated populations with SV')
parser.add_argument("--nb-inds", help="Number of individuals to generate", required=True)
parser.add_argument("--reference", help="Reference genome", required=True)
parser.add_argument("--sv-list", help="File containing the SVs", required=True)
parser.add_argument("--coverage", help="Coverage of reads (default: 15)", default=15)
parser.add_argument("--output-directory", help="Output directory (default: res)", default="res")
parser.add_argument("--tmp-directory", help="Temporary directory (default: tmp)", default="tmp")

args = parser.parse_args()

nb_inds = args.nb_inds
reference = args.reference
sv_list = args.sv_list
output_dir = args.output_directory
tmp_dir = args.tmp_directory

if not os.path.isfile(reference + ".fai"):
    os.system("samtools faidx " + reference)


def allele(frequency):
    return 1 if random.uniform(0, 1) < frequency else 0

prg_path = os.path.dirname(os.path.realpath(__file__))

if not os.path.isdir(tmp_dir):
    os.mkdir(tmp_dir)
if not os.path.isdir(output_dir):
    os.mkdir(output_dir)


###########################
# 1. Get random deletions #
###########################

print("GENERATE SV DELs...\n")

os.system(os.path.join(prg_path, "SVsim -W -i {0} -r {1} -o {2}{3}reference-sv".format(sv_list, reference, tmp_dir, os.path.sep)))


###################################
# 2. Build BED files of deletions #
###################################

with open(os.path.join(tmp_dir, "reference-sv.bed"), "w") as bed:
    with open(os.path.join(tmp_dir, "reference-sv.bedpe"), "r") as bedpe:
        for line in bedpe:
            freq = 0.2 if random.uniform(0,1) < 0.5 else 0.5
            parts = line.split("\t")
            bed.write("\t".join([parts[0], parts[2], parts[4], parts[6].replace("::", "_"), str(freq)]) + "\n")


###############################################################################
# 3. Build VCF files containing genotypes for the given number of individuals #
###############################################################################
# VCF file is a result file, not used by this script

print("GENERATE SUMMARY VCF FILE...\n")

genotypes_for_inds = OrderedDict()
# { chr: { id_indiv: {start: #start, end: #end, genotypes: [[0,1],[1,1],...], ...}, # ...}

# Build VCF header:
with open(os.path.join(prg_path, "template.vcf"), "r") as template:
    with open(os.path.join(tmp_dir, "template.vcf"), "w") as my_template:
        for line in template:
            if line[:6] == "#CHROM":
                line = line.replace("\n", "")
                for i in range(0, int(args.nb_inds)):
                    line += "\tINDIV_" + str(i+1)
                line += "\n"
            my_template.write(line)

with open(os.path.join("tmp", "reference-sv.bed"), "r") as bed:
    vcf_reader = vcf.Reader(filename=os.path.join('tmp', 'template.vcf'))
    vcf_writer = vcf.Writer(open(os.path.join(output_dir, "genotypes.vcf"), "w"), vcf_reader)
    for line in bed:
        parts = line.replace("\n", "").split("\t")
        freq = float(parts[4])
        if parts[0] not in genotypes_for_inds:
            genotypes_for_inds[parts[0]] = {}
        genotypes_for_inds[parts[0]][parts[3]] = {"start": int(parts[1]), "end": int(parts[2]), "genotypes": []}
        genotypes = []
        polymorph = False
        while not polymorph:
            unique_genotypes = set()
            all_genotypes = []
            genotypes_tmp = []
            for i in range(1, int(args.nb_inds)+1):
                genotype = str(allele(freq)) + "/" + str(allele(freq))
                genotype_data = vcf.model._Call(None, "INDIV_" + str(i), vcf.model.make_calldata_tuple("GT")(GT=genotype))
                genotypes_tmp.append(genotype_data)
                unique_genotypes.add(genotype)
                all_genotypes.append(genotype)
            polymorph = len(unique_genotypes) > 1
            if polymorph:
                genotypes += genotypes_tmp
                genotypes_for_inds[parts[0]][parts[3]]["genotypes"] = [x.split("/") for x in all_genotypes]

        info = {"END": int(parts[2]), "AF": freq}
        vcf_record = vcf.model._Record(parts[0], int(parts[1]), parts[3], "N", [vcf.model._SV("DEL")], ".", ".", info, "GT", [0], genotypes)
        vcf_writer.write_record(vcf_record)
    vcf_writer.close()


###############################################
# Build fasta chromosomes for each individual #
###############################################

print("BUILD FASTA GENOME FOR EACH INDIVIDUAL...\n")


def svsort(sv, chr):
    """
    Function to sort regions
    """
    return int(genotypes_for_inds[chr][sv]["start"])


fasta_orig = SeqIO.index(reference, "fasta")

for chr, svs_infos in genotypes_for_inds.items():

    print("PROCESSING CHROMOSOME {0}...\n".format(chr))

    svs = list(svs_infos.keys())
    svs = sorted(svs, key=lambda x:svsort(x, chr))

    fasta_orig_chr = fasta_orig[chr]
    last_nt = len(fasta_orig_chr)-1

    # Compute keeped genomic regions for each diploid chromosome:
    regions = OrderedDict()
    current_region_pointer = OrderedDict()
    for svs_i in svs:
        i = 0
        for genotypes in svs_infos[svs_i]["genotypes"]:
            if i not in regions:
                regions[i] = {}
                current_region_pointer[i] = {}
            for j in range(0,2): # For each chromosome of the diploid genome
                if j not in regions[i]:
                    regions[i][j] = []
                    current_region_pointer[i][j] = "0"
                if svs_infos[svs_i]["genotypes"][i][j] == "1":
                    regions[i][j].append([current_region_pointer[i][j], svs_infos[svs_i]["start"]])
                    current_region_pointer[i][j] = svs_infos[svs_i]["end"]
            i += 1

    # Build FASTA of each diploid chromosome:
    for indiv, chrs_dips in regions.items():
        id_chrs = list(chrs_dips.keys())
        id_chrs = sorted(id_chrs)
        for id_chr in id_chrs:
            chr_dip = chrs_dips[id_chr] # SVs for each diploid chromosome
            logs_regions = [] # Store logs

            # Build FASTA and store logs:
            with open(os.path.join(output_dir, "INDIV_" + str(indiv+1) + "_chr_" + str(id_chr) + ".fasta"), "a") as output_handle:
                fasta = ""
                last_one = 0
                for chr_region in chr_dip:
                    fasta += fasta_orig_chr[int(chr_region[0]):int(chr_region[1])]
                    logs_regions.append("\t".join(str(x) for x in chr_region))
                    last_one = int(chr_region[1])
                if last_one < last_nt:
                    logs_regions.append("\t".join([str(current_region_pointer[indiv][id_chr]), str(last_nt)]))
                    fasta += fasta_orig_chr[int(current_region_pointer[indiv][id_chr]):last_nt]
                SeqIO.write(fasta, output_handle, "fasta")

            # Write logs
            with open(os.path.join(output_dir, "INDIV_" + str(indiv+1) + ".regions.log"), "a") as log_handle:
                log_handle.write(chr + "_" + str(id_chr) + "\t")
                log_handle.write("\n\t".join(logs_regions))
                log_handle.write("\n")

print("GENERATE RANDOM READS FOR EACH INDIVIDUAL FROM GENOME...\n")

# Generate reads for all individuals:
cmd = "{4}pirs/pirs simulate -z -x {3} -d -B {4}pirs/Profiles/Base-Calling_Profiles/humNew.PE100.matrix.gz -I {4}pirs/Profiles/InDel_Profiles/phixv2.InDel.matrix -G {4}pirs/Profiles/GC-depth_Profiles/humNew.gcdep_100.dat -o {0} {1} {2}"
for i in range(1, int(args.nb_inds)+1):
    prefix = os.path.join(output_dir, "INDIV_" + str(i))
    chr0 = prefix + "_chr_0.fasta"
    chr1 = prefix + "_chr_1.fasta"
    os.system(cmd.format(prefix, chr0, chr1, args.coverage, prg_path + os.path.sep))

print("DONE!\n")