Newer
Older
from collections import OrderedDict
import vcf
from Bio import SeqIO
Floreal Cabanettes
committed
import tempfile
from pysam import tabix_compress, tabix_index, VariantFile
Floreal Cabanettes
committed
from variants_simulator import VariantsSimulator
from exceptions import InputException, ExecException
import multiprocessing
class Print:
def __init__(self, stdout=None, stderr=None):
self.stdout = stdout
self.stderr = stderr
if self.stdout is not None and os.path.isfile(self.stdout):
os.remove(self.stdout)
if self.stderr is not None and os.path.isfile(self.stderr):
os.remove(self.stderr)
def out(self, message):
if self.stdout is not None:
with open(self.stdout, "a") as stdout_f:
print(message, file=stdout_f)
else:
print(message, file=sys.stdout)
def err(self, message):
if self.stderr is not None:
with open(self.stderr, "a") as stderr_f:
print(message, file=stderr_f)
else:
print(message, file=sys.stderr)
def check_min_size(value):
ivalue = int(value)
if ivalue <= 1:
raise argparse.ArgumentTypeError("%s is an invalid size value (>= 1)" % value)
return ivalue
def parse_args():
"""
Parse script arguments
:return: argparse arguments object
"""
Floreal Cabanettes
committed
parser = argparse.ArgumentParser(description='Generate simulated populations with SV',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("-n", "--nb-inds", help="Number of individuals to generate", type=int)
parser.add_argument("-r", "--reference", help="Reference genome", required=True)
parser.add_argument("-ns", "--nstretches", help="N-stretches positions bed file")
parser.add_argument("-s", "--sv-list", help="File containing the SVs", required=False)
parser.add_argument("-c", "--coverage", help="Coverage of reads", default=15, type=int)
parser.add_argument("-o", "--output-directory", help="Output directory", default="res")
parser.add_argument("-e", "--force-outputdir", help="Delete output directory before start, if already exists",
action='store_const', const=True, default=False)
parser.add_argument("-f", "--force-polymorphism", help="Force polymorphism for each SV", action='store_const',
const=True, default=False)
parser.add_argument("-a", "--haploid", help="Make a haploid genome, instead of diploid one", action="store_const",
const=True, default=False)
parser.add_argument("-p", "--proba-del", help="Probabilty to have a deletion", type=float, default=0.000001)
parser.add_argument("-l", "--read-len", help="Generate reads having a length of LEN", type=int, default=100)
parser.add_argument("-m", "--insert-len-mean", help="Generate inserts (fragments) having an average length of LEN",
type=int, default=300)
parser.add_argument("-v", "--insert-len-sd", help="Set the standard deviation of the insert (fragment) length (%%)",
type=int, default=30)
Floreal Cabanettes
committed
parser.add_argument("-q", "--quiet", help="Don't ask anything, choose default answer instead", action="store_const",
const=True, default=False)
parser.add_argument("-md", "--min-deletions", help="Minimum of deletions to generate (>=1)", default=1,
type=check_min_size)
parser.add_argument("-t", "--threads", help="Number of threads", default=multiprocessing.cpu_count(), type=int)
parser.add_argument("-g", "--genotypes", help="Position of SV with genotypes of individuals")
if args.nb_inds is None and args.genotypes is None:
parser.error("--nb-inds parameter is required (except if --genotypes parameter is given)")
return args
def _allele(frequency):
"""
Get randomly an allele, given a frequency
:param frequency: frequency of the allele {float}
:return: allele randomly choosen {0 or 1}
"""
return 1 if random.uniform(0, 1) < frequency else 0
def _get_genotypes_for_inds(nb_inds, haploid, freq):
"""
Get genotypes for each individual
:param nb_inds: number of individuals {int}
:param haploid: is the genome hamploid {bool}
:param freq: frequency of the allele {float}
:return: list of genotypes, vcf data {list}
"""
all_genotypes = []
genotypes_row = []
for i in range(1, nb_inds + 1):
genotype = str(_allele(freq)) + "/" + str(_allele(freq))
genotype_data = vcf.model._Call(None, "indiv" + str(i), vcf.model.make_calldata_tuple("GT")(GT=genotype))
genotypes_row.append(genotype_data)
all_genotypes.append(genotype)
return all_genotypes, genotypes_row
def _svsort(sv, chrm, genotypes_for_inds):
"""
Function to sort regions
:param sv: the variant
:param chrm: chromosome {str}
:param genotypes_for_inds: dictionary of genotypes for each individual
return int(genotypes_for_inds[chrm][sv]["start"])
def _build_vcf_header(vcf_file, prg_path, tmp_dir, nb_inds):
"""
Build header of the VCF file
:param vcf_file: vcf file full path name {str}
:param prg_path: program path {str}
:param tmp_dir: temporary directory {str}
:param nb_inds: number of individuals {int}
"""
try:
with open(os.path.join(prg_path, "template.vcf"), "r") as template:
try:
with open(vcf_file, "w") as my_template:
for line in template:
if line[:6] == "#CHROM":
line = line.replace("\n", "")
for i in range(0, nb_inds):
line += "\n"
my_template.write(line)
except IOError:
raise ExecException("ERROR: unable to create template file \"{0}\" in temp dir.".
format(os.path.join(tmp_dir, "template.vcf")))
raise ExecException("ERROR: template file not found in program directory.")
Floreal Cabanettes
committed
def build_genotypes_vcf_list(deletions: dict, output_vcf, haploid, force_polymorphism, nb_inds, tmp_dir, prg_path):
"""
Build VCF file describing genotypes for each individual (and the associated python dictionary)
Floreal Cabanettes
committed
:param deletions: deletions description {dict}
:param output_vcf: output VCF file full path name {str}
:param haploid: is haploid {bool}
:param force_polymorphism: force polymorphism {bool}
:param nb_inds: number of individuals {int}
:param tmp_dir: temporary directory {str}
:param prg_path: program path {str}
:return: dictionary of genotypes for each variant for each dictionary {OrderedDict}
"""
genotypes_for_inds = OrderedDict()
# { chr: { id_indiv: {start: #start, end: #end, genotypes: [[0,1],[1,1],...], ...}, # ...}
vcf_file = os.path.join(tmp_dir, "template.vcf")
# Build VCF header:
_build_vcf_header(vcf_file, prg_path, tmp_dir, nb_inds)
Floreal Cabanettes
committed
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
vcf_reader = vcf.Reader(filename=vcf_file)
vcf_writer = vcf.Writer(open(output_vcf, "w"), vcf_reader)
for chrm, deletes in deletions.items():
for delete in deletes:
if chrm not in genotypes_for_inds:
genotypes_for_inds[chrm] = {}
genotypes_for_inds[chrm][delete["name"]] = {"start": delete["start"], "end": delete["end"], "genotypes": []}
# Get genotypes:
all_genotypes, genotypes = [], []
if force_polymorphism:
polymorph = False
while not polymorph:
all_genotypes, genotypes = _get_genotypes_for_inds(nb_inds, haploid, delete["freq"])
polymorph = len(set(all_genotypes)) > 1
else:
all_genotypes, genotypes = _get_genotypes_for_inds(nb_inds, haploid, delete["freq"])
genotypes_for_inds[chrm][delete["name"]]["genotypes"] = [x.split("/") for x in all_genotypes]
info = {"END": delete["end"], "AF": delete["freq"]}
vcf_record = vcf.model._Record(chrm, delete["start"], delete["name"], "N", [vcf.model._SV("DEL")], ".",
".", info, "GT", [0], genotypes)
vcf_writer.write_record(vcf_record)
vcf_writer.close()
tabix_compress(output_vcf, output_vcf + ".gz", True)
tabix_index(output_vcf + ".gz", force=True, preset="vcf")
def load_genotypes_from_file(genotypes_file):
variants = VariantFile(genotypes_file)
genotypes_for_inds = {}
nb_inds = 0
for variant in variants:
chrm = variant.chrom
id_v = variant.id
start = variant.start
end = variant.stop
samples = variant.samples
gt = [list(map(str, samples[x]["GT"])) for x in samples]
if chrm not in genotypes_for_inds:
genotypes_for_inds[chrm] = {}
genotypes_for_inds[chrm][id_v] = {
"start": start,
"end": end,
"genotypes": gt
}
nb_inds = len(gt)
return nb_inds, genotypes_for_inds
def _compute_keeped_genomic_regions(svs, svs_infos, haploid):
"""
Get list of all regions keeped (not deleted)
:param svs: list of variants (deletions) {list}
:param svs_infos: infos of variants {dict}
:param haploid: is haploid {bool}
:return: regions for each individuals {OrderedDict}, the last position for each region {OrderedDict}
"""
regions = OrderedDict()
current_region_pointer = OrderedDict()
for svs_i in svs:
i = 0
for genotypes in svs_infos[svs_i]["genotypes"]:
if i not in regions:
regions[i] = {}
current_region_pointer[i] = {}
for j in range(0, 2 if not haploid else 1): # For each chromosome of the diploid genome, or the chromosome
if j not in regions[i]:
regions[i][j] = []
current_region_pointer[i][j] = "0"
if svs_infos[svs_i]["genotypes"][i][j] == "1":
regions[i][j].append([current_region_pointer[i][j], svs_infos[svs_i]["start"]])
current_region_pointer[i][j] = svs_infos[svs_i]["end"]
i += 1
return regions, current_region_pointer
def _build_fastas(chrm, regions, current_region_pointer, output_dir, fasta_orig_chr, last_nt):
"""
Build fasta files
:param chrm:
:param regions:
:param current_region_pointer:
:param output_dir:
:param fasta_orig_chr:
:param last_nt:
:return:
"""
for indiv, chrs_dips in regions.items():
id_chrs = list(chrs_dips.keys())
id_chrs = sorted(id_chrs)
for id_chr in id_chrs:
chr_dip = chrs_dips[id_chr] # SVs for each diploid chromosome
logs_regions = [] # Store logs
Floreal Cabanettes
committed
try:
with open(os.path.join(output_dir, "indiv" + str(indiv + 1) + "_chr_" + str(id_chr) + ".fasta"),
Floreal Cabanettes
committed
fasta = ""
last_one = 0
for chr_region in chr_dip:
fasta += fasta_orig_chr[int(chr_region[0]):int(chr_region[1])]
logs_regions.append("\t".join(str(x) for x in chr_region))
last_one = int(chr_region[1])
if last_one < last_nt:
logs_regions.append("\t".join([str(current_region_pointer[indiv][id_chr]), str(last_nt)]))
fasta += fasta_orig_chr[int(current_region_pointer[indiv][id_chr]):last_nt]
SeqIO.write(fasta, output_handle, "fasta")
except IOError:
raise ExecException("ERROR: unable to write \"{0}\" file.".
format(os.path.join(output_dir, "indiv" + str(indiv + 1) + "_chr_" + str(id_chr) +
Floreal Cabanettes
committed
try:
with open(os.path.join(output_dir, "indiv" + str(indiv + 1) + ".regions.log"), "a") as log_handle:
log_handle.write(chrm + "_" + str(id_chr) + "\t")
Floreal Cabanettes
committed
log_handle.write("\n\t".join(logs_regions))
log_handle.write("\n")
except IOError:
raise ExecException("ERROR: unable to write log file: \"{0}\"".
format(os.path.join(output_dir, "indiv" + str(indiv + 1) + ".regions.log")))
def build_fastas_chromosomes(reference, genotypes_for_inds, haploid, output_dir, printer):
printer.out("BUILD FASTA GENOME FOR EACH INDIVIDUAL...\n")
fasta_orig = SeqIO.index(reference, "fasta")
for chrm, svs_infos in genotypes_for_inds.items():
printer.out("PROCESSING CHROMOSOME {0}...\n".format(chrm))
svs = list(svs_infos.keys())
svs = sorted(svs, key=lambda x:_svsort(x, chrm, genotypes_for_inds))
fasta_orig_chr = fasta_orig[chrm]
last_nt = len(fasta_orig_chr)-1
# Compute keeped genomic regions for each diploid chromosome:
regions, current_region_pointer = _compute_keeped_genomic_regions(svs, svs_infos, haploid)
# Build FASTA of each diploid/haploid chromosome:
_build_fastas(chrm, regions, current_region_pointer, output_dir, fasta_orig_chr, last_nt)
def generate_samples_fastq(haploid, nb_inds, output_dir, coverage, read_len, insert_len_mean, insert_len_sd,
prg_path, threads, stdout, stderr):
# Generate reads for all individuals:
cmd = [prg_path + "/pirs/pirs", "simulate", "-z", "-x", str(coverage), "-d", "-B",
prg_path + "/pirs/Profiles/Base-Calling_Profiles/humNew.PE100.matrix.gz", "-I",
prg_path + "/pirs/Profiles/InDel_Profiles/phixv2.InDel.matrix", "-l", str(read_len), "-m",
str(insert_len_mean), "-v", str(insert_len_sd),
"-G", prg_path + "/pirs/Profiles/GC-depth_Profiles/humNew.gcdep_100.dat", "-t", str(threads), "-o"]
if haploid:
cmd.remove("-d")
cmd_full = cmd + [output_dir, "-s", "indiv%d" % i]
prefix_fasta = os.path.join(output_dir, "indiv" + str(i))
chr0 = prefix_fasta + "_chr_0.fasta"
chr1 = prefix_fasta + "_chr_1.fasta"
cmd_full += [chr0, chr1]
cmd_full.append(chr0)
subprocess.call(cmd_full, stdout=open(stdout, "a") if stdout is not None else None,
stderr=open(stderr, "a") if stderr is not None else None)
def confirm(deletions: dict, variants: dict, printer: Print):
Floreal Cabanettes
committed
nb_dels = 0
variants = sorted(variants["DEL"], key=lambda x: x["min"])
variants_counts = OrderedDict()
variants_counts_chr = {}
for variant in variants:
variants_counts["{0}-{1}".format(variant["min"], variant["max"])] = 0
for chrm, deletes in deletions.items():
nb_dels_chr = len(deletes)
nb_dels += nb_dels_chr
variants_counts_chr[chrm] = nb_dels_chr
for delete in deletes:
v_len = delete["length"]
for variant in variants:
if variant["min"] < v_len < variant["max"]:
variants_counts["{0}-{1}".format(variant["min"], variant["max"])] += 1
break
printer.out("We generate {0} deletion variants.".format(nb_dels))
printer.out("Ranges:")
printer.out("\t- Range {0}: {1}".format(v_range, v_count))
printer.out("Chromosomes:")
for chrm in sorted(list(variants_counts_chr.keys())):
printer.out("\t- {0}: {1}".format(chrm, variants_counts_chr[chrm]))
printer.out("")
Floreal Cabanettes
committed
return input("Continue [Y/n]? ") in ["y", "Y", ""]
def init(output_dir, force_outputdir, sv_list, nstretches, nb_inds, reference, proba_del, haploid, force_polymorphism,
coverage, read_len, insert_len_mean, insert_len_sd, threads, genotypes=None, min_deletions=1, quiet=True,
stdout=None, stderr=None):
printer = Print(stdout=stdout, stderr=stderr)
printer.err("Error: output directory {0} already exists.".format(output_dir))
printer.err("Error: unable to create output directory {0}: file exists.".format(output_dir))
if nb_inds is not None and nb_inds < 2:
printer.err("nb-inds must be at least 2")
return 1
if not os.path.isfile(reference + ".fai"):
os.system("samtools faidx " + reference)
tmp_dir = tempfile.mkdtemp()
prg_path = os.path.dirname(os.path.realpath(__file__))
############################
# Define fixed files names #
############################
output_vcf = os.path.join(output_dir, "genotypes.vcf")
################
# Launch steps #
################
deletions = None
sv_sim = None
if genotypes is None:
printer.out("GENERATE RANDOM DELETIONS VARIANTS...\n")
try:
nb_deletions = 0
nb_try = 0
sv_sim = VariantsSimulator(sv_list, nstretches, threads, stdout, stderr)
printer.out("Try to generate at least %s deletions..." % min_deletions)
max_try = 10
while nb_deletions < min_deletions and nb_try < max_try:
printer.out("\nTry {0} / {1}...".format(nb_try + 1, max_try))
nb_deletions, deletions = sv_sim.get_random_deletions(proba_del, reference)
sv_sim.print_flush()
nb_try += 1
if nb_deletions < min_deletions:
printer.err("\nUnable to generate %s deletions. Try to reduce size of deletions or increase "
"general probability to have a deletion." % min_deletions)
return 1
printer.out("")
except InputException as e:
printer.err(e)
except IOError as e:
printer.err(e)
return 1
except Exception:
printer.err(traceback.format_exc())
return 1
if quiet or genotypes is not None or confirm(deletions, sv_sim.variants, printer):
genotypes_for_inds = None
if genotypes is None:
printer.out("GENERATE SUMMARY VCF FILE...\n")
genotypes_for_inds = build_genotypes_vcf_list(deletions, output_vcf, haploid, force_polymorphism, nb_inds,
tmp_dir, prg_path)
else:
nb_inds, genotypes_for_inds = load_genotypes_from_file(genotypes)
if nb_inds < 2:
printer.err("nb-inds must be at least 2")
return 1
build_fastas_chromosomes(reference, genotypes_for_inds, haploid, output_dir, printer)
printer.out("GENERATE RANDOM READS FOR EACH INDIVIDUAL FROM GENOME...\n")
generate_samples_fastq(haploid, nb_inds, output_dir, coverage, read_len, insert_len_mean,
insert_len_sd, prg_path, threads, stdout, stderr)
printer.out("DONE!\n")
except ExecException as e:
printer.err(e)
return 1
except Exception:
printer.err(traceback.format_exc())
return 1
Floreal Cabanettes
committed
else:
printer.out("Aborted!\n")
def main():
args = parse_args()
reference = args.reference
sv_list = args.sv_list
nstretches = args.nstretches
output_dir = args.output_directory
haploid = args.haploid
nb_inds = args.nb_inds
force_outputdir = args.force_outputdir
proba_del = args.proba_del
threads = args.threads
force_polymorphism = args.force_polymorphism
coverage = args.coverage
read_len = args.read_len
insert_len_mean = args.insert_len_mean
insert_len_sd = args.insert_len_sd
quiet = args.quiet
min_deletions = args.min_deletions
genotypes = args.genotypes
return init(output_dir, force_outputdir, sv_list, nstretches, nb_inds, reference, proba_del, haploid,
force_polymorphism, coverage, read_len, insert_len_mean, insert_len_sd, threads, genotypes,
min_deletions, quiet)