Newer
Older
#! /usr/bin/env python3
"""mergeSV : merge SV on the basis of their intervals
The merging is done by identifiying connected components in a graph.
The graph is constructing taking the SV a node and an edge connects two nodes
if the conresponding intervals exhibits a reciprocal overlap of a least R0%.
Additional constraints on the precision of breakpoints (left and right) can be specified.
Each connected component is termed a CNVR (Copy Number Variation Region)
and is associated to the corresponding CNVs.
"""
import argparse
from pysam import VariantFile
import lib.genotype_results as gtres
import lib.vcf as vcf
prg_path = os.path.dirname(os.path.realpath(__file__))
sys.path.insert(0, os.path.join(prg_path, "svlib"))
from svinterval import construct_overlap_graph, connected_components
ALPHABET = list(string.ascii_uppercase)
XLSX_COLS = ALPHABET.copy()
COLOR_NOT_FOUND = "#FE2E2E"
COLOR_NOT_FOUND_2 = "#dddddd"
COLOR_COL_FILTER = "#BEF781"
COLOR_IS_KEPT = "#81F781"
COLOR_FALSE_POSITIVE = "#FE642E"
COLOR_WRONG_GT = "#B40404"
"""
Get arguments from argparse
:return: argparse arguments object
"""
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="\
Build Results \n \
description: Build results of the simulated data detection")
parser.add_argument('-v', '--vcfs', type=str, required=True, help='File listing vcf files for each detection tool')
parser.add_argument('-t', '--true-vcf', type=str, required=True, help='VCF file containing the simulated deletions')
parser.add_argument('-f', '--filtered-vcf', type=str, required=True,
help='File listing VCF files containing the filtered results')
parser.add_argument('-y', '--type', required=True, type=str, choices=vcf.ALLOW_VARIANTS, help="Type of variant")
parser.add_argument('--overlap_cutoff', type=float, default=0.5, help='cutoff for reciprocal overlap')
parser.add_argument('--left_precision', type=int, default=-1, help='left breakpoint precision')
parser.add_argument('--right_precision', type=int, default=-1, help='right breakpoint precision')
parser.add_argument('-o', '--output', type=str, default="results", help='output folder')
parser.add_argument('--no-xls', action='store_const', const=True, default=False, help='Do not build Excel file')
parser.add_argument('--haploid', action='store_const', const=True, default=False, help='The organism is haploid')
parser.add_argument('-r', '--rules', type=str, required=False, help="Simulation rule file")
# parse the arguments
args = parser.parse_args()
if args.left_precision == -1:
args.left_precision = sys.maxsize
if args.right_precision == -1:
args.right_precision = sys.maxsize
if args.rules is None:
args.rules = os.path.join(prg_path, "defaults.rules")
# send back the user input
return args
"""
Print to stderr
"""
print(*args, file=sys.stderr, **kwargs)
def svsort(sv, records):
"""
Function to sort regions
"""
if records[sv]["start"] != "":
start = str(records[sv]["start"])
else:
first_tool = list(records[sv]["tools"].keys())[0]
start = str(records[sv]["tools"][first_tool]["start"])
def get_gt(geno, true_gt):
"""
Get the genotype: consider 1/0 and 0/1 as equivalent (we can't identify a specific chromosome)
We want geno equal to true_gt in these cases
:param geno: the genotype
:param true_gt: the true genotype
:return: the genotype
"""
if true_gt is not None:
if geno == "1/0" and true_gt == "0/1":
geno = "0/1"
elif geno == "0/1" and true_gt == "1/0":
geno = "1/0"
return geno
def get_max_len(cell, col, max_col_len):
"""
Get the max content length of a column
:param cell: the cell content
:param col: the column number
:param max_col_len: current max content length for each column
:return: the new max content length for the given column
"""
return max(len(str(cell)), max_col_len[col] if col in max_col_len else 0)
def get_quality_color(quality):
"""
Returns the color associated to a genotype quality value
:param quality: the quality value
:return: the associated color
"""
color_very_low_quality = "#FE2E2E"
color_low_quality = "#FE9A2E"
color_medium_quality = "#FFFF00"
color_high_quality = "#81F781"
if quality > 60:
return color_high_quality
elif quality > 40:
return color_medium_quality
elif quality > 20:
return color_low_quality
else:
return color_very_low_quality
def get_genotypes(genotypes_files, true_vcf_file):
"""
Get genotype of each individual for each SV
:param genotypes_file: VCF file containing genotypes
:param true_vcf_file: VCF file containing real data
:return: genotypes of each individual for each SV, quality of each genotype, number of individuals
"""
reader_t = VariantFile(true_vcf_file)
samples_t = rec_t.samples
genotypes[rec_t.id] = ["/".join(map(str, samples_t[x]["GT"])) for x in samples_t]
nb_inds = len(list(genotypes.values())[0])
# Samples:
for genotypes_file in genotypes_files:
reader = VariantFile(genotypes_file)
for rec in reader:
samples = rec.samples
genotypes[rec.id] = ["/".join(map(str, samples[x]["GT"])) for x in samples_t.keys()] # Fixed: use samples keys
# from real data to keep the same order
gt_quality[rec.id] = [samples[x]["GQ"] for x in samples_t.keys()]
return genotypes, gt_quality, nb_inds
def build_records(genotypes, SVSet, true_ones_records, filtered_records, gt_quality):
"""
Build records for each SV
:param genotypes: list of genotypes of each individual for each SV
:param SVSet: set of all SVs
:param true_ones_records: records of the real data
:param filtered_records: records of the filtered data
:param gt_quality: list of qualities of each genotype of individuals for each SV
:return: records dict, tools set, list of orphans records (associated to None real data)
"""
number = 1
records = {}
tools = set()
orphans = 0
for components in connected_components(SVSet):
names = []
results = {}
true_one = None
for node in components:
names.append(node.id)
if node.id in true_ones_records:
true_one = node.id
records[true_one] = {
"start": node.start,
"end": node.end,
"length": int(node.end) - int(node.start),
"tools": {},
"genotypes": genotypes[true_one] if true_one in genotypes else None,
"chromosome": node.chrom
}
else:
tool_name = node.id.split("_")[0]
results[tool_name] = {
"start": node.start,
"end": node.end,
"length": int(node.end) - int(node.start),
"filtered": (node.id in filtered_records) if filtered_records is not None else False,
"genotypes": genotypes[node.id] if node.id in genotypes else None,
"qualities": gt_quality[node.id] if node.id in gt_quality else None
if true_one is not None:
records[true_one]["tools"] = results
else:
orphans += 1
records["orphan_" + str(orphans)] = {
"start": "",
"end": "",
"length": "",
"tools": results,
"genotypes": None,
"chromosome": chromosome
}
names.sort()
eprint("Group #%i: %s" % (number, ", ".join(names)))
number += 1
return records, tools, orphans
def build_header(tools, filtered_records, nb_records, max_col_len, nb_inds):
"""
Build tools headers, and header cells for each tool
:param tools: list of tools
:param cells: cells of the first sheet (sv description)
:param cells_gt: cells of the second sheet (genotype)
:param cells_gq: cells of the third sheet (genotype quality)
:param filtered_records: (bool) is there filtered records
:param nb_records: number of records
:param max_col_len: max content length for each column
:param nb_inds: number of individuals
:return: headers (list of tools + read and filtered data) ; cells, cells_gt, cells_gq and max_col_len updated
"""
# First one (SV description)
cells = {
"A1": {"text": "RESULTS", "format": {"bg_color": "#ffe856"}},
"A2": {"text": "Deletion", "format": {"bold": True}},
"A" + str(1 + nb_records + 3): {"text": "DIFFS", "format": {"bg_color": "#ffe856"}},
"A" + str(2 + nb_records + 3): {"text": "Deletion", "format": {"bold": True}}
}
# Second one (genotype)
cells_gt = {
"A1": {"text": "GENOTYPES", "format": {"bg_color": "#ffe856"}},
"A2": {"text": "Deletion", "format": {"bold": True}}
}
# Third one (genotype quality)
cells_gq = {
"A1": {"text": "GT QUALITY", "format": {"bg_color": "#ffe856"}},
"A2": {"text": "Deletion", "format": {"bold": True}}
}
for tool in ["Real data"] + tools + (["Filtered results"] if filtered_records else []):
headers.append(tool)
l_format = {"bold": True}
if tool == "Filtered results":
l_format["bg_color"] = COLOR_COL_FILTER
cells[XLSX_COLS[i] + "2"] = cells[XLSX_COLS[i] + str(2 + nb_records + 3)] = {"text": "Start",
"format": l_format}
max_col_len[i] = get_max_len("Start", i, max_col_len)
cells[XLSX_COLS[i + 1] + "2"] = cells[XLSX_COLS[i + 1] + str(2 + nb_records + 3)] = {"text": "End",
"format": l_format}
max_col_len[i + 1] = get_max_len("End", i + 1, max_col_len)
cells[XLSX_COLS[i + 2] + "2"] = cells[XLSX_COLS[i + 2] + str(2 + nb_records + 3)] = {"text": "Length",
"format": l_format}
max_col_len[i + 2] = get_max_len("Length", i + 2, max_col_len)
# Genotypes:
for k in range(0, nb_inds):
cells_gt[XLSX_COLS[j + k] + "2"] = cells_gq[XLSX_COLS[j + k] + "2"] = \
{"text": "indiv" + str(k + 1), "format": {"bold": True}}
i += 3
j += nb_inds
return headers, cells, cells_gt, cells_gq, max_col_len
def fill_real_data(row, cells, cells_gt, cells_gq, max_col_len, record, rec_id, nb_records, chromosome):
"""
Fill cells of the first data column (real data simulated)
:param cells: cells of the first sheet (sv description)
:param cells_gt: cells of the second sheet (genotype)
:param cells_gq: cells of the third sheet (genotype quality)
:param max_col_len: max text length for each column
:param record: data of the record
:param nb_records: number of records (total)
:return: cells, cells_gt, cells_gq and max_col_len updated
"""
# SV ID:
cells[XLSX_COLS[0] + str(row)] = cells[XLSX_COLS[0] + str(row + nb_records + 3)] = {"text": rec_id, "format": {}}
max_col_len[0] = get_max_len(rec_id, 0, max_col_len)
cells_gt[XLSX_COLS[0] + str(row)] = cells_gq[XLSX_COLS[0] + str(row)] = {"text": rec_id, "format": {}}
cells[XLSX_COLS[1] + str(row)] = cells[XLSX_COLS[1] + str(row + nb_records + 3)] = {"text": chromosome,
cells_gt[XLSX_COLS[1] + str(row)] = cells_gq[XLSX_COLS[1] + str(row)] = {"text": chromosome, "format": {}}
cells[XLSX_COLS[2] + str(row)] = cells[XLSX_COLS[2] + str(row + nb_records + 3)] = {"text": record["start"],
max_col_len[2] = get_max_len(record["start"], 2, max_col_len)
cells[XLSX_COLS[3] + str(row)] = cells[XLSX_COLS[3] + str(row + nb_records + 3)] = {"text": record["end"],
max_col_len[3] = get_max_len(record["end"], 3, max_col_len)
cells[XLSX_COLS[4] + str(row)] = cells[XLSX_COLS[4] + str(row + nb_records + 3)] = {"text": record["length"],
max_col_len[4] = get_max_len(record["length"], 4, max_col_len)
# GENOTYPES:
if record["genotypes"] is not None:
for gt in range(0, len(record["genotypes"])):
cells_gt[XLSX_COLS[2 + gt] + str(row)] = cells_gq[XLSX_COLS[2 + 0 + gt] + str(row)] = \
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
{"text": record["genotypes"][gt], "format": {}}
return cells, cells_gt, cells_gq, max_col_len
def fill_tool_data(row, col, cells, max_col_len, record, nb_records, my_start, my_end, my_length,
sv_format=None):
"""
Fill cells for a tool and a record
:param row: row position
:param col: columns position
:param cells: cells of the first sheet (sv description)
:param max_col_len: max text length for each column
:param record: data of the record for a tool
:param nb_records: number of records (total)
:param my_start: real start of the SV
:param my_end: real end of the SV
:param my_length: real length of the SV
:param sv_format: format for the cell
:return: cells and max_col_len updated
"""
if sv_format is None:
sv_format = {}
#############
# RAW DATA: #
#############
# START:
cells[XLSX_COLS[col] + str(row)] = {"text": record["start"], "format": sv_format}
max_col_len[col] = get_max_len(record["start"], col, max_col_len)
# END:
cells[XLSX_COLS[col + 1] + str(row)] = {"text": record["end"], "format": sv_format}
max_col_len[col + 1] = get_max_len(record["end"], col + 1, max_col_len)
# LENGTH:
cells[XLSX_COLS[col + 2] + str(row)] = {"text": record["length"], "format": sv_format}
max_col_len[col + 2] = get_max_len(record["length"], col + 2, max_col_len)
###########
# DIFFS: #
###########
if my_start != "":
start = record["start"] - my_start
end = record["end"] - my_end
length = record["length"] - my_length
else:
start = end = length = "NA"
# START:
cells[XLSX_COLS[col] + str(row + nb_records + 3)] = {"text": start, "format": sv_format}
cells[XLSX_COLS[col + 1] + str(row + nb_records + 3)] = {"text": end, "format": sv_format}
cells[XLSX_COLS[col + 2] + str(row + nb_records + 3)] = {"text": length,
"format": sv_format}
return cells, max_col_len
def fill_genotypes_data(row, col, cells_gt, cells_gq, record, my_genotypes, haploid=False):
"""
Fill cells for a tool and a record (genotype/genotype quality parts)
:param row: row position
:param col: column position
:param cells_gt: cells for genotype sheet (2)
:param cells_gq: cells for the genotype quality sheet (3)
:param record: data of the record for a tool
:param my_genotypes: real genotypes for each individual
:return: cells_gt and cells_gq updated
"""
the_genotypes = record["genotypes"]
for gt in range(0, len(the_genotypes)):
true_gt = my_genotypes[gt] if my_genotypes is not None else ""
geno = get_gt(the_genotypes[gt], true_gt)
gt_format = {"bg_color": get_quality_color(int(record["qualities"][gt]) if record["qualities"][gt] is not None else 0)}
if (not haploid and geno != true_gt) or \
(haploid and ((true_gt == "1" and geno != "1/1") or (true_gt == "0" and geno != "0/0"))):
gt_format["font_color"] = "#ff0000"
# Genotype:
cells_gt[XLSX_COLS[col + gt] + str(row)] = {"text": geno, "format": gt_format}
cells_gq[XLSX_COLS[col + gt] + str(row)] = {"text": int(record["qualities"][gt]) if record["qualities"][gt] is not None else 0, "format": gt_format}
return cells_gt, cells_gq
def write_headers(workbook, worksheet, headers, cell_len, rows, filtered_records):
"""
Write header on the given sheet of the XLSX file
:param worksheet: th sheet
:param headers: headers values
:param cell_len: length of each cell of the header
:param rows: (list) rows into write the header
:param filtered_records: (bool) is there filtered records
"""
for row in rows:
merge_format = workbook.add_format({'bold': True})
worksheet.write(XLSX_COLS[1] + str(row + 1) + ":" + XLSX_COLS[cell_len] + str(row + 1), headers[0], merge_format)
for header in headers[1:]:
if i < len(headers) - 2 or not filtered_records:
merge_format = workbook.add_format({'align': 'center'})
else:
merge_format = workbook.add_format({'align': 'center', 'bg_color': COLOR_COL_FILTER})
worksheet.merge_range(XLSX_COLS[2 + i * cell_len] + str(row) + ":" + XLSX_COLS[1 + cell_len + i * cell_len] + str(row),
header, merge_format)
i += 1
def create_xls_document(output, genotypes, headers, filtered_records, nb_records, nb_inds, cells, cells_gt, cells_gq,
max_col_len):
"""
Create the XLSX file
:param output: output directory
:param genotypes: genotypes file
:param headers: headers of each sheet
:param filtered_records: (bool) has filtered records
:param nb_records: number of records
:param nb_inds: number of individuals
:param cells: cells for first sheet (SV description)
:param cells_gt: cells for second sheet (genotypes)
:param cells_gq: cells for third sheet (genotype quality)
:param max_col_len: max content length for each column
"""
try:
import xlsxwriter
except ImportError:
print("\nWARN: Excel file not built: xlsxwriter python module not installed")
with xlsxwriter.Workbook(os.path.join(output, "results.xlsx")) as workbook:
#################################
# First sheet (SV description): #
#################################
worksheet = workbook.add_worksheet("SVs")
write_headers(workbook, worksheet, headers, 3, [1, 1+nb_records+3], filtered_records)
# Body:
for cell_id, cell_content in cells.items():
cell_format = workbook.add_format(cell_content["format"])
worksheet.write(cell_id, cell_content["text"], cell_format)
# Resize columns:
for col, max_len in max_col_len.items():
worksheet.set_column(col, col, max_len+1)
if genotypes:
#############################
# Second sheet (Genotypes): #
#############################
worksheet_gt = workbook.add_worksheet("Genotypes")
write_headers(workbook, worksheet_gt, headers, nb_inds, [1], filtered_records)
# Body:
for cell_id, cell_content in cells_gt.items():
cell_format = workbook.add_format(cell_content["format"])
worksheet_gt.write(cell_id, cell_content["text"], cell_format)
worksheet_gt.freeze_panes(0, 2+nb_inds)
# Resize columns:
worksheet_gt.set_column(0, 0, max_col_len[0]+1)
####################################
# Third sheet (Genotypes quality): #
####################################
worksheet_gq = workbook.add_worksheet("Gt quality")
write_headers(workbook, worksheet_gq, headers, nb_inds, [1], filtered_records)
# Body
for cell_id, cell_content in cells_gq.items():
cell_format = workbook.add_format(cell_content["format"])
worksheet_gq.write(cell_id, cell_content["text"], cell_format)
worksheet_gq.freeze_panes(0, 2+nb_inds)
# Resize columns:
worksheet_gq.set_column(0, 0, max_col_len[0]+1)
def create_tsv_file(filename: str, headers: list, cells: dict, nb_tools: int, nb_per_tool: int, records_range: ()):
"""
Create tabulated separated values file
:param filename: filename of the file
:param headers: headers of each sheet
:param cells: cells of the table to save
:param nb_tools: number of tools
:param nb_per_tool: number per tools
:param records_range: range of records to treat {tuple(2)}
:return:
"""
# Define top headers to each column:
for i in range(0, nb_per_tool):
top_headers[h] = header
head.append("")
h += 1
rows = [head]
for i in range(0, records_range[1]-records_range[0]):
rows.append(["" for x in range(0, (nb_tools * nb_per_tool) + 2)])
# Fill content:
for id_cell, cell in cells.items():
id_m = re.match(r"^([A-Z]+)(\d+)$", id_cell)
col = XLSX_COLS.index(id_m.group(1))
row = int(id_m.group(2))
if records_range[0] <= row <= records_range[1]:
r = row - records_range[0]
if r == 0 and col > 0:
if col > 1:
rows[r][col] = top_headers[col].replace(" ", "_") + "__" + cell["text"]
else:
rows[r][col] = headers[0]
else:
rows[r][col] = str(cell["text"])
# List as text:
for r in range(0, len(rows)):
rows[r] = "\t".join(rows[r])
tsv = "\n".join(rows)
with open(filename, "w") as tsv_file:
tsv_file.write(tsv)
def print_results(nb_records, orphans, with_xlsx, output, do_genotype):
"""
Print list of outputs
:param nb_records: number of records {int}
:param orphans: sv found in tools but not present in real data {dict}
:param with_xlsx: build xlsx file {bool}
:param output: output prefix {str}
:param do_genotype: do the genotype {bool}
"""
print("")
print("###########")
print("# RESULTS #")
print("###########")
print("")
print(str(nb_records) + " Results found")
print(str(orphans) + " False Positive")
print("")
if with_xlsx:
print("Results saved in :\n\t- " + os.path.join(output, "results.xlsx"))
else:
print("Results:")
print("")
print("TSV files:")
print("\t- " + os.path.join(output, "results_sv_per_tools.tsv"))
print("\t- " + os.path.join(output, "results_sv_diffs_per_tools.tsv"))
print("\t- " + os.path.join(output, "results_sv_genotypes_per_tools.tsv"))
print("\t- " + os.path.join(output, "results_sv_genotypes_quality_per_tools.tsv"))
print("")
print("Summary:")
print("\t- " + os.path.join(output, "Summarized_results.html"))
print("")
def fill_cells_no_tools(cells, cells_gt, cells_gq, i, j, g, nb_records, nb_inds, do_genotype):
"""
Fill cells when a tool does not detect a SV
:param cells: cells definition
:param cells_gt: cells definition for genotypes
:param cells_gq: cells definition for genotypes quality
:param i: row index
:param j: column index
:param g: column index for genotypes and genotypes quality tables
:param nb_records: total number of records
:param nb_inds: total number of individuals
:param do_genotype: do the genotypes
:return: cells, completed
"""
for k in range(0, 3):
# noinspection PyUnresolvedReferences
cells[XLSX_COLS[j + k] + str(i)] = {"text": "", "format": {"bg_color": COLOR_NOT_FOUND}}
cells[XLSX_COLS[j + k] + str(i + nb_records + 3)] = {"text": "",
"format": {"bg_color": COLOR_NOT_FOUND}}
# Genotype:
for gt in range(0, nb_inds):
# noinspection PyUnresolvedReferences
cells_gt[XLSX_COLS[g + gt] + str(i)] = cells_gq[XLSX_COLS[g + gt] + str(i)] = \
{"text": "", "format": {"bg_color": COLOR_NOT_FOUND_2}}
return cells, cells_gt, cells_gq
def apply_style_of_filter_cells(cells, cells_gt, cells_gq, i, is_kept, nb_records, nb_inds, nb_tools, filtered_records,
"""
Apply style of cells
:param cells: cells of the default table {dict}
:param cells_gt: cells of the genotypes table {dict}
:param cells_gq: cells of the genotypes quality table {dict}
:param i: row index {int}
:param is_kept: is the variant kept after filtering {bool}
:param nb_records: number of records {int}
:param nb_inds: number of individuals {int}
:param nb_tools: number of tools {int}
:param filtered_records: file containing filtered records {str}
:param do_genotype: do the genotype {bool}
:param rec_id: id of the record {str}
:return:
"""
if is_kept:
# SV is kept: color bg in green in the corresponding tool
cells[XLSX_COLS[0] + str(i)]["format"]["bg_color"] = \
cells[XLSX_COLS[0] + str(i + nb_records + 3)]["format"]["bg_color"] = COLOR_IS_KEPT
elif filtered_records is not None:
# SV does not pass the filter: color bg in red in the filter column
cells[XLSX_COLS[2 + ((nb_tools + 1) * 3)] + str(i)] = {"text": "", "format": {"bg_color": COLOR_NOT_FOUND}}
cells[XLSX_COLS[2 + ((nb_tools + 1) * 3) + 1] + str(i)] = {"text": "",
"format": {"bg_color": COLOR_NOT_FOUND}}
cells[XLSX_COLS[2 + ((nb_tools + 1) * 3) + 2] + str(i)] = {"text": "",
"format": {"bg_color": COLOR_NOT_FOUND}}
cells[XLSX_COLS[2 + ((nb_tools + 1) * 3)] + str(i + nb_records + 3)] = {"text": "",
"format": {
"bg_color": COLOR_NOT_FOUND}}
cells[XLSX_COLS[2 + ((nb_tools + 1) * 3) + 1] + str(i + nb_records + 3)] = {
"text": "", "format": {"bg_color": COLOR_NOT_FOUND}}
cells[XLSX_COLS[2 + ((nb_tools + 1) * 3) + 2] + str(i + nb_records + 3)] = {
"text": "", "format": {"bg_color": COLOR_NOT_FOUND}}
# Genotype:
# Color in gray in the filter column
for gt in range(0, nb_inds):
cells_gt[XLSX_COLS[2 + ((nb_tools + 1) * nb_inds) + gt] + str(i)] = \
cells_gq[XLSX_COLS[2 + ((nb_tools + 1) * nb_inds) + gt] + str(i)] = {"text": "", "format":
{"bg_color": COLOR_NOT_FOUND_2}
}
# False positives (orphans) in orange:
if re.match(r"^orphan_\d+$", rec_id):
cells[XLSX_COLS[0] + str(i)]["format"]["bg_color"] = \
cells[XLSX_COLS[0] + str(i + nb_records + 3)]["format"]["bg_color"] = COLOR_FALSE_POSITIVE
return cells, cells_gt, cells_gq
def build_body_cells(rec_keys, records, nb_records, nb_inds, tools, cells, cells_gt, cells_gq, max_col_len,
nb_tools, do_genotype, haploid, filtered_records):
for rec_id in rec_keys:
record = records[rec_id]
my_start = record["start"]
my_end = record["end"]
my_length = record["length"]
my_genotypes = record["genotypes"]
# Real data of the simulation:
cells, cells_gt, cells_gq, max_col_len = fill_real_data(i, cells, cells_gt, cells_gq, max_col_len, record,
rec_id, nb_records, record["chromosome"])
is_kept = False
for tool in tools:
if tool in record["tools"]:
###########################
# IF TOOL DETECTS THE SV: #
###########################
######################
# SET FILTERED DATA: #
######################
sv_format = {"bg_color": COLOR_IS_KEPT}
cells, max_col_len = fill_tool_data(i, 2 + ((nb_tools + 1) * 3), cells, max_col_len,
record["tools"][tool], nb_records, my_start, my_end, my_length,
{"bg_color": COLOR_COL_FILTER})
# Genotype (sheets 2&3):
cells_gt, cells_gq = fill_genotypes_data(i, 2 + ((nb_tools + 1) * nb_inds), cells_gt, cells_gq,
record["tools"][tool], my_genotypes, haploid)
#################
# SET TOOL DATA #
#################
# SV data (sheet 1):
cells, max_col_len = fill_tool_data(i, j, cells, max_col_len,
record["tools"][tool], nb_records, my_start, my_end, my_length,
sv_format)
# Genotype (sheets 2&3):
cells_gt, cells_gq = fill_genotypes_data(i, g, cells_gt, cells_gq,
record["tools"][tool], my_genotypes, haploid)
###############################
# TOOL DOES NOT DETECT THE SV #
###############################
cells, cells_gt, cells_gq = fill_cells_no_tools(cells, cells_gt, cells_gq, i, j, g, nb_records, nb_inds,
###############################################################################
# Until we have filled all tools, check if the record is kept after filtering: #
###############################################################################
if filtered_records is not None:
cells, cells_gt, cells_gq = apply_style_of_filter_cells(cells, cells_gt, cells_gq, i, is_kept, nb_records,
nb_inds, nb_tools, filtered_records, do_genotype,
rec_id)
return cells, cells_gt, cells_gq, max_col_len
for alp in ALPHABET:
for j in ALPHABET:
XLSX_COLS.append(alp + j)
def init(output, vcf_files, true_vcf, rules, filtered_vcfs=None, type_v="del", overlap_cutoff=0.5,
left_precision=sys.maxsize, right_precision=sys.maxsize, no_xls=False, haploid=False):
if not os.path.exists(output):
os.makedirs(output)
genotypes = {}
gt_quality = {}
nb_inds = 0
filtered = None
filtered_all = None
filtered_records = None
do_genotype = False
if filtered_vcfs:
filtered = {}
filtered_all = {}
for filtered_vcf in filtered_vcfs:
eprint(" Reading file %s" % filtered_vcf)
vcf.readvariants(filtered_vcf, type_v, filtered, True, filtered_all)
filtered_records = filtered.keys()
genotypes, gt_quality, nb_inds = get_genotypes(filtered_vcfs, true_vcf)
do_genotype = True
# Reading all the vcf files
sv_set = []
for infile in vcf_files:
eprint(" Reading file %s" % infile)
try:
sv_set += vcf.readvariants(infile, type_v).values()
except:
print("Ignoreing file %s" % infile)
eprint(" Reading file %s" % true_vcf)
true_variants = vcf.readvariants(true_vcf, type_v)
sv_set += true_variants.values()
# Compute connected components:
eprint("Computing Connected components")
construct_overlap_graph(sv_set, overlap_cutoff, left_precision, right_precision)
# Build records:
records, tools, orphans = build_records(genotypes, sv_set, true_variants.keys(), filtered_records, gt_quality)
nb_records = len(records)
#################################################
# Define cells of each sheet of the excel file: #
#################################################
tools = sorted(tools)
nb_tools = len(tools)
max_col_len = {}
######################
# BUILD HEADER CELLS #
######################
headers, cells, cells_gt, cells_gq, max_col_len = build_header(tools, filtered_records is not None, nb_records,
max_col_len, nb_inds)
rec_keys = sorted(records.keys(), key=lambda x: (records[x]["chromosome"], svsort(x, records)))
####################
# BUILD BODY CELLS #
####################
cells, cells_gt, cells_gq, max_col_len = build_body_cells(rec_keys, records, nb_records, nb_inds, tools, cells,
cells_gt, cells_gq, max_col_len, nb_tools, do_genotype,
haploid, filtered_records)
if not no_xls:
with_xlsx = create_xls_document(output, genotypes, headers, filtered_records is not None, nb_records, nb_inds,
cells, cells_gt, cells_gq, max_col_len)
create_tsv_file(os.path.join(output, "results_sv_per_tools.tsv"), headers, cells,
nb_tools + (2 if filtered_records is not None else 1),
3, (2, nb_records + 2))
create_tsv_file(os.path.join(output, "results_sv_diffs_per_tools.tsv"), headers, cells,
nb_tools + (2 if filtered_records is not None else 1),
3, (2 + nb_records + 3, nb_records * 2 + 5))
create_tsv_file(os.path.join(output, "results_sv_genotypes_per_tools.tsv"), headers, cells_gt,
nb_tools + (2 if filtered_records is not None else 1),
nb_inds, (2, nb_records + 2))
create_tsv_file(os.path.join(output, "results_sv_genotypes_quality_per_tools.tsv"), headers, cells_gq,
nb_tools + (2 if filtered_records is not None else 1),
nb_inds, (2, nb_records + 2))
###############################
# Build genotypes result file #
###############################
gtres.build(true_genotypes=true_variants,
pred_genotypes=filtered_all,
filtered_genotypes=filtered,
output=os.path.join(output, "results_genotype.tsv"))
#######################################
# Build Jupyter HTML notebook summary #
#######################################
# Copy necessary files:
ipynb = os.path.join(prg_path, "Summarized_results.ipynb")
shutil.copy(ipynb, os.path.join(output, os.path.basename(ipynb)))
with open(rules, "r") as rules_in, open(os.path.join(output, "rules.sim"), "w") as rules_out:
for line in rules_in:
parts = line.rstrip().split("\t")
if parts[0] == type_v.upper():
rules_out.write(line)
# Build HTML summary:
ipynb = os.path.join(output, "Summarized_results.ipynb")
exit_code = os.system("jupyter nbconvert --to html --template %s --execute %s" % (template, ipynb))
if exit_code != 0:
raise Exception("Jupyter notebook fails with exit status %d" % exit_code)
print_results(nb_records, orphans, with_xlsx, output, do_genotype)
def get_vcf_files(vcf_list):
"""
Get vcf files saved in a list file
:param vcf_list: the list file
:return: list of vcf files
"""
files = []
with open(vcf_list, "r") as vcf:
for line in vcf:
line = line.rstrip()
if line != "":
files.append(line)
return files
def main():
"""
Main function
"""
# parse the command line args
args = get_args()
init(output=args.output,
vcf_files=get_vcf_files(args.vcfs),
true_vcf=args.true_vcf,
filtered_vcfs=get_vcf_files(args.filtered_vcf),
type_v=args.type,
overlap_cutoff=args.overlap_cutoff,
left_precision=args.left_precision,
right_precision=args.right_precision,
no_xls=args.no_xls,
haploid=args.haploid,
rules=args.rules)
# initialize the script
if __name__ == '__main__':