Newer
Older
#! /usr/bin/env python3
"""mergeSV : merge SV on the basis of their intervals
The merging is done by identifiying connected components in a graph.
The graph is constructing taking the SV a node and an edge connects two nodes
if the conresponding intervals exhibits a reciprocal overlap of a least R0%.
Additional constraints on the precision of breakpoints (left and right) can be specified.
Each connected component is termed a CNVR (Copy Number Variation Region)
and is associated to the corresponding CNVs.
"""
import argparse
from pysam import VariantFile
import lib.genotype_results as gtres
import lib.vcf as vcf
prg_path = os.path.dirname(os.path.realpath(__file__))
sys.path.insert(0, os.path.join(prg_path, "svlib"))
from svinterval import construct_overlap_graph, connected_components
ALPHABET = list(string.ascii_uppercase)
XLSX_COLS = ALPHABET.copy()
COLOR_NOT_FOUND = "#FE2E2E"
COLOR_NOT_FOUND_2 = "#dddddd"
COLOR_COL_FILTER = "#BEF781"
COLOR_IS_KEPT = "#81F781"
COLOR_FALSE_POSITIVE = "#FE642E"
COLOR_WRONG_GT = "#B40404"
"""
Get arguments from argparse
:return: argparse arguments object
"""
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="\
Build Results \n \
description: Build results of the simulated data detection")
parser.add_argument('-v', '--vcfs', type=str, required=True, help='File listing vcf files for each detection tool')
parser.add_argument('-t', '--true-vcf', type=str, required=True, help='VCF file containing the simulated deletions')
parser.add_argument('-f', '--filtered-vcf', type=str, required=True,
help='File listing VCF files containing the filtered results')
parser.add_argument('-y', '--type', required=True, type=str, choices=vcf.ALLOW_VARIANTS, help="Type of variant")
parser.add_argument('--overlap_cutoff', type=float, default=0.5, help='cutoff for reciprocal overlap')
parser.add_argument('--left_precision', type=int, default=-1, help='left breakpoint precision')
parser.add_argument('--right_precision', type=int, default=-1, help='right breakpoint precision')
parser.add_argument('-o', '--output', type=str, default="results", help='output folder')
parser.add_argument('--no-xls', action='store_const', const=True, default=False, help='Do not build Excel file')
parser.add_argument('--haploid', action='store_const', const=True, default=False, help='The organism is haploid')
parser.add_argument('-r', '--rules', type=str, required=False, help="Simulation rule file")
# parse the arguments
args = parser.parse_args()
if args.left_precision == -1:
args.left_precision = sys.maxsize
if args.right_precision == -1:
args.right_precision = sys.maxsize
if args.rules is None:
args.rules = os.path.join(prg_path, "defaults.rules")
# send back the user input
return args
"""
Print to stderr
"""
print(*args, file=sys.stderr, **kwargs)
def svsort(sv, records):
"""
Function to sort regions
"""
if records[sv]["start"] != "":
start = str(records[sv]["start"])
else:
first_tool = list(records[sv]["tools"].keys())[0]
start = str(records[sv]["tools"][first_tool]["start"])
def get_gt(geno, true_gt):
"""
Get the genotype: consider 1/0 and 0/1 as equivalent (we can't identify a specific chromosome)
We want geno equal to true_gt in these cases
:param geno: the genotype
:param true_gt: the true genotype
:return: the genotype
"""
if true_gt is not None:
if geno == "1/0" and true_gt == "0/1":
geno = "0/1"
elif geno == "0/1" and true_gt == "1/0":
geno = "1/0"
return geno
def get_max_len(cell, col, max_col_len):
"""
Get the max content length of a column
:param cell: the cell content
:param col: the column number
:param max_col_len: current max content length for each column
:return: the new max content length for the given column
"""
return max(len(str(cell)), max_col_len[col] if col in max_col_len else 0)
def get_quality_color(quality):
"""
Returns the color associated to a genotype quality value
:param quality: the quality value
:return: the associated color
"""
color_very_low_quality = "#FE2E2E"
color_low_quality = "#FE9A2E"
color_medium_quality = "#FFFF00"
color_high_quality = "#81F781"
if quality > 60:
return color_high_quality
elif quality > 40:
return color_medium_quality
elif quality > 20:
return color_low_quality
else:
return color_very_low_quality
def get_genotypes(genotypes_files, true_vcf_file):
"""
Get genotype of each individual for each SV
:param genotypes_file: VCF file containing genotypes
:param true_vcf_file: VCF file containing real data
:return: genotypes of each individual for each SV, quality of each genotype, number of individuals
"""
reader_t = VariantFile(true_vcf_file)
samples_t = rec_t.samples
genotypes[rec_t.id] = ["/".join(map(str, samples_t[x]["GT"])) for x in samples_t]
nb_inds = len(list(genotypes.values())[0])
# Samples:
for genotypes_file in genotypes_files:
reader = VariantFile(genotypes_file)
for rec in reader:
samples = rec.samples
genotypes[rec.id] = ["/".join(map(str, samples[x]["GT"])) for x in samples_t.keys()] # Fixed: use samples keys
# from real data to keep the same order
gt_quality[rec.id] = [samples[x]["GQ"] for x in samples_t.keys()]
return genotypes, gt_quality, nb_inds
def build_records(genotypes, SVSet, true_ones_records, filtered_records, gt_quality, all_variants_filter):
"""
Build records for each SV
:param genotypes: list of genotypes of each individual for each SV
:param SVSet: set of all SVs
:param true_ones_records: records of the real data
:param filtered_records: records of the filtered data
:param gt_quality: list of qualities of each genotype of individuals for each SV
:return: records dict, tools set, list of orphans records (associated to None real data)
"""
number = 1
records = {}
tools = set()
orphans = 0
for components in connected_components(SVSet):
names = []
results = {}
true_one = None
for node in components:
names.append(node.id)
if node.id in true_ones_records:
true_one = node.id
records[true_one] = {
"start": node.start,
"end": node.end,
"length": int(node.end) - int(node.start),
"tools": {},
"genotypes": genotypes[true_one] if true_one in genotypes else None,
"chromosome": node.chrom
}
else:
tool_name = node.id.split("_")[0]
results[tool_name] = {
"start": node.start,
"end": node.end,
"length": int(node.end) - int(node.start),
"filtered": (node.id in filtered_records) if filtered_records is not None else False,
"filters": ",".join([x for x in all_variants_filter[node.id].filter]) if
node.id in all_variants_filter else "",
"genotypes": genotypes[node.id] if node.id in genotypes else None,
"qualities": gt_quality[node.id] if node.id in gt_quality else None
if results[tool_name]["filters"] == "":
results[tool_name]["filters"] = "PASS"
if true_one is not None:
records[true_one]["tools"] = results
else:
orphans += 1
records["orphan_" + str(orphans)] = {
"start": "",
"end": "",
"length": "",
"tools": results,
"genotypes": None,
"chromosome": chromosome
}
names.sort()
eprint("Group #%i: %s" % (number, ", ".join(names)))
number += 1
return records, tools, orphans
def build_header(tools, filtered_records, nb_records, max_col_len, nb_inds):
"""
Build tools headers, and header cells for each tool
:param tools: list of tools
:param cells: cells of the first sheet (sv description)
:param cells_gt: cells of the second sheet (genotype)
:param cells_gq: cells of the third sheet (genotype quality)
:param filtered_records: (bool) is there filtered records
:param nb_records: number of records
:param max_col_len: max content length for each column
:param nb_inds: number of individuals
:return: headers (list of tools + read and filtered data) ; cells, cells_gt, cells_gq and max_col_len updated
"""
# First one (SV description)
cells = {
"A1": {"text": "RESULTS", "format": {"bg_color": "#ffe856"}},
"A2": {"text": "Deletion", "format": {"bold": True}},
"A" + str(1 + nb_records + 3): {"text": "DIFFS", "format": {"bg_color": "#ffe856"}},
"A" + str(2 + nb_records + 3): {"text": "Deletion", "format": {"bold": True}}
}
# Second one (genotype)
cells_gt = {
"A1": {"text": "GENOTYPES", "format": {"bg_color": "#ffe856"}},
"A2": {"text": "Deletion", "format": {"bold": True}}
}
# Third one (genotype quality)
cells_gq = {
"A1": {"text": "GT QUALITY", "format": {"bg_color": "#ffe856"}},
"A2": {"text": "Deletion", "format": {"bold": True}}
}
for tool in ["Real data"] + tools + (["Filtered results"] if filtered_records else []):
headers.append(tool)
l_format = {"bold": True}
if tool == "Filtered results":
l_format["bg_color"] = COLOR_COL_FILTER
cells[XLSX_COLS[i] + "2"] = cells[XLSX_COLS[i] + str(2 + nb_records + 3)] = {"text": "Start",
"format": l_format}
max_col_len[i] = get_max_len("Start", i, max_col_len)
cells[XLSX_COLS[i + 1] + "2"] = cells[XLSX_COLS[i + 1] + str(2 + nb_records + 3)] = {"text": "End",
"format": l_format}
max_col_len[i + 1] = get_max_len("End", i + 1, max_col_len)
cells[XLSX_COLS[i + 2] + "2"] = cells[XLSX_COLS[i + 2] + str(2 + nb_records + 3)] = {"text": "Length",
"format": l_format}
max_col_len[i + 2] = get_max_len("Length", i + 2, max_col_len)
if tool not in ["Real data", "Filtered results"]:
cells[XLSX_COLS[i + 3] + "2"] = cells[XLSX_COLS[i + 3] + str(2 + nb_records + 3)] = {"text": "Filters",
"format": l_format}
max_col_len[i + 3] = get_max_len("Filters", i + 3, max_col_len)
i += 1
# Genotypes:
for k in range(0, nb_inds):
cells_gt[XLSX_COLS[j + k] + "2"] = cells_gq[XLSX_COLS[j + k] + "2"] = \
{"text": "indiv" + str(k + 1), "format": {"bold": True}}
i += 3
j += nb_inds
return headers, cells, cells_gt, cells_gq, max_col_len
def fill_real_data(row, cells, cells_gt, cells_gq, max_col_len, record, rec_id, nb_records, chromosome):
"""
Fill cells of the first data column (real data simulated)
:param cells: cells of the first sheet (sv description)
:param cells_gt: cells of the second sheet (genotype)
:param cells_gq: cells of the third sheet (genotype quality)
:param max_col_len: max text length for each column
:param record: data of the record
:param nb_records: number of records (total)
:return: cells, cells_gt, cells_gq and max_col_len updated
"""
# SV ID:
cells[XLSX_COLS[0] + str(row)] = cells[XLSX_COLS[0] + str(row + nb_records + 3)] = {"text": rec_id, "format": {}}
max_col_len[0] = get_max_len(rec_id, 0, max_col_len)
cells_gt[XLSX_COLS[0] + str(row)] = cells_gq[XLSX_COLS[0] + str(row)] = {"text": rec_id, "format": {}}
cells[XLSX_COLS[1] + str(row)] = cells[XLSX_COLS[1] + str(row + nb_records + 3)] = {"text": chromosome,
cells_gt[XLSX_COLS[1] + str(row)] = cells_gq[XLSX_COLS[1] + str(row)] = {"text": chromosome, "format": {}}
cells[XLSX_COLS[2] + str(row)] = cells[XLSX_COLS[2] + str(row + nb_records + 3)] = {"text": record["start"],
max_col_len[2] = get_max_len(record["start"], 2, max_col_len)
cells[XLSX_COLS[3] + str(row)] = cells[XLSX_COLS[3] + str(row + nb_records + 3)] = {"text": record["end"],
max_col_len[3] = get_max_len(record["end"], 3, max_col_len)
cells[XLSX_COLS[4] + str(row)] = cells[XLSX_COLS[4] + str(row + nb_records + 3)] = {"text": record["length"],
max_col_len[4] = get_max_len(record["length"], 4, max_col_len)
# GENOTYPES:
if record["genotypes"] is not None:
for gt in range(0, len(record["genotypes"])):
cells_gt[XLSX_COLS[2 + gt] + str(row)] = cells_gq[XLSX_COLS[2 + 0 + gt] + str(row)] = \
{"text": record["genotypes"][gt], "format": {}}
return cells, cells_gt, cells_gq, max_col_len
def fill_tool_data(row, col, cells, max_col_len, record, nb_records, my_start, my_end, my_length,
sv_format=None, no_filter=False):
"""
Fill cells for a tool and a record
:param row: row position
:param col: columns position
:param cells: cells of the first sheet (sv description)
:param max_col_len: max text length for each column
:param record: data of the record for a tool
:param nb_records: number of records (total)
:param my_start: real start of the SV
:param my_end: real end of the SV
:param my_length: real length of the SV
:param sv_format: format for the cell
:return: cells and max_col_len updated
"""
if sv_format is None:
sv_format = {}
#############
# RAW DATA: #
#############
# START:
cells[XLSX_COLS[col] + str(row)] = {"text": record["start"], "format": sv_format}
max_col_len[col] = get_max_len(record["start"], col, max_col_len)
# END:
cells[XLSX_COLS[col + 1] + str(row)] = {"text": record["end"], "format": sv_format}
max_col_len[col + 1] = get_max_len(record["end"], col + 1, max_col_len)
# LENGTH:
cells[XLSX_COLS[col + 2] + str(row)] = {"text": record["length"], "format": sv_format}
max_col_len[col + 2] = get_max_len(record["length"], col + 2, max_col_len)
if "filters" in record and not no_filter:
cells[XLSX_COLS[col + 3] + str(row)] = {"text": record["filters"], "format": sv_format}
max_col_len[col + 3] = get_max_len(record["filters"], col + 3, max_col_len)
###########
# DIFFS: #
###########
if my_start != "":
start = record["start"] - my_start
end = record["end"] - my_end
length = record["length"] - my_length
else:
start = end = length = "NA"
# START:
cells[XLSX_COLS[col] + str(row + nb_records + 3)] = {"text": start, "format": sv_format}
cells[XLSX_COLS[col + 1] + str(row + nb_records + 3)] = {"text": end, "format": sv_format}
cells[XLSX_COLS[col + 2] + str(row + nb_records + 3)] = {"text": length,
# FILTERS:
if "filters" in record and not no_filter:
cells[XLSX_COLS[col + 3] + str(row + nb_records + 3)] = {"text": record["filters"],
"format": sv_format}
return cells, max_col_len
def fill_genotypes_data(row, col, cells_gt, cells_gq, record, my_genotypes, haploid=False):
"""
Fill cells for a tool and a record (genotype/genotype quality parts)
:param row: row position
:param col: column position
:param cells_gt: cells for genotype sheet (2)
:param cells_gq: cells for the genotype quality sheet (3)
:param record: data of the record for a tool
:param my_genotypes: real genotypes for each individual
:return: cells_gt and cells_gq updated
"""
the_genotypes = record["genotypes"]
for gt in range(0, len(the_genotypes)):
true_gt = my_genotypes[gt] if my_genotypes is not None else ""
geno = get_gt(the_genotypes[gt], true_gt)
gt_format = {"bg_color": get_quality_color(int(record["qualities"][gt]) if record["qualities"][gt] is not None else 0)}
if (not haploid and geno != true_gt) or \
(haploid and ((true_gt == "1" and geno != "1/1") or (true_gt == "0" and geno != "0/0"))):
gt_format["font_color"] = "#ff0000"
# Genotype:
cells_gt[XLSX_COLS[col + gt] + str(row)] = {"text": geno, "format": gt_format}
cells_gq[XLSX_COLS[col + gt] + str(row)] = {"text": int(record["qualities"][gt]) if record["qualities"][gt] is not None else 0, "format": gt_format}
return cells_gt, cells_gq
def write_headers(workbook, worksheet, headers, cell_len, rows, filtered_records):
"""
Write header on the given sheet of the XLSX file
:param worksheet: th sheet
:param headers: headers values
:param cell_len: length of each cell of the header
:param rows: (list) rows into write the header
:param filtered_records: (bool) is there filtered records
"""
orig_cell_len = cell_len
for row in rows:
merge_format = workbook.add_format({'bold': True})
worksheet.write(XLSX_COLS[1] + str(row + 1) + ":" + XLSX_COLS[cell_len] + str(row + 1), headers[0], merge_format)
left_cell_lenth = 2
if header not in ["Real data", "Filtered results"]:
cell_len = orig_cell_len + 1
else:
cell_len = orig_cell_len
if i < len(headers) - 2 or not filtered_records:
merge_format = workbook.add_format({'align': 'center'})
else:
merge_format = workbook.add_format({'align': 'center', 'bg_color': COLOR_COL_FILTER})
worksheet.merge_range(XLSX_COLS[left_cell_lenth] + str(row) + ":" +
XLSX_COLS[left_cell_lenth + cell_len - 1] + str(row),
left_cell_lenth += cell_len
def create_xls_document(output, genotypes, headers, filtered_records, nb_records, nb_inds, cells, cells_gt, cells_gq,
max_col_len):
"""
Create the XLSX file
:param output: output directory
:param genotypes: genotypes file
:param headers: headers of each sheet
:param filtered_records: (bool) has filtered records
:param nb_records: number of records
:param nb_inds: number of individuals
:param cells: cells for first sheet (SV description)
:param cells_gt: cells for second sheet (genotypes)
:param cells_gq: cells for third sheet (genotype quality)
:param max_col_len: max content length for each column
"""
try:
import xlsxwriter
except ImportError:
print("\nWARN: Excel file not built: xlsxwriter python module not installed")
with xlsxwriter.Workbook(os.path.join(output, "results.xlsx")) as workbook:
#################################
# First sheet (SV description): #
#################################
worksheet = workbook.add_worksheet("SVs")
write_headers(workbook, worksheet, headers, 3, [1, 1+nb_records+3], filtered_records)
# Body:
for cell_id, cell_content in cells.items():
cell_format = workbook.add_format(cell_content["format"])
worksheet.write(cell_id, cell_content["text"], cell_format)
# Resize columns:
for col, max_len in max_col_len.items():
worksheet.set_column(col, col, max_len+1)
if genotypes:
#############################
# Second sheet (Genotypes): #
#############################
worksheet_gt = workbook.add_worksheet("Genotypes")
write_headers(workbook, worksheet_gt, headers, nb_inds, [1], filtered_records)
# Body:
for cell_id, cell_content in cells_gt.items():
cell_format = workbook.add_format(cell_content["format"])
worksheet_gt.write(cell_id, cell_content["text"], cell_format)
worksheet_gt.freeze_panes(0, 2+nb_inds)
# Resize columns:
worksheet_gt.set_column(0, 0, max_col_len[0]+1)
####################################
# Third sheet (Genotypes quality): #
####################################
worksheet_gq = workbook.add_worksheet("Gt quality")
write_headers(workbook, worksheet_gq, headers, nb_inds, [1], filtered_records)
# Body
for cell_id, cell_content in cells_gq.items():
cell_format = workbook.add_format(cell_content["format"])
worksheet_gq.write(cell_id, cell_content["text"], cell_format)
worksheet_gq.freeze_panes(0, 2+nb_inds)
# Resize columns:
worksheet_gq.set_column(0, 0, max_col_len[0]+1)
def create_tsv_file(filename: str, headers: list, cells: dict, nb_tools: int, nb_per_tool: int, records_range: ()):
"""
Create tabulated separated values file
:param filename: filename of the file
:param headers: headers of each sheet
:param cells: cells of the table to save
:param nb_tools: number of tools
:param nb_per_tool: number per tools
:param records_range: range of records to treat {tuple(2)}
:return:
"""
nb_only_tools = nb_tools # Number of tools, except real data and filtered results
nb_others = 0
# Define top headers to each column:
for i in range(0, nb_per_tool):
top_headers[h] = header
head.append("")
h += 1
if header not in ["Real data", "Filtered results"]:
top_headers[h] = header
head.append("")
h += 1
else:
nb_only_tools -= 1
nb_others += 1
for i in range(0, records_range[1]-records_range[0]):
rows.append(["" for x in range(0, (nb_only_tools * (nb_per_tool + 1)) + (nb_others * nb_per_tool) + 2)])
# Fill content:
for id_cell, cell in cells.items():
id_m = re.match(r"^([A-Z]+)(\d+)$", id_cell)
col = XLSX_COLS.index(id_m.group(1))
row = int(id_m.group(2))
if records_range[0] <= row <= records_range[1]:
r = row - records_range[0]
if r == 0 and col > 0:
if col > 1:
rows[r][col] = top_headers[col].replace(" ", "_") + "__" + cell["text"]
else:
rows[r][col] = headers[0]
else:
rows[r][col] = str(cell["text"])
# List as text:
for r in range(0, len(rows)):
rows[r] = "\t".join(rows[r])
tsv = "\n".join(rows)
with open(filename, "w") as tsv_file:
tsv_file.write(tsv)
def print_results(nb_records, orphans, with_xlsx, output, do_genotype, jupyter_file):
"""
Print list of outputs
:param nb_records: number of records {int}
:param orphans: sv found in tools but not present in real data {dict}
:param with_xlsx: build xlsx file {bool}
:param output: output prefix {str}
:param do_genotype: do the genotype {bool}
"""
print("")
print("###########")
print("# RESULTS #")
print("###########")
print("")
print(str(nb_records) + " Results found")
print(str(orphans) + " False Positive")
print("")
if with_xlsx:
print("Results saved in :\n\t- " + os.path.join(output, "results.xlsx"))
else:
print("Results:")
print("")
print("TSV files:")
print("\t- " + os.path.join(output, "results_sv_per_tools.tsv"))
print("\t- " + os.path.join(output, "results_sv_diffs_per_tools.tsv"))
print("\t- " + os.path.join(output, "results_sv_genotypes_per_tools.tsv"))
print("\t- " + os.path.join(output, "results_sv_genotypes_quality_per_tools.tsv"))
print("")
def fill_cells_no_tools(cells, cells_gt, cells_gq, i, j, g, nb_records, nb_inds, do_genotype):
"""
Fill cells when a tool does not detect a SV
:param cells: cells definition
:param cells_gt: cells definition for genotypes
:param cells_gq: cells definition for genotypes quality
:param i: row index
:param j: column index
:param g: column index for genotypes and genotypes quality tables
:param nb_records: total number of records
:param nb_inds: total number of individuals
:param do_genotype: do the genotypes
:return: cells, completed
"""
for k in range(0, 3):
# noinspection PyUnresolvedReferences
cells[XLSX_COLS[j + k] + str(i)] = {"text": "", "format": {"bg_color": COLOR_NOT_FOUND}}
cells[XLSX_COLS[j + k] + str(i + nb_records + 3)] = {"text": "",
"format": {"bg_color": COLOR_NOT_FOUND}}
# Genotype:
for gt in range(0, nb_inds):
# noinspection PyUnresolvedReferences
cells_gt[XLSX_COLS[g + gt] + str(i)] = cells_gq[XLSX_COLS[g + gt] + str(i)] = \
{"text": "", "format": {"bg_color": COLOR_NOT_FOUND_2}}
return cells, cells_gt, cells_gq
def apply_style_of_filter_cells(cells, cells_gt, cells_gq, i, is_kept, nb_records, nb_inds, nb_tools, filtered_records,
"""
Apply style of cells
:param cells: cells of the default table {dict}
:param cells_gt: cells of the genotypes table {dict}
:param cells_gq: cells of the genotypes quality table {dict}
:param i: row index {int}
:param is_kept: is the variant kept after filtering {bool}
:param nb_records: number of records {int}
:param nb_inds: number of individuals {int}
:param nb_tools: number of tools {int}
:param filtered_records: file containing filtered records {str}
:param do_genotype: do the genotype {bool}
:param rec_id: id of the record {str}
:return:
"""
if is_kept:
# SV is kept: color bg in green in the corresponding tool
cells[XLSX_COLS[0] + str(i)]["format"]["bg_color"] = \
cells[XLSX_COLS[0] + str(i + nb_records + 3)]["format"]["bg_color"] = COLOR_IS_KEPT
elif filtered_records is not None:
# SV does not pass the filter: color bg in red in the filter column
cells[XLSX_COLS[2 + ((nb_tools * 4) + 3)] + str(i)] = {"text": "", "format": {"bg_color": COLOR_NOT_FOUND}}
cells[XLSX_COLS[2 + ((nb_tools * 4) + 3) + 1] + str(i)] = {"text": "",
"format": {"bg_color": COLOR_NOT_FOUND}}
cells[XLSX_COLS[2 + ((nb_tools * 4) + 3) + 2] + str(i)] = {"text": "",
"format": {"bg_color": COLOR_NOT_FOUND}}
cells[XLSX_COLS[2 + ((nb_tools * 4) + 3)] + str(i + nb_records + 3)] = {"text": "",
"format": {
"bg_color": COLOR_NOT_FOUND}}
cells[XLSX_COLS[2 + ((nb_tools * 4) + 3) + 1] + str(i + nb_records + 3)] = {
"text": "", "format": {"bg_color": COLOR_NOT_FOUND}}
cells[XLSX_COLS[2 + ((nb_tools * 4) + 3) + 2] + str(i + nb_records + 3)] = {
"text": "", "format": {"bg_color": COLOR_NOT_FOUND}}
# Genotype:
# Color in gray in the filter column
for gt in range(0, nb_inds):
cells_gt[XLSX_COLS[2 + ((nb_tools + 1) * nb_inds) + gt] + str(i)] = \
cells_gq[XLSX_COLS[2 + ((nb_tools + 1) * nb_inds) + gt] + str(i)] = {"text": "", "format":
{"bg_color": COLOR_NOT_FOUND_2}
}
# False positives (orphans) in orange:
if re.match(r"^orphan_\d+$", rec_id):
cells[XLSX_COLS[0] + str(i)]["format"]["bg_color"] = \
cells[XLSX_COLS[0] + str(i + nb_records + 3)]["format"]["bg_color"] = COLOR_FALSE_POSITIVE
return cells, cells_gt, cells_gq
def build_body_cells(rec_keys, records, nb_records, nb_inds, tools, cells, cells_gt, cells_gq, max_col_len,
nb_tools, do_genotype, haploid, filtered_records):
for rec_id in rec_keys:
record = records[rec_id]
my_start = record["start"]
my_end = record["end"]
my_length = record["length"]
my_genotypes = record["genotypes"]
# Real data of the simulation:
cells, cells_gt, cells_gq, max_col_len = fill_real_data(i, cells, cells_gt, cells_gq, max_col_len, record,
rec_id, nb_records, record["chromosome"])
is_kept = False
for tool in tools:
if tool in record["tools"]:
###########################
# IF TOOL DETECTS THE SV: #
###########################
######################
# SET FILTERED DATA: #
######################
sv_format = {"bg_color": COLOR_IS_KEPT}
cells, max_col_len = fill_tool_data(i, 2 + ((nb_tools * 4) + 3), cells, max_col_len,
record["tools"][tool], nb_records, my_start, my_end, my_length,
{"bg_color": COLOR_COL_FILTER}, True)
# Genotype (sheets 2&3):
cells_gt, cells_gq = fill_genotypes_data(i, 2 + ((nb_tools + 1) * nb_inds), cells_gt, cells_gq,
record["tools"][tool], my_genotypes, haploid)
#################
# SET TOOL DATA #
#################
# SV data (sheet 1):
cells, max_col_len = fill_tool_data(i, j, cells, max_col_len,
record["tools"][tool], nb_records, my_start, my_end, my_length,
sv_format)
# Genotype (sheets 2&3):
cells_gt, cells_gq = fill_genotypes_data(i, g, cells_gt, cells_gq,
record["tools"][tool], my_genotypes, haploid)
###############################
# TOOL DOES NOT DETECT THE SV #
###############################
cells, cells_gt, cells_gq = fill_cells_no_tools(cells, cells_gt, cells_gq, i, j, g, nb_records, nb_inds,
###############################################################################
# Until we have filled all tools, check if the record is kept after filtering: #
###############################################################################
if filtered_records is not None:
cells, cells_gt, cells_gq = apply_style_of_filter_cells(cells, cells_gt, cells_gq, i, is_kept, nb_records,
nb_inds, nb_tools, filtered_records, do_genotype,
rec_id)
return cells, cells_gt, cells_gq, max_col_len
for alp in ALPHABET:
for j in ALPHABET:
XLSX_COLS.append(alp + j)
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
def _format_cell(content, length):
content = str(content)
if len(content) < length:
content += " " * (length - len(content))
return content
def add_jupyter_header(records: dict, groups: list, type_v: str, jupyter_file: str):
summary = ["Type of variants: %s.\n" % type_v.upper(), "\n", "Counts by size:\n", "\n"]
count_by_group = {group: 0 for group in groups}
count_by_chromosomes = {}
total = 0
for idr, record in records.items():
if not idr.startswith("orphan_"):
chromosome = record["chromosome"]
if chromosome not in count_by_chromosomes:
count_by_chromosomes[chromosome] = 0
count_by_chromosomes[chromosome] += 1
for group in groups:
if group[0] < record["length"] <= group[1]:
count_by_group[group] += 1
total += 1
break
summary += ["min size | max size | count | percent\n",
":------: | :------: | :---: | :-----:\n"]
for group in groups:
summary.append("{start} | {end} | {count} | {percent} %\n".format(
start=_format_cell(group[0], 8),
end=_format_cell(group[1], 8),
count=_format_cell(count_by_group[group], 5),
percent=_format_cell((count_by_group[group] * 100) // total, 7))
)
summary += ["\n", "Counts by chromosome:\n", "\n",
"chromosome | count | percent\n",
":--------: | :---: | :-----:\n"]
for chromosome, count in count_by_chromosomes.items():
summary.append("{chr} | {count} | {percent} %\n".format(
chr=_format_cell(chromosome, 10),
count=_format_cell(count, 5),
percent=_format_cell((count * 100) // total, 7)
))
with open(jupyter_file, "r") as jupy_f:
jupy = json.loads(jupy_f.read())
jupy["cells"][2]["source"] = summary
with open(jupyter_file, "w") as jupy_f:
jupy_f.write(json.dumps(jupy, indent=4))
def init(output, vcf_files, true_vcf, rules, filtered_vcfs=None, type_v="del", overlap_cutoff=0.5,
left_precision=sys.maxsize, right_precision=sys.maxsize, no_xls=False, haploid=False):
if not os.path.exists(output):
os.makedirs(output)
genotypes = {}
gt_quality = {}
nb_inds = 0
filtered = None
filtered_all = None
filtered_records = None
do_genotype = False
if filtered_vcfs:
filtered = {}
filtered_all = {}
for filtered_vcf in filtered_vcfs:
eprint(" Reading file %s" % filtered_vcf)
vcf.readvariants(filtered_vcf, type_v, filtered, True, filtered_all)
filtered_records = filtered.keys()
genotypes, gt_quality, nb_inds = get_genotypes(filtered_vcfs, true_vcf)
do_genotype = True
# Reading all the vcf files
sv_set = []
for infile in vcf_files:
eprint(" Reading file %s" % infile)
try:
sv_set += vcf.readvariants(infile, type_v).values()
except:
print("Ignoreing file %s" % infile)
eprint(" Reading file %s" % true_vcf)
true_variants = vcf.readvariants(true_vcf, type_v)
sv_set += true_variants.values()
# Compute connected components:
eprint("Computing Connected components")
construct_overlap_graph(sv_set, overlap_cutoff, left_precision, right_precision)
# Build records:
records, tools, orphans = build_records(genotypes, sv_set, true_variants.keys(), filtered_records, gt_quality,
filtered_all)
nb_records = len(records)
#################################################
# Define cells of each sheet of the excel file: #
#################################################
tools = sorted(tools)
nb_tools = len(tools)
max_col_len = {}
######################
# BUILD HEADER CELLS #
######################
headers, cells, cells_gt, cells_gq, max_col_len = build_header(tools, filtered_records is not None, nb_records,
max_col_len, nb_inds)
rec_keys = sorted(records.keys(), key=lambda x: (records[x]["chromosome"], svsort(x, records)))
####################
# BUILD BODY CELLS #
####################
cells, cells_gt, cells_gq, max_col_len = build_body_cells(rec_keys, records, nb_records, nb_inds, tools, cells,
cells_gt, cells_gq, max_col_len, nb_tools, do_genotype,
haploid, filtered_records)
if not no_xls:
with_xlsx = create_xls_document(output, genotypes, headers, filtered_records is not None, nb_records, nb_inds,
cells, cells_gt, cells_gq, max_col_len)
create_tsv_file(os.path.join(output, "results_sv_per_tools.tsv"), headers, cells,
nb_tools + (2 if filtered_records is not None else 1),
3, (2, nb_records + 2))
create_tsv_file(os.path.join(output, "results_sv_diffs_per_tools.tsv"), headers, cells,
nb_tools + (2 if filtered_records is not None else 1),
3, (2 + nb_records + 3, nb_records * 2 + 5))
create_tsv_file(os.path.join(output, "results_sv_genotypes_per_tools.tsv"), headers, cells_gt,
nb_tools + (2 if filtered_records is not None else 1),
nb_inds, (2, nb_records + 2))
create_tsv_file(os.path.join(output, "results_sv_genotypes_quality_per_tools.tsv"), headers, cells_gq,
nb_tools + (2 if filtered_records is not None else 1),
nb_inds, (2, nb_records + 2))
###############################
# Build genotypes result file #
###############################
gtres.build(true_genotypes=true_variants,
pred_genotypes=filtered_all,
filtered_genotypes=filtered,
output=os.path.join(output, "results_genotype.tsv"))
#######################################
# Build Jupyter HTML notebook summary #
#######################################