Skip to content
Snippets Groups Projects
build_results.py 41.4 KiB
Newer Older
#! /usr/bin/env python3

"""mergeSV  : merge SV on the basis of their intervals

The merging is done by identifiying connected components in a graph.

The graph is constructing taking the SV a node and an edge connects two nodes
if the conresponding intervals exhibits a reciprocal overlap of a least R0%.

Additional constraints on the precision of breakpoints (left and right) can be specified.

Each connected component is termed a CNVR (Copy Number Variation Region)
and is associated to the corresponding CNVs.

"""

import string
import re
import json
from pysam import VariantFile

import lib.genotype_results as gtres
import lib.vcf as vcf

import sys
prg_path = os.path.dirname(os.path.realpath(__file__))
Floreal Cabanettes's avatar
Floreal Cabanettes committed
sys.path.insert(0, os.path.join(prg_path, "svlib"))

from svinterval import construct_overlap_graph, connected_components
ALPHABET = list(string.ascii_uppercase)
XLSX_COLS = ALPHABET.copy()
COLOR_NOT_FOUND = "#FE2E2E"
COLOR_NOT_FOUND_2 = "#dddddd"
COLOR_COL_FILTER = "#BEF781"
COLOR_IS_KEPT = "#81F781"
COLOR_FALSE_POSITIVE = "#FE642E"
COLOR_WRONG_GT = "#B40404"


def get_args():
    """
    Get arguments from argparse
    :return: argparse arguments object
    """
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="\
Build Results \n \
description: Build results of the simulated data detection")
    parser.add_argument('-v', '--vcfs', type=str, required=True, help='File listing vcf files for each detection tool')
    parser.add_argument('-t', '--true-vcf', type=str, required=True, help='VCF file containing the simulated deletions')
    parser.add_argument('-f', '--filtered-vcf', type=str, required=True,
                        help='File listing VCF files containing the filtered results')
    parser.add_argument('-y', '--type', required=True, type=str, choices=vcf.ALLOW_VARIANTS, help="Type of variant")
    parser.add_argument('--overlap_cutoff',  type=float, default=0.5, help='cutoff for reciprocal overlap')
    parser.add_argument('--left_precision',  type=int, default=-1, help='left breakpoint precision')
    parser.add_argument('--right_precision',  type=int, default=-1, help='right breakpoint precision')
    parser.add_argument('-o', '--output', type=str, default="results", help='output folder')
    parser.add_argument('--no-xls', action='store_const', const=True, default=False, help='Do not build Excel file')
    parser.add_argument('--haploid', action='store_const', const=True, default=False, help='The organism is haploid')
    parser.add_argument('-r', '--rules', type=str, required=False, help="Simulation rule file")
    # parse the arguments
    args = parser.parse_args()

    if args.left_precision == -1:
        args.left_precision = sys.maxsize

    if args.right_precision == -1:
        args.right_precision = sys.maxsize

    if args.rules is None:
        args.rules = os.path.join(prg_path, "defaults.rules")

    # send back the user input
    return args

def eprint(*args, **kwargs):
    print(*args, file=sys.stderr, **kwargs)

Thomas Faraut's avatar
Thomas Faraut committed
    
def svsort(sv, records):
    """
    Function to sort regions
    """
    if records[sv]["start"] != "":
        start =  str(records[sv]["start"])
    else:
        first_tool = list(records[sv]["tools"].keys())[0]
        start = str(records[sv]["tools"][first_tool]["start"])
Thomas Faraut's avatar
Thomas Faraut committed
    return start    
    

def get_gt(geno, true_gt):
    """
    Get the genotype: consider 1/0 and 0/1 as equivalent (we can't identify a specific chromosome)
    We want geno equal to true_gt in these cases
    :param geno: the genotype
    :param true_gt: the true genotype
    :return: the genotype
    """
    if true_gt is not None:
        if geno == "1/0" and true_gt == "0/1":
            geno = "0/1"
        elif geno == "0/1" and true_gt == "1/0":
            geno = "1/0"
    return geno


def get_max_len(cell, col, max_col_len):
    """
    Get the max content length of a column
    :param cell: the cell content
    :param col: the column number
    :param max_col_len: current max content length for each column
    :return: the new max content length for the given column
    """
    return max(len(str(cell)), max_col_len[col] if col in max_col_len else 0)


def get_quality_color(quality):
    """
    Returns the color associated to a genotype quality value
    :param quality: the quality value
    :return: the associated color
    """
    color_very_low_quality = "#FE2E2E"
    color_low_quality = "#FE9A2E"
    color_medium_quality = "#FFFF00"
    color_high_quality = "#81F781"
    if quality > 60:
        return color_high_quality
    elif quality > 40:
        return color_medium_quality
    elif quality > 20:
        return color_low_quality
    else:
        return color_very_low_quality


def get_genotypes(genotypes_files, true_vcf_file):
    """
    Get genotype of each individual for each SV
    :param genotypes_file: VCF file containing genotypes
    :param true_vcf_file: VCF file containing real data
    :return: genotypes of each individual for each SV, quality of each genotype, number of individuals
    """
    reader_t = VariantFile(true_vcf_file)
    samples_t = None
    for rec_t in reader_t:
        samples_t = rec_t.samples
        genotypes[rec_t.id] = ["/".join(map(str, samples_t[x]["GT"])) for x in samples_t]

    nb_inds = len(list(genotypes.values())[0])

    # Samples:
    for genotypes_file in genotypes_files:
        reader = VariantFile(genotypes_file)
        for rec in reader:
            samples = rec.samples
            genotypes[rec.id] = ["/".join(map(str, samples[x]["GT"])) for x in samples_t.keys()]  # Fixed: use samples keys
            # from real data to keep the same order
            gt_quality[rec.id] = [samples[x]["GQ"] for x in samples_t.keys()]

    return genotypes, gt_quality, nb_inds


def build_records(genotypes, SVSet, true_ones_records, filtered_records, gt_quality, all_variants_filter):
    """
    Build records for each SV
    :param genotypes: list of genotypes of each individual for each SV
    :param SVSet: set of all SVs
    :param true_ones_records: records of the real data
    :param filtered_records: records of the filtered data
    :param gt_quality: list of qualities of each genotype of individuals for each SV
    :return: records dict, tools set, list of orphans records (associated to None real data)
    """
    number = 1
    records = {}
    tools = set()
    orphans = 0
    for components in connected_components(SVSet):
        names = []
        results = {}
        true_one = None

        chromosome = None

        for node in components:
            names.append(node.id)
            if node.id in true_ones_records:
                true_one = node.id
                records[true_one] = {
                    "start": node.start,
                    "end": node.end,
                    "length": int(node.end) - int(node.start),
                    "tools": {},
                    "genotypes": genotypes[true_one] if true_one in genotypes else None,
                    "chromosome": node.chrom
                }
            else:
                tool_name = node.id.split("_")[0]
                results[tool_name] = {
                    "start": node.start,
                    "end": node.end,
                    "length": int(node.end) - int(node.start),
                    "filtered": (node.id in filtered_records) if filtered_records is not None else False,
                    "filters": ",".join([x for x in all_variants_filter[node.id].filter]) if
                               node.id in all_variants_filter else "",
                    "genotypes": genotypes[node.id] if node.id in genotypes else None,
                    "qualities": gt_quality[node.id] if node.id in gt_quality else None
                if results[tool_name]["filters"] == "":
                    results[tool_name]["filters"] = "PASS"
                tools.add(tool_name)
                chromosome = node.chrom

        if true_one is not None:
            records[true_one]["tools"] = results
        else:
            orphans += 1
            records["orphan_" + str(orphans)] = {
                "start": "",
                "end": "",
                "length": "",
                "tools": results,
                "genotypes": None,
                "chromosome": chromosome
            }

        names.sort()

        eprint("Group #%i: %s" % (number, ", ".join(names)))
        number += 1

    return records, tools, orphans


def build_header(tools, filtered_records, nb_records, max_col_len, nb_inds):
    """
    Build tools headers, and header cells for each tool
    :param tools: list of tools
    :param cells: cells of the first sheet (sv description)
    :param cells_gt: cells of the second sheet (genotype)
    :param cells_gq: cells of the third sheet (genotype quality)
    :param filtered_records: (bool) is there filtered records
    :param nb_records: number of records
    :param max_col_len: max content length for each column
    :param nb_inds: number of individuals
    :return: headers (list of tools + read and filtered data) ; cells, cells_gt, cells_gq and max_col_len updated
    """
    # First one (SV description)
    cells = {
        "A1": {"text": "RESULTS", "format": {"bg_color": "#ffe856"}},
        "A2": {"text": "Deletion", "format": {"bold": True}},
        "A" + str(1 + nb_records + 3): {"text": "DIFFS", "format": {"bg_color": "#ffe856"}},
        "A" + str(2 + nb_records + 3): {"text": "Deletion", "format": {"bold": True}}
    }

    # Second one (genotype)
    cells_gt = {
        "A1": {"text": "GENOTYPES", "format": {"bg_color": "#ffe856"}},
        "A2": {"text": "Deletion", "format": {"bold": True}}
    }

    # Third one (genotype quality)
    cells_gq = {
        "A1": {"text": "GT QUALITY", "format": {"bg_color": "#ffe856"}},
        "A2": {"text": "Deletion", "format": {"bold": True}}
    }

    i = 2
    j = 2
    headers = ["Chr"]
    for tool in ["Real data"] + tools + (["Filtered results"] if filtered_records else []):
        headers.append(tool)
        l_format = {"bold": True}
        if tool == "Filtered results":
            l_format["bg_color"] = COLOR_COL_FILTER
        cells[XLSX_COLS[i] + "2"] = cells[XLSX_COLS[i] + str(2 + nb_records + 3)] = {"text": "Start",
                                                                                     "format": l_format}
        max_col_len[i] = get_max_len("Start", i, max_col_len)
        cells[XLSX_COLS[i + 1] + "2"] = cells[XLSX_COLS[i + 1] + str(2 + nb_records + 3)] = {"text": "End",
                                                                                             "format": l_format}
        max_col_len[i + 1] = get_max_len("End", i + 1, max_col_len)
        cells[XLSX_COLS[i + 2] + "2"] = cells[XLSX_COLS[i + 2] + str(2 + nb_records + 3)] = {"text": "Length",
                                                                                             "format": l_format}
        max_col_len[i + 2] = get_max_len("Length", i + 2, max_col_len)

        if tool not in ["Real data", "Filtered results"]:
            cells[XLSX_COLS[i + 3] + "2"] = cells[XLSX_COLS[i + 3] + str(2 + nb_records + 3)] = {"text": "Filters",
                                                                                                 "format": l_format}
            max_col_len[i + 3] = get_max_len("Filters", i + 3, max_col_len)
            i += 1

        # Genotypes:
        for k in range(0, nb_inds):
            cells_gt[XLSX_COLS[j + k] + "2"] = cells_gq[XLSX_COLS[j + k] + "2"] = \
                {"text": "indiv" + str(k + 1), "format": {"bold": True}}

        i += 3
        j += nb_inds

    return headers, cells, cells_gt, cells_gq, max_col_len


def fill_real_data(row, cells, cells_gt, cells_gq, max_col_len, record, rec_id, nb_records, chromosome):
    """
    Fill cells of the first data column (real data simulated)
    :param cells: cells of the first sheet (sv description)
    :param cells_gt: cells of the second sheet (genotype)
    :param cells_gq: cells of the third sheet (genotype quality)
    :param max_col_len: max text length for each column
    :param record: data of the record
    :param nb_records: number of records (total)
    :return: cells, cells_gt, cells_gq and max_col_len updated
    """
    # SV ID:
    cells[XLSX_COLS[0] + str(row)] = cells[XLSX_COLS[0] + str(row + nb_records + 3)] = {"text": rec_id, "format": {}}
    max_col_len[0] = get_max_len(rec_id, 0, max_col_len)
    cells_gt[XLSX_COLS[0] + str(row)] = cells_gq[XLSX_COLS[0] + str(row)] = {"text": rec_id, "format": {}}
    # Chromosome:
    cells[XLSX_COLS[1] + str(row)] = cells[XLSX_COLS[1] + str(row + nb_records + 3)] = {"text": chromosome,
                                                                                        "format": {}}
    cells_gt[XLSX_COLS[1] + str(row)] = cells_gq[XLSX_COLS[1] + str(row)] = {"text": chromosome, "format": {}}
    cells[XLSX_COLS[2] + str(row)] = cells[XLSX_COLS[2] + str(row + nb_records + 3)] = {"text": record["start"],
    max_col_len[2] = get_max_len(record["start"], 2, max_col_len)
    cells[XLSX_COLS[3] + str(row)] = cells[XLSX_COLS[3] + str(row + nb_records + 3)] = {"text": record["end"],
    max_col_len[3] = get_max_len(record["end"], 3, max_col_len)
    cells[XLSX_COLS[4] + str(row)] = cells[XLSX_COLS[4] + str(row + nb_records + 3)] = {"text": record["length"],
    max_col_len[4] = get_max_len(record["length"], 4, max_col_len)

    # GENOTYPES:
    if record["genotypes"] is not None:
        for gt in range(0, len(record["genotypes"])):
            cells_gt[XLSX_COLS[2 + gt] + str(row)] = cells_gq[XLSX_COLS[2 + 0 + gt] + str(row)] = \
                {"text": record["genotypes"][gt], "format": {}}

    return cells, cells_gt, cells_gq, max_col_len


def fill_tool_data(row, col, cells, max_col_len, record, nb_records, my_start, my_end, my_length,
                   sv_format=None, no_filter=False):
    """
    Fill cells for a tool and a record
    :param row: row position
    :param col: columns position
    :param cells: cells of the first sheet (sv description)
    :param max_col_len: max text length for each column
    :param record: data of the record for a tool
    :param nb_records: number of records (total)
    :param my_start: real start of the SV
    :param my_end: real end of the SV
    :param my_length: real length of the SV
    :param sv_format: format for the cell
    :return: cells and max_col_len updated
    """

    if sv_format is None:
        sv_format = {}

    #############
    # RAW DATA: #
    #############

    # START:
    cells[XLSX_COLS[col] + str(row)] = {"text": record["start"], "format": sv_format}
    max_col_len[col] = get_max_len(record["start"], col, max_col_len)

    # END:
    cells[XLSX_COLS[col + 1] + str(row)] = {"text": record["end"], "format": sv_format}
    max_col_len[col + 1] = get_max_len(record["end"], col + 1, max_col_len)

    # LENGTH:
    cells[XLSX_COLS[col + 2] + str(row)] = {"text": record["length"], "format": sv_format}
    max_col_len[col + 2] = get_max_len(record["length"], col + 2, max_col_len)

    if "filters" in record and not no_filter:
        cells[XLSX_COLS[col + 3] + str(row)] = {"text": record["filters"], "format": sv_format}
        max_col_len[col + 3] = get_max_len(record["filters"], col + 3, max_col_len)

    ###########
    #  DIFFS: #
    ###########

    if my_start != "":
        start = record["start"] - my_start
        end = record["end"] - my_end
        length = record["length"] - my_length
    else:
        start = end = length = "NA"

    # START:
    cells[XLSX_COLS[col] + str(row + nb_records + 3)] = {"text": start, "format": sv_format}
    cells[XLSX_COLS[col + 1] + str(row + nb_records + 3)] = {"text": end, "format": sv_format}
    cells[XLSX_COLS[col + 2] + str(row + nb_records + 3)] = {"text": length,
    # FILTERS:
    if "filters" in record and not no_filter:
        cells[XLSX_COLS[col + 3] + str(row + nb_records + 3)] = {"text": record["filters"],
                                                                 "format": sv_format}

def fill_genotypes_data(row, col, cells_gt, cells_gq, record, my_genotypes, haploid=False):
    """
    Fill cells for a tool and a record (genotype/genotype quality parts)
    :param row: row position
    :param col: column position
    :param cells_gt: cells for genotype sheet (2)
    :param cells_gq: cells for the genotype quality sheet (3)
    :param record: data of the record for a tool
    :param my_genotypes: real genotypes for each individual
    :return: cells_gt and cells_gq updated
    """

    the_genotypes = record["genotypes"]
    for gt in range(0, len(the_genotypes)):
        true_gt = my_genotypes[gt] if my_genotypes is not None else ""
        geno = get_gt(the_genotypes[gt], true_gt)
        if geno == "None":
            geno = "N/N"
        gt_format = {"bg_color": get_quality_color(int(record["qualities"][gt]) if record["qualities"][gt] is not None else 0)}
        if (not haploid and geno != true_gt) or \
                (haploid and ((true_gt == "1" and geno != "1/1") or (true_gt == "0" and geno != "0/0"))):
            gt_format["font_color"] = "#ff0000"

        # Genotype:
        cells_gt[XLSX_COLS[col + gt] + str(row)] = {"text": geno, "format": gt_format}
        cells_gq[XLSX_COLS[col + gt] + str(row)] = {"text": int(record["qualities"][gt]) if record["qualities"][gt] is not None else 0, "format": gt_format}

    return cells_gt, cells_gq


def write_headers(workbook, worksheet, headers, cell_len, rows, filtered_records):
    """
    Write header on the given sheet of the XLSX file
Floreal Cabanettes's avatar
Floreal Cabanettes committed
    :param workbook: the XLSX file workbook
    :param worksheet: th sheet
    :param headers: headers values
    :param cell_len: length of each cell of the header
    :param rows: (list) rows into write the header
    :param filtered_records: (bool) is there filtered records
    """
    for row in rows:
        merge_format = workbook.add_format({'bold': True})
        worksheet.write(XLSX_COLS[1] + str(row + 1) + ":" + XLSX_COLS[cell_len] + str(row + 1), headers[0], merge_format)
    for header in headers[1:]:
        if header not in ["Real data", "Filtered results"]:
            cell_len = orig_cell_len + 1
        else:
            cell_len = orig_cell_len
        if i < len(headers) - 2 or not filtered_records:
            merge_format = workbook.add_format({'align': 'center'})
        else:
            merge_format = workbook.add_format({'align': 'center', 'bg_color': COLOR_COL_FILTER})
            worksheet.merge_range(XLSX_COLS[left_cell_lenth] + str(row) + ":" +
                                  XLSX_COLS[left_cell_lenth + cell_len - 1] + str(row),
                                  header, merge_format)
def create_xls_document(output, genotypes, headers, filtered_records, nb_records, nb_inds, cells, cells_gt, cells_gq,
                        max_col_len):
    """
    Create the XLSX file
    :param output: output directory
    :param genotypes: genotypes file
    :param headers: headers of each sheet
    :param filtered_records: (bool) has filtered records
    :param nb_records: number of records
    :param nb_inds: number of individuals
    :param cells: cells for first sheet (SV description)
    :param cells_gt: cells for second sheet (genotypes)
    :param cells_gq: cells for third sheet (genotype quality)
    :param max_col_len: max content length for each column
    """

    try:
        import xlsxwriter
    except ImportError:
Floreal Cabanettes's avatar
Floreal Cabanettes committed
        print("\nWARN: Excel file not built: xlsxwriter python module not installed")
    with xlsxwriter.Workbook(os.path.join(output, "results.xlsx")) as workbook:

        #################################
        # First sheet (SV description): #
        #################################

        worksheet = workbook.add_worksheet("SVs")
        write_headers(workbook, worksheet, headers, 3, [1, 1+nb_records+3], filtered_records)

        # Body:
        for cell_id, cell_content in cells.items():
            cell_format = workbook.add_format(cell_content["format"])
            worksheet.write(cell_id, cell_content["text"], cell_format)

        # Resize columns:
        for col, max_len in max_col_len.items():
            worksheet.set_column(col, col, max_len+1)


            #############################
            # Second sheet (Genotypes): #
            #############################

            worksheet_gt = workbook.add_worksheet("Genotypes")
            write_headers(workbook, worksheet_gt, headers, nb_inds, [1], filtered_records)

            # Body:
            for cell_id, cell_content in cells_gt.items():
                cell_format = workbook.add_format(cell_content["format"])
                worksheet_gt.write(cell_id, cell_content["text"], cell_format)
                worksheet_gt.freeze_panes(0, 2+nb_inds)

            # Resize columns:
            worksheet_gt.set_column(0, 0, max_col_len[0]+1)

            ####################################
            # Third sheet (Genotypes quality): #
            ####################################

            worksheet_gq = workbook.add_worksheet("Gt quality")
            write_headers(workbook, worksheet_gq, headers, nb_inds, [1], filtered_records)

            # Body
            for cell_id, cell_content in cells_gq.items():
                cell_format = workbook.add_format(cell_content["format"])
                worksheet_gq.write(cell_id, cell_content["text"], cell_format)
                worksheet_gq.freeze_panes(0, 2+nb_inds)

            # Resize columns:
            worksheet_gq.set_column(0, 0, max_col_len[0]+1)
Floreal Cabanettes's avatar
Floreal Cabanettes committed
    return True
def create_tsv_file(filename: str, headers: list, cells: dict, nb_tools: int, nb_per_tool: int, records_range: ()):
    """
    Create tabulated separated values file
    :param filename: filename of the file
    :param headers: headers of each sheet
    :param cells: cells of the table to save
    :param nb_tools: number of tools
    :param nb_per_tool: number per tools
    :param records_range: range of records to treat {tuple(2)}
    :return:
    """
    # Init rows:
    head = ["", headers[0]]
    top_headers = {}
    nb_only_tools = nb_tools # Number of tools, except real data and filtered results
    nb_others = 0
    for header in headers[1:]:
        # Define top headers to each column:
        for i in range(0, nb_per_tool):
            top_headers[h] = header
            head.append("")
            h += 1
        if header not in ["Real data", "Filtered results"]:
            top_headers[h] = header
            head.append("")
            h += 1
        else:
            nb_only_tools -= 1
            nb_others += 1
    rows = [head]
    for i in range(0, records_range[1]-records_range[0]):
        rows.append(["" for x in range(0, (nb_only_tools * (nb_per_tool + 1)) + (nb_others * nb_per_tool) + 2)])

    # Fill content:
    for id_cell, cell in cells.items():
        id_m = re.match(r"^([A-Z]+)(\d+)$", id_cell)
        col = XLSX_COLS.index(id_m.group(1))
        row = int(id_m.group(2))
        if records_range[0] <= row <= records_range[1]:
            r = row - records_range[0]
            if r == 0 and col > 0:
                if col > 1:
                    rows[r][col] = top_headers[col].replace(" ", "_") + "__" + cell["text"]
                else:
                    rows[r][col] = headers[0]
            else:
                rows[r][col] = str(cell["text"])

    # List as text:
    for r in range(0, len(rows)):
        rows[r] = "\t".join(rows[r])
    tsv = "\n".join(rows)
    with open(filename, "w") as tsv_file:
        tsv_file.write(tsv)


def print_results(nb_records, orphans, with_xlsx, output, do_genotype, jupyter_file):
    """
    Print list of outputs
    :param nb_records: number of records {int}
    :param orphans: sv found in tools but not present in real data {dict}
    :param with_xlsx: build xlsx file {bool}
    :param output: output prefix {str}
    :param do_genotype: do the genotype {bool}
    """
    print("")
    print("###########")
    print("# RESULTS #")
    print("###########")
    print("")
    print(str(nb_records) + " Results found")
    print(str(orphans) + " False Positive")
    print("")
    if with_xlsx:
        print("Results saved in :\n\t- " + os.path.join(output, "results.xlsx"))
    else:
        print("Results:")
    print("")
    print("TSV files:")
Floreal Cabanettes's avatar
Floreal Cabanettes committed
    print("\t- " + os.path.join(output, "results_sv_per_tools.tsv"))
    print("\t- " + os.path.join(output, "results_sv_diffs_per_tools.tsv"))
        print("\t- " + os.path.join(output, "results_sv_genotypes_per_tools.tsv"))
        print("\t- " + os.path.join(output, "results_sv_genotypes_quality_per_tools.tsv"))
    print("Summary:")
    print("\t- " + jupyter_file)
    print("")
def fill_cells_no_tools(cells, cells_gt, cells_gq, i, j, g, nb_records, nb_inds, do_genotype):
    """
    Fill cells when a tool does not detect a SV
    :param cells: cells definition
    :param cells_gt: cells definition for genotypes
    :param cells_gq: cells definition for genotypes quality
    :param i: row index
    :param j: column index
    :param g: column index for genotypes and genotypes quality tables
    :param nb_records: total number of records
    :param nb_inds: total number of individuals
    :param do_genotype: do the genotypes
    :return: cells, completed
    """
    for k in range(0, 3):
        # noinspection PyUnresolvedReferences
        cells[XLSX_COLS[j + k] + str(i)] = {"text": "", "format": {"bg_color": COLOR_NOT_FOUND}}
        cells[XLSX_COLS[j + k] + str(i + nb_records + 3)] = {"text": "",
                                                             "format": {"bg_color": COLOR_NOT_FOUND}}
            for gt in range(0, nb_inds):
                # noinspection PyUnresolvedReferences
                cells_gt[XLSX_COLS[g + gt] + str(i)] = cells_gq[XLSX_COLS[g + gt] + str(i)] = \
                    {"text": "", "format": {"bg_color": COLOR_NOT_FOUND_2}}
    return cells, cells_gt, cells_gq
def apply_style_of_filter_cells(cells, cells_gt, cells_gq, i, is_kept, nb_records, nb_inds, nb_tools, filtered_records,
                   do_genotype, rec_id):
    """
    Apply style of cells
    :param cells: cells of the default table {dict}
    :param cells_gt: cells of the genotypes table {dict}
    :param cells_gq: cells of the genotypes quality table {dict}
    :param i: row index {int}
    :param is_kept: is the variant kept after filtering {bool}
    :param nb_records:  number of records {int}
    :param nb_inds: number of individuals {int}
    :param nb_tools: number of tools {int}
    :param filtered_records: file containing filtered records {str}
    :param do_genotype: do the genotype {bool}
    :param rec_id: id of the record {str}
    :return:
    """
    if is_kept:
        # SV is kept: color bg in green in the corresponding tool
        cells[XLSX_COLS[0] + str(i)]["format"]["bg_color"] = \
            cells[XLSX_COLS[0] + str(i + nb_records + 3)]["format"]["bg_color"] = COLOR_IS_KEPT
    elif filtered_records is not None:
        # SV does not pass the filter: color bg in red in the filter column
        cells[XLSX_COLS[2 + ((nb_tools * 4) + 3)] + str(i)] = {"text": "", "format": {"bg_color": COLOR_NOT_FOUND}}
        cells[XLSX_COLS[2 + ((nb_tools * 4) + 3) + 1] + str(i)] = {"text": "",
                                                                   "format": {"bg_color": COLOR_NOT_FOUND}}
        cells[XLSX_COLS[2 + ((nb_tools * 4) + 3) + 2] + str(i)] = {"text": "",
                                                                   "format": {"bg_color": COLOR_NOT_FOUND}}
        cells[XLSX_COLS[2 + ((nb_tools * 4) + 3)] + str(i + nb_records + 3)] = {"text": "",
                                                                                "format": {
                                                                                    "bg_color": COLOR_NOT_FOUND}}
        cells[XLSX_COLS[2 + ((nb_tools * 4) + 3) + 1] + str(i + nb_records + 3)] = {
            "text": "", "format": {"bg_color": COLOR_NOT_FOUND}}
        cells[XLSX_COLS[2 + ((nb_tools * 4) + 3) + 2] + str(i + nb_records + 3)] = {
            "text": "", "format": {"bg_color": COLOR_NOT_FOUND}}
            # Color in gray in the filter column
            for gt in range(0, nb_inds):
                cells_gt[XLSX_COLS[2 + ((nb_tools + 1) * nb_inds) + gt] + str(i)] = \
                    cells_gq[XLSX_COLS[2 + ((nb_tools + 1) * nb_inds) + gt] + str(i)] = {"text": "", "format":
                                                                                         {"bg_color": COLOR_NOT_FOUND_2}
                                                                                         }
    # False positives (orphans) in orange:
    if re.match(r"^orphan_\d+$", rec_id):
        cells[XLSX_COLS[0] + str(i)]["format"]["bg_color"] = \
            cells[XLSX_COLS[0] + str(i + nb_records + 3)]["format"]["bg_color"] = COLOR_FALSE_POSITIVE
def build_body_cells(rec_keys, records, nb_records, nb_inds, tools, cells, cells_gt, cells_gq, max_col_len,
                     nb_tools, do_genotype, haploid, filtered_records):
    for rec_id in rec_keys:
        record = records[rec_id]
        my_start = record["start"]
        my_end = record["end"]
        my_length = record["length"]
        my_genotypes = record["genotypes"]
        # Real data of the simulation:
        cells, cells_gt, cells_gq, max_col_len = fill_real_data(i, cells, cells_gt, cells_gq, max_col_len, record,
                                                                rec_id, nb_records, record["chromosome"])
        j = 5
        g = nb_inds + 2
        is_kept = False
        for tool in tools:
            if tool in record["tools"]:

                ###########################
                # IF TOOL DETECTS THE SV: #
                ###########################

                if record["tools"][tool]["filtered"]:

                    ######################
                    # SET FILTERED DATA: #
                    ######################

                    is_kept = True
                    sv_format = {"bg_color": COLOR_IS_KEPT}
                    # SV data (sheet 1):
                    cells, max_col_len = fill_tool_data(i, 2 + ((nb_tools * 4) + 3), cells, max_col_len,
                                                        record["tools"][tool], nb_records, my_start, my_end, my_length,
                                                        {"bg_color": COLOR_COL_FILTER}, True)
                        # Genotype (sheets 2&3):
                        cells_gt, cells_gq = fill_genotypes_data(i, 2 + ((nb_tools + 1) * nb_inds), cells_gt, cells_gq,
                                                                 record["tools"][tool], my_genotypes, haploid)
                else:
                    sv_format = {}

                #################
                # SET TOOL DATA #
                #################

                # SV data (sheet 1):
                cells, max_col_len = fill_tool_data(i, j, cells, max_col_len,
                                                    record["tools"][tool], nb_records, my_start, my_end, my_length,
                                                    sv_format)
                    # Genotype (sheets 2&3):
                    cells_gt, cells_gq = fill_genotypes_data(i, g, cells_gt, cells_gq,
                                                             record["tools"][tool], my_genotypes, haploid)

                ###############################
                # TOOL DOES NOT DETECT THE SV #
                ###############################
                cells, cells_gt, cells_gq = fill_cells_no_tools(cells, cells_gt, cells_gq, i, j, g, nb_records, nb_inds,

        ###############################################################################
        # Until we have filled all tools, check if the record is kept after filtering: #
        ###############################################################################
        if filtered_records is not None:
            cells, cells_gt, cells_gq = apply_style_of_filter_cells(cells, cells_gt, cells_gq, i, is_kept, nb_records,
                                                                    nb_inds, nb_tools, filtered_records, do_genotype,
                                                                    rec_id)
    return cells, cells_gt, cells_gq, max_col_len


Floreal Cabanettes's avatar
Floreal Cabanettes committed
def build_xlsx_cols():
    for alp in ALPHABET:
        for j in ALPHABET:
            XLSX_COLS.append(alp + j)


def _format_cell(content, length):
    content = str(content)
    if len(content) < length:
        content += " " * (length - len(content))
    return content


def add_jupyter_header(records: dict, groups: list, type_v: str, jupyter_file: str):
    summary = ["Type of variants: %s.\n" % type_v.upper(), "\n", "Counts by size:\n", "\n"]
    count_by_group = {group: 0 for group in groups}
    count_by_chromosomes = {}
    total = 0
    for idr, record in records.items():
        if not idr.startswith("orphan_"):
            chromosome = record["chromosome"]
            if chromosome not in count_by_chromosomes:
                count_by_chromosomes[chromosome] = 0
            count_by_chromosomes[chromosome] += 1
            for group in groups:
                if group[0] < record["length"] <= group[1]:
                    count_by_group[group] += 1
                    total += 1
                    break

    summary += ["min size | max size | count | percent\n",
                ":------: | :------: | :---: | :-----:\n"]

    for group in groups:
        summary.append("{start} | {end} | {count} | {percent} %\n".format(
            start=_format_cell(group[0], 8),
            end=_format_cell(group[1], 8),
            count=_format_cell(count_by_group[group], 5),
            percent=_format_cell((count_by_group[group] * 100) // total, 7))
        )

    summary += ["\n", "Counts by chromosome:\n", "\n",
                "chromosome | count | percent\n",
                ":--------: | :---: | :-----:\n"]

    for chromosome, count in count_by_chromosomes.items():
        summary.append("{chr} | {count} | {percent} %\n".format(
            chr=_format_cell(chromosome, 10),
            count=_format_cell(count, 5),
            percent=_format_cell((count * 100) // total, 7)
        ))

    with open(jupyter_file, "r") as jupy_f:
        jupy = json.loads(jupy_f.read())

    jupy["cells"][2]["source"] = summary

    with open(jupyter_file, "w") as jupy_f:
        jupy_f.write(json.dumps(jupy, indent=4))


def init(output, vcf_files, true_vcf, rules, filtered_vcfs=None, type_v="del", overlap_cutoff=0.5,
Floreal Cabanettes's avatar
Floreal Cabanettes committed
         left_precision=sys.maxsize, right_precision=sys.maxsize, no_xls=False, haploid=False):
    if not os.path.exists(output):
        os.makedirs(output)

Floreal Cabanettes's avatar
Floreal Cabanettes committed
    build_xlsx_cols()
    filtered = None
    filtered_all = None
    filtered_records = None
    do_genotype = False
        filtered = {}
        filtered_all = {}
        for filtered_vcf in filtered_vcfs:
            eprint(" Reading file %s" % filtered_vcf)
            vcf.readvariants(filtered_vcf, type_v, filtered, True, filtered_all)
        filtered_records = filtered.keys()
        genotypes, gt_quality, nb_inds = get_genotypes(filtered_vcfs, true_vcf)
        do_genotype = True
        eprint(" Reading file %s" % infile)
        try:
            sv_set += vcf.readvariants(infile, type_v).values()
        except:
            print("Ignoreing file %s" % infile)

    eprint(" Reading file %s" % true_vcf)
    true_variants = vcf.readvariants(true_vcf, type_v)
    sv_set += true_variants.values()

    # Compute connected components:
    eprint("Computing Connected components")
    construct_overlap_graph(sv_set, overlap_cutoff, left_precision, right_precision)

    # Build records:
    records, tools, orphans = build_records(genotypes, sv_set, true_variants.keys(), filtered_records, gt_quality,
                                            filtered_all)

    nb_records = len(records)

    #################################################
    # Define cells of each sheet of the excel file: #
    #################################################

    tools = sorted(tools)
    nb_tools = len(tools)

    max_col_len = {}

    ######################
    # BUILD HEADER CELLS #
    ######################

    headers, cells, cells_gt, cells_gq, max_col_len = build_header(tools, filtered_records is not None, nb_records,
                                                                   max_col_len, nb_inds)

    rec_keys = sorted(records.keys(), key=lambda x: (records[x]["chromosome"], svsort(x, records)))

    ####################
    # BUILD BODY CELLS #
    ####################
    cells, cells_gt, cells_gq, max_col_len = build_body_cells(rec_keys, records, nb_records, nb_inds, tools, cells,
                                                              cells_gt, cells_gq, max_col_len, nb_tools, do_genotype,
    # Create document:
Floreal Cabanettes's avatar
Floreal Cabanettes committed
    with_xlsx = False
    if not no_xls:
        with_xlsx = create_xls_document(output, genotypes, headers, filtered_records is not None, nb_records, nb_inds,
                                        cells, cells_gt, cells_gq, max_col_len)
    # Create CSV files:
    create_tsv_file(os.path.join(output, "results_sv_per_tools.tsv"), headers, cells,
                    nb_tools + (2 if filtered_records is not None else 1),
Floreal Cabanettes's avatar
Floreal Cabanettes committed
    create_tsv_file(os.path.join(output, "results_sv_diffs_per_tools.tsv"), headers, cells,
                    nb_tools + (2 if filtered_records is not None else 1),
                    3, (2 + nb_records + 3, nb_records * 2 + 5))
Floreal Cabanettes's avatar
Floreal Cabanettes committed
        create_tsv_file(os.path.join(output, "results_sv_genotypes_per_tools.tsv"), headers, cells_gt,
                        nb_tools + (2 if filtered_records is not None else 1),
                        nb_inds, (2, nb_records + 2))
Floreal Cabanettes's avatar
Floreal Cabanettes committed
        create_tsv_file(os.path.join(output, "results_sv_genotypes_quality_per_tools.tsv"), headers, cells_gq,
                        nb_tools + (2 if filtered_records is not None else 1),
                        nb_inds, (2, nb_records + 2))
    ###############################
    # Build genotypes result file #
    ###############################

    gtres.build(true_genotypes=true_variants,
                pred_genotypes=filtered_all,
                filtered_genotypes=filtered,
                output=os.path.join(output, "results_genotype.tsv"))

    #######################################
    # Build Jupyter HTML notebook summary #
    #######################################