build_results.py

#! /usr/bin/env python3

"""mergeSV  : merge SV on the basis of their intervals

The merging is done by identifiying connected components in a graph.

The graph is constructing taking the SV a node and an edge connects two nodes
if the conresponding intervals exhibits a reciprocal overlap of a least R0%.

Additional constraints on the precision of breakpoints (left and right) can be specified.

Each connected component is termed a CNVR (Copy Number Variation Region)
and is associated to the corresponding CNVs.

"""

import argparse
import string
import os
import re

from pysam import VariantFile

import sys
prg_path = os.path.dirname(os.path.realpath(__file__))
sys.path.insert(0, os.path.join(prg_path, "lib"))

from svreader.vcfwrapper import VCFReader
from svinterval import construct_overlap_graph, connected_components

ALPHABET = list(string.ascii_uppercase)
XLSX_COLS = ALPHABET.copy()

COLOR_NOT_FOUND = "#FE2E2E"
COLOR_NOT_FOUND_2 = "#dddddd"
COLOR_COL_FILTER = "#BEF781"
COLOR_IS_KEPT = "#81F781"
COLOR_FALSE_POSITIVE = "#FE642E"
COLOR_WRONG_GT = "#B40404"


def get_args():
    """
    Get arguments from argparse
    :return: argparse arguments object
    """
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="\
Build Results \n \
description: Build results of the simulated data detection")
    parser.add_argument('-v', '--vcf', type=str, required=True, help='folder containing all vcf results files')
    parser.add_argument('-t', '--true-vcf', type=str, required=True, help='VCF file containing the simulated deletions')
    parser.add_argument('-f', '--filtered-vcf', type=str, required=False,
                        help='VCF file containing the filtered results', nargs='+')
    parser.add_argument('-g', '--genotypes', type=str, help="VCF file containing genotypes", nargs='+')
    parser.add_argument('--overlap_cutoff',  type=float, default=0.5, help='cutoff for reciprocal overlap')
    parser.add_argument('--left_precision',  type=int, default=-1, help='left breakpoint precision')
    parser.add_argument('--right_precision',  type=int, default=-1, help='right breakpoint precision')
    parser.add_argument('-o', '--output', type=str, default="results", help='output prefix')
    parser.add_argument('--no-xls', action='store_const', const=True, default=False, help='Do not build Excel file')
    parser.add_argument('--haploid', action='store_const', const=True, default=False, help='The organism is haploid')
    # parse the arguments
    args = parser.parse_args()

    if args.left_precision == -1:
        args.left_precision = sys.maxsize

    if args.right_precision == -1:
        args.right_precision = sys.maxsize

    # send back the user input
    return args


def eprint(*args, **kwargs):
    """
    Print to stderr
    """
    print(*args, file=sys.stderr, **kwargs)


def passed_variant(record):
    """
    Did this variant pass?
    :param record: vcf record object
    :return: True if pass, False else
    """
    return record.filter is None or len(record.filter) == 0 or "PASS" in record.filter


def read_vcf_file(infile):
    """
    Read a vcf file
    :param infile: vcf file path
    :return: set or records, list of records ids
    """
    SVSet=[]
    ids = []
    for record in VCFReader(infile):
        if not passed_variant(record):
            continue
        SVSet.append(record)
        ids.append(record.id)
    return SVSet, ids

    
def svsort(sv, records):
    """
    Function to sort regions
    """
    if records[sv]["start"] != "":
        start =  str(records[sv]["start"])
    else:
        first_tool = list(records[sv]["tools"].keys())[0]
        start = str(records[sv]["tools"][first_tool]["start"])
    return start    
    

def get_gt(geno, true_gt):
    """
    Get the genotype: consider 1/0 and 0/1 as equivalent (we can't identify a specific chromosome)
    We want geno equal to true_gt in these cases
    :param geno: the genotype
    :param true_gt: the true genotype
    :return: the genotype
    """
    if true_gt is not None:
        if geno == "1/0" and true_gt == "0/1":
            geno = "0/1"
        elif geno == "0/1" and true_gt == "1/0":
            geno = "1/0"
    return geno


def get_max_len(cell, col, max_col_len):
    """
    Get the max content length of a column
    :param cell: the cell content
    :param col: the column number
    :param max_col_len: current max content length for each column
    :return: the new max content length for the given column
    """
    return max(len(str(cell)), max_col_len[col] if col in max_col_len else 0)


def get_quality_color(quality):
    """
    Returns the color associated to a genotype quality value
    :param quality: the quality value
    :return: the associated color
    """
    color_very_low_quality = "#FE2E2E"
    color_low_quality = "#FE9A2E"
    color_medium_quality = "#FFFF00"
    color_high_quality = "#81F781"
    if quality > 60:
        return color_high_quality
    elif quality > 40:
        return color_medium_quality
    elif quality > 20:
        return color_low_quality
    else:
        return color_very_low_quality


def get_genotypes(genotypes_files, true_vcf_file):
    """
    Get genotype of each individual for each SV
    :param genotypes_file: VCF file containing genotypes
    :param true_vcf_file: VCF file containing real data
    :return: genotypes of each individual for each SV, quality of each genotype, number of individuals
    """
    genotypes = {}
    gt_quality = {}

    # Real data:
    reader_t = VariantFile(true_vcf_file)
    samples_t = None
    for rec_t in reader_t:
        samples_t = rec_t.samples
        genotypes[rec_t.id] = ["/".join(map(str, samples_t[x]["GT"])) for x in samples_t]

    nb_inds = len(list(genotypes.values())[0])

    # Samples:
    for genotypes_file in genotypes_files:
        reader = VariantFile(genotypes_file)
        for rec in reader:
            samples = rec.samples
            genotypes[rec.id] = ["/".join(map(str, samples[x]["GT"])) for x in samples_t.keys()]  # Fixed: use samples keys
            # from real data to keep the same order
            gt_quality[rec.id] = [samples[x]["GQ"] for x in samples_t.keys()]

    return genotypes, gt_quality, nb_inds


def build_records(genotypes, SVSet, true_ones_records, filtered_records, gt_quality):
    """
    Build records for each SV
    :param genotypes: list of genotypes of each individual for each SV
    :param SVSet: set of all SVs
    :param true_ones_records: records of the real data
    :param filtered_records: records of the filtered data
    :param gt_quality: list of qualities of each genotype of individuals for each SV
    :return: records dict, tools set, list of orphans records (associated to None real data)
    """
    number = 1
    records = {}
    tools = set()
    orphans = 0
    for components in connected_components(SVSet):
        names = []
        results = {}
        true_one = None

        chromosome = None

        for node in components:
            names.append(node.id)
            if node.id in true_ones_records:
                true_one = node.id
                records[true_one] = {
                    "start": node.start,
                    "end": node.end,
                    "length": int(node.end) - int(node.start),
                    "tools": {},
                    "orphan": False,
                    "genotypes": genotypes[true_one] if true_one in genotypes else None,
                    "chromosome": node.chrom
                }
            else:
                tool_name = node.id.split("_")[0]
                results[tool_name] = {
                    "start": node.start,
                    "end": node.end,
                    "length": int(node.end) - int(node.start),
                    "filtered": (node.id in filtered_records) if filtered_records is not None else False,
                    "genotypes": genotypes[node.id] if node.id in genotypes else None,
                    "qualities": gt_quality[node.id] if node.id in gt_quality else None
                }
                tools.add(tool_name)
                chromosome = node.chrom

        if true_one is not None:
            records[true_one]["tools"] = results
        else:
            orphans += 1
            records["orphan_" + str(orphans)] = {
                "start": "",
                "end": "",
                "length": "",
                "tools": results,
                "orphan": True,
                "genotypes": None,
                "chromosome": chromosome
            }

        names.sort()

        eprint("Group #%i: %s" % (number, ", ".join(names)))
        number += 1

    return records, tools, orphans


def build_header(tools, filtered_records, nb_records, max_col_len, nb_inds):
    """
    Build tools headers, and header cells for each tool
    :param tools: list of tools
    :param cells: cells of the first sheet (sv description)
    :param cells_gt: cells of the second sheet (genotype)
    :param cells_gq: cells of the third sheet (genotype quality)
    :param filtered_records: (bool) is there filtered records
    :param nb_records: number of records
    :param max_col_len: max content length for each column
    :param nb_inds: number of individuals
    :return: headers (list of tools + read and filtered data) ; cells, cells_gt, cells_gq and max_col_len updated
    """
    # First one (SV description)
    cells = {
        "A1": {"text": "RESULTS", "format": {"bg_color": "#ffe856"}},
        "A2": {"text": "Deletion", "format": {"bold": True}},
        "A" + str(1 + nb_records + 3): {"text": "DIFFS", "format": {"bg_color": "#ffe856"}},
        "A" + str(2 + nb_records + 3): {"text": "Deletion", "format": {"bold": True}}
    }

    # Second one (genotype)
    cells_gt = {
        "A1": {"text": "GENOTYPES", "format": {"bg_color": "#ffe856"}},
        "A2": {"text": "Deletion", "format": {"bold": True}}
    }

    # Third one (genotype quality)
    cells_gq = {
        "A1": {"text": "GT QUALITY", "format": {"bg_color": "#ffe856"}},
        "A2": {"text": "Deletion", "format": {"bold": True}}
    }

    i = 2
    j = 2
    headers = ["Chr"]
    for tool in ["Real data"] + tools + (["Filtered results"] if filtered_records else []):
        headers.append(tool)
        l_format = {"bold": True}
        if tool == "Filtered results":
            l_format["bg_color"] = COLOR_COL_FILTER
        cells[XLSX_COLS[i] + "2"] = cells[XLSX_COLS[i] + str(2 + nb_records + 3)] = {"text": "Start",
                                                                                     "format": l_format}
        max_col_len[i] = get_max_len("Start", i, max_col_len)
        cells[XLSX_COLS[i + 1] + "2"] = cells[XLSX_COLS[i + 1] + str(2 + nb_records + 3)] = {"text": "End",
                                                                                             "format": l_format}
        max_col_len[i + 1] = get_max_len("End", i + 1, max_col_len)
        cells[XLSX_COLS[i + 2] + "2"] = cells[XLSX_COLS[i + 2] + str(2 + nb_records + 3)] = {"text": "Length",
                                                                                             "format": l_format}
        max_col_len[i + 2] = get_max_len("Length", i + 2, max_col_len)

        # Genotypes:
        for k in range(0, nb_inds):
            cells_gt[XLSX_COLS[j + k] + "2"] = cells_gq[XLSX_COLS[j + k] + "2"] = \
                {"text": "indiv" + str(k + 1), "format": {"bold": True}}

        i += 3
        j += nb_inds

    return headers, cells, cells_gt, cells_gq, max_col_len


def fill_real_data(row, cells, cells_gt, cells_gq, max_col_len, record, rec_id, nb_records, chromosome):
    """
    Fill cells of the first data column (real data simulated)
    :param cells: cells of the first sheet (sv description)
    :param cells_gt: cells of the second sheet (genotype)
    :param cells_gq: cells of the third sheet (genotype quality)
    :param max_col_len: max text length for each column
    :param record: data of the record
    :param nb_records: number of records (total)
    :return: cells, cells_gt, cells_gq and max_col_len updated
    """
    # SV ID:
    cells[XLSX_COLS[0] + str(row)] = cells[XLSX_COLS[0] + str(row + nb_records + 3)] = {"text": rec_id, "format": {}}
    max_col_len[0] = get_max_len(rec_id, 0, max_col_len)
    cells_gt[XLSX_COLS[0] + str(row)] = cells_gq[XLSX_COLS[0] + str(row)] = {"text": rec_id, "format": {}}

    # Chromosome:
    cells[XLSX_COLS[1] + str(row)] = cells[XLSX_COLS[1] + str(row + nb_records + 3)] = {"text": chromosome,
                                                                                        "format": {}}
    cells_gt[XLSX_COLS[1] + str(row)] = cells_gq[XLSX_COLS[1] + str(row)] = {"text": chromosome, "format": {}}

    # START:
    cells[XLSX_COLS[2] + str(row)] = cells[XLSX_COLS[2] + str(row + nb_records + 3)] = {"text": record["start"],
                                                                                    "format": {}}
    max_col_len[2] = get_max_len(record["start"], 2, max_col_len)

    # END:
    cells[XLSX_COLS[3] + str(row)] = cells[XLSX_COLS[3] + str(row + nb_records + 3)] = {"text": record["end"],
                                                                                    "format": {}}
    max_col_len[3] = get_max_len(record["end"], 3, max_col_len)

    # LENGTH:
    cells[XLSX_COLS[4] + str(row)] = cells[XLSX_COLS[4] + str(row + nb_records + 3)] = {"text": record["length"],
                                                                                    "format": {}}
    max_col_len[4] = get_max_len(record["length"], 4, max_col_len)

    # GENOTYPES:
    if record["genotypes"] is not None:
        for gt in range(0, len(record["genotypes"])):
            cells_gt[XLSX_COLS[2 + gt] + str(row)] = cells_gq[XLSX_COLS[2 + 0 + gt] + str(row)] = \
                {"text": record["genotypes"][gt], "format": {}}

    return cells, cells_gt, cells_gq, max_col_len


def fill_tool_data(row, col, cells, max_col_len, record, nb_records, my_start, my_end, my_length,
                   sv_format=None):
    """
    Fill cells for a tool and a record
    :param row: row position
    :param col: columns position
    :param cells: cells of the first sheet (sv description)
    :param max_col_len: max text length for each column
    :param record: data of the record for a tool
    :param nb_records: number of records (total)
    :param my_start: real start of the SV
    :param my_end: real end of the SV
    :param my_length: real length of the SV
    :param sv_format: format for the cell
    :return: cells and max_col_len updated
    """

    if sv_format is None:
        sv_format = {}

    #############
    # RAW DATA: #
    #############

    # START:
    cells[XLSX_COLS[col] + str(row)] = {"text": record["start"], "format": sv_format}
    max_col_len[col] = get_max_len(record["start"], col, max_col_len)

    # END:
    cells[XLSX_COLS[col + 1] + str(row)] = {"text": record["end"], "format": sv_format}
    max_col_len[col + 1] = get_max_len(record["end"], col + 1, max_col_len)

    # LENGTH:
    cells[XLSX_COLS[col + 2] + str(row)] = {"text": record["length"], "format": sv_format}
    max_col_len[col + 2] = get_max_len(record["length"], col + 2, max_col_len)

    ###########
    #  DIFFS: #
    ###########

    if my_start != "":
        start = record["start"] - my_start
        end = record["end"] - my_end
        length = record["length"] - my_length
    else:
        start = end = length = "NA"

    # START:
    cells[XLSX_COLS[col] + str(row + nb_records + 3)] = {"text": start, "format": sv_format}

    # END:
    cells[XLSX_COLS[col + 1] + str(row + nb_records + 3)] = {"text": end, "format": sv_format}

    # LENGTH:
    cells[XLSX_COLS[col + 2] + str(row + nb_records + 3)] = {"text": length,
                                                             "format": sv_format}

    return cells, max_col_len


def fill_genotypes_data(row, col, cells_gt, cells_gq, record, my_genotypes, haploid=False):
    """
    Fill cells for a tool and a record (genotype/genotype quality parts)
    :param row: row position
    :param col: column position
    :param cells_gt: cells for genotype sheet (2)
    :param cells_gq: cells for the genotype quality sheet (3)
    :param record: data of the record for a tool
    :param my_genotypes: real genotypes for each individual
    :return: cells_gt and cells_gq updated
    """

    the_genotypes = record["genotypes"]
    for gt in range(0, len(the_genotypes)):
        true_gt = my_genotypes[gt] if my_genotypes is not None else ""
        geno = get_gt(the_genotypes[gt], true_gt)

        # Format:
        gt_format = {"bg_color": get_quality_color(int(record["qualities"][gt]))}
        if (not haploid and geno != true_gt) or \
                (haploid and ((true_gt == "1" and geno != "1/1") or (true_gt == "0" and geno != "0/0"))):
            gt_format["font_color"] = "#ff0000"

        # Genotype:
        cells_gt[XLSX_COLS[col + gt] + str(row)] = {"text": geno, "format": gt_format}
        # Quality:
        cells_gq[XLSX_COLS[col + gt] + str(row)] = {"text": int(record["qualities"][gt]), "format": gt_format}

    return cells_gt, cells_gq


def write_headers(workbook, worksheet, headers, cell_len, rows, filtered_records):
    """
    Write header on the given sheet of the XLSX file
    :param workbook: the XLSX file workbook
    :param worksheet: th sheet
    :param headers: headers values
    :param cell_len: length of each cell of the header
    :param rows: (list) rows into write the header
    :param filtered_records: (bool) is there filtered records
    """
    for row in rows:
        merge_format = workbook.add_format({'bold': True})
        worksheet.write(XLSX_COLS[1] + str(row + 1) + ":" + XLSX_COLS[cell_len] + str(row + 1), headers[0], merge_format)
    i = 0
    for header in headers[1:]:
        if i < len(headers) - 2 or not filtered_records:
            merge_format = workbook.add_format({'align': 'center'})
        else:
            merge_format = workbook.add_format({'align': 'center', 'bg_color': COLOR_COL_FILTER})
        for row in rows:
            worksheet.merge_range(XLSX_COLS[2 + i * cell_len] + str(row) + ":" + XLSX_COLS[1 + cell_len + i * cell_len] + str(row),
                                  header, merge_format)
        i += 1


def create_xls_document(output, genotypes, headers, filtered_records, nb_records, nb_inds, cells, cells_gt, cells_gq,
                        max_col_len):
    """
    Create the XLSX file
    :param output: output directory
    :param genotypes: genotypes file
    :param headers: headers of each sheet
    :param filtered_records: (bool) has filtered records
    :param nb_records: number of records
    :param nb_inds: number of individuals
    :param cells: cells for first sheet (SV description)
    :param cells_gt: cells for second sheet (genotypes)
    :param cells_gq: cells for third sheet (genotype quality)
    :param max_col_len: max content length for each column
    """

    try:
        import xlsxwriter
    except ImportError:
        print("\nWARN: Excel file not built: xlsxwriter python module not installed")
        return False

    with xlsxwriter.Workbook(output + ".xlsx") as workbook:

        #################################
        # First sheet (SV description): #
        #################################

        worksheet = workbook.add_worksheet("SVs")
        write_headers(workbook, worksheet, headers, 3, [1, 1+nb_records+3], filtered_records)

        # Body:
        for cell_id, cell_content in cells.items():
            cell_format = workbook.add_format(cell_content["format"])
            worksheet.write(cell_id, cell_content["text"], cell_format)

        # Resize columns:
        for col, max_len in max_col_len.items():
            worksheet.set_column(col, col, max_len+1)

        if genotypes:

            #############################
            # Second sheet (Genotypes): #
            #############################

            worksheet_gt = workbook.add_worksheet("Genotypes")
            write_headers(workbook, worksheet_gt, headers, nb_inds, [1], filtered_records)

            # Body:
            for cell_id, cell_content in cells_gt.items():
                cell_format = workbook.add_format(cell_content["format"])
                worksheet_gt.write(cell_id, cell_content["text"], cell_format)
                worksheet_gt.freeze_panes(0, 2+nb_inds)

            # Resize columns:
            worksheet_gt.set_column(0, 0, max_col_len[0]+1)

            ####################################
            # Third sheet (Genotypes quality): #
            ####################################

            worksheet_gq = workbook.add_worksheet("Gt quality")
            write_headers(workbook, worksheet_gq, headers, nb_inds, [1], filtered_records)

            # Body
            for cell_id, cell_content in cells_gq.items():
                cell_format = workbook.add_format(cell_content["format"])
                worksheet_gq.write(cell_id, cell_content["text"], cell_format)
                worksheet_gq.freeze_panes(0, 2+nb_inds)

            # Resize columns:
            worksheet_gq.set_column(0, 0, max_col_len[0]+1)
    return True


def create_tsv_file(filename: str, headers: list, cells: dict, nb_tools: int, nb_per_tool: int, records_range: ()):
    """
    Create tabulated separated values file
    :param filename: filename of the file
    :param headers: headers of each sheet
    :param cells: cells of the table to save
    :param nb_tools: number of tools
    :param nb_per_tool: number per tools
    :param records_range: range of records to treat {tuple(2)}
    :return:
    """
    # Init rows:
    head = ["", headers[0]]
    top_headers = {}
    h = 2
    for header in headers[1:]:
        # Define top headers to each column:
        for i in range(0, nb_per_tool):
            top_headers[h] = header
            head.append("")
            h += 1
    rows = [head]
    for i in range(0, records_range[1]-records_range[0]):
        rows.append(["" for x in range(0, (nb_tools * nb_per_tool) + 2)])

    # Fill content:
    for id_cell, cell in cells.items():
        id_m = re.match(r"^([A-Z]+)(\d+)$", id_cell)
        col = XLSX_COLS.index(id_m.group(1))
        row = int(id_m.group(2))
        if records_range[0] <= row <= records_range[1]:
            r = row - records_range[0]
            if r == 0 and col > 0:
                if col > 1:
                    rows[r][col] = top_headers[col].replace(" ", "_") + "__" + cell["text"]
                else:
                    rows[r][col] = headers[0]
            else:
                rows[r][col] = str(cell["text"])

    # List as text:
    for r in range(0, len(rows)):
        rows[r] = "\t".join(rows[r])
    tsv = "\n".join(rows)
    with open(filename, "w") as tsv_file:
        tsv_file.write(tsv)


def search_vcf_files(my_folder):
    """
    Search vcf files recursively (can be replaced by glob, but required python >= 3.5
    (3.4 version in genotoul plateform)
    :param my_folder: folder into search files
    :return: list of vcf files, with absolute paths
    """
    vcf_files = []
    for item in os.listdir(my_folder):
        item_file = os.path.join(my_folder, item)
        if os.path.isfile(item_file) and (item_file.endswith(".vcf") or item_file.endswith(".vcf.gz")):
            vcf_files.append(os.path.abspath(item_file))
        elif os.path.isdir(item_file):
            vcf_files += search_vcf_files(item_file)
    return vcf_files


def print_results(nb_records, orphans, with_xlsx, output, do_genotype):
    """
    Print list of outputs
    :param nb_records: number of records {int}
    :param orphans: sv found in tools but not present in real data {dict}
    :param with_xlsx: build xlsx file {bool}
    :param output: output prefix {str}
    :param do_genotype: do the genotype {bool}
    """
    print("")
    print("###########")
    print("# RESULTS #")
    print("###########")
    print("")
    print(str(nb_records) + " Results found")
    print(str(orphans) + " False Positive")
    print("")
    if with_xlsx:
        print("Results saved in :\n\t- " + output + ".xlsx")
    else:
        print("Results:")
    print("")
    print("TSV files:")
    print("\t- " + output + "_sv_per_tools.tsv")
    print("\t- " + output + "_sv_diffs_per_tools.tsv")
    if do_genotype:
        print("\t- " + output + "_sv_genotypes_per_tools.tsv")
        print("\t- " + output + "_sv_genotypes_quality_per_tools.tsv")
    print("")


def fill_cells_no_tools(cells, cells_gt, cells_gq, i, j, g, nb_records, nb_inds, do_genotype):
    """
    Fill cells when a tool does not detect a SV
    :param cells: cells definition
    :param cells_gt: cells definition for genotypes
    :param cells_gq: cells definition for genotypes quality
    :param i: row index
    :param j: column index
    :param g: column index for genotypes and genotypes quality tables
    :param nb_records: total number of records
    :param nb_inds: total number of individuals
    :param do_genotype: do the genotypes
    :return: cells, completed
    """
    for k in range(0, 3):
        # noinspection PyUnresolvedReferences
        cells[XLSX_COLS[j + k] + str(i)] = {"text": "", "format": {"bg_color": COLOR_NOT_FOUND}}
        cells[XLSX_COLS[j + k] + str(i + nb_records + 3)] = {"text": "",
                                                             "format": {"bg_color": COLOR_NOT_FOUND}}

        # Genotype:
        if do_genotype:
            for gt in range(0, nb_inds):
                # noinspection PyUnresolvedReferences
                cells_gt[XLSX_COLS[g + gt] + str(i)] = cells_gq[XLSX_COLS[g + gt] + str(i)] = \
                    {"text": "", "format": {"bg_color": COLOR_NOT_FOUND_2}}
    return cells, cells_gt, cells_gq


def apply_style_of_filter_cells(cells, cells_gt, cells_gq, i, is_kept, nb_records, nb_inds, nb_tools, filtered_records,
                   do_genotype, rec_id):
    """
    Apply style of cells
    :param cells: cells of the default table {dict}
    :param cells_gt: cells of the genotypes table {dict}
    :param cells_gq: cells of the genotypes quality table {dict}
    :param i: row index {int}
    :param is_kept: is the variant kept after filtering {bool}
    :param nb_records:  number of records {int}
    :param nb_inds: number of individuals {int}
    :param nb_tools: number of tools {int}
    :param filtered_records: file containing filtered records {str}
    :param do_genotype: do the genotype {bool}
    :param rec_id: id of the record {str}
    :return:
    """
    if is_kept:
        # SV is kept: color bg in green in the corresponding tool
        cells[XLSX_COLS[0] + str(i)]["format"]["bg_color"] = \
            cells[XLSX_COLS[0] + str(i + nb_records + 3)]["format"]["bg_color"] = COLOR_IS_KEPT
    elif filtered_records is not None:
        # SV does not pass the filter: color bg in red in the filter column
        cells[XLSX_COLS[2 + ((nb_tools + 1) * 3)] + str(i)] = {"text": "", "format": {"bg_color": COLOR_NOT_FOUND}}
        cells[XLSX_COLS[2 + ((nb_tools + 1) * 3) + 1] + str(i)] = {"text": "",
                                                                   "format": {"bg_color": COLOR_NOT_FOUND}}
        cells[XLSX_COLS[2 + ((nb_tools + 1) * 3) + 2] + str(i)] = {"text": "",
                                                                   "format": {"bg_color": COLOR_NOT_FOUND}}
        cells[XLSX_COLS[2 + ((nb_tools + 1) * 3)] + str(i + nb_records + 3)] = {"text": "",
                                                                                "format": {
                                                                                    "bg_color": COLOR_NOT_FOUND}}
        cells[XLSX_COLS[2 + ((nb_tools + 1) * 3) + 1] + str(i + nb_records + 3)] = {
            "text": "", "format": {"bg_color": COLOR_NOT_FOUND}}
        cells[XLSX_COLS[2 + ((nb_tools + 1) * 3) + 2] + str(i + nb_records + 3)] = {
            "text": "", "format": {"bg_color": COLOR_NOT_FOUND}}

        # Genotype:
        if do_genotype:
            # Color in gray in the filter column
            for gt in range(0, nb_inds):
                cells_gt[XLSX_COLS[2 + ((nb_tools + 1) * nb_inds) + gt] + str(i)] = \
                    cells_gq[XLSX_COLS[2 + ((nb_tools + 1) * nb_inds) + gt] + str(i)] = {"text": "", "format":
                                                                                         {"bg_color": COLOR_NOT_FOUND_2}
                                                                                         }

    # False positives (orphans) in orange:
    if re.match(r"^orphan_\d+$", rec_id):
        cells[XLSX_COLS[0] + str(i)]["format"]["bg_color"] = \
            cells[XLSX_COLS[0] + str(i + nb_records + 3)]["format"]["bg_color"] = COLOR_FALSE_POSITIVE

    return cells, cells_gt, cells_gq


def build_body_cells(rec_keys, records, nb_records, nb_inds, tools, cells, cells_gt, cells_gq, max_col_len,
                     nb_tools, do_genotype, haploid, filtered_records):
    i = 3
    for rec_id in rec_keys:
        record = records[rec_id]
        my_start = record["start"]
        my_end = record["end"]
        my_length = record["length"]
        my_genotypes = record["genotypes"]

        # Real data of the simulation:
        cells, cells_gt, cells_gq, max_col_len = fill_real_data(i, cells, cells_gt, cells_gq, max_col_len, record,
                                                                rec_id, nb_records, record["chromosome"])

        j = 5
        g = nb_inds + 2
        is_kept = False
        for tool in tools:
            if tool in record["tools"]:

                ###########################
                # IF TOOL DETECTS THE SV: #
                ###########################

                if record["tools"][tool]["filtered"]:

                    ######################
                    # SET FILTERED DATA: #
                    ######################

                    is_kept = True
                    sv_format = {"bg_color": COLOR_IS_KEPT}

                    # SV data (sheet 1):
                    cells, max_col_len = fill_tool_data(i, 2 + ((nb_tools + 1) * 3), cells, max_col_len,
                                                        record["tools"][tool], nb_records, my_start, my_end, my_length,
                                                        {"bg_color": COLOR_COL_FILTER})

                    if do_genotype:
                        # Genotype (sheets 2&3):
                        cells_gt, cells_gq = fill_genotypes_data(i, 2 + ((nb_tools + 1) * nb_inds), cells_gt, cells_gq,
                                                                 record["tools"][tool], my_genotypes, haploid)
                else:
                    sv_format = {}

                #################
                # SET TOOL DATA #
                #################

                # SV data (sheet 1):
                cells, max_col_len = fill_tool_data(i, j, cells, max_col_len,
                                                    record["tools"][tool], nb_records, my_start, my_end, my_length,
                                                    sv_format)

                if do_genotype:
                    # Genotype (sheets 2&3):
                    cells_gt, cells_gq = fill_genotypes_data(i, g, cells_gt, cells_gq,
                                                             record["tools"][tool], my_genotypes, haploid)
            else:

                ###############################
                # TOOL DOES NOT DETECT THE SV #
                ###############################
                cells, cells_gt, cells_gq = fill_cells_no_tools(cells, cells_gt, cells_gq, i, j, g, nb_records, nb_inds,
                                                                do_genotype)

            j += 3
            g += nb_inds

        ###############################################################################
        # Until we have filled all tools, check if the record is kept after filtering: #
        ###############################################################################
        cells, cells_gt, cells_gq = apply_style_of_filter_cells(cells, cells_gt, cells_gq, i, is_kept, nb_records,
                                                                nb_inds, nb_tools, filtered_records, do_genotype,
                                                                rec_id)

        i += 1

    return cells, cells_gt, cells_gq, max_col_len


def build_xlsx_cols():
    for alp in ALPHABET:
        for j in ALPHABET:
            XLSX_COLS.append(alp + j)


def init(output, vcf_folder, true_vcf, filtered_vcfs=None, genotypes_files=None, overlap_cutoff=0.5,
         left_precision=sys.maxsize, right_precision=sys.maxsize, no_xls=False, haploid=False):
    build_xlsx_cols()

    genotypes = {}
    gt_quality = {}

    nb_inds = 0

    if genotypes_files:
        genotypes, gt_quality, nb_inds = get_genotypes(genotypes_files, true_vcf)

    do_genotype = genotypes_files is not None

    filenames = search_vcf_files(vcf_folder)

    true_ones = true_vcf

    # Reading all the vcf files
    sv_set = []
    for infile in filenames:
        eprint(" Reading file %s" % infile)
        try:
            sv_set += read_vcf_file(infile)[0]
        except:
            print("Ignoreing file %s" % infile)

    eprint(" Reading file %s" % true_ones)
    sv_set_to, true_ones_records = read_vcf_file(true_ones)
    sv_set += sv_set_to
    filtered_records = []

    if filtered_vcfs:
        for filtered_vcf in filtered_vcfs:
            eprint(" Reading file %s" % filtered_vcf)
            filtered_records += read_vcf_file(filtered_vcf)[1]

    # Compute connected components:
    eprint("Computing Connected components")
    construct_overlap_graph(sv_set, overlap_cutoff, left_precision, right_precision)

    # Build records:
    records, tools, orphans = build_records(genotypes, sv_set, true_ones_records, filtered_records, gt_quality)

    nb_records = len(records)

    #################################################
    # Define cells of each sheet of the excel file: #
    #################################################

    tools = sorted(tools)
    nb_tools = len(tools)

    max_col_len = {}

    ######################
    # BUILD HEADER CELLS #
    ######################

    headers, cells, cells_gt, cells_gq, max_col_len = build_header(tools, filtered_records is not None, nb_records,
                                                                   max_col_len, nb_inds)

    rec_keys = sorted(records.keys(), key=lambda x: (records[x]["chromosome"], svsort(x, records)))

    ####################
    # BUILD BODY CELLS #
    ####################
    cells, cells_gt, cells_gq, max_col_len = build_body_cells(rec_keys, records, nb_records, nb_inds, tools, cells,
                                                              cells_gt, cells_gq, max_col_len, nb_tools, do_genotype,
                                                              haploid, filtered_records)

    # Create document:
    with_xlsx = False
    if not no_xls:
        with_xlsx = create_xls_document(output, genotypes, headers, filtered_records is not None, nb_records, nb_inds,
                                        cells, cells_gt, cells_gq, max_col_len)

    # Create CSV files:
    create_tsv_file(output + "_sv_per_tools.tsv", headers, cells,
                    nb_tools + (2 if filtered_records is not None else 1),
                    3, (2, nb_records + 2))
    create_tsv_file(output + "_sv_diffs_per_tools.tsv", headers, cells,
                    nb_tools + (2 if filtered_records is not None else 1),
                    3, (2 + nb_records + 3, nb_records * 2 + 5))
    if do_genotype:
        create_tsv_file(output + "_sv_genotypes_per_tools.tsv", headers, cells_gt,
                        nb_tools + (2 if filtered_records is not None else 1),
                        nb_inds, (2, nb_records + 2))
        create_tsv_file(output + "_sv_genotypes_quality_per_tools.tsv", headers, cells_gq,
                        nb_tools + (2 if filtered_records is not None else 1),
                        nb_inds, (2, nb_records + 2))

    print_results(nb_records, orphans, with_xlsx, output, do_genotype)


def main():
    """
    Main function
    """
    # parse the command line args
    args = get_args()
    init(args.output, args.vcf, args.true_vcf, args.filtered_vcf, args.genotypes, args.overlap_cutoff,
         args.left_precision, args.right_precision, args.no_xls, args.haploid)


# initialize the script
if __name__ == '__main__':
    sys.exit(main())