Skip to content
Snippets Groups Projects
build_results.py 32.2 KiB
Newer Older
#! /usr/bin/env python3

"""mergeSV  : merge SV on the basis of their intervals

The merging is done by identifiying connected components in a graph.

The graph is constructing taking the SV a node and an edge connects two nodes
if the conresponding intervals exhibits a reciprocal overlap of a least R0%.

Additional constraints on the precision of breakpoints (left and right) can be specified.

Each connected component is termed a CNVR (Copy Number Variation Region)
and is associated to the corresponding CNVs.

"""

import argparse, sys
import glob
import vcf
import string
import re

import xlsxwriter

import sys
prg_path = os.path.dirname(os.path.realpath(__file__))
sys.path.insert(0, os.path.join(prg_path, "lib"))

from svreader.vcfwrapper import VCFReader
from svinterval import construct_overlap_graph, connected_components
alphabet = list(string.ascii_uppercase)
xlsx_cols = alphabet.copy()
for alp in alphabet:
    for j in alphabet:
        xlsx_cols.append(alp + j)

color_not_found = "#FE2E2E"
color_not_found_2 = "#dddddd"
color_col_filter = "#BEF781"
color_is_kept = "#81F781"
color_false_positive = "#FE642E"
color_wrong_gt = "#B40404"


def get_args():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="\
Build Results \n \
description: Build results of the simulated data detection")
    parser.add_argument('-v', '--vcf', type=str, required=True, help='folder containing all vcf results files')
    parser.add_argument('-t', '--true-vcf', type=str, required=True, help='VCF file containing the simulated deletions')
    parser.add_argument('-f', '--filtered-vcf', type=str, required=False,
                        help='VCF file containing the filtered results')
    parser.add_argument('-g', '--genotypes', type=str, help="VCF file containing genotypes")
    parser.add_argument('--overlap_cutoff',  type=float, default=0.5, help='cutoff for reciprocal overlap')
    parser.add_argument('--left_precision',  type=int, default=-1, help='left breakpoint precision')
    parser.add_argument('--right_precision',  type=int, default=-1, help='right breakpoint precision')
    parser.add_argument('-o', '--output', type=str, default="results", help='output prefix')
    parser.add_argument('--no-xls', action='store_const', const=True, default=False, help='Do not build Excel file')
    parser.add_argument('--haploid', action='store_const', const=True, default=False, help='The organism is haploid')
    # parse the arguments
    args = parser.parse_args()

    if args.left_precision == -1:
        args.left_precision = sys.maxsize

    if args.right_precision == -1:
        args.right_precision = sys.maxsize

    # send back the user input
    return args

# A small wrapper to print to stderr
def eprint(*args, **kwargs):
    print(*args, file=sys.stderr, **kwargs)


def to_vcf_record(cnv):

        info = { "SVLEN" : cnv.length(),
                 "SVTYPE" : "DEL",
                 "END" : cnv.end,
                 "NUM_CNV" : cnv.NumCNV(),
                 "CNV":",".join(cnv.CNV()),
                 "CALLERS":",".join(cnv.Callers()),
                 "NUM_CALLERS":cnv.NumCallers(),
                 "CIPOS":cnv.cipos(),
                 "CIEND":cnv.ciend(),
                 "PRECISION":",".join(map(str,cnv.precision())),
                 "CALLERSVSAMPLES":",".join(cnv.CallersVsamples())
                  }
        alt = [vcf.model._SV("DEL")]

        if not cnv.IsPrecise():
            info['IMPRECISE'] = True
        else:
            info['REPR_CNV'] = cnv.repr_cnv()


        vcf_record = vcf.model._Record(cnv.chrom,
                                       cnv.start,
                                       cnv.name,
                                       "N",
                                       alt,
                                       ".",
                                       ".",
                                       info,
                                       "",
                                       [0],
                                       [])
        return vcf_record

def passed_variant(record):
        """Did this variant pass?"""
        return record.filter is None or len(record.filter) == 0 or "PASS" in record.filter

# --------------------------------------
# main function


def read_vcf_file(infile):
    SVSet=[]
    ids = []
    for record in VCFReader(infile):
        if not passed_variant(record):
            continue
        SVSet.append(record)
        ids.append(record.id)
    return SVSet, ids

Thomas Faraut's avatar
Thomas Faraut committed
    
def svsort(sv, records):
    """
    Function to sort regions
    """
    if records[sv]["start"] != "":
        start =  str(records[sv]["start"])
    else:
        first_tool = list(records[sv]["tools"].keys())[0]
        start = str(records[sv]["tools"][first_tool]["start"])
Thomas Faraut's avatar
Thomas Faraut committed
    return start    
    

def get_gt(geno, true_gt):
    """
    Get the genotype: consider 1/0 and 0/1 as equivalent (we can't identify a specific chromosome)
    We want geno equal to true_gt in these cases
    :param geno: the genotype
    :param true_gt: the true genotype
    :return: the genotype
    """
    if true_gt is not None:
        if geno == "1/0" and true_gt == "0/1":
            geno = "0/1"
        elif geno == "0/1" and true_gt == "1/0":
            geno = "1/0"
    return geno


def get_max_len(cell, col, max_col_len):
    """
    Get the max content length of a column
    :param cell: the cell content
    :param col: the column number
    :param max_col_len: current max content length for each column
    :return: the new max content length for the given column
    """
    return max(len(str(cell)), max_col_len[col] if col in max_col_len else 0)


def get_quality_color(quality):
    """
    Returns the color associated to a genotype quality value
    :param quality: the quality value
    :return: the associated color
    """
    color_very_low_quality = "#FE2E2E"
    color_low_quality = "#FE9A2E"
    color_medium_quality = "#FFFF00"
    color_high_quality = "#81F781"
    if quality > 60:
        return color_high_quality
    elif quality > 40:
        return color_medium_quality
    elif quality > 20:
        return color_low_quality
    else:
        return color_very_low_quality


def get_genotypes(genotypes_file, true_vcf_file):
    """
    Get genotype of each individual for each SV
    :param genotypes_file: VCF file containing genotypes
    :param true_vcf_file: VCF file containing real data
    :return: genotypes of each individual for each SV, quality of each genotype, number of individuals
    """
    # Genotypes:
    genotypes_raw = os.popen("zcat " + genotypes_file + " " + true_vcf_file + " | " +
                             os.path.join(prg_path, "vawk") + " '{ print $3,S$*$GT}'").read().split("\n")
    for gt in genotypes_raw:
        if gt != "":
            gt_l = gt.split("\t")
            genotypes[gt_l[0]] = gt_l[1:]
            nb_inds = len(gt_l[1:])

    gt_quality_raw = os.popen("zcat " + genotypes_file + " | " +
                              os.path.join(prg_path, "vawk") + " '{ print $3,S$*$GQ}'").read().split("\n")
    for gq in gt_quality_raw:
        if gq != "":
            gq_l = gq.split("\t")
            gt_quality[gq_l[0]] = gq_l[1:]

    return genotypes, gt_quality, nb_inds


def build_records(genotypes, SVSet, true_ones_records, filtered_records, gt_quality):
    """
    Build records for each SV
    :param genotypes: list of genotypes of each individual for each SV
    :param SVSet: set of all SVs
    :param true_ones_records: records of the real data
    :param filtered_records: records of the filtered data
    :param gt_quality: list of qualities of each genotype of individuals for each SV
    :return: records dict, tools set, list of orphans records (associated to None real data)
    """
    number = 1
    records = {}
    tools = set()
    orphans = 0
    for components in connected_components(SVSet):
        names = []
        results = {}
        true_one = None

        for node in components:
            names.append(node.id)
            if node.id in true_ones_records:
                true_one = node.id
                records[true_one] = {
                    "start": node.start,
                    "end": node.end,
                    "length": int(node.end) - int(node.start),
                    "tools": {},
                    "genotypes": genotypes[true_one] if true_one in genotypes else None,
                    "chromosome": node.chrom
                }
            else:
                tool_name = node.id.split("_")[0]
                results[tool_name] = {
                    "start": node.start,
                    "end": node.end,
                    "length": int(node.end) - int(node.start),
                    "filtered": (node.id in filtered_records) if filtered_records is not None else False,
                    "genotypes": genotypes[node.id] if node.id in genotypes else None,
                    "qualities": gt_quality[node.id] if node.id in gt_quality else None
                }
                tools.add(tool_name)

        if true_one is not None:
            records[true_one]["tools"] = results
        else:
            orphans += 1
            records["orphan_" + str(orphans)] = {
                "start": "",
                "end": "",
                "length": "",
                "tools": results,
                "orphan": True,
                "genotypes": None
            }

        names.sort()

        eprint("Group #%i: %s" % (number, ", ".join(names)))
        number += 1

    return records, tools, orphans


def build_header(tools, cells, cells_gt, cells_gq, filtered_records, nb_records, max_col_len, nb_inds):
    """
    Build tools headers, and header cells for each tool
    :param tools: list of tools
    :param cells: cells of the first sheet (sv description)
    :param cells_gt: cells of the second sheet (genotype)
    :param cells_gq: cells of the third sheet (genotype quality)
    :param filtered_records: (bool) is there filtered records
    :param nb_records: number of records
    :param max_col_len: max content length for each column
    :param nb_inds: number of individuals
    :return: headers (list of tools + read and filtered data) ; cells, cells_gt, cells_gq and max_col_len updated
    """
    i = 2
    j = 2
    headers = ["Chr"]
    for tool in ["Real data"] + tools + (["Filtered results"] if filtered_records else []):
        headers.append(tool)
        l_format = {"bold": True}
        if tool == "Filtered results":
            l_format["bg_color"] = color_col_filter
        cells[xlsx_cols[i] + "2"] = cells[xlsx_cols[i] + str(2 + nb_records + 3)] = {"text": "Start",
                                                                                     "format": l_format}
        max_col_len[i] = get_max_len("Start", i, max_col_len)
        cells[xlsx_cols[i + 1] + "2"] = cells[xlsx_cols[i + 1] + str(2 + nb_records + 3)] = {"text": "End",
                                                                                             "format": l_format}
        max_col_len[i + 1] = get_max_len("End", i + 1, max_col_len)
        cells[xlsx_cols[i + 2] + "2"] = cells[xlsx_cols[i + 2] + str(2 + nb_records + 3)] = {"text": "Length",
                                                                                             "format": l_format}
        max_col_len[i + 2] = get_max_len("Length", i + 2, max_col_len)

        # Genotypes:
        for k in range(0, nb_inds):
            cells_gt[xlsx_cols[j + k] + "2"] = cells_gq[xlsx_cols[j + k] + "2"] = \
                {"text": "INDIV_" + str(k + 1), "format": {"bold": True}}

        i += 3
        j += nb_inds

    return headers, cells, cells_gt, cells_gq, max_col_len


def fill_real_data(row, cells, cells_gt, cells_gq, max_col_len, record, rec_id, nb_records, chromosome):
    """
    Fill cells of the first data column (real data simulated)
    :param cells: cells of the first sheet (sv description)
    :param cells_gt: cells of the second sheet (genotype)
    :param cells_gq: cells of the third sheet (genotype quality)
    :param max_col_len: max text length for each column
    :param record: data of the record
    :param nb_records: number of records (total)
    :return: cells, cells_gt, cells_gq and max_col_len updated
    """
    # SV ID:
    cells[xlsx_cols[0] + str(row)] = cells[xlsx_cols[0] + str(row + nb_records + 3)] = {"text": rec_id, "format": {}}
    max_col_len[0] = get_max_len(rec_id, 0, max_col_len)
    cells_gt[xlsx_cols[0] + str(row)] = cells_gq[xlsx_cols[0] + str(row)] = {"text": rec_id, "format": {}}

    # Chromosome:
    cells[xlsx_cols[1] + str(row)] = cells[xlsx_cols[1] + str(row + nb_records + 3)] = {"text": chromosome,
                                                                                        "format": {}}
    cells_gt[xlsx_cols[1] + str(row)] = cells_gq[xlsx_cols[1] + str(row)] = {"text": chromosome, "format": {}}

    cells[xlsx_cols[2] + str(row)] = cells[xlsx_cols[2] + str(row + nb_records + 3)] = {"text": record["start"],
    max_col_len[2] = get_max_len(record["start"], 2, max_col_len)
    cells[xlsx_cols[3] + str(row)] = cells[xlsx_cols[3] + str(row + nb_records + 3)] = {"text": record["end"],
    max_col_len[3] = get_max_len(record["end"], 3, max_col_len)
    cells[xlsx_cols[4] + str(row)] = cells[xlsx_cols[4] + str(row + nb_records + 3)] = {"text": record["length"],
    max_col_len[4] = get_max_len(record["length"], 4, max_col_len)

    # GENOTYPES:
    if record["genotypes"] is not None:
        for gt in range(0, len(record["genotypes"])):
            cells_gt[xlsx_cols[2 + gt] + str(row)] = cells_gq[xlsx_cols[2 + 0 + gt] + str(row)] = \
                {"text": record["genotypes"][gt], "format": {}}

    return cells, cells_gt, cells_gq, max_col_len


def fill_tool_data(row, col, cells, max_col_len, record, nb_records, my_start, my_end, my_length,
                   sv_format=None):
    """
    Fill cells for a tool and a record
    :param row: row position
    :param col: columns position
    :param cells: cells of the first sheet (sv description)
    :param max_col_len: max text length for each column
    :param record: data of the record for a tool
    :param nb_records: number of records (total)
    :param my_start: real start of the SV
    :param my_end: real end of the SV
    :param my_length: real length of the SV
    :param sv_format: format for the cell
    :return: cells and max_col_len updated
    """

    if sv_format is None:
        sv_format = {}

    #############
    # RAW DATA: #
    #############

    # START:
    cells[xlsx_cols[col] + str(row)] = {"text": record["start"], "format": sv_format}
    max_col_len[col] = get_max_len(record["start"], col, max_col_len)

    # END:
    cells[xlsx_cols[col + 1] + str(row)] = {"text": record["end"], "format": sv_format}
    max_col_len[col + 1] = get_max_len(record["end"], col + 1, max_col_len)

    # LENGTH:
    cells[xlsx_cols[col + 2] + str(row)] = {"text": record["length"], "format": sv_format}
    max_col_len[col + 2] = get_max_len(record["length"], col + 2, max_col_len)

    ###########
    #  DIFFS: #
    ###########

    if my_start != "":
        start = record["start"] - my_start
        end = record["end"] - my_end
        length = record["length"] - my_length
    else:
        start = end = length = "NA"

    # START:
    cells[xlsx_cols[col] + str(row + nb_records + 3)] = {"text": start, "format": sv_format}

    # END:
    cells[xlsx_cols[col + 1] + str(row + nb_records + 3)] = {"text": end, "format": sv_format}

    # LENGTH:
    cells[xlsx_cols[col + 2] + str(row + nb_records + 3)] = {"text": length,
                                                             "format": sv_format}

    return cells, max_col_len


def fill_genotypes_data(row, col, cells_gt, cells_gq, record, my_genotypes, haploid=False):
    """
    Fill cells for a tool and a record (genotype/genotype quality parts)
    :param row: row position
    :param col: column position
    :param cells_gt: cells for genotype sheet (2)
    :param cells_gq: cells for the genotype quality sheet (3)
    :param record: data of the record for a tool
    :param my_genotypes: real genotypes for each individual
    :return: cells_gt and cells_gq updated
    """

    the_genotypes = record["genotypes"]
    for gt in range(0, len(the_genotypes)):
        true_gt = my_genotypes[gt] if my_genotypes is not None else ""
        geno = get_gt(the_genotypes[gt], true_gt)

        # Format:
        gt_format = {"bg_color": get_quality_color(int(record["qualities"][gt]))}
        if (not haploid and geno != true_gt) or \
                (haploid and ((true_gt == "1" and geno != "1/1") or (true_gt == "0" and geno != "0/0"))):
            gt_format["font_color"] = "#ff0000"

        # Genotype:
        cells_gt[xlsx_cols[col + gt] + str(row)] = {"text": geno, "format": gt_format}
        # Quality:
        cells_gq[xlsx_cols[col + gt] + str(row)] = {"text": int(record["qualities"][gt]), "format": gt_format}

    return cells_gt, cells_gq


def write_headers(workbook, worksheet, headers, cell_len, rows, filtered_records):
    """
    Write header on the given sheet of the XLSX file
    :param workbook: the XSLX file workbook
    :param worksheet: th sheet
    :param headers: headers values
    :param cell_len: length of each cell of the header
    :param rows: (list) rows into write the header
    :param filtered_records: (bool) is there filtered records
    """
    for row in rows:
        merge_format = workbook.add_format({'bold': True})
        worksheet.write(xlsx_cols[1] + str(row+1) + ":" + xlsx_cols[cell_len] + str(row+1), headers[0], merge_format)
    for header in headers[1:]:
        if i < len(headers) - 2 or not filtered_records:
            merge_format = workbook.add_format({'align': 'center'})
        else:
            merge_format = workbook.add_format({'align': 'center', 'bg_color': color_col_filter})
        for row in rows:
            worksheet.merge_range(xlsx_cols[2 + i * cell_len] + str(row) + ":" + xlsx_cols[1 + cell_len + i * cell_len] + str(row),
                                  header, merge_format)
        i += 1


def create_xls_document(args, headers, filtered_records, nb_records, nb_inds, cells, cells_gt, cells_gq, max_col_len):
    """
    Create the XLSX file
    :param args: arguments given by the user (argparse)
    :param headers: headers of each sheet
    :param filtered_records: (bool) has filtered records
    :param nb_records: number of records
    :param nb_inds: number of individuals
    :param cells: cells for first sheet (SV description)
    :param cells_gt: cells for second sheet (genotypes)
    :param cells_gq: cells for third sheet (genotype quality)
    :param max_col_len: max content length for each column
    """
    with xlsxwriter.Workbook(args.output + ".xslx") as workbook:

        #################################
        # First sheet (SV description): #
        #################################

        worksheet = workbook.add_worksheet("SVs")
        write_headers(workbook, worksheet, headers, 3, [1, 1+nb_records+3], filtered_records)

        # Body:
        for cell_id, cell_content in cells.items():
            cell_format = workbook.add_format(cell_content["format"])
            worksheet.write(cell_id, cell_content["text"], cell_format)

        # Resize columns:
        for col, max_len in max_col_len.items():
            worksheet.set_column(col, col, max_len+1)

        if args.genotypes:

            #############################
            # Second sheet (Genotypes): #
            #############################

            worksheet_gt = workbook.add_worksheet("Genotypes")
            write_headers(workbook, worksheet_gt, headers, nb_inds, [1], filtered_records)

            # Body:
            for cell_id, cell_content in cells_gt.items():
                cell_format = workbook.add_format(cell_content["format"])
                worksheet_gt.write(cell_id, cell_content["text"], cell_format)
                worksheet_gt.freeze_panes(0, 2+nb_inds)

            # Resize columns:
            worksheet_gt.set_column(0, 0, max_col_len[0]+1)

            ####################################
            # Third sheet (Genotypes quality): #
            ####################################

            worksheet_gq = workbook.add_worksheet("Gt quality")
            write_headers(workbook, worksheet_gq, headers, nb_inds, [1], filtered_records)

            # Body
            for cell_id, cell_content in cells_gq.items():
                cell_format = workbook.add_format(cell_content["format"])
                worksheet_gq.write(cell_id, cell_content["text"], cell_format)
                worksheet_gq.freeze_panes(0, 2+nb_inds)

            # Resize columns:
            worksheet_gq.set_column(0, 0, max_col_len[0]+1)


def create_tsv_file(filename: str, headers: list, cells: dict, nb_tools: int, nb_per_tool: int, records_range: ()):
    # Init rows:
    head = ["", headers[0]]
    top_headers = {}
    h = 2
    for header in headers[1:]:
        # Define top headers to each column:
        for i in range(0, nb_per_tool):
            top_headers[h] = header
            head.append("")
            h += 1
    rows = [head]
    for i in range(0, records_range[1]-records_range[0]):
        rows.append(["" for x in range(0, (nb_tools * nb_per_tool) + 2)])

    # Fill content:
    for id_cell, cell in cells.items():
        id_m = re.match(r"^([A-Z]+)(\d+)$", id_cell)
        col = xlsx_cols.index(id_m.group(1))
        row = int(id_m.group(2))
        if records_range[0] <= row <= records_range[1]:
            r = row - records_range[0]
            if r == 0 and col > 0:
                if col > 1:
                    rows[r][col] = top_headers[col].replace(" ", "_") + "__" + cell["text"]
                else:
                    rows[r][col] = headers[0]
            else:
                rows[r][col] = str(cell["text"])

    # List as text:
    for r in range(0, len(rows)):
        rows[r] = "\t".join(rows[r])
    tsv = "\n".join(rows)
    with open(filename, "w") as tsv_file:
        tsv_file.write(tsv)



# noinspection PyUnresolvedReferences
def main():
    # parse the command line args
    args = get_args()

    genotypes = {}
    gt_quality = {}

    nb_inds = 0

    if args.genotypes:
        genotypes, gt_quality, nb_inds = get_genotypes(args.genotypes, args.true_vcf)

    overlap_cutoff   = args.overlap_cutoff
    left_precision   = args.left_precision
    right_precision  = args.right_precision

    filenames = []

    filenames += glob.glob(args.vcf + "/**/*.vcf", recursive=True)
    filenames += glob.glob(args.vcf + "/**/*.vcf.gz", recursive=True)

    true_ones = args.true_vcf

    # Reading all the vcf files
    SVSet=[]
    for infile in filenames:
        eprint(" Reading file %s" % (infile))
        try:
            SVSet += read_vcf_file(infile)[0]
        except:
            print("Ignoreing file %s" % (infile))

    eprint(" Reading file %s" % (true_ones))
    SVSet_to, true_ones_records = read_vcf_file(true_ones)
    SVSet += SVSet_to
    filtered_records = None

    if args.filtered_vcf:
        eprint(" Reading file %s" % (args.filtered_vcf))
        filtered_records = read_vcf_file(args.filtered_vcf)[1]

    # Compute connected components:
    eprint("Computing Connected components")
    construct_overlap_graph(SVSet,overlap_cutoff,left_precision,right_precision)

    # Build records:
    records, tools, orphans = build_records(genotypes, SVSet, true_ones_records, filtered_records, gt_quality)

    nb_records = len(records)

    #################################################
    # Define cells of each sheet of the excel file: #
    #################################################

    # First one (SV description)
    cells = {
        "A1": {"text": "RESULTS", "format": {"bg_color": "#ffe856"}},
        "A2": {"text": "Deletion", "format": {"bold": True}},
        "A" + str(1+nb_records+3): {"text": "DIFFS", "format": {"bg_color": "#ffe856"}},
        "A" + str(2+nb_records+3): {"text": "Deletion", "format": {"bold": True}}
    }
    # Second one (genotype)
    cells_gt = {
        "A1": {"text": "GENOTYPES", "format": {"bg_color": "#ffe856"}},
        "A2": {"text": "Deletion", "format": {"bold": True}}
    }

    # Third one (genotype quality)
    cells_gq = {
        "A1": {"text": "GT QUALITY", "format": {"bg_color": "#ffe856"}},
        "A2": {"text": "Deletion", "format": {"bold": True}}
    }

    tools = sorted(tools)
    nb_tools = len(tools)

    ######################
    # BUILD HEADER CELLS #
    ######################
    headers, cells, cells_gt, cells_gq, max_col_len = build_header(tools, cells, cells_gt, cells_gq,
                                                                   filtered_records is not None,
                                                                   nb_records, max_col_len, nb_inds)
Thomas Faraut's avatar
Thomas Faraut committed
    rec_keys = sorted(records.keys(), key=lambda x:(records[x]["chromosome"], svsort(x, records)))
    ####################
    # BUILD BODY CELLS #
    ####################

    for rec_id in rec_keys:
        record = records[rec_id]
        my_start = record["start"]
        my_end = record["end"]
        my_length = record["length"]
        my_genotypes = record["genotypes"]
        # Real data of the simulation:
        cells, cells_gt, cells_gq, max_col_len = fill_real_data(i, cells, cells_gt, cells_gq, max_col_len, record,
                                                                rec_id, nb_records, record["chromosome"])
        j = 5
        g = nb_inds + 2
        is_kept = False
        for tool in tools:
            if tool in record["tools"]:

                ###########################
                # IF TOOL DETECTS THE SV: #
                ###########################

                if record["tools"][tool]["filtered"]:

                    ######################
                    # SET FILTERED DATA: #
                    ######################

                    is_kept = True
                    sv_format = {"bg_color": color_is_kept}
                    # SV data (sheet 1):
                    cells, max_col_len = fill_tool_data(i, 2+((nb_tools+1)*3), cells, max_col_len,
                                                        record["tools"][tool], nb_records, my_start, my_end, my_length,
                                                        {"bg_color": color_col_filter})

                    # Genotype (sheets 2&3):
                    cells_gt, cells_gq = fill_genotypes_data(i, 2 + ((nb_tools + 1) * nb_inds), cells_gt, cells_gq,
                                                             record["tools"][tool], my_genotypes, args.haploid)
                else:
                    sv_format = {}

                #################
                # SET TOOL DATA #
                #################

                # SV data (sheet 1):
                cells, max_col_len = fill_tool_data(i, j, cells, max_col_len,
                                                    record["tools"][tool], nb_records, my_start, my_end, my_length,
                                                    sv_format)
                # Genotype (sheets 2&3):
                cells_gt, cells_gq = fill_genotypes_data(i, g, cells_gt, cells_gq,
                                                         record["tools"][tool], my_genotypes, args.haploid)

                ###############################
                # TOOL DOES NOT DETECT THE SV #
                ###############################

                for k in range(0,3):
                    # noinspection PyUnresolvedReferences
                    cells[xlsx_cols[j + k] + str(i)] = {"text": "", "format": {"bg_color": color_not_found}}
                    cells[xlsx_cols[j + k] + str(i+nb_records+3)] = {"text": "", "format": {"bg_color": color_not_found}}

                    # Genotype:
                    for gt in range(0, nb_inds):
                        # noinspection PyUnresolvedReferences
                        cells_gt[xlsx_cols[g + gt] + str(i)] = cells_gq[xlsx_cols[g + gt] + str(i)] = \
                            {"text": "", "format": {"bg_color": color_not_found_2}}

        ###############################################################################
        # Until we have filled all tools, check if the record is kept after filtering: #
        ###############################################################################

        if is_kept:
            # SV is kept: color bg in green in the corresponding tool
            cells[xlsx_cols[0] + str(i)]["format"]["bg_color"] = \
                cells[xlsx_cols[0] + str(i+nb_records+3)]["format"]["bg_color"] = color_is_kept
        elif filtered_records is not None:
            # SV does not pass the filter: color bg in red in the filter column
            cells[xlsx_cols[2 + ((nb_tools + 1) * 3)] + str(i)] = {"text": "", "format": {"bg_color": color_not_found}}
            cells[xlsx_cols[2 + ((nb_tools + 1) * 3) + 1] + str(i)] = {"text": "", "format": {"bg_color": color_not_found}}
            cells[xlsx_cols[2 + ((nb_tools + 1) * 3) + 2] + str(i)] = {"text": "", "format": {"bg_color": color_not_found}}
            cells[xlsx_cols[2 + ((nb_tools + 1) * 3)] + str(i + nb_records + 3)] = {"text": "",
                                                                                    "format": {"bg_color": color_not_found}}
            cells[xlsx_cols[2 + ((nb_tools + 1) * 3) + 1] + str(i + nb_records + 3)] = {
                "text": "", "format": {"bg_color": color_not_found}}
            cells[xlsx_cols[2 + ((nb_tools + 1) * 3) + 2] + str(i + nb_records + 3)] = {
                "text": "", "format": {"bg_color": color_not_found}}

            # Color in gray in the filter column
            for gt in range(0, nb_inds):
                cells_gt[xlsx_cols[2 + ((nb_tools + 1) * nb_inds) + gt] + str(i)] = \
                    cells_gq[xlsx_cols[2 + ((nb_tools + 1) * nb_inds) + gt] + str(i)] = {"text": "", "format":
                                                                                        {"bg_color": color_not_found_2}}
        # False positives (orphans) in orange:
        if re.match(r"^orphan_\d+$", rec_id):
            cells[xlsx_cols[0] + str(i)]["format"]["bg_color"] = \
                cells[xlsx_cols[0] + str(i+nb_records+3)]["format"]["bg_color"] = color_false_positive

        i += 1

    # Create document:
    if not args.no_xls:
        create_xls_document(args, headers, filtered_records is not None, nb_records, nb_inds, cells, cells_gt, cells_gq,
                            max_col_len)
    # Create CSV files:
    create_tsv_file(args.output + "_sv_per_tools.tsv", headers, cells,
                    nb_tools + (2 if filtered_records is not None else 1),
                    3, (2, nb_records+2))
    create_tsv_file(args.output + "_sv_diffs_per_tools.tsv", headers, cells,
                    nb_tools + (2 if filtered_records is not None else 1),
                    3, (2+nb_records+3, nb_records * 2 + 5))
    create_tsv_file(args.output + "_sv_genotypes_per_tools.tsv", headers, cells_gt,
                    nb_tools + (2 if filtered_records is not None else 1),
                    nb_inds, (2, nb_records + 2))
    create_tsv_file(args.output + "_sv_genotypes_quality_per_tools.tsv", headers, cells_gq,
                    nb_tools + (2 if filtered_records is not None else 1),
                    nb_inds, (2, nb_records + 2))

    print("")
    print("###########")
    print("# RESULTS #")
    print("###########")
    print("")
    print(str(nb_records) + " Results found")
    print(str(orphans) + " False Positive")
    print("")
    if not args.no_xls:
        print("Results saved in :\n\t- " + args.output + ".xslx")
    else:
        print("Results:")
    print("")
    print("TSV files:")
    print("\t- " + args.output + "_sv_per_tools.tsv")
    print("\t- " + args.output + "_sv_diffs_per_tools.tsv")
    print("\t- " + args.output + "_sv_genotypes_per_tools.tsv")
    print("\t- " + args.output + "_sv_genotypes_quality_per_tools.tsv")
    print("")

# initialize the script
if __name__ == '__main__':
    try:
        sys.exit(main())
    except:
        raise