build_xls_results.py

#! /usr/bin/env python3

"""mergeSV  : merge SV on the basis of their intervals

The merging is done by identifiying connected components in a graph.

The graph is constructing taking the SV a node and an edge connects two nodes
if the conresponding intervals exhibits a reciprocal overlap of a least R0%.

Additional constraints on the precision of breakpoints (left and right) can be specified.

Each connected component is termed a CNVR (Copy Number Variation Region)
and is associated to the corresponding CNVs.

"""

import argparse, sys
import glob
import vcf
import string
import os
import re

from svreader.vcfwrapper import VCFReader
from svinterval import construct_overlap_graph, connected_components

import xlsxwriter

prg_path = os.path.dirname(os.path.realpath(__file__))

alphabet = list(string.ascii_uppercase)
xlsx_cols = alphabet.copy()
for i in alphabet:
    for j in alphabet:
        xlsx_cols.append(i + j)


def get_args():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="\
mergeIndividualSV \n \
description: Merge SV based on reciprocal overlap")
    #parser.add_argument('-v', '--vcf', type=str, required=True, help='vcf file(s), comma-separated')
    parser.add_argument('-v', '--vcf', type=str, required=True, help='folder containing all vcf results files')
    parser.add_argument('-t', '--true-vcf', type=str, required=True, help='VCF file containing the simulated deletions')
    parser.add_argument('-f', '--filtered-vcf', type=str, required=False,
                        help='VCF file containing the filtered results')
    parser.add_argument('-g', '--genotypes', type=str, help="VCF file containing genotypes")
    parser.add_argument('--overlap_cutoff',  type=float, default=0.5, help='cutoff for reciprocal overlap')
    parser.add_argument('--left_precision',  type=int, default=-1, help='left breakpoint precision')
    parser.add_argument('--right_precision',  type=int, default=-1, help='right breakpoint precision')
    parser.add_argument('-o', '--output', type=str, default="results.xlsx", help='output Excel file')
    # parse the arguments
    args = parser.parse_args()

    if args.left_precision == -1:
        args.left_precision = sys.maxsize

    if args.right_precision == -1:
        args.right_precision = sys.maxsize

    # send back the user input
    return args

# A small wrapper to print to stderr
def eprint(*args, **kwargs):
    print(*args, file=sys.stderr, **kwargs)


def to_vcf_record(cnv):

        info = { "SVLEN" : cnv.length(),
                 "SVTYPE" : "DEL",
                 "END" : cnv.end,
                 "NUM_CNV" : cnv.NumCNV(),
                 "CNV":",".join(cnv.CNV()),
                 "CALLERS":",".join(cnv.Callers()),
                 "NUM_CALLERS":cnv.NumCallers(),
                 "CIPOS":cnv.cipos(),
                 "CIEND":cnv.ciend(),
                 "PRECISION":",".join(map(str,cnv.precision())),
                 "CALLERSVSAMPLES":",".join(cnv.CallersVsamples())
                  }
        alt = [vcf.model._SV("DEL")]

        if not cnv.IsPrecise():
            info['IMPRECISE'] = True
        else:
            info['REPR_CNV'] = cnv.repr_cnv()


        vcf_record = vcf.model._Record(cnv.chrom,
                                       cnv.start,
                                       cnv.name,
                                       "N",
                                       alt,
                                       ".",
                                       ".",
                                       info,
                                       "",
                                       [0],
                                       [])
        return vcf_record

def passed_variant(record):
        """Did this variant pass?"""
        return record.filter is None or len(record.filter) == 0 or "PASS" in record.filter

# --------------------------------------
# main function


def read_vcf_file(infile):
    SVSet=[]
    ids = []
    for record in VCFReader(infile):
        if not passed_variant(record):
            continue
        SVSet.append(record)
        ids.append(record.id)
    return SVSet, ids


def svsort(sv, records):
    """
    Function to sort regions
    """
    if records[sv]["start"] != "":
        return int(records[sv]["start"])
    first_tool = list(records[sv]["tools"].keys())[0]
    return int(records[sv]["tools"][first_tool]["start"])


def get_gt(geno, true_gt):
    if true_gt is not None:
        if geno == "1/0" and true_gt == "0/1":
            geno = "0/1"
        elif geno == "0/1" and true_gt == "1/0":
            geno = "1/0"
    return geno


def get_max_len(cell, col, max_col_len):
    return max(len(str(cell)), max_col_len[col] if col in max_col_len else 0)


def main():
    # parse the command line args
    args = get_args()

    genotypes = {}

    nb_inds = 0

    if args.genotypes:
        genotypes_raw = os.popen("zcat genotype_chr_chr12.vcf.gz " + args.true_vcf +  " | " +
                                 os.path.join(prg_path, "vawk") + " '{ print $3,S$*$GT}'").read().split("\n")
        for gt in genotypes_raw:
            if gt != "":
                gt_l = gt.split("\t")
                genotypes[gt_l[0]] = gt_l[1:]
                nb_inds = len(gt_l[1:])

    overlap_cutoff   = args.overlap_cutoff
    left_precision   = args.left_precision
    right_precision  = args.right_precision

    color_not_found = "#FE2E2E"
    color_col_filter = "#BEF781"
    color_is_kept = "#81F781"
    color_false_positive = "#FE642E"

    filenames = []

    filenames += glob.glob(args.vcf + "/**/*.vcf", recursive=True)
    filenames += glob.glob(args.vcf + "/**/*.vcf.gz", recursive=True)

    true_ones = args.true_vcf

    # Reading all the vcf files
    SVSet=[]
    for infile in filenames:
        eprint(" Reading file %s" % (infile))
        SVSet += read_vcf_file(infile)[0]

    eprint(" Reading file %s" % (true_ones))
    SVSet_to, true_ones_records = read_vcf_file(true_ones)
    SVSet += SVSet_to
    filtered_records = None

    if args.filtered_vcf:
        eprint(" Reading file %s" % (args.filtered_vcf))
        filtered_records = read_vcf_file(args.filtered_vcf)[1]

    eprint("Computing Connected components")
    construct_overlap_graph(SVSet,overlap_cutoff,left_precision,right_precision)

    number = 1
    records = {}
    tools = set()
    orphans = 0
    for components in connected_components(SVSet):
        names = []
        results = {}
        true_one = None

        for node in components:
            names.append(node.id)
            if node.id in true_ones_records:
                true_one = node.id
                records[true_one] = {
                    "start": node.start,
                    "end": node.end,
                    "length": int(node.end) - int(node.start),
                    "tools": {},
                    "orphan": False,
                    "genotypes": genotypes[true_one] if true_one in genotypes else None
                }
            else:
                tool_name = node.id.split("_")[0]
                results[tool_name] = {
                    "start": node.start,
                    "end": node.end,
                    "length": int(node.end) - int(node.start),
                    "filtered": (node.id in filtered_records) if filtered_records is not None else False,
                    "genotypes": genotypes[node.id] if node.id in genotypes else None
                }
                tools.add(tool_name)

        if true_one is not None:
            records[true_one]["tools"] = results
        else:
            orphans += 1
            records["orphan_" + str(orphans)] = {
                "start": "",
                "end": "",
                "length": "",
                "tools": results,
                "orphan": True,
                "genotypes": None
            }

        names.sort()

        eprint("Group #%i: %s" % (number, ", ".join(names)))
        number += 1

    nb_records = len(records)

    cells = {
        "A1": {"text": "RESULTS", "format": {"bg_color": "#ffe856"}},
        "A2": {"text": "Deletion", "format": {"bold": True}},
        "A" + str(1+nb_records+3): {"text": "DIFFS", "format": {"bg_color": "#ffe856"}},
        "A" + str(2+nb_records+3): {"text": "Deletion", "format": {"bold": True}}
    }

    cells_gt = {
        "A1": {"text": "GENOTYPES", "format": {"bg_color": "#ffe856"}},
        "A2": {"text": "Deletion", "format": {"bold": True}}
    }

    tools = sorted(tools)
    nb_tools = len(tools)

    max_col_len = {}

    i = 1
    j = 1
    headers = []
    for tool in ["Real data"] + tools + (["Filtered results"] if filtered_records is not None else []):
        #cells[xlsx_cols[i] + "1"] = cells[xlsx_cols[i] + str(1+nb_records+3)] = {"text": tool, "format": ""}
        headers.append(tool)
        l_format = {"bold": True}
        if tool == "Filtered results":
            l_format["bg_color"] = color_col_filter
        cells[xlsx_cols[i] + "2"] = cells[xlsx_cols[i] + str(2+nb_records+3)] = {"text": "Start", "format": l_format}
        max_col_len[i] = get_max_len("Start", i, max_col_len)
        cells[xlsx_cols[i + 1] + "2"] = cells[xlsx_cols[i+1] + str(2+nb_records+3)] = {"text": "End",
                                                                                       "format": l_format}
        max_col_len[i+1] = get_max_len("End", i+1, max_col_len)
        cells[xlsx_cols[i + 2] + "2"] = cells[xlsx_cols[i+2] + str(2+nb_records+3)] = {"text": "Length",
                                                                                       "format": l_format}
        max_col_len[i+2] = get_max_len("Length", i+2, max_col_len)

        # Genotypes:
        for k in range(0, nb_inds):
            cells_gt[xlsx_cols[j+k] + "2"] = {"text": "INDIV_" + str(k+1), "format": {"bold": True}}

        i += 3
        j += nb_inds

    rec_keys = sorted(records.keys(), key=lambda x:svsort(x, records))

    i = 3
    for id in rec_keys:
        record = records[id]
        cells[xlsx_cols[0] + str(i)] = cells[xlsx_cols[0] + str(i+nb_records+3)] = {"text": id, "format": {}}
        max_col_len[0] = get_max_len(id, 0, max_col_len)
        print(max_col_len[0])
        cells_gt[xlsx_cols[0] + str(i)] = {"text": id, "format": {}}
        cells[xlsx_cols[1] + str(i)] = cells[xlsx_cols[1] + str(i+nb_records+3)] = {"text": record["start"],
                                                                                    "format": {}}
        max_col_len[1] = get_max_len(record["start"], 1, max_col_len)
        my_start = record["start"]
        cells[xlsx_cols[2] + str(i)] = cells[xlsx_cols[2] + str(i+nb_records+3)] = {"text": record["end"],
                                                                                    "format": {}}
        max_col_len[2] = get_max_len(record["end"], 2, max_col_len)
        my_end = record["end"]
        cells[xlsx_cols[3] + str(i)] = cells[xlsx_cols[3] + str(i+nb_records+3)] = {"text": record["length"],
                                                                                    "format": {}}
        max_col_len[3] = get_max_len(record["length"], 3, max_col_len)
        my_length = record["length"]

        # Genotypes:
        my_genotypes = None
        if record["genotypes"] is not None:
            for gt in range(0, len(record["genotypes"])):
                cells_gt[xlsx_cols[1+gt] + str(i)] = {"text": record["genotypes"][gt], "format": {}}
            my_genotypes = record["genotypes"]

        j = 4
        g = nb_inds + 1
        is_kept = False
        for tool in tools:
            if tool in record["tools"]:
                if record["tools"][tool]["filtered"]:
                    is_kept = True
                    sv_format = {"bg_color": color_is_kept}
                    # Raw values:
                    cells[xlsx_cols[1+((nb_tools+1)*3)] + str(i)] = {"text": record["tools"][tool]["start"],
                                                                     "format": {"bg_color": color_col_filter}}
                    max_col_len[1+((nb_tools+1)*3)] = \
                        get_max_len(record["tools"][tool]["start"], 1 + ((nb_tools + 1) * 3),
                                    max_col_len)
                    cells[xlsx_cols[1+((nb_tools+1)*3)+1] + str(i)] = {"text": record["tools"][tool]["end"],
                                                                       "format": {"bg_color": color_col_filter}}
                    max_col_len[1 + ((nb_tools + 1) * 3)+1] = \
                        get_max_len(record["tools"][tool]["end"], 1 + ((nb_tools + 1) * 3)+1,
                                    max_col_len)
                    cells[xlsx_cols[1+((nb_tools+1)*3)+2] + str(i)] = {"text": record["tools"][tool]["length"],
                                                                       "format": {"bg_color": color_col_filter}}
                    max_col_len[1 + ((nb_tools + 1) * 3) + 2] = \
                        get_max_len(record["tools"][tool]["length"], 1 + ((nb_tools + 1) * 3) + 2,
                                    max_col_len)
                    # Diffs:
                    if my_start != "":
                        cells[xlsx_cols[1+((nb_tools+1)*3)] + str(i+nb_records+3)] = {
                            "text": record["tools"][tool]["start"] - my_start, "format": {"bg_color": color_col_filter}}
                        cells[xlsx_cols[1+((nb_tools+1)*3) + 1] + str(i+nb_records+3)] = {
                            "text": record["tools"][tool]["end"] - my_end, "format": {"bg_color": color_col_filter}}
                        cells[xlsx_cols[1+((nb_tools+1)*3) + 2] + str(i+nb_records+3)] = {
                            "text": record["tools"][tool]["length"] - my_length,
                            "format": {"bg_color": color_col_filter}}
                    else:
                        cells[xlsx_cols[1 + ((nb_tools + 1) * 3)] + str(i + nb_records + 3)] = {
                            "text": "?????", "format": {"bg_color": color_col_filter}}
                        cells[xlsx_cols[1 + ((nb_tools + 1) * 3) + 1] + str(i + nb_records + 3)] = {
                            "text": "?????","format": {"bg_color": color_col_filter}}
                        cells[xlsx_cols[1 + ((nb_tools + 1) * 3) + 2] + str(i + nb_records + 3)] = {
                            "text": "?????", "format": {"bg_color": color_col_filter}}

                    # Genotypes:
                    if record["tools"][tool]["genotypes"] is not None:
                        the_genotypes = record["tools"][tool]["genotypes"]
                        for gt in range(0, len(the_genotypes)):
                            true_gt = my_genotypes[gt]
                            geno = get_gt(the_genotypes[gt], true_gt)
                            gt_format = {"bg_color": color_col_filter}
                            if geno != my_genotypes[gt]:
                                gt_format["font_color"] = "#ff0000"
                            cells_gt[xlsx_cols[1 + ((nb_tools + 1) * nb_inds) + gt] + str(i)] = \
                            {"text": geno, "format": gt_format}

                else:
                    sv_format = {}
                # Raw values:
                cells[xlsx_cols[j] + str(i)] = {"text": record["tools"][tool]["start"], "format": sv_format}
                max_col_len[j] = get_max_len(record["tools"][tool]["start"], j, max_col_len)
                cells[xlsx_cols[j + 1] + str(i)] = {"text": record["tools"][tool]["end"], "format": sv_format}
                max_col_len[j + 1] = get_max_len(record["tools"][tool]["end"], j + 1, max_col_len)
                cells[xlsx_cols[j + 2] + str(i)] = {"text": record["tools"][tool]["length"], "format": sv_format}
                max_col_len[j + 2] = get_max_len(record["tools"][tool]["length"], j + 2, max_col_len)

                # Diffs :
                if my_start != "":
                    cells[xlsx_cols[j] + str(i+nb_records+3)] = {"text": record["tools"][tool]["start"] - my_start,
                                                                 "format": sv_format}
                    cells[xlsx_cols[j + 1] + str(i+nb_records+3)] = {"text": record["tools"][tool]["end"] - my_end,
                                                                     "format": sv_format}
                    cells[xlsx_cols[j + 2] + str(i+nb_records+3)] = {"text": record["tools"][tool]["length"] - my_length,
                                                                 "format": sv_format}
                else:
                    cells[xlsx_cols[j] + str(i + nb_records + 3)] = {"text": "NA", "format": sv_format}
                    cells[xlsx_cols[j + 1] + str(i + nb_records + 3)] = {"text": "NA", "format": sv_format}
                    cells[xlsx_cols[j + 2] + str(i + nb_records + 3)] = {"text": "NA", "format": sv_format}

                # Genotypes:
                if record["tools"][tool]["genotypes"] is not None:
                    the_genotypes = record["tools"][tool]["genotypes"]
                    for gt in range(0, len(the_genotypes)):
                        true_gt = my_genotypes[gt]
                        geno = get_gt(the_genotypes[gt], true_gt)
                        gt_format = sv_format.copy()
                        if geno != true_gt:
                            gt_format["font_color"] = "#ff0000"
                        cells_gt[xlsx_cols[g + gt] + str(i)] = \
                            {"text": geno, "format": gt_format}

            else:
                for k in range(0,3):
                    cells[xlsx_cols[j + k] + str(i)] = {"text": "", "format": {"bg_color": color_not_found}}
                    cells[xlsx_cols[j + k] + str(i+nb_records+3)] = {"text": "", "format": {"bg_color": color_not_found}}

                    # Genotype:
                    for gt in range(0, nb_inds):
                        cells_gt[xlsx_cols[g + gt] + str(i)] = {"text": "", "format": {"bg_color": "#000000"}}
            j += 3
            g += nb_inds
        if is_kept:
            cells[xlsx_cols[0] + str(i)]["format"]["bg_color"] = \
                cells[xlsx_cols[0] + str(i+nb_records+3)]["format"]["bg_color"] = color_is_kept
        elif filtered_records is not None:
            cells[xlsx_cols[1 + ((nb_tools + 1) * 3)] + str(i)] = {"text": "", "format": {"bg_color": color_not_found}}
            cells[xlsx_cols[1 + ((nb_tools + 1) * 3) + 1] + str(i)] = {"text": "", "format": {"bg_color": color_not_found}}
            cells[xlsx_cols[1 + ((nb_tools + 1) * 3) + 2] + str(i)] = {"text": "", "format": {"bg_color": color_not_found}}
            cells[xlsx_cols[1 + ((nb_tools + 1) * 3)] + str(i + nb_records + 3)] = {"text": "",
                                                                                    "format": {"bg_color": color_not_found}}
            cells[xlsx_cols[1 + ((nb_tools + 1) * 3) + 1] + str(i + nb_records + 3)] = {
                "text": "", "format": {"bg_color": color_not_found}}
            cells[xlsx_cols[1 + ((nb_tools + 1) * 3) + 2] + str(i + nb_records + 3)] = {
                "text": "", "format": {"bg_color": color_not_found}}

            # Genotype:
            for gt in range(0, nb_inds):
                cells_gt[xlsx_cols[1 + ((nb_tools + 1) * nb_inds) + gt] + str(i)] = {"text": "", "format":
                                                                                     {"bg_color": "#000000"}}

        # False positives (orphans) in orange:
        if re.match(r"^orphan_\d+$", id):
            cells[xlsx_cols[0] + str(i)]["format"]["bg_color"] = \
                cells[xlsx_cols[0] + str(i+nb_records+3)]["format"]["bg_color"] = color_false_positive

        i += 1

    # Create document:
    with xlsxwriter.Workbook(args.output) as workbook:
        worksheet = workbook.add_worksheet("SVs")
        i=0
        max_header_length = 0
        for header in headers:
            max_header_length = max(len(header), max_header_length)
            if i < len(headers) - 1 or filtered_records is None:
                merge_format = workbook.add_format({'align': 'center'})
            else:
                merge_format = workbook.add_format({'align': 'center', 'bg_color': color_col_filter})
            worksheet.merge_range(xlsx_cols[1 + i * 3] + "1:" + xlsx_cols[3 + i * 3] + "1", header, merge_format)
            diff_line = str(1+nb_records+3)
            worksheet.merge_range(xlsx_cols[1 + i * 3] + diff_line + ":" + xlsx_cols[3 + i * 3] + diff_line, header,
                                  merge_format)
            i += 1
        for cell_id, cell_content in cells.items():
            cell_format = workbook.add_format(cell_content["format"])
            worksheet.write(cell_id, cell_content["text"], cell_format)

        # Resize columns:
        for col, max_len in max_col_len.items():
            print(col,max_len)
            worksheet.set_column(col, col, max_len+1)

        # Genotypes:
        if args.genotypes:
            worksheet_gt = workbook.add_worksheet("Genotypes")
            i = 0
            for header in headers:
                if i < len(headers) - 1 or filtered_records is None:
                    merge_format = workbook.add_format({'align': 'center'})
                else:
                    merge_format = workbook.add_format({'align': 'center', 'bg_color': color_col_filter})
                worksheet_gt.merge_range(xlsx_cols[1 + i * nb_inds] + "1:" + xlsx_cols[nb_inds + i * nb_inds] + "1",
                                         header, merge_format)
                i += 1
            for cell_id, cell_content in cells_gt.items():
                cell_format = workbook.add_format(cell_content["format"])
                worksheet_gt.write(cell_id, cell_content["text"], cell_format)
                worksheet_gt.freeze_panes(0, 9)
            worksheet_gt.set_column(0, 0, max_col_len[0]+1)

    print("")
    print("###########")
    print("# RESULTS #")
    print("###########")
    print("")
    print(str(number-1) + " Results found")
    print(str(orphans) + " False Positive")
    print("")
    print("Results saved in " + args.output)
    print("")

# initialize the script
if __name__ == '__main__':
    try:
        sys.exit(main())
    except:
        raise