Add genotypes comparison + resize columns

a96b9922 · Floreal Cabanettes · 410e18aa · a96b9922
Commit a96b9922 authored 7 years ago by Floreal Cabanettes
--- a/build_xls_results.py
+++ b/build_xls_results.py
@@ -18,6 +18,7 @@ import argparse, sys
 import glob
 import vcf
 import string
+import os
 import re

 from svreader.vcfwrapper import VCFReader
@@ -25,6 +26,8 @@ from svinterval import construct_overlap_graph, connected_components

 import xlsxwriter

+prg_path = os.path.dirname(os.path.realpath(__file__))
+
 alphabet = list(string.ascii_uppercase)
 xlsx_cols = alphabet.copy()
 for i in alphabet:
@@ -41,6 +44,7 @@ description: Merge SV based on reciprocal overlap")
    parser.add_argument('-t', '--true-vcf', type=str, required=True, help='VCF file containing the simulated deletions')
    parser.add_argument('-f', '--filtered-vcf', type=str, required=False,
                        help='VCF file containing the filtered results')
+    parser.add_argument('-g', '--genotypes', type=str, help="VCF file containing genotypes")
    parser.add_argument('--overlap_cutoff',  type=float, default=0.5, help='cutoff for reciprocal overlap')
    parser.add_argument('--left_precision',  type=int, default=-1, help='left breakpoint precision')
    parser.add_argument('--right_precision',  type=int, default=-1, help='right breakpoint precision')
@@ -125,10 +129,37 @@ def svsort(sv, records):
    first_tool = list(records[sv]["tools"].keys())[0]
    return int(records[sv]["tools"][first_tool]["start"])

+
+def get_gt(geno, true_gt):
+    if true_gt is not None:
+        if geno == "1/0" and true_gt == "0/1":
+            geno = "0/1"
+        elif geno == "0/1" and true_gt == "1/0":
+            geno = "1/0"
+    return geno
+
+
+def get_max_len(cell, col, max_col_len):
+    return max(len(str(cell)), max_col_len[col] if col in max_col_len else 0)
+
+
 def main():
    # parse the command line args
    args = get_args()

+    genotypes = {}
+
+    nb_inds = 0
+
+    if args.genotypes:
+        genotypes_raw = os.popen("zcat genotype_chr_chr12.vcf.gz " + args.true_vcf +  " | " +
+                                 os.path.join(prg_path, "vawk") + " '{ print $3,S$*$GT}'").read().split("\n")
+        for gt in genotypes_raw:
+            if gt != "":
+                gt_l = gt.split("\t")
+                genotypes[gt_l[0]] = gt_l[1:]
+                nb_inds = len(gt_l[1:])
+
    overlap_cutoff   = args.overlap_cutoff
    left_precision   = args.left_precision
    right_precision  = args.right_precision
@@ -181,7 +212,8 @@ def main():
                    "end": node.end,
                    "length": int(node.end) - int(node.start),
                    "tools": {},
-                    "orphan": False
+                    "orphan": False,
+                    "genotypes": genotypes[true_one] if true_one in genotypes else None
                }
            else:
                tool_name = node.id.split("_")[0]
@@ -189,7 +221,8 @@ def main():
                    "start": node.start,
                    "end": node.end,
                    "length": int(node.end) - int(node.start),
-                    "filtered": (node.id in filtered_records) if filtered_records is not None else False
+                    "filtered": (node.id in filtered_records) if filtered_records is not None else False,
+                    "genotypes": genotypes[node.id] if node.id in genotypes else None
                }
                tools.add(tool_name)

@@ -202,7 +235,8 @@ def main():
                "end": "",
                "length": "",
                "tools": results,
-                "orphan": True
+                "orphan": True,
+                "genotypes": None
            }

        names.sort()
@@ -218,10 +252,19 @@ def main():
        "A" + str(1+nb_records+3): {"text": "DIFFS", "format": {"bg_color": "#ffe856"}},
        "A" + str(2+nb_records+3): {"text": "Deletion", "format": {"bold": True}}
    }
+
+    cells_gt = {
+        "A1": {"text": "GENOTYPES", "format": {"bg_color": "#ffe856"}},
+        "A2": {"text": "Deletion", "format": {"bold": True}}
+    }
+
    tools = sorted(tools)
    nb_tools = len(tools)

+    max_col_len = {}
+
    i = 1
+    j = 1
    headers = []
    for tool in ["Real data"] + tools + (["Filtered results"] if filtered_records is not None else []):
        #cells[xlsx_cols[i] + "1"] = cells[xlsx_cols[i] + str(1+nb_records+3)] = {"text": tool, "format": ""}
@@ -230,11 +273,20 @@ def main():
        if tool == "Filtered results":
            l_format["bg_color"] = color_col_filter
        cells[xlsx_cols[i] + "2"] = cells[xlsx_cols[i] + str(2+nb_records+3)] = {"text": "Start", "format": l_format}
+        max_col_len[i] = get_max_len("Start", i, max_col_len)
        cells[xlsx_cols[i + 1] + "2"] = cells[xlsx_cols[i+1] + str(2+nb_records+3)] = {"text": "End",
                                                                                       "format": l_format}
+        max_col_len[i+1] = get_max_len("End", i+1, max_col_len)
        cells[xlsx_cols[i + 2] + "2"] = cells[xlsx_cols[i+2] + str(2+nb_records+3)] = {"text": "Length",
                                                                                       "format": l_format}
+        max_col_len[i+2] = get_max_len("Length", i+2, max_col_len)
+
+        # Genotypes:
+        for k in range(0, nb_inds):
+            cells_gt[xlsx_cols[j+k] + "2"] = {"text": "INDIV_" + str(k+1), "format": {"bold": True}}
+
        i += 3
+        j += nb_inds

    rec_keys = sorted(records.keys(), key=lambda x:svsort(x, records))

@@ -242,17 +294,31 @@ def main():
    for id in rec_keys:
        record = records[id]
        cells[xlsx_cols[0] + str(i)] = cells[xlsx_cols[0] + str(i+nb_records+3)] = {"text": id, "format": {}}
+        max_col_len[0] = get_max_len(id, 0, max_col_len)
+        print(max_col_len[0])
+        cells_gt[xlsx_cols[0] + str(i)] = {"text": id, "format": {}}
        cells[xlsx_cols[1] + str(i)] = cells[xlsx_cols[1] + str(i+nb_records+3)] = {"text": record["start"],
                                                                                    "format": {}}
+        max_col_len[1] = get_max_len(record["start"], 1, max_col_len)
        my_start = record["start"]
        cells[xlsx_cols[2] + str(i)] = cells[xlsx_cols[2] + str(i+nb_records+3)] = {"text": record["end"],
                                                                                    "format": {}}
+        max_col_len[2] = get_max_len(record["end"], 2, max_col_len)
        my_end = record["end"]
        cells[xlsx_cols[3] + str(i)] = cells[xlsx_cols[3] + str(i+nb_records+3)] = {"text": record["length"],
                                                                                    "format": {}}
+        max_col_len[3] = get_max_len(record["length"], 3, max_col_len)
        my_length = record["length"]

+        # Genotypes:
+        my_genotypes = None
+        if record["genotypes"] is not None:
+            for gt in range(0, len(record["genotypes"])):
+                cells_gt[xlsx_cols[1+gt] + str(i)] = {"text": record["genotypes"][gt], "format": {}}
+            my_genotypes = record["genotypes"]
+
        j = 4
+        g = nb_inds + 1
        is_kept = False
        for tool in tools:
            if tool in record["tools"]:
@@ -262,10 +328,19 @@ def main():
                    # Raw values:
                    cells[xlsx_cols[1+((nb_tools+1)*3)] + str(i)] = {"text": record["tools"][tool]["start"],
                                                                     "format": {"bg_color": color_col_filter}}
+                    max_col_len[1+((nb_tools+1)*3)] = \
+                        get_max_len(record["tools"][tool]["start"], 1 + ((nb_tools + 1) * 3),
+                                    max_col_len)
                    cells[xlsx_cols[1+((nb_tools+1)*3)+1] + str(i)] = {"text": record["tools"][tool]["end"],
                                                                       "format": {"bg_color": color_col_filter}}
+                    max_col_len[1 + ((nb_tools + 1) * 3)+1] = \
+                        get_max_len(record["tools"][tool]["end"], 1 + ((nb_tools + 1) * 3)+1,
+                                    max_col_len)
                    cells[xlsx_cols[1+((nb_tools+1)*3)+2] + str(i)] = {"text": record["tools"][tool]["length"],
                                                                       "format": {"bg_color": color_col_filter}}
+                    max_col_len[1 + ((nb_tools + 1) * 3) + 2] = \
+                        get_max_len(record["tools"][tool]["length"], 1 + ((nb_tools + 1) * 3) + 2,
+                                    max_col_len)
                    # Diffs:
                    if my_start != "":
                        cells[xlsx_cols[1+((nb_tools+1)*3)] + str(i+nb_records+3)] = {
@@ -283,12 +358,27 @@ def main():
                        cells[xlsx_cols[1 + ((nb_tools + 1) * 3) + 2] + str(i + nb_records + 3)] = {
                            "text": "?????", "format": {"bg_color": color_col_filter}}

+                    # Genotypes:
+                    if record["tools"][tool]["genotypes"] is not None:
+                        the_genotypes = record["tools"][tool]["genotypes"]
+                        for gt in range(0, len(the_genotypes)):
+                            true_gt = my_genotypes[gt]
+                            geno = get_gt(the_genotypes[gt], true_gt)
+                            gt_format = {"bg_color": color_col_filter}
+                            if geno != my_genotypes[gt]:
+                                gt_format["font_color"] = "#ff0000"
+                            cells_gt[xlsx_cols[1 + ((nb_tools + 1) * nb_inds) + gt] + str(i)] = \
+                            {"text": geno, "format": gt_format}
+
                else:
                    sv_format = {}
                # Raw values:
                cells[xlsx_cols[j] + str(i)] = {"text": record["tools"][tool]["start"], "format": sv_format}
+                max_col_len[j] = get_max_len(record["tools"][tool]["start"], j, max_col_len)
                cells[xlsx_cols[j + 1] + str(i)] = {"text": record["tools"][tool]["end"], "format": sv_format}
+                max_col_len[j + 1] = get_max_len(record["tools"][tool]["end"], j + 1, max_col_len)
                cells[xlsx_cols[j + 2] + str(i)] = {"text": record["tools"][tool]["length"], "format": sv_format}
+                max_col_len[j + 2] = get_max_len(record["tools"][tool]["length"], j + 2, max_col_len)

                # Diffs :
                if my_start != "":
@@ -299,19 +389,36 @@ def main():
                    cells[xlsx_cols[j + 2] + str(i+nb_records+3)] = {"text": record["tools"][tool]["length"] - my_length,
                                                                 "format": sv_format}
                else:
-                    cells[xlsx_cols[j] + str(i + nb_records + 3)] = {"text": "?????", "format": sv_format}
-                    cells[xlsx_cols[j + 1] + str(i + nb_records + 3)] = {"text": "?????", "format": sv_format}
-                    cells[xlsx_cols[j + 2] + str(i + nb_records + 3)] = {"text": "?????", "format": sv_format}
+                    cells[xlsx_cols[j] + str(i + nb_records + 3)] = {"text": "NA", "format": sv_format}
+                    cells[xlsx_cols[j + 1] + str(i + nb_records + 3)] = {"text": "NA", "format": sv_format}
+                    cells[xlsx_cols[j + 2] + str(i + nb_records + 3)] = {"text": "NA", "format": sv_format}
+
+                # Genotypes:
+                if record["tools"][tool]["genotypes"] is not None:
+                    the_genotypes = record["tools"][tool]["genotypes"]
+                    for gt in range(0, len(the_genotypes)):
+                        true_gt = my_genotypes[gt]
+                        geno = get_gt(the_genotypes[gt], true_gt)
+                        gt_format = sv_format.copy()
+                        if geno != true_gt:
+                            gt_format["font_color"] = "#ff0000"
+                        cells_gt[xlsx_cols[g + gt] + str(i)] = \
+                            {"text": geno, "format": gt_format}

            else:
                for k in range(0,3):
                    cells[xlsx_cols[j + k] + str(i)] = {"text": "", "format": {"bg_color": color_not_found}}
                    cells[xlsx_cols[j + k] + str(i+nb_records+3)] = {"text": "", "format": {"bg_color": color_not_found}}
+
+                    # Genotype:
+                    for gt in range(0, nb_inds):
+                        cells_gt[xlsx_cols[g + gt] + str(i)] = {"text": "", "format": {"bg_color": "#000000"}}
            j += 3
+            g += nb_inds
        if is_kept:
            cells[xlsx_cols[0] + str(i)]["format"]["bg_color"] = \
                cells[xlsx_cols[0] + str(i+nb_records+3)]["format"]["bg_color"] = color_is_kept
-        else:
+        elif filtered_records is not None:
            cells[xlsx_cols[1 + ((nb_tools + 1) * 3)] + str(i)] = {"text": "", "format": {"bg_color": color_not_found}}
            cells[xlsx_cols[1 + ((nb_tools + 1) * 3) + 1] + str(i)] = {"text": "", "format": {"bg_color": color_not_found}}
            cells[xlsx_cols[1 + ((nb_tools + 1) * 3) + 2] + str(i)] = {"text": "", "format": {"bg_color": color_not_found}}
@@ -322,6 +429,11 @@ def main():
            cells[xlsx_cols[1 + ((nb_tools + 1) * 3) + 2] + str(i + nb_records + 3)] = {
                "text": "", "format": {"bg_color": color_not_found}}

+            # Genotype:
+            for gt in range(0, nb_inds):
+                cells_gt[xlsx_cols[1 + ((nb_tools + 1) * nb_inds) + gt] + str(i)] = {"text": "", "format":
+                                                                                     {"bg_color": "#000000"}}
+
        # False positives (orphans) in orange:
        if re.match(r"^orphan_\d+$", id):
            cells[xlsx_cols[0] + str(i)]["format"]["bg_color"] = \
@@ -331,9 +443,11 @@ def main():

    # Create document:
    with xlsxwriter.Workbook(args.output) as workbook:
-        worksheet = workbook.add_worksheet()
+        worksheet = workbook.add_worksheet("SVs")
        i=0
+        max_header_length = 0
        for header in headers:
+            max_header_length = max(len(header), max_header_length)
            if i < len(headers) - 1 or filtered_records is None:
                merge_format = workbook.add_format({'align': 'center'})
            else:
@@ -347,6 +461,29 @@ def main():
            cell_format = workbook.add_format(cell_content["format"])
            worksheet.write(cell_id, cell_content["text"], cell_format)

+        # Resize columns:
+        for col, max_len in max_col_len.items():
+            print(col,max_len)
+            worksheet.set_column(col, col, max_len+1)
+
+        # Genotypes:
+        if args.genotypes:
+            worksheet_gt = workbook.add_worksheet("Genotypes")
+            i = 0
+            for header in headers:
+                if i < len(headers) - 1 or filtered_records is None:
+                    merge_format = workbook.add_format({'align': 'center'})
+                else:
+                    merge_format = workbook.add_format({'align': 'center', 'bg_color': color_col_filter})
+                worksheet_gt.merge_range(xlsx_cols[1 + i * nb_inds] + "1:" + xlsx_cols[nb_inds + i * nb_inds] + "1",
+                                         header, merge_format)
+                i += 1
+            for cell_id, cell_content in cells_gt.items():
+                cell_format = workbook.add_format(cell_content["format"])
+                worksheet_gt.write(cell_id, cell_content["text"], cell_format)
+                worksheet_gt.freeze_panes(0, 9)
+            worksheet_gt.set_column(0, 0, max_col_len[0]+1)
+
    print("")
    print("###########")
    print("# RESULTS #")