Allow several filter and genotype files

09249c14 · Floreal Cabanettes · 4461fed1 · 09249c14
Commit 09249c14 authored 7 years ago by Floreal Cabanettes
--- a/build_results.py
+++ b/build_results.py
@@ -50,8 +50,8 @@ description: Build results of the simulated data detection")
    parser.add_argument('-v', '--vcf', type=str, required=True, help='folder containing all vcf results files')
    parser.add_argument('-t', '--true-vcf', type=str, required=True, help='VCF file containing the simulated deletions')
    parser.add_argument('-f', '--filtered-vcf', type=str, required=False,
-                        help='VCF file containing the filtered results')
-    parser.add_argument('-g', '--genotypes', type=str, help="VCF file containing genotypes")
+                        help='VCF file containing the filtered results', nargs='+')
+    parser.add_argument('-g', '--genotypes', type=str, help="VCF file containing genotypes", nargs='+')
    parser.add_argument('--overlap_cutoff',  type=float, default=0.5, help='cutoff for reciprocal overlap')
    parser.add_argument('--left_precision',  type=int, default=-1, help='left breakpoint precision')
    parser.add_argument('--right_precision',  type=int, default=-1, help='right breakpoint precision')
@@ -162,7 +162,7 @@ def get_quality_color(quality):
        return color_very_low_quality


-def get_genotypes(genotypes_file, true_vcf_file):
+def get_genotypes(genotypes_files, true_vcf_file):
    """
    Get genotype of each individual for each SV
    :param genotypes_file: VCF file containing genotypes
@@ -182,12 +182,13 @@ def get_genotypes(genotypes_file, true_vcf_file):
    nb_inds = len(list(genotypes.values())[0])

    # Samples:
-    reader = VariantFile(genotypes_file)
-    for rec in reader:
-        samples = rec.samples
-        genotypes[rec.id] = ["/".join(map(str, samples[x]["GT"])) for x in samples_t.keys()]  # Fixed: use samples keys
-        # from real data to keep the same order
-        gt_quality[rec.id] = [samples[x]["GQ"] for x in samples_t.keys()]
+    for genotypes_file in genotypes_files:
+        reader = VariantFile(genotypes_file)
+        for rec in reader:
+            samples = rec.samples
+            genotypes[rec.id] = ["/".join(map(str, samples[x]["GT"])) for x in samples_t.keys()]  # Fixed: use samples keys
+            # from real data to keep the same order
+            gt_quality[rec.id] = [samples[x]["GQ"] for x in samples_t.keys()]

    return genotypes, gt_quality, nb_inds

@@ -625,14 +626,14 @@ def search_vcf_files(my_folder):
    return vcf_files


-def print_results(nb_records, orphans, with_xlsx, output, genotypes_file):
+def print_results(nb_records, orphans, with_xlsx, output, do_genotype):
    """
    Print list of outputs
    :param nb_records: number of records {int}
    :param orphans: sv found in tools but not present in real data {dict}
    :param with_xlsx: build xlsx file {bool}
    :param output: output prefix {str}
-    :param genotypes_file: genotypes file {str}
+    :param do_genotype: do the genotype {bool}
    """
    print("")
    print("###########")
@@ -650,13 +651,13 @@ def print_results(nb_records, orphans, with_xlsx, output, genotypes_file):
    print("TSV files:")
    print("\t- " + output + "_sv_per_tools.tsv")
    print("\t- " + output + "_sv_diffs_per_tools.tsv")
-    if genotypes_file:
+    if do_genotype:
        print("\t- " + output + "_sv_genotypes_per_tools.tsv")
        print("\t- " + output + "_sv_genotypes_quality_per_tools.tsv")
    print("")


-def fill_cells_no_tools(cells, cells_gt, cells_gq, i, j, g, nb_records, nb_inds, genotypes_file):
+def fill_cells_no_tools(cells, cells_gt, cells_gq, i, j, g, nb_records, nb_inds, do_genotype):
    """
    Fill cells when a tool does not detect a SV
    :param cells: cells definition
@@ -667,7 +668,7 @@ def fill_cells_no_tools(cells, cells_gt, cells_gq, i, j, g, nb_records, nb_inds,
    :param g: column index for genotypes and genotypes quality tables
    :param nb_records: total number of records
    :param nb_inds: total number of individuals
-    :param genotypes_file: VCF file containing genotypes
+    :param do_genotype: do the genotypes
    :return: cells, completed
    """
    for k in range(0, 3):
@@ -677,7 +678,7 @@ def fill_cells_no_tools(cells, cells_gt, cells_gq, i, j, g, nb_records, nb_inds,
                                                             "format": {"bg_color": COLOR_NOT_FOUND}}

        # Genotype:
-        if genotypes_file:
+        if do_genotype:
            for gt in range(0, nb_inds):
                # noinspection PyUnresolvedReferences
                cells_gt[XLSX_COLS[g + gt] + str(i)] = cells_gq[XLSX_COLS[g + gt] + str(i)] = \
@@ -686,7 +687,7 @@ def fill_cells_no_tools(cells, cells_gt, cells_gq, i, j, g, nb_records, nb_inds,


 def apply_style_of_filter_cells(cells, cells_gt, cells_gq, i, is_kept, nb_records, nb_inds, nb_tools, filtered_records,
-                   genotypes_file, rec_id):
+                   do_genotype, rec_id):
    """
    Apply style of cells
    :param cells: cells of the default table {dict}
@@ -698,7 +699,7 @@ def apply_style_of_filter_cells(cells, cells_gt, cells_gq, i, is_kept, nb_record
    :param nb_inds: number of individuals {int}
    :param nb_tools: number of tools {int}
    :param filtered_records: file containing filtered records {str}
-    :param genotypes_file: file containing genotypes {str}
+    :param do_genotype: do the genotype {bool}
    :param rec_id: id of the record {str}
    :return:
    """
@@ -722,7 +723,7 @@ def apply_style_of_filter_cells(cells, cells_gt, cells_gq, i, is_kept, nb_record
            "text": "", "format": {"bg_color": COLOR_NOT_FOUND}}

        # Genotype:
-        if genotypes_file:
+        if do_genotype:
            # Color in gray in the filter column
            for gt in range(0, nb_inds):
                cells_gt[XLSX_COLS[2 + ((nb_tools + 1) * nb_inds) + gt] + str(i)] = \
@@ -739,7 +740,7 @@ def apply_style_of_filter_cells(cells, cells_gt, cells_gq, i, is_kept, nb_record


 def build_body_cells(rec_keys, records, nb_records, nb_inds, tools, cells, cells_gt, cells_gq, max_col_len,
-                     nb_tools, genotypes_file, haploid, filtered_records):
+                     nb_tools, do_genotype, haploid, filtered_records):
    i = 3
    for rec_id in rec_keys:
        record = records[rec_id]
@@ -776,7 +777,7 @@ def build_body_cells(rec_keys, records, nb_records, nb_inds, tools, cells, cells
                                                        record["tools"][tool], nb_records, my_start, my_end, my_length,
                                                        {"bg_color": COLOR_COL_FILTER})

-                    if genotypes_file:
+                    if do_genotype:
                        # Genotype (sheets 2&3):
                        cells_gt, cells_gq = fill_genotypes_data(i, 2 + ((nb_tools + 1) * nb_inds), cells_gt, cells_gq,
                                                                 record["tools"][tool], my_genotypes, haploid)
@@ -792,7 +793,7 @@ def build_body_cells(rec_keys, records, nb_records, nb_inds, tools, cells, cells
                                                    record["tools"][tool], nb_records, my_start, my_end, my_length,
                                                    sv_format)

-                if genotypes_file:
+                if do_genotype:
                    # Genotype (sheets 2&3):
                    cells_gt, cells_gq = fill_genotypes_data(i, g, cells_gt, cells_gq,
                                                             record["tools"][tool], my_genotypes, haploid)
@@ -802,7 +803,7 @@ def build_body_cells(rec_keys, records, nb_records, nb_inds, tools, cells, cells
                # TOOL DOES NOT DETECT THE SV #
                ###############################
                cells, cells_gt, cells_gq = fill_cells_no_tools(cells, cells_gt, cells_gq, i, j, g, nb_records, nb_inds,
-                                                                genotypes_file)
+                                                                do_genotype)

            j += 3
            g += nb_inds
@@ -811,7 +812,7 @@ def build_body_cells(rec_keys, records, nb_records, nb_inds, tools, cells, cells
        # Until we have filled all tools, check if the record is kept after filtering: #
        ###############################################################################
        cells, cells_gt, cells_gq = apply_style_of_filter_cells(cells, cells_gt, cells_gq, i, is_kept, nb_records,
-                                                                nb_inds, nb_tools, filtered_records, genotypes_file,
+                                                                nb_inds, nb_tools, filtered_records, do_genotype,
                                                                rec_id)

        i += 1
@@ -825,7 +826,7 @@ def build_xlsx_cols():
            XLSX_COLS.append(alp + j)


-def init(output, vcf_folder, true_vcf, filtered_vcf=None, genotypes_file=None, overlap_cutoff=0.5,
+def init(output, vcf_folder, true_vcf, filtered_vcfs=None, genotypes_files=None, overlap_cutoff=0.5,
         left_precision=sys.maxsize, right_precision=sys.maxsize, no_xls=False, haploid=False):
    build_xlsx_cols()

@@ -834,8 +835,10 @@ def init(output, vcf_folder, true_vcf, filtered_vcf=None, genotypes_file=None, o

    nb_inds = 0

-    if genotypes_file:
-        genotypes, gt_quality, nb_inds = get_genotypes(genotypes_file, true_vcf)
+    if genotypes_files:
+        genotypes, gt_quality, nb_inds = get_genotypes(genotypes_files, true_vcf)
+
+    do_genotype = genotypes_files is not None

    filenames = search_vcf_files(vcf_folder)

@@ -853,11 +856,12 @@ def init(output, vcf_folder, true_vcf, filtered_vcf=None, genotypes_file=None, o
    eprint(" Reading file %s" % true_ones)
    sv_set_to, true_ones_records = read_vcf_file(true_ones)
    sv_set += sv_set_to
-    filtered_records = None
+    filtered_records = []

-    if filtered_vcf:
-        eprint(" Reading file %s" % filtered_vcf)
-        filtered_records = read_vcf_file(filtered_vcf)[1]
+    if filtered_vcfs:
+        for filtered_vcf in filtered_vcfs:
+            eprint(" Reading file %s" % filtered_vcf)
+            filtered_records += read_vcf_file(filtered_vcf)[1]

    # Compute connected components:
    eprint("Computing Connected components")
@@ -890,7 +894,7 @@ def init(output, vcf_folder, true_vcf, filtered_vcf=None, genotypes_file=None, o
    # BUILD BODY CELLS #
    ####################
    cells, cells_gt, cells_gq, max_col_len = build_body_cells(rec_keys, records, nb_records, nb_inds, tools, cells,
-                                                              cells_gt, cells_gq, max_col_len, nb_tools, genotypes_file,
+                                                              cells_gt, cells_gq, max_col_len, nb_tools, do_genotype,
                                                              haploid, filtered_records)

    # Create document:
@@ -906,7 +910,7 @@ def init(output, vcf_folder, true_vcf, filtered_vcf=None, genotypes_file=None, o
    create_tsv_file(output + "_sv_diffs_per_tools.tsv", headers, cells,
                    nb_tools + (2 if filtered_records is not None else 1),
                    3, (2 + nb_records + 3, nb_records * 2 + 5))
-    if genotypes_file:
+    if do_genotype:
        create_tsv_file(output + "_sv_genotypes_per_tools.tsv", headers, cells_gt,
                        nb_tools + (2 if filtered_records is not None else 1),
                        nb_inds, (2, nb_records + 2))
@@ -914,7 +918,7 @@ def init(output, vcf_folder, true_vcf, filtered_vcf=None, genotypes_file=None, o
                        nb_tools + (2 if filtered_records is not None else 1),
                        nb_inds, (2, nb_records + 2))

-    print_results(nb_records, orphans, with_xlsx, output, genotypes_file)
+    print_results(nb_records, orphans, with_xlsx, output, do_genotype)


 def main():