#! /usr/bin/env python3 """mergeSV : merge SV on the basis of their intervals The merging is done by identifiying connected components in a graph. The graph is constructing taking the SV a node and an edge connects two nodes if the conresponding intervals exhibits a reciprocal overlap of a least R0%. Additional constraints on the precision of breakpoints (left and right) can be specified. Each connected component is termed a CNVR (Copy Number Variation Region) and is associated to the corresponding CNVs. """ import argparse import string import os import re from pysam import VariantFile import sys prg_path = os.path.dirname(os.path.realpath(__file__)) sys.path.insert(0, os.path.join(prg_path, "lib")) from svreader.vcfwrapper import VCFReader from svinterval import construct_overlap_graph, connected_components ALPHABET = list(string.ascii_uppercase) XLSX_COLS = ALPHABET.copy() COLOR_NOT_FOUND = "#FE2E2E" COLOR_NOT_FOUND_2 = "#dddddd" COLOR_COL_FILTER = "#BEF781" COLOR_IS_KEPT = "#81F781" COLOR_FALSE_POSITIVE = "#FE642E" COLOR_WRONG_GT = "#B40404" def get_args(): """ Get arguments from argparse :return: argparse arguments object """ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="\ Build Results \n \ description: Build results of the simulated data detection") parser.add_argument('-v', '--vcf', type=str, required=True, help='folder containing all vcf results files') parser.add_argument('-t', '--true-vcf', type=str, required=True, help='VCF file containing the simulated deletions') parser.add_argument('-f', '--filtered-vcf', type=str, required=False, help='VCF file containing the filtered results') parser.add_argument('-g', '--genotypes', type=str, help="VCF file containing genotypes") parser.add_argument('--overlap_cutoff', type=float, default=0.5, help='cutoff for reciprocal overlap') parser.add_argument('--left_precision', type=int, default=-1, help='left breakpoint precision') parser.add_argument('--right_precision', type=int, default=-1, help='right breakpoint precision') parser.add_argument('-o', '--output', type=str, default="results", help='output prefix') parser.add_argument('--no-xls', action='store_const', const=True, default=False, help='Do not build Excel file') parser.add_argument('--haploid', action='store_const', const=True, default=False, help='The organism is haploid') # parse the arguments args = parser.parse_args() if args.left_precision == -1: args.left_precision = sys.maxsize if args.right_precision == -1: args.right_precision = sys.maxsize # send back the user input return args def eprint(*args, **kwargs): """ Print to stderr """ print(*args, file=sys.stderr, **kwargs) def passed_variant(record): """ Did this variant pass? :param record: vcf record object :return: True if pass, False else """ return record.filter is None or len(record.filter) == 0 or "PASS" in record.filter def read_vcf_file(infile): """ Read a vcf file :param infile: vcf file path :return: set or records, list of records ids """ SVSet=[] ids = [] for record in VCFReader(infile): if not passed_variant(record): continue SVSet.append(record) ids.append(record.id) return SVSet, ids def svsort(sv, records): """ Function to sort regions """ if records[sv]["start"] != "": start = str(records[sv]["start"]) else: first_tool = list(records[sv]["tools"].keys())[0] start = str(records[sv]["tools"][first_tool]["start"]) return start def get_gt(geno, true_gt): """ Get the genotype: consider 1/0 and 0/1 as equivalent (we can't identify a specific chromosome) We want geno equal to true_gt in these cases :param geno: the genotype :param true_gt: the true genotype :return: the genotype """ if true_gt is not None: if geno == "1/0" and true_gt == "0/1": geno = "0/1" elif geno == "0/1" and true_gt == "1/0": geno = "1/0" return geno def get_max_len(cell, col, max_col_len): """ Get the max content length of a column :param cell: the cell content :param col: the column number :param max_col_len: current max content length for each column :return: the new max content length for the given column """ return max(len(str(cell)), max_col_len[col] if col in max_col_len else 0) def get_quality_color(quality): """ Returns the color associated to a genotype quality value :param quality: the quality value :return: the associated color """ color_very_low_quality = "#FE2E2E" color_low_quality = "#FE9A2E" color_medium_quality = "#FFFF00" color_high_quality = "#81F781" if quality > 60: return color_high_quality elif quality > 40: return color_medium_quality elif quality > 20: return color_low_quality else: return color_very_low_quality def get_genotypes(genotypes_file, true_vcf_file): """ Get genotype of each individual for each SV :param genotypes_file: VCF file containing genotypes :param true_vcf_file: VCF file containing real data :return: genotypes of each individual for each SV, quality of each genotype, number of individuals """ genotypes = {} gt_quality = {} # Real data: reader_t = VariantFile(true_vcf_file) for rec_t in reader_t: samples = rec_t.samples genotypes[rec_t.id] = ["/".join(map(str, samples[x]["GT"])) for x in samples] nb_inds = len(list(genotypes.values())[0]) # Samples: reader = VariantFile(genotypes_file) for rec in reader: samples = rec.samples genotypes[rec.id] = ["/".join(map(str, samples[x]["GT"])) for x in samples] gt_quality[rec.id] = [samples[x]["GQ"] for x in samples] return genotypes, gt_quality, nb_inds def build_records(genotypes, SVSet, true_ones_records, filtered_records, gt_quality): """ Build records for each SV :param genotypes: list of genotypes of each individual for each SV :param SVSet: set of all SVs :param true_ones_records: records of the real data :param filtered_records: records of the filtered data :param gt_quality: list of qualities of each genotype of individuals for each SV :return: records dict, tools set, list of orphans records (associated to None real data) """ number = 1 records = {} tools = set() orphans = 0 for components in connected_components(SVSet): names = [] results = {} true_one = None chromosome = None for node in components: names.append(node.id) if node.id in true_ones_records: true_one = node.id records[true_one] = { "start": node.start, "end": node.end, "length": int(node.end) - int(node.start), "tools": {}, "orphan": False, "genotypes": genotypes[true_one] if true_one in genotypes else None, "chromosome": node.chrom } else: tool_name = node.id.split("_")[0] results[tool_name] = { "start": node.start, "end": node.end, "length": int(node.end) - int(node.start), "filtered": (node.id in filtered_records) if filtered_records is not None else False, "genotypes": genotypes[node.id] if node.id in genotypes else None, "qualities": gt_quality[node.id] if node.id in gt_quality else None } tools.add(tool_name) chromosome = node.chrom if true_one is not None: records[true_one]["tools"] = results else: orphans += 1 records["orphan_" + str(orphans)] = { "start": "", "end": "", "length": "", "tools": results, "orphan": True, "genotypes": None, "chromosome": chromosome } names.sort() eprint("Group #%i: %s" % (number, ", ".join(names))) number += 1 return records, tools, orphans def build_header(tools, filtered_records, nb_records, max_col_len, nb_inds): """ Build tools headers, and header cells for each tool :param tools: list of tools :param cells: cells of the first sheet (sv description) :param cells_gt: cells of the second sheet (genotype) :param cells_gq: cells of the third sheet (genotype quality) :param filtered_records: (bool) is there filtered records :param nb_records: number of records :param max_col_len: max content length for each column :param nb_inds: number of individuals :return: headers (list of tools + read and filtered data) ; cells, cells_gt, cells_gq and max_col_len updated """ # First one (SV description) cells = { "A1": {"text": "RESULTS", "format": {"bg_color": "#ffe856"}}, "A2": {"text": "Deletion", "format": {"bold": True}}, "A" + str(1 + nb_records + 3): {"text": "DIFFS", "format": {"bg_color": "#ffe856"}}, "A" + str(2 + nb_records + 3): {"text": "Deletion", "format": {"bold": True}} } # Second one (genotype) cells_gt = { "A1": {"text": "GENOTYPES", "format": {"bg_color": "#ffe856"}}, "A2": {"text": "Deletion", "format": {"bold": True}} } # Third one (genotype quality) cells_gq = { "A1": {"text": "GT QUALITY", "format": {"bg_color": "#ffe856"}}, "A2": {"text": "Deletion", "format": {"bold": True}} } i = 2 j = 2 headers = ["Chr"] for tool in ["Real data"] + tools + (["Filtered results"] if filtered_records else []): headers.append(tool) l_format = {"bold": True} if tool == "Filtered results": l_format["bg_color"] = COLOR_COL_FILTER cells[XLSX_COLS[i] + "2"] = cells[XLSX_COLS[i] + str(2 + nb_records + 3)] = {"text": "Start", "format": l_format} max_col_len[i] = get_max_len("Start", i, max_col_len) cells[XLSX_COLS[i + 1] + "2"] = cells[XLSX_COLS[i + 1] + str(2 + nb_records + 3)] = {"text": "End", "format": l_format} max_col_len[i + 1] = get_max_len("End", i + 1, max_col_len) cells[XLSX_COLS[i + 2] + "2"] = cells[XLSX_COLS[i + 2] + str(2 + nb_records + 3)] = {"text": "Length", "format": l_format} max_col_len[i + 2] = get_max_len("Length", i + 2, max_col_len) # Genotypes: for k in range(0, nb_inds): cells_gt[XLSX_COLS[j + k] + "2"] = cells_gq[XLSX_COLS[j + k] + "2"] = \ {"text": "indiv" + str(k + 1), "format": {"bold": True}} i += 3 j += nb_inds return headers, cells, cells_gt, cells_gq, max_col_len def fill_real_data(row, cells, cells_gt, cells_gq, max_col_len, record, rec_id, nb_records, chromosome): """ Fill cells of the first data column (real data simulated) :param cells: cells of the first sheet (sv description) :param cells_gt: cells of the second sheet (genotype) :param cells_gq: cells of the third sheet (genotype quality) :param max_col_len: max text length for each column :param record: data of the record :param nb_records: number of records (total) :return: cells, cells_gt, cells_gq and max_col_len updated """ # SV ID: cells[XLSX_COLS[0] + str(row)] = cells[XLSX_COLS[0] + str(row + nb_records + 3)] = {"text": rec_id, "format": {}} max_col_len[0] = get_max_len(rec_id, 0, max_col_len) cells_gt[XLSX_COLS[0] + str(row)] = cells_gq[XLSX_COLS[0] + str(row)] = {"text": rec_id, "format": {}} # Chromosome: cells[XLSX_COLS[1] + str(row)] = cells[XLSX_COLS[1] + str(row + nb_records + 3)] = {"text": chromosome, "format": {}} cells_gt[XLSX_COLS[1] + str(row)] = cells_gq[XLSX_COLS[1] + str(row)] = {"text": chromosome, "format": {}} # START: cells[XLSX_COLS[2] + str(row)] = cells[XLSX_COLS[2] + str(row + nb_records + 3)] = {"text": record["start"], "format": {}} max_col_len[2] = get_max_len(record["start"], 2, max_col_len) # END: cells[XLSX_COLS[3] + str(row)] = cells[XLSX_COLS[3] + str(row + nb_records + 3)] = {"text": record["end"], "format": {}} max_col_len[3] = get_max_len(record["end"], 3, max_col_len) # LENGTH: cells[XLSX_COLS[4] + str(row)] = cells[XLSX_COLS[4] + str(row + nb_records + 3)] = {"text": record["length"], "format": {}} max_col_len[4] = get_max_len(record["length"], 4, max_col_len) # GENOTYPES: if record["genotypes"] is not None: for gt in range(0, len(record["genotypes"])): cells_gt[XLSX_COLS[2 + gt] + str(row)] = cells_gq[XLSX_COLS[2 + 0 + gt] + str(row)] = \ {"text": record["genotypes"][gt], "format": {}} return cells, cells_gt, cells_gq, max_col_len def fill_tool_data(row, col, cells, max_col_len, record, nb_records, my_start, my_end, my_length, sv_format=None): """ Fill cells for a tool and a record :param row: row position :param col: columns position :param cells: cells of the first sheet (sv description) :param max_col_len: max text length for each column :param record: data of the record for a tool :param nb_records: number of records (total) :param my_start: real start of the SV :param my_end: real end of the SV :param my_length: real length of the SV :param sv_format: format for the cell :return: cells and max_col_len updated """ if sv_format is None: sv_format = {} ############# # RAW DATA: # ############# # START: cells[XLSX_COLS[col] + str(row)] = {"text": record["start"], "format": sv_format} max_col_len[col] = get_max_len(record["start"], col, max_col_len) # END: cells[XLSX_COLS[col + 1] + str(row)] = {"text": record["end"], "format": sv_format} max_col_len[col + 1] = get_max_len(record["end"], col + 1, max_col_len) # LENGTH: cells[XLSX_COLS[col + 2] + str(row)] = {"text": record["length"], "format": sv_format} max_col_len[col + 2] = get_max_len(record["length"], col + 2, max_col_len) ########### # DIFFS: # ########### if my_start != "": start = record["start"] - my_start end = record["end"] - my_end length = record["length"] - my_length else: start = end = length = "NA" # START: cells[XLSX_COLS[col] + str(row + nb_records + 3)] = {"text": start, "format": sv_format} # END: cells[XLSX_COLS[col + 1] + str(row + nb_records + 3)] = {"text": end, "format": sv_format} # LENGTH: cells[XLSX_COLS[col + 2] + str(row + nb_records + 3)] = {"text": length, "format": sv_format} return cells, max_col_len def fill_genotypes_data(row, col, cells_gt, cells_gq, record, my_genotypes, haploid=False): """ Fill cells for a tool and a record (genotype/genotype quality parts) :param row: row position :param col: column position :param cells_gt: cells for genotype sheet (2) :param cells_gq: cells for the genotype quality sheet (3) :param record: data of the record for a tool :param my_genotypes: real genotypes for each individual :return: cells_gt and cells_gq updated """ the_genotypes = record["genotypes"] for gt in range(0, len(the_genotypes)): true_gt = my_genotypes[gt] if my_genotypes is not None else "" geno = get_gt(the_genotypes[gt], true_gt) # Format: gt_format = {"bg_color": get_quality_color(int(record["qualities"][gt]))} if (not haploid and geno != true_gt) or \ (haploid and ((true_gt == "1" and geno != "1/1") or (true_gt == "0" and geno != "0/0"))): gt_format["font_color"] = "#ff0000" # Genotype: cells_gt[XLSX_COLS[col + gt] + str(row)] = {"text": geno, "format": gt_format} # Quality: cells_gq[XLSX_COLS[col + gt] + str(row)] = {"text": int(record["qualities"][gt]), "format": gt_format} return cells_gt, cells_gq def write_headers(workbook, worksheet, headers, cell_len, rows, filtered_records): """ Write header on the given sheet of the XLSX file :param workbook: the XLSX file workbook :param worksheet: th sheet :param headers: headers values :param cell_len: length of each cell of the header :param rows: (list) rows into write the header :param filtered_records: (bool) is there filtered records """ for row in rows: merge_format = workbook.add_format({'bold': True}) worksheet.write(XLSX_COLS[1] + str(row + 1) + ":" + XLSX_COLS[cell_len] + str(row + 1), headers[0], merge_format) i = 0 for header in headers[1:]: if i < len(headers) - 2 or not filtered_records: merge_format = workbook.add_format({'align': 'center'}) else: merge_format = workbook.add_format({'align': 'center', 'bg_color': COLOR_COL_FILTER}) for row in rows: worksheet.merge_range(XLSX_COLS[2 + i * cell_len] + str(row) + ":" + XLSX_COLS[1 + cell_len + i * cell_len] + str(row), header, merge_format) i += 1 def create_xls_document(output, genotypes, headers, filtered_records, nb_records, nb_inds, cells, cells_gt, cells_gq, max_col_len): """ Create the XLSX file :param output: output directory :param genotypes: genotypes file :param headers: headers of each sheet :param filtered_records: (bool) has filtered records :param nb_records: number of records :param nb_inds: number of individuals :param cells: cells for first sheet (SV description) :param cells_gt: cells for second sheet (genotypes) :param cells_gq: cells for third sheet (genotype quality) :param max_col_len: max content length for each column """ try: import xlsxwriter except ImportError: print("\nWARN: Excel file not built: xlsxwriter python module not installed") return False with xlsxwriter.Workbook(output + ".xlsx") as workbook: ################################# # First sheet (SV description): # ################################# worksheet = workbook.add_worksheet("SVs") write_headers(workbook, worksheet, headers, 3, [1, 1+nb_records+3], filtered_records) # Body: for cell_id, cell_content in cells.items(): cell_format = workbook.add_format(cell_content["format"]) worksheet.write(cell_id, cell_content["text"], cell_format) # Resize columns: for col, max_len in max_col_len.items(): worksheet.set_column(col, col, max_len+1) if genotypes: ############################# # Second sheet (Genotypes): # ############################# worksheet_gt = workbook.add_worksheet("Genotypes") write_headers(workbook, worksheet_gt, headers, nb_inds, [1], filtered_records) # Body: for cell_id, cell_content in cells_gt.items(): cell_format = workbook.add_format(cell_content["format"]) worksheet_gt.write(cell_id, cell_content["text"], cell_format) worksheet_gt.freeze_panes(0, 2+nb_inds) # Resize columns: worksheet_gt.set_column(0, 0, max_col_len[0]+1) #################################### # Third sheet (Genotypes quality): # #################################### worksheet_gq = workbook.add_worksheet("Gt quality") write_headers(workbook, worksheet_gq, headers, nb_inds, [1], filtered_records) # Body for cell_id, cell_content in cells_gq.items(): cell_format = workbook.add_format(cell_content["format"]) worksheet_gq.write(cell_id, cell_content["text"], cell_format) worksheet_gq.freeze_panes(0, 2+nb_inds) # Resize columns: worksheet_gq.set_column(0, 0, max_col_len[0]+1) return True def create_tsv_file(filename: str, headers: list, cells: dict, nb_tools: int, nb_per_tool: int, records_range: ()): """ Create tabulated separated values file :param filename: filename of the file :param headers: headers of each sheet :param cells: cells of the table to save :param nb_tools: number of tools :param nb_per_tool: number per tools :param records_range: range of records to treat {tuple(2)} :return: """ # Init rows: head = ["", headers[0]] top_headers = {} h = 2 for header in headers[1:]: # Define top headers to each column: for i in range(0, nb_per_tool): top_headers[h] = header head.append("") h += 1 rows = [head] for i in range(0, records_range[1]-records_range[0]): rows.append(["" for x in range(0, (nb_tools * nb_per_tool) + 2)]) # Fill content: for id_cell, cell in cells.items(): id_m = re.match(r"^([A-Z]+)(\d+)$", id_cell) col = XLSX_COLS.index(id_m.group(1)) row = int(id_m.group(2)) if records_range[0] <= row <= records_range[1]: r = row - records_range[0] if r == 0 and col > 0: if col > 1: rows[r][col] = top_headers[col].replace(" ", "_") + "__" + cell["text"] else: rows[r][col] = headers[0] else: rows[r][col] = str(cell["text"]) # List as text: for r in range(0, len(rows)): rows[r] = "\t".join(rows[r]) tsv = "\n".join(rows) with open(filename, "w") as tsv_file: tsv_file.write(tsv) def search_vcf_files(my_folder): """ Search vcf files recursively (can be replaced by glob, but required python >= 3.5 (3.4 version in genotoul plateform) :param my_folder: folder into search files :return: list of vcf files, with absolute paths """ vcf_files = [] for item in os.listdir(my_folder): item_file = os.path.join(my_folder, item) if os.path.isfile(item_file) and (item_file.endswith(".vcf") or item_file.endswith(".vcf.gz")): vcf_files.append(os.path.abspath(item_file)) elif os.path.isdir(item_file): vcf_files += search_vcf_files(item_file) return vcf_files def print_results(nb_records, orphans, with_xlsx, output, genotypes_file): """ Print list of outputs :param nb_records: number of records {int} :param orphans: sv found in tools but not present in real data {dict} :param with_xlsx: build xlsx file {bool} :param output: output prefix {str} :param genotypes_file: genotypes file {str} """ print("") print("###########") print("# RESULTS #") print("###########") print("") print(str(nb_records) + " Results found") print(str(orphans) + " False Positive") print("") if with_xlsx: print("Results saved in :\n\t- " + output + ".xlsx") else: print("Results:") print("") print("TSV files:") print("\t- " + output + "_sv_per_tools.tsv") print("\t- " + output + "_sv_diffs_per_tools.tsv") if genotypes_file: print("\t- " + output + "_sv_genotypes_per_tools.tsv") print("\t- " + output + "_sv_genotypes_quality_per_tools.tsv") print("") def fill_cells_no_tools(cells, cells_gt, cells_gq, i, j, g, nb_records, nb_inds, genotypes_file): """ Fill cells when a tool does not detect a SV :param cells: cells definition :param cells_gt: cells definition for genotypes :param cells_gq: cells definition for genotypes quality :param i: row index :param j: column index :param g: column index for genotypes and genotypes quality tables :param nb_records: total number of records :param nb_inds: total number of individuals :param genotypes_file: VCF file containing genotypes :return: cells, completed """ for k in range(0, 3): # noinspection PyUnresolvedReferences cells[XLSX_COLS[j + k] + str(i)] = {"text": "", "format": {"bg_color": COLOR_NOT_FOUND}} cells[XLSX_COLS[j + k] + str(i + nb_records + 3)] = {"text": "", "format": {"bg_color": COLOR_NOT_FOUND}} # Genotype: if genotypes_file: for gt in range(0, nb_inds): # noinspection PyUnresolvedReferences cells_gt[XLSX_COLS[g + gt] + str(i)] = cells_gq[XLSX_COLS[g + gt] + str(i)] = \ {"text": "", "format": {"bg_color": COLOR_NOT_FOUND_2}} return cells, cells_gt, cells_gq def apply_style_of_filter_cells(cells, cells_gt, cells_gq, i, is_kept, nb_records, nb_inds, nb_tools, filtered_records, genotypes_file, rec_id): """ Apply style of cells :param cells: cells of the default table {dict} :param cells_gt: cells of the genotypes table {dict} :param cells_gq: cells of the genotypes quality table {dict} :param i: row index {int} :param is_kept: is the variant kept after filtering {bool} :param nb_records: number of records {int} :param nb_inds: number of individuals {int} :param nb_tools: number of tools {int} :param filtered_records: file containing filtered records {str} :param genotypes_file: file containing genotypes {str} :param rec_id: id of the record {str} :return: """ if is_kept: # SV is kept: color bg in green in the corresponding tool cells[XLSX_COLS[0] + str(i)]["format"]["bg_color"] = \ cells[XLSX_COLS[0] + str(i + nb_records + 3)]["format"]["bg_color"] = COLOR_IS_KEPT elif filtered_records is not None: # SV does not pass the filter: color bg in red in the filter column cells[XLSX_COLS[2 + ((nb_tools + 1) * 3)] + str(i)] = {"text": "", "format": {"bg_color": COLOR_NOT_FOUND}} cells[XLSX_COLS[2 + ((nb_tools + 1) * 3) + 1] + str(i)] = {"text": "", "format": {"bg_color": COLOR_NOT_FOUND}} cells[XLSX_COLS[2 + ((nb_tools + 1) * 3) + 2] + str(i)] = {"text": "", "format": {"bg_color": COLOR_NOT_FOUND}} cells[XLSX_COLS[2 + ((nb_tools + 1) * 3)] + str(i + nb_records + 3)] = {"text": "", "format": { "bg_color": COLOR_NOT_FOUND}} cells[XLSX_COLS[2 + ((nb_tools + 1) * 3) + 1] + str(i + nb_records + 3)] = { "text": "", "format": {"bg_color": COLOR_NOT_FOUND}} cells[XLSX_COLS[2 + ((nb_tools + 1) * 3) + 2] + str(i + nb_records + 3)] = { "text": "", "format": {"bg_color": COLOR_NOT_FOUND}} # Genotype: if genotypes_file: # Color in gray in the filter column for gt in range(0, nb_inds): cells_gt[XLSX_COLS[2 + ((nb_tools + 1) * nb_inds) + gt] + str(i)] = \ cells_gq[XLSX_COLS[2 + ((nb_tools + 1) * nb_inds) + gt] + str(i)] = {"text": "", "format": {"bg_color": COLOR_NOT_FOUND_2} } # False positives (orphans) in orange: if re.match(r"^orphan_\d+$", rec_id): cells[XLSX_COLS[0] + str(i)]["format"]["bg_color"] = \ cells[XLSX_COLS[0] + str(i + nb_records + 3)]["format"]["bg_color"] = COLOR_FALSE_POSITIVE return cells, cells_gt, cells_gq def build_body_cells(rec_keys, records, nb_records, nb_inds, tools, cells, cells_gt, cells_gq, max_col_len, nb_tools, genotypes_file, haploid, filtered_records): i = 3 for rec_id in rec_keys: record = records[rec_id] my_start = record["start"] my_end = record["end"] my_length = record["length"] my_genotypes = record["genotypes"] # Real data of the simulation: cells, cells_gt, cells_gq, max_col_len = fill_real_data(i, cells, cells_gt, cells_gq, max_col_len, record, rec_id, nb_records, record["chromosome"]) j = 5 g = nb_inds + 2 is_kept = False for tool in tools: if tool in record["tools"]: ########################### # IF TOOL DETECTS THE SV: # ########################### if record["tools"][tool]["filtered"]: ###################### # SET FILTERED DATA: # ###################### is_kept = True sv_format = {"bg_color": COLOR_IS_KEPT} # SV data (sheet 1): cells, max_col_len = fill_tool_data(i, 2 + ((nb_tools + 1) * 3), cells, max_col_len, record["tools"][tool], nb_records, my_start, my_end, my_length, {"bg_color": COLOR_COL_FILTER}) if genotypes_file: # Genotype (sheets 2&3): cells_gt, cells_gq = fill_genotypes_data(i, 2 + ((nb_tools + 1) * nb_inds), cells_gt, cells_gq, record["tools"][tool], my_genotypes, haploid) else: sv_format = {} ################# # SET TOOL DATA # ################# # SV data (sheet 1): cells, max_col_len = fill_tool_data(i, j, cells, max_col_len, record["tools"][tool], nb_records, my_start, my_end, my_length, sv_format) if genotypes_file: # Genotype (sheets 2&3): cells_gt, cells_gq = fill_genotypes_data(i, g, cells_gt, cells_gq, record["tools"][tool], my_genotypes, haploid) else: ############################### # TOOL DOES NOT DETECT THE SV # ############################### cells, cells_gt, cells_gq = fill_cells_no_tools(cells, cells_gt, cells_gq, i, j, g, nb_records, nb_inds, genotypes_file) j += 3 g += nb_inds ############################################################################### # Until we have filled all tools, check if the record is kept after filtering: # ############################################################################### cells, cells_gt, cells_gq = apply_style_of_filter_cells(cells, cells_gt, cells_gq, i, is_kept, nb_records, nb_inds, nb_tools, filtered_records, genotypes_file, rec_id) i += 1 return cells, cells_gt, cells_gq, max_col_len def build_xlsx_cols(): for alp in ALPHABET: for j in ALPHABET: XLSX_COLS.append(alp + j) def init(output, vcf_folder, true_vcf, filtered_vcf=None, genotypes_file=None, overlap_cutoff=0.5, left_precision=sys.maxsize, right_precision=sys.maxsize, no_xls=False, haploid=False): build_xlsx_cols() genotypes = {} gt_quality = {} nb_inds = 0 if genotypes_file: genotypes, gt_quality, nb_inds = get_genotypes(genotypes_file, true_vcf) filenames = search_vcf_files(vcf_folder) true_ones = true_vcf # Reading all the vcf files sv_set = [] for infile in filenames: eprint(" Reading file %s" % infile) try: sv_set += read_vcf_file(infile)[0] except: print("Ignoreing file %s" % infile) eprint(" Reading file %s" % true_ones) sv_set_to, true_ones_records = read_vcf_file(true_ones) sv_set += sv_set_to filtered_records = None if filtered_vcf: eprint(" Reading file %s" % filtered_vcf) filtered_records = read_vcf_file(filtered_vcf)[1] # Compute connected components: eprint("Computing Connected components") construct_overlap_graph(sv_set, overlap_cutoff, left_precision, right_precision) # Build records: records, tools, orphans = build_records(genotypes, sv_set, true_ones_records, filtered_records, gt_quality) nb_records = len(records) ################################################# # Define cells of each sheet of the excel file: # ################################################# tools = sorted(tools) nb_tools = len(tools) max_col_len = {} ###################### # BUILD HEADER CELLS # ###################### headers, cells, cells_gt, cells_gq, max_col_len = build_header(tools, filtered_records is not None, nb_records, max_col_len, nb_inds) rec_keys = sorted(records.keys(), key=lambda x: (records[x]["chromosome"], svsort(x, records))) #################### # BUILD BODY CELLS # #################### cells, cells_gt, cells_gq, max_col_len = build_body_cells(rec_keys, records, nb_records, nb_inds, tools, cells, cells_gt, cells_gq, max_col_len, nb_tools, genotypes_file, haploid, filtered_records) # Create document: with_xlsx = False if not no_xls: with_xlsx = create_xls_document(output, genotypes, headers, filtered_records is not None, nb_records, nb_inds, cells, cells_gt, cells_gq, max_col_len) # Create CSV files: create_tsv_file(output + "_sv_per_tools.tsv", headers, cells, nb_tools + (2 if filtered_records is not None else 1), 3, (2, nb_records + 2)) create_tsv_file(output + "_sv_diffs_per_tools.tsv", headers, cells, nb_tools + (2 if filtered_records is not None else 1), 3, (2 + nb_records + 3, nb_records * 2 + 5)) if genotypes_file: create_tsv_file(output + "_sv_genotypes_per_tools.tsv", headers, cells_gt, nb_tools + (2 if filtered_records is not None else 1), nb_inds, (2, nb_records + 2)) create_tsv_file(output + "_sv_genotypes_quality_per_tools.tsv", headers, cells_gq, nb_tools + (2 if filtered_records is not None else 1), nb_inds, (2, nb_records + 2)) print_results(nb_records, orphans, with_xlsx, output, genotypes_file) def main(): """ Main function """ # parse the command line args args = get_args() init(args.output, args.vcf, args.true_vcf, args.filtered_vcf, args.genotypes, args.overlap_cutoff, args.left_precision, args.right_precision, args.no_xls, args.haploid) # initialize the script if __name__ == '__main__': sys.exit(main())