Skip to content
Snippets Groups Projects
Commit a96b9922 authored by Floreal Cabanettes's avatar Floreal Cabanettes
Browse files

Add genotypes comparison + resize columns

parent 410e18aa
No related branches found
No related tags found
No related merge requests found
......@@ -18,6 +18,7 @@ import argparse, sys
import glob
import vcf
import string
import os
import re
from svreader.vcfwrapper import VCFReader
......@@ -25,6 +26,8 @@ from svinterval import construct_overlap_graph, connected_components
import xlsxwriter
prg_path = os.path.dirname(os.path.realpath(__file__))
alphabet = list(string.ascii_uppercase)
xlsx_cols = alphabet.copy()
for i in alphabet:
......@@ -41,6 +44,7 @@ description: Merge SV based on reciprocal overlap")
parser.add_argument('-t', '--true-vcf', type=str, required=True, help='VCF file containing the simulated deletions')
parser.add_argument('-f', '--filtered-vcf', type=str, required=False,
help='VCF file containing the filtered results')
parser.add_argument('-g', '--genotypes', type=str, help="VCF file containing genotypes")
parser.add_argument('--overlap_cutoff', type=float, default=0.5, help='cutoff for reciprocal overlap')
parser.add_argument('--left_precision', type=int, default=-1, help='left breakpoint precision')
parser.add_argument('--right_precision', type=int, default=-1, help='right breakpoint precision')
......@@ -125,10 +129,37 @@ def svsort(sv, records):
first_tool = list(records[sv]["tools"].keys())[0]
return int(records[sv]["tools"][first_tool]["start"])
def get_gt(geno, true_gt):
if true_gt is not None:
if geno == "1/0" and true_gt == "0/1":
geno = "0/1"
elif geno == "0/1" and true_gt == "1/0":
geno = "1/0"
return geno
def get_max_len(cell, col, max_col_len):
return max(len(str(cell)), max_col_len[col] if col in max_col_len else 0)
def main():
# parse the command line args
args = get_args()
genotypes = {}
nb_inds = 0
if args.genotypes:
genotypes_raw = os.popen("zcat genotype_chr_chr12.vcf.gz " + args.true_vcf + " | " +
os.path.join(prg_path, "vawk") + " '{ print $3,S$*$GT}'").read().split("\n")
for gt in genotypes_raw:
if gt != "":
gt_l = gt.split("\t")
genotypes[gt_l[0]] = gt_l[1:]
nb_inds = len(gt_l[1:])
overlap_cutoff = args.overlap_cutoff
left_precision = args.left_precision
right_precision = args.right_precision
......@@ -181,7 +212,8 @@ def main():
"end": node.end,
"length": int(node.end) - int(node.start),
"tools": {},
"orphan": False
"orphan": False,
"genotypes": genotypes[true_one] if true_one in genotypes else None
}
else:
tool_name = node.id.split("_")[0]
......@@ -189,7 +221,8 @@ def main():
"start": node.start,
"end": node.end,
"length": int(node.end) - int(node.start),
"filtered": (node.id in filtered_records) if filtered_records is not None else False
"filtered": (node.id in filtered_records) if filtered_records is not None else False,
"genotypes": genotypes[node.id] if node.id in genotypes else None
}
tools.add(tool_name)
......@@ -202,7 +235,8 @@ def main():
"end": "",
"length": "",
"tools": results,
"orphan": True
"orphan": True,
"genotypes": None
}
names.sort()
......@@ -218,10 +252,19 @@ def main():
"A" + str(1+nb_records+3): {"text": "DIFFS", "format": {"bg_color": "#ffe856"}},
"A" + str(2+nb_records+3): {"text": "Deletion", "format": {"bold": True}}
}
cells_gt = {
"A1": {"text": "GENOTYPES", "format": {"bg_color": "#ffe856"}},
"A2": {"text": "Deletion", "format": {"bold": True}}
}
tools = sorted(tools)
nb_tools = len(tools)
max_col_len = {}
i = 1
j = 1
headers = []
for tool in ["Real data"] + tools + (["Filtered results"] if filtered_records is not None else []):
#cells[xlsx_cols[i] + "1"] = cells[xlsx_cols[i] + str(1+nb_records+3)] = {"text": tool, "format": ""}
......@@ -230,11 +273,20 @@ def main():
if tool == "Filtered results":
l_format["bg_color"] = color_col_filter
cells[xlsx_cols[i] + "2"] = cells[xlsx_cols[i] + str(2+nb_records+3)] = {"text": "Start", "format": l_format}
max_col_len[i] = get_max_len("Start", i, max_col_len)
cells[xlsx_cols[i + 1] + "2"] = cells[xlsx_cols[i+1] + str(2+nb_records+3)] = {"text": "End",
"format": l_format}
max_col_len[i+1] = get_max_len("End", i+1, max_col_len)
cells[xlsx_cols[i + 2] + "2"] = cells[xlsx_cols[i+2] + str(2+nb_records+3)] = {"text": "Length",
"format": l_format}
max_col_len[i+2] = get_max_len("Length", i+2, max_col_len)
# Genotypes:
for k in range(0, nb_inds):
cells_gt[xlsx_cols[j+k] + "2"] = {"text": "INDIV_" + str(k+1), "format": {"bold": True}}
i += 3
j += nb_inds
rec_keys = sorted(records.keys(), key=lambda x:svsort(x, records))
......@@ -242,17 +294,31 @@ def main():
for id in rec_keys:
record = records[id]
cells[xlsx_cols[0] + str(i)] = cells[xlsx_cols[0] + str(i+nb_records+3)] = {"text": id, "format": {}}
max_col_len[0] = get_max_len(id, 0, max_col_len)
print(max_col_len[0])
cells_gt[xlsx_cols[0] + str(i)] = {"text": id, "format": {}}
cells[xlsx_cols[1] + str(i)] = cells[xlsx_cols[1] + str(i+nb_records+3)] = {"text": record["start"],
"format": {}}
max_col_len[1] = get_max_len(record["start"], 1, max_col_len)
my_start = record["start"]
cells[xlsx_cols[2] + str(i)] = cells[xlsx_cols[2] + str(i+nb_records+3)] = {"text": record["end"],
"format": {}}
max_col_len[2] = get_max_len(record["end"], 2, max_col_len)
my_end = record["end"]
cells[xlsx_cols[3] + str(i)] = cells[xlsx_cols[3] + str(i+nb_records+3)] = {"text": record["length"],
"format": {}}
max_col_len[3] = get_max_len(record["length"], 3, max_col_len)
my_length = record["length"]
# Genotypes:
my_genotypes = None
if record["genotypes"] is not None:
for gt in range(0, len(record["genotypes"])):
cells_gt[xlsx_cols[1+gt] + str(i)] = {"text": record["genotypes"][gt], "format": {}}
my_genotypes = record["genotypes"]
j = 4
g = nb_inds + 1
is_kept = False
for tool in tools:
if tool in record["tools"]:
......@@ -262,10 +328,19 @@ def main():
# Raw values:
cells[xlsx_cols[1+((nb_tools+1)*3)] + str(i)] = {"text": record["tools"][tool]["start"],
"format": {"bg_color": color_col_filter}}
max_col_len[1+((nb_tools+1)*3)] = \
get_max_len(record["tools"][tool]["start"], 1 + ((nb_tools + 1) * 3),
max_col_len)
cells[xlsx_cols[1+((nb_tools+1)*3)+1] + str(i)] = {"text": record["tools"][tool]["end"],
"format": {"bg_color": color_col_filter}}
max_col_len[1 + ((nb_tools + 1) * 3)+1] = \
get_max_len(record["tools"][tool]["end"], 1 + ((nb_tools + 1) * 3)+1,
max_col_len)
cells[xlsx_cols[1+((nb_tools+1)*3)+2] + str(i)] = {"text": record["tools"][tool]["length"],
"format": {"bg_color": color_col_filter}}
max_col_len[1 + ((nb_tools + 1) * 3) + 2] = \
get_max_len(record["tools"][tool]["length"], 1 + ((nb_tools + 1) * 3) + 2,
max_col_len)
# Diffs:
if my_start != "":
cells[xlsx_cols[1+((nb_tools+1)*3)] + str(i+nb_records+3)] = {
......@@ -283,12 +358,27 @@ def main():
cells[xlsx_cols[1 + ((nb_tools + 1) * 3) + 2] + str(i + nb_records + 3)] = {
"text": "?????", "format": {"bg_color": color_col_filter}}
# Genotypes:
if record["tools"][tool]["genotypes"] is not None:
the_genotypes = record["tools"][tool]["genotypes"]
for gt in range(0, len(the_genotypes)):
true_gt = my_genotypes[gt]
geno = get_gt(the_genotypes[gt], true_gt)
gt_format = {"bg_color": color_col_filter}
if geno != my_genotypes[gt]:
gt_format["font_color"] = "#ff0000"
cells_gt[xlsx_cols[1 + ((nb_tools + 1) * nb_inds) + gt] + str(i)] = \
{"text": geno, "format": gt_format}
else:
sv_format = {}
# Raw values:
cells[xlsx_cols[j] + str(i)] = {"text": record["tools"][tool]["start"], "format": sv_format}
max_col_len[j] = get_max_len(record["tools"][tool]["start"], j, max_col_len)
cells[xlsx_cols[j + 1] + str(i)] = {"text": record["tools"][tool]["end"], "format": sv_format}
max_col_len[j + 1] = get_max_len(record["tools"][tool]["end"], j + 1, max_col_len)
cells[xlsx_cols[j + 2] + str(i)] = {"text": record["tools"][tool]["length"], "format": sv_format}
max_col_len[j + 2] = get_max_len(record["tools"][tool]["length"], j + 2, max_col_len)
# Diffs :
if my_start != "":
......@@ -299,19 +389,36 @@ def main():
cells[xlsx_cols[j + 2] + str(i+nb_records+3)] = {"text": record["tools"][tool]["length"] - my_length,
"format": sv_format}
else:
cells[xlsx_cols[j] + str(i + nb_records + 3)] = {"text": "?????", "format": sv_format}
cells[xlsx_cols[j + 1] + str(i + nb_records + 3)] = {"text": "?????", "format": sv_format}
cells[xlsx_cols[j + 2] + str(i + nb_records + 3)] = {"text": "?????", "format": sv_format}
cells[xlsx_cols[j] + str(i + nb_records + 3)] = {"text": "NA", "format": sv_format}
cells[xlsx_cols[j + 1] + str(i + nb_records + 3)] = {"text": "NA", "format": sv_format}
cells[xlsx_cols[j + 2] + str(i + nb_records + 3)] = {"text": "NA", "format": sv_format}
# Genotypes:
if record["tools"][tool]["genotypes"] is not None:
the_genotypes = record["tools"][tool]["genotypes"]
for gt in range(0, len(the_genotypes)):
true_gt = my_genotypes[gt]
geno = get_gt(the_genotypes[gt], true_gt)
gt_format = sv_format.copy()
if geno != true_gt:
gt_format["font_color"] = "#ff0000"
cells_gt[xlsx_cols[g + gt] + str(i)] = \
{"text": geno, "format": gt_format}
else:
for k in range(0,3):
cells[xlsx_cols[j + k] + str(i)] = {"text": "", "format": {"bg_color": color_not_found}}
cells[xlsx_cols[j + k] + str(i+nb_records+3)] = {"text": "", "format": {"bg_color": color_not_found}}
# Genotype:
for gt in range(0, nb_inds):
cells_gt[xlsx_cols[g + gt] + str(i)] = {"text": "", "format": {"bg_color": "#000000"}}
j += 3
g += nb_inds
if is_kept:
cells[xlsx_cols[0] + str(i)]["format"]["bg_color"] = \
cells[xlsx_cols[0] + str(i+nb_records+3)]["format"]["bg_color"] = color_is_kept
else:
elif filtered_records is not None:
cells[xlsx_cols[1 + ((nb_tools + 1) * 3)] + str(i)] = {"text": "", "format": {"bg_color": color_not_found}}
cells[xlsx_cols[1 + ((nb_tools + 1) * 3) + 1] + str(i)] = {"text": "", "format": {"bg_color": color_not_found}}
cells[xlsx_cols[1 + ((nb_tools + 1) * 3) + 2] + str(i)] = {"text": "", "format": {"bg_color": color_not_found}}
......@@ -322,6 +429,11 @@ def main():
cells[xlsx_cols[1 + ((nb_tools + 1) * 3) + 2] + str(i + nb_records + 3)] = {
"text": "", "format": {"bg_color": color_not_found}}
# Genotype:
for gt in range(0, nb_inds):
cells_gt[xlsx_cols[1 + ((nb_tools + 1) * nb_inds) + gt] + str(i)] = {"text": "", "format":
{"bg_color": "#000000"}}
# False positives (orphans) in orange:
if re.match(r"^orphan_\d+$", id):
cells[xlsx_cols[0] + str(i)]["format"]["bg_color"] = \
......@@ -331,9 +443,11 @@ def main():
# Create document:
with xlsxwriter.Workbook(args.output) as workbook:
worksheet = workbook.add_worksheet()
worksheet = workbook.add_worksheet("SVs")
i=0
max_header_length = 0
for header in headers:
max_header_length = max(len(header), max_header_length)
if i < len(headers) - 1 or filtered_records is None:
merge_format = workbook.add_format({'align': 'center'})
else:
......@@ -347,6 +461,29 @@ def main():
cell_format = workbook.add_format(cell_content["format"])
worksheet.write(cell_id, cell_content["text"], cell_format)
# Resize columns:
for col, max_len in max_col_len.items():
print(col,max_len)
worksheet.set_column(col, col, max_len+1)
# Genotypes:
if args.genotypes:
worksheet_gt = workbook.add_worksheet("Genotypes")
i = 0
for header in headers:
if i < len(headers) - 1 or filtered_records is None:
merge_format = workbook.add_format({'align': 'center'})
else:
merge_format = workbook.add_format({'align': 'center', 'bg_color': color_col_filter})
worksheet_gt.merge_range(xlsx_cols[1 + i * nb_inds] + "1:" + xlsx_cols[nb_inds + i * nb_inds] + "1",
header, merge_format)
i += 1
for cell_id, cell_content in cells_gt.items():
cell_format = workbook.add_format(cell_content["format"])
worksheet_gt.write(cell_id, cell_content["text"], cell_format)
worksheet_gt.freeze_panes(0, 9)
worksheet_gt.set_column(0, 0, max_col_len[0]+1)
print("")
print("###########")
print("# RESULTS #")
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment