Skip to content
Snippets Groups Projects
Commit 31b9c3bb authored by Floreal Cabanettes's avatar Floreal Cabanettes
Browse files

Add filter information on recall and precision first graphs

parent 2e1554a4
No related branches found
No related tags found
No related merge requests found
%% Cell type:markdown id: tags:
# CNV detection benchmark
%% Cell type:code id: tags:
``` python
import pandas as pd
import numpy as np
from collections import OrderedDict
import math
import re
import os
import json
import pylab as P
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid", color_codes=True)
%matplotlib inline
from pysam import VariantFile
from collections import defaultdict
from collections import Counter
from IPython.display import display, HTML
# Read tsv file
results_df = pd.read_table("results_sv_per_tools.tsv", header=0, index_col=0)
# Retrieve list of tools
tools = set()
for col in results_df.columns:
if col.endswith("__Start") and col != "Real_data__Start" and col != "Filtered_results__Start":
tools.add(col.split("__")[0])
```
%% Cell type:markdown id: tags:
### Recall & Precision
%% Cell type:code id: tags:
``` python
def compute_tp_fp_fn(svs, tool):
def compute_tp_fp_fn(svs, tool, only_filter=False, include_duplicates=True):
tp = 0
fp = 0
fn = 0
start_values = svs["{0}__Start".format(tool)]
for i in range(0, len(start_values)):
if math.isnan(start_values[i]) and not math.isnan(svs["Real_data__Start"][i]):
if not only_filter or (svs["{0}__Filters".format(tool)][i] == "PASS" or (include_duplicates
and svs["{0}__Filters".format(tool)][i] == "DUPLICATE")):
if math.isnan(start_values[i]) and not math.isnan(svs["Real_data__Start"][i]):
fn += 1
elif not math.isnan(start_values[i]) and not math.isnan(svs["Real_data__Start"][i]):
tp += 1
elif not math.isnan(start_values[i]) and math.isnan(svs["Real_data__Start"][i]):
fp += 1
elif not math.isnan(svs["Real_data__Start"][i]):
fn += 1
elif not math.isnan(start_values[i]) and not math.isnan(svs["Real_data__Start"][i]):
tp += 1
elif not math.isnan(start_values[i]) and math.isnan(svs["Real_data__Start"][i]):
fp += 1
return tp, fp, fn
recall = OrderedDict()
recall_f = OrderedDict()
precision = OrderedDict()
precision_f = OrderedDict()
all_colors = ["#405f93", "#468a56", "#be4447", "#6553a0", "#DF3A01", "#868A08"]
all_colors_f = ["#072352", "#0a3614", "#450a0b", "#20124b", "#461707", "#2d2e05"]
colors = {}
i = 0
for tool in tools:
if tool + "__Start" in results_df:
tp, fp, fn = compute_tp_fp_fn(results_df, tool)
recall[tool] = [tp / (tp+fn) * 100]
precision[tool] = [tp / (tp+fp) * 100]
colors[tool] = all_colors[i]
# Filtered ones:
tp, fp, fn = compute_tp_fp_fn(results_df, tool, True)
recall_f[tool] = [tp / (tp+fn) * 100]
precision_f[tool] = [tp / (tp+fp) * 100]
i += 1
plt.figure(1, figsize=(20,10))
# clrs = ['grey' if (x < max(values)) else 'red' for x in values ]
# Plot recall
plt.subplot(121)
recall_df = pd.DataFrame.from_dict(recall, orient="columns")
plot = sns.barplot(data=recall_df)
plot = sns.barplot(data=recall_df, palette=all_colors)
recall_f_df = pd.DataFrame.from_dict(recall_f, orient="columns")
plot = sns.barplot(data=recall_f_df, palette=all_colors_f)
plot.set_title("Recall", fontsize=30)
plot.set_ylabel("Recall (%)", fontsize=20)
plot.set_xlabel("Tool", fontsize=20)
plot.tick_params(labelsize=14)
plot.set_ylim([0,100])
# Plot precision
plt.subplot(122)
precision_f_df = pd.DataFrame.from_dict(precision_f, orient="columns")
plot2 = sns.barplot(data=precision_f_df, palette=all_colors_f)
precision_df = pd.DataFrame.from_dict(precision, orient="columns")
plot2 = sns.barplot(data=precision_df)
plot2 = sns.barplot(data=precision_df, palette=all_colors)
plot2.set_title("Precision", fontsize=30)
plot2.set_ylabel("Precision (%)", fontsize=20)
plot2.set_xlabel("Tool", fontsize=20)
plot2.tick_params(labelsize=14)
plt.show()
```
%% Cell type:markdown id: tags:
### Influence of variant size on recall
%% Cell type:code id: tags:
``` python
groups = []
with open("rules.sim", "r") as rules:
for line in rules:
line = line.rstrip()
if line != "":
parts = re.split(r"\s+", line)
groups.append((int(parts[1]), int(parts[2])))
groups.sort(key=lambda x: x[0])
```
%% Cell type:markdown id: tags:
#### By tool
%% Cell type:code id: tags:
``` python
nrows = math.ceil(len(tools)/2)
ncols = min(2, len(tools))
palettes = ["Blues_d", "Greens_d", "Reds_d", "Purples_d", "YlOrBr_d", "PuBu_d"]
plt.figure(1, figsize=(20,nrows * 8))
nplot=0
for tool in tools:
npalette = nplot
while npalette >= len(palettes):
npalette -= len(palettes)
nplot += 1
results_by_group = OrderedDict()
for group in groups:
tmp_res = results_df[(results_df.Real_data__Length >= group[0]) & (results_df.Real_data__Length < group[1])]
tp, fp, fn = compute_tp_fp_fn(tmp_res, tool)
results_by_group["-".join(map(str,group))] = [tp / (tp+fn) * 100 if tp+fn >0 else 0]
recall_df = pd.DataFrame.from_dict(results_by_group, orient="columns")
plt.subplot(nrows, ncols, nplot)
plot = sns.barplot(data=recall_df, palette=palettes[npalette])
plot.set_title(tool, fontsize=25)
plot.set_ylabel("Recall (%)", fontsize=20)
plot.set_xlabel("Variant size (bp)", fontsize=20)
plot.tick_params(labelsize=14)
plot.set_ylim([0,100])
plt.show()
```
%% Cell type:markdown id: tags:
#### Global
%% Cell type:code id: tags:
``` python
results_by_group = OrderedDict()
for group in groups:
tmp_res = results_df[(results_df.Real_data__Length >= group[0]) & (results_df.Real_data__Length < group[1])]
tp, fp, fn = compute_tp_fp_fn(tmp_res, "Filtered_results")
results_by_group["-".join(map(str,group))] = [tp / (tp+fn) * 100 if tp+fn >0 else 0]
plt.figure(1, figsize=(15,8))
recall_df = pd.DataFrame.from_dict(results_by_group, orient="columns")
plot = sns.barplot(data=recall_df, color='black')
plot.set_title("Recall", fontsize=30)
plot.set_ylabel("Recall (%)", fontsize=20)
plot.set_xlabel("Variant size (bp)", fontsize=20)
plot.tick_params(labelsize=14)
plot.set_ylim([0,100])
plt.show()
```
%% Cell type:markdown id: tags:
### Shared variant by tool
%% Cell type:code id: tags:
``` python
def compute_found_sv(svs: pd.DataFrame, tool: str):
variants = []
start_values = svs["{0}__Start".format(tool)]
for idx in svs.index:
if not math.isnan(start_values[idx]) and not math.isnan(svs["Real_data__Start"][idx]):
variants.append(idx)
return variants
variants_by_tool = []
for tool in tools:
variants_by_tool.append({"name": tool, "data": compute_found_sv(results_df, tool)})
```
%% Cell type:code id: tags:
``` python
html=HTML("<script type='text/javascript' src='http://jvenn.toulouse.inra.fr/app/js/canvas2svg.js'></script> \
<script type='text/javascript' src='http://jvenn.toulouse.inra.fr/app/js/jvenn.min.js'></script> \
<div id='draw'></div> \
<script type='text/javascript'> \
$(document).ready(function(){ \
$('#draw').jvenn({ \
series: " + json.dumps(variants_by_tool) + " \
}); \
}); \
</script>")
display(html)
```
%% Cell type:markdown id: tags:
### Breakpoints precision
%% Cell type:code id: tags:
``` python
plt.figure(1, figsize=(20,8))
# Start precision
df_diffs=pd.read_table("results_sv_diffs_per_tools.tsv", header=0, index_col=0)
all_diffs_soft_start = pd.DataFrame()
for tool in tools:
all_diffs_soft_start[tool] = df_diffs[tool + "__Start"].abs()
plt.subplot(121)
plot = sns.stripplot(data=all_diffs_soft_start, jitter=True)
plot.set_ylim([0,150])
plot.tick_params(labelsize=15)
plot.set_title("Start position", fontsize=28, y=1.04)
plot.set_ylabel("Diff from real data (abs)", fontsize=20)
# End precision
df_diffs=pd.read_table("results_sv_diffs_per_tools.tsv", header=0, index_col=0)
all_diffs_soft_end = pd.DataFrame()
for tool in tools:
all_diffs_soft_end[tool] = df_diffs[tool + "__End"].abs()
plt.subplot(122)
plot = sns.stripplot(data=all_diffs_soft_end, jitter=True)
plot.set_ylim([0,150])
plot.tick_params(labelsize=15)
plot.set_title("End position", fontsize=28, y=1.04)
plot.set_ylabel("Diff from real data (abs)", fontsize=20)
plt.show()
```
%% Cell type:markdown id: tags:
## 2. CNV Genotyping
We compare quality of genotyping
%% Cell type:code id: tags:
``` python
# Simulated deletions
total = len(results_df[results_df.Real_data__Start.notnull()])
```
%% Cell type:code id: tags:
``` python
# Predicted deletions genotyped
df_svtyper=pd.read_csv("results_genotype.tsv",sep='\t',index_col=False,names=['del','software','delsize','recall','left','right'])
df_svtyper['precision'] = [ "precise" if (x+y)<20 else "unprecise" for (x,y) in zip(df_svtyper.left,df_svtyper.right)]
df_svtyper['deltype'] = ['small' if x<200 else 'medium' for x in df_svtyper.delsize]
counts_svtyper = np.unique(df_svtyper.software,return_counts=True)[1]
```
%% Cell type:markdown id: tags:
#### Genotype recall for each software prediction
%% Cell type:code id: tags:
``` python
gt_tools = list(tools) + ["pass"]
plt.figure(1, figsize=(8,5))
sns.stripplot(x="software", y="recall", jitter=True, palette="Set2", dodge=True, linewidth=1, edgecolor='gray', order=gt_tools,data=df_svtyper)
axes = sns.boxplot(x="software", y="recall", palette="Set2", order=gt_tools,data=df_svtyper)
axes.title.set_position([.5, 1.2])
axes.set_title(str(total)+" simulated variants",size=25)
axes.axes.xaxis.label.set_size(20)
axes.axes.yaxis.label.set_size(20)
axes.tick_params(labelsize=15)
ymax = axes.get_ylim()[1]
for i in range(0, len(counts_svtyper)):
t1=axes.text(-0.1 + (i * 1), ymax+ymax/100, counts_svtyper[i], fontsize=15)
```
%% Cell type:markdown id: tags:
#### Inluence of precision : precise means less than 20bp
%% Cell type:code id: tags:
``` python
plt.figure(1, figsize=(8,5))
sns.stripplot(x="software", y="recall", jitter=True, hue="precision", palette="Set2", dodge=True, linewidth=1, edgecolor='gray', order=gt_tools,data=df_svtyper)
axes = sns.boxplot(x="software", y="recall",hue="precision",palette="Set2", order=gt_tools,data=df_svtyper)
axes.title.set_position([.5, 1.2])
axes.set_ylim(0.15, 1.05)
axes.set_title(str(total)+" simulated variants",size=25)
axes.axes.xaxis.label.set_size(20)
axes.axes.yaxis.label.set_size(20)
axes.tick_params(labelsize=15)
ymax = axes.get_ylim()[1]
for i in range(0, len(counts_svtyper)):
t1=axes.text(-0.1 + (i * 1), ymax+ymax/100, counts_svtyper[i], fontsize=15)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
```
%% Cell type:markdown id: tags:
#### Inluence of deletion size : small means less than 200bp
%% Cell type:code id: tags:
``` python
plt.figure(1, figsize=(8,5))
sns.stripplot(x="software", y="recall", jitter=True, hue="deltype", palette="Set2", dodge=True, linewidth=1, edgecolor='gray', order=gt_tools,data=df_svtyper)
axes = sns.boxplot(x="software", y="recall",hue="deltype",palette="Set2", order=gt_tools,data=df_svtyper)
axes.title.set_position([.5, 1.2])
axes.set_ylim(0.15, 1.05)
axes.set_title(str(total)+" simulated variants",size=25)
axes.axes.xaxis.label.set_size(20)
axes.axes.yaxis.label.set_size(20)
axes.tick_params(labelsize=15)
ymax = axes.get_ylim()[1]
for i in range(0, len(counts_svtyper)):
t1=axes.text(-0.1 + (i * 1), ymax+ymax/100, counts_svtyper[i], fontsize=15)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
```
%% Cell type:markdown id: tags:
##### Size VS Precision
%% Cell type:code id: tags:
``` python
plt.figure(1, figsize=(8,5))
sns.stripplot(x="precision", y="delsize", jitter=True, palette="Set2", dodge=True, linewidth=1, edgecolor='gray', order=['precise', 'unprecise'],data=df_svtyper)
axes = sns.boxplot(x="precision", y="delsize",palette="Set2", order=['precise', 'unprecise'],data=df_svtyper)
axes.axes.xaxis.label.set_size(20)
axes.axes.yaxis.label.set_size(20)
axes.tick_params(labelsize=15)
```
......
......@@ -175,7 +175,7 @@ def get_genotypes(genotypes_files, true_vcf_file):
return genotypes, gt_quality, nb_inds
def build_records(genotypes, SVSet, true_ones_records, filtered_records, gt_quality):
def build_records(genotypes, SVSet, true_ones_records, filtered_records, gt_quality, all_variants_filter):
"""
Build records for each SV
:param genotypes: list of genotypes of each individual for each SV
......@@ -216,9 +216,13 @@ def build_records(genotypes, SVSet, true_ones_records, filtered_records, gt_qual
"end": node.end,
"length": int(node.end) - int(node.start),
"filtered": (node.id in filtered_records) if filtered_records is not None else False,
"filters": ",".join([x for x in all_variants_filter[node.id].filter]) if
node.id in all_variants_filter else "",
"genotypes": genotypes[node.id] if node.id in genotypes else None,
"qualities": gt_quality[node.id] if node.id in gt_quality else None
}
if results[tool_name]["filters"] == "":
results[tool_name]["filters"] = "PASS"
tools.add(tool_name)
chromosome = node.chrom
......@@ -295,6 +299,12 @@ def build_header(tools, filtered_records, nb_records, max_col_len, nb_inds):
"format": l_format}
max_col_len[i + 2] = get_max_len("Length", i + 2, max_col_len)
if tool not in ["Real data", "Filtered results"]:
cells[XLSX_COLS[i + 3] + "2"] = cells[XLSX_COLS[i + 3] + str(2 + nb_records + 3)] = {"text": "Filters",
"format": l_format}
max_col_len[i + 3] = get_max_len("Filters", i + 3, max_col_len)
i += 1
# Genotypes:
for k in range(0, nb_inds):
cells_gt[XLSX_COLS[j + k] + "2"] = cells_gq[XLSX_COLS[j + k] + "2"] = \
......@@ -352,7 +362,7 @@ def fill_real_data(row, cells, cells_gt, cells_gq, max_col_len, record, rec_id,
def fill_tool_data(row, col, cells, max_col_len, record, nb_records, my_start, my_end, my_length,
sv_format=None):
sv_format=None, no_filter=False):
"""
Fill cells for a tool and a record
:param row: row position
......@@ -387,6 +397,10 @@ def fill_tool_data(row, col, cells, max_col_len, record, nb_records, my_start, m
cells[XLSX_COLS[col + 2] + str(row)] = {"text": record["length"], "format": sv_format}
max_col_len[col + 2] = get_max_len(record["length"], col + 2, max_col_len)
if "filters" in record and not no_filter:
cells[XLSX_COLS[col + 3] + str(row)] = {"text": record["filters"], "format": sv_format}
max_col_len[col + 3] = get_max_len(record["filters"], col + 3, max_col_len)
###########
# DIFFS: #
###########
......@@ -408,6 +422,11 @@ def fill_tool_data(row, col, cells, max_col_len, record, nb_records, my_start, m
cells[XLSX_COLS[col + 2] + str(row + nb_records + 3)] = {"text": length,
"format": sv_format}
# FILTERS:
if "filters" in record and not no_filter:
cells[XLSX_COLS[col + 3] + str(row + nb_records + 3)] = {"text": record["filters"],
"format": sv_format}
return cells, max_col_len
......@@ -454,18 +473,26 @@ def write_headers(workbook, worksheet, headers, cell_len, rows, filtered_records
:param rows: (list) rows into write the header
:param filtered_records: (bool) is there filtered records
"""
orig_cell_len = cell_len
for row in rows:
merge_format = workbook.add_format({'bold': True})
worksheet.write(XLSX_COLS[1] + str(row + 1) + ":" + XLSX_COLS[cell_len] + str(row + 1), headers[0], merge_format)
i = 0
left_cell_lenth = 2
for header in headers[1:]:
if header not in ["Real data", "Filtered results"]:
cell_len = orig_cell_len + 1
else:
cell_len = orig_cell_len
if i < len(headers) - 2 or not filtered_records:
merge_format = workbook.add_format({'align': 'center'})
else:
merge_format = workbook.add_format({'align': 'center', 'bg_color': COLOR_COL_FILTER})
for row in rows:
worksheet.merge_range(XLSX_COLS[2 + i * cell_len] + str(row) + ":" + XLSX_COLS[1 + cell_len + i * cell_len] + str(row),
worksheet.merge_range(XLSX_COLS[left_cell_lenth] + str(row) + ":" +
XLSX_COLS[left_cell_lenth + cell_len - 1] + str(row),
header, merge_format)
left_cell_lenth += cell_len
i += 1
......@@ -560,15 +587,24 @@ def create_tsv_file(filename: str, headers: list, cells: dict, nb_tools: int, nb
head = ["", headers[0]]
top_headers = {}
h = 2
nb_only_tools = nb_tools # Number of tools, except real data and filtered results
nb_others = 0
for header in headers[1:]:
# Define top headers to each column:
for i in range(0, nb_per_tool):
top_headers[h] = header
head.append("")
h += 1
if header not in ["Real data", "Filtered results"]:
top_headers[h] = header
head.append("")
h += 1
else:
nb_only_tools -= 1
nb_others += 1
rows = [head]
for i in range(0, records_range[1]-records_range[0]):
rows.append(["" for x in range(0, (nb_tools * nb_per_tool) + 2)])
rows.append(["" for x in range(0, (nb_only_tools * (nb_per_tool + 1)) + (nb_others * nb_per_tool) + 2)])
# Fill content:
for id_cell, cell in cells.items():
......@@ -679,17 +715,17 @@ def apply_style_of_filter_cells(cells, cells_gt, cells_gq, i, is_kept, nb_record
cells[XLSX_COLS[0] + str(i + nb_records + 3)]["format"]["bg_color"] = COLOR_IS_KEPT
elif filtered_records is not None:
# SV does not pass the filter: color bg in red in the filter column
cells[XLSX_COLS[2 + ((nb_tools + 1) * 3)] + str(i)] = {"text": "", "format": {"bg_color": COLOR_NOT_FOUND}}
cells[XLSX_COLS[2 + ((nb_tools + 1) * 3) + 1] + str(i)] = {"text": "",
cells[XLSX_COLS[2 + ((nb_tools * 4) + 3)] + str(i)] = {"text": "", "format": {"bg_color": COLOR_NOT_FOUND}}
cells[XLSX_COLS[2 + ((nb_tools * 4) + 3) + 1] + str(i)] = {"text": "",
"format": {"bg_color": COLOR_NOT_FOUND}}
cells[XLSX_COLS[2 + ((nb_tools + 1) * 3) + 2] + str(i)] = {"text": "",
cells[XLSX_COLS[2 + ((nb_tools * 4) + 3) + 2] + str(i)] = {"text": "",
"format": {"bg_color": COLOR_NOT_FOUND}}
cells[XLSX_COLS[2 + ((nb_tools + 1) * 3)] + str(i + nb_records + 3)] = {"text": "",
cells[XLSX_COLS[2 + ((nb_tools * 4) + 3)] + str(i + nb_records + 3)] = {"text": "",
"format": {
"bg_color": COLOR_NOT_FOUND}}
cells[XLSX_COLS[2 + ((nb_tools + 1) * 3) + 1] + str(i + nb_records + 3)] = {
cells[XLSX_COLS[2 + ((nb_tools * 4) + 3) + 1] + str(i + nb_records + 3)] = {
"text": "", "format": {"bg_color": COLOR_NOT_FOUND}}
cells[XLSX_COLS[2 + ((nb_tools + 1) * 3) + 2] + str(i + nb_records + 3)] = {
cells[XLSX_COLS[2 + ((nb_tools * 4) + 3) + 2] + str(i + nb_records + 3)] = {
"text": "", "format": {"bg_color": COLOR_NOT_FOUND}}
# Genotype:
......@@ -743,9 +779,9 @@ def build_body_cells(rec_keys, records, nb_records, nb_inds, tools, cells, cells
sv_format = {"bg_color": COLOR_IS_KEPT}
# SV data (sheet 1):
cells, max_col_len = fill_tool_data(i, 2 + ((nb_tools + 1) * 3), cells, max_col_len,
cells, max_col_len = fill_tool_data(i, 2 + ((nb_tools * 4) + 3), cells, max_col_len,
record["tools"][tool], nb_records, my_start, my_end, my_length,
{"bg_color": COLOR_COL_FILTER})
{"bg_color": COLOR_COL_FILTER}, True)
if do_genotype:
# Genotype (sheets 2&3):
......@@ -775,7 +811,7 @@ def build_body_cells(rec_keys, records, nb_records, nb_inds, tools, cells, cells
cells, cells_gt, cells_gq = fill_cells_no_tools(cells, cells_gt, cells_gq, i, j, g, nb_records, nb_inds,
do_genotype)
j += 3
j += 4
g += nb_inds
###############################################################################
......@@ -843,7 +879,8 @@ def init(output, vcf_files, true_vcf, rules, filtered_vcfs=None, type_v="del", o
construct_overlap_graph(sv_set, overlap_cutoff, left_precision, right_precision)
# Build records:
records, tools, orphans = build_records(genotypes, sv_set, true_variants.keys(), filtered_records, gt_quality)
records, tools, orphans = build_records(genotypes, sv_set, true_variants.keys(), filtered_records, gt_quality,
filtered_all)
nb_records = len(records)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment