Skip to content
Snippets Groups Projects
Commit d2c794ae authored by Floreal Cabanettes's avatar Floreal Cabanettes
Browse files

Add build of jupyter notebook summary

parent 57d82513
No related branches found
No related tags found
No related merge requests found
%% Cell type:markdown id: tags:
# CNV detection benchmark
%% Cell type:code id: tags:
``` python
%%html
<script>
function code_toggle() {
if (code_shown){
$('div.input').hide('500');
$('#toggleButton').val('Show Code')
} else {
$("div.input:not(:first)").show('500');
$('#toggleButton').val('Hide Code')
}
code_shown = !code_shown
}
$( document ).ready(function(){
code_shown=false;
$('div.input').hide()
});
</script>
<form action="javascript:code_toggle()" style="position:fixed;left:10px;top:10px"><input type="submit" id="toggleButton" value="Show Code"></form>
```
%% Cell type:code id: tags:
``` python
import pandas as pd
import numpy as np
from collections import OrderedDict
import math
import re
import os
import json
import pylab as P
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid", color_codes=True)
%matplotlib inline
from pysam import VariantFile
from collections import defaultdict
from collections import Counter
from IPython.display import display, HTML
# Read tsv file
results_df = pd.read_table("results_sv_per_tools.tsv", header=0, index_col=0)
# Retrieve list of tools
tools = set()
for col in results_df.columns:
if col.endswith("__Start") and col != "Real_data__Start" and col != "Filtered_results__Start":
tools.add(col.split("__")[0])
```
%% Cell type:markdown id: tags:
### Recall & Precision
%% Cell type:code id: tags:
``` python
def compute_tp_fp_fn(svs, tool):
tp = 0
fp = 0
fn = 0
start_values = svs["{0}__Start".format(tool)]
for i in range(0, len(start_values)):
if math.isnan(start_values[i]) and not math.isnan(svs["Real_data__Start"][i]):
fn += 1
elif not math.isnan(start_values[i]) and not math.isnan(svs["Real_data__Start"][i]):
tp += 1
elif not math.isnan(start_values[i]) and math.isnan(svs["Real_data__Start"][i]):
fp += 1
return tp, fp, fn
recall = OrderedDict()
precision = OrderedDict()
for tool in tools:
if tool + "__Start" in results_df:
tp, fp, fn = compute_tp_fp_fn(results_df, tool)
recall[tool] = [tp / (tp+fn) * 100]
precision[tool] = [tp / (tp+fp) * 100]
plt.figure(1, figsize=(20,10))
# Plot recall
plt.subplot(121)
recall_df = pd.DataFrame.from_dict(recall, orient="columns")
plot = sns.barplot(data=recall_df)
plot.set_title("Recall", fontsize=30)
plot.set_ylabel("Recall (%)", fontsize=20)
plot.set_xlabel("Tool", fontsize=20)
plot.tick_params(labelsize=14)
plot.set_ylim([0,100])
# Plot precision
plt.subplot(122)
precision_df = pd.DataFrame.from_dict(precision, orient="columns")
plot2 = sns.barplot(data=precision_df)
plot2.set_title("Precision", fontsize=30)
plot2.set_ylabel("Precision (%)", fontsize=20)
plot2.set_xlabel("Tool", fontsize=20)
plot2.tick_params(labelsize=14)
plt.show()
```
%% Cell type:markdown id: tags:
### Influence of variant size on recall
%% Cell type:code id: tags:
``` python
groups = []
with open("rules.sim", "r") as rules:
for line in rules:
line = line.rstrip()
if line != "":
parts = re.split(r"\s+", line)
groups.append((int(parts[1]), int(parts[2])))
groups.sort(key=lambda x: x[0])
```
%% Cell type:markdown id: tags:
#### By tool
%% Cell type:code id: tags:
``` python
nrows = math.ceil(len(tools)/2)
ncols = min(2, len(tools))
palettes = ["Blues_d", "Greens_d", "Reds_d", "Purples_d", "YlOrBr_d", "PuBu_d"]
plt.figure(1, figsize=(20,nrows * 8))
nplot=0
for tool in tools:
npalette = nplot
while npalette >= len(palettes):
npalette -= len(palettes)
nplot += 1
results_by_group = OrderedDict()
for group in groups:
tmp_res = results_df[(results_df.Real_data__Length >= group[0]) & (results_df.Real_data__Length < group[1])]
tp, fp, fn = compute_tp_fp_fn(tmp_res, tool)
results_by_group["-".join(map(str,group))] = [tp / (tp+fn) * 100]
recall_df = pd.DataFrame.from_dict(results_by_group, orient="columns")
plt.subplot(nrows, ncols, nplot)
plot = sns.barplot(data=recall_df, palette=palettes[npalette])
plot.set_title(tool, fontsize=25)
plot.set_ylabel("Recall (%)", fontsize=20)
plot.set_xlabel("Variant size (bp)", fontsize=20)
plot.tick_params(labelsize=14)
plot.set_ylim([0,100])
plt.show()
```
%% Cell type:markdown id: tags:
#### Global
%% Cell type:code id: tags:
``` python
results_by_group = OrderedDict()
for group in groups:
tmp_res = results_df[(results_df.Real_data__Length >= group[0]) & (results_df.Real_data__Length < group[1])]
tp, fp, fn = compute_tp_fp_fn(tmp_res, "Filtered_results")
results_by_group["-".join(map(str,group))] = [tp / (tp+fn) * 100]
plt.figure(1, figsize=(15,8))
recall_df = pd.DataFrame.from_dict(results_by_group, orient="columns")
plot = sns.barplot(data=recall_df, color='black')
plot.set_title("Recall", fontsize=30)
plot.set_ylabel("Recall (%)", fontsize=20)
plot.set_xlabel("Variant size (bp)", fontsize=20)
plot.tick_params(labelsize=14)
plot.set_ylim([0,100])
plt.show()
```
%% Cell type:markdown id: tags:
### Shared variant by tool
%% Cell type:code id: tags:
``` python
def compute_found_sv(svs: pd.DataFrame, tool: str):
variants = []
start_values = svs["{0}__Start".format(tool)]
for idx in svs.index:
if not math.isnan(start_values[idx]) and not math.isnan(svs["Real_data__Start"][idx]):
variants.append(idx)
return variants
variants_by_tool = []
for tool in tools:
variants_by_tool.append({"name": tool, "data": compute_found_sv(results_df, tool)})
```
%% Cell type:code id: tags:
``` python
html=HTML("<script type='text/javascript' src='http://jvenn.toulouse.inra.fr/app/js/canvas2svg.js'></script> \
<script type='text/javascript' src='http://jvenn.toulouse.inra.fr/app/js/jvenn.min.js'></script> \
<div id='draw'></div> \
<script type='text/javascript'> \
$(document).ready(function(){ \
$('#draw').jvenn({ \
series: " + json.dumps(variants_by_tool) + " \
}); \
}); \
</script>")
display(html)
```
%% Cell type:markdown id: tags:
### Breakpoints precision
%% Cell type:code id: tags:
``` python
plt.figure(1, figsize=(20,8))
# Start precision
df_diffs=pd.read_table("results_sv_diffs_per_tools.tsv", header=0, index_col=0)
all_diffs_soft_start = pd.DataFrame()
for tool in tools:
all_diffs_soft_start[tool] = df_diffs[tool + "__Start"].abs()
plt.subplot(121)
plot = sns.stripplot(data=all_diffs_soft_start, jitter=True)
plot.set_ylim([0,150])
plot.tick_params(labelsize=15)
plot.set_title("Start position", fontsize=28, y=1.04)
plot.set_ylabel("Diff from real data (abs)", fontsize=20)
# End precision
df_diffs=pd.read_table("results_sv_diffs_per_tools.tsv", header=0, index_col=0)
all_diffs_soft_end = pd.DataFrame()
for tool in tools:
all_diffs_soft_end[tool] = df_diffs[tool + "__End"].abs()
plt.subplot(122)
plot = sns.stripplot(data=all_diffs_soft_end, jitter=True)
plot.set_ylim([0,150])
plot.tick_params(labelsize=15)
plot.set_title("End position", fontsize=28, y=1.04)
plot.set_ylabel("Diff from real data (abs)", fontsize=20)
plt.show()
```
%% Cell type:markdown id: tags:
## 2. CNV Genotyping
We compare quality of genotyping
%% Cell type:code id: tags:
``` python
# Simulated deletions
total = len(results_df[results_df.Real_data__Start.notnull()])
```
%% Cell type:code id: tags:
``` python
# Predicted deletions genotyped
df_svtyper=pd.read_csv("results_genotype.tsv",sep='\t',index_col=False,names=['del','software','delsize','recall','left','right'])
df_svtyper['precision'] = [ "precise" if (x+y)<20 else "unprecise" for (x,y) in zip(df_svtyper.left,df_svtyper.right)]
df_svtyper['deltype'] = ['small' if x<200 else 'medium' for x in df_svtyper.delsize]
counts_svtyper = np.unique(df_svtyper.software,return_counts=True)[1]
```
%% Cell type:markdown id: tags:
#### Genotype recall for each software prediction
%% Cell type:code id: tags:
``` python
gt_tools = list(tools) + ["pass"]
plt.figure(1, figsize=(8,5))
sns.stripplot(x="software", y="recall", jitter=True, palette="Set2", dodge=True, linewidth=1, edgecolor='gray', order=gt_tools,data=df_svtyper)
axes = sns.boxplot(x="software", y="recall", palette="Set2", order=gt_tools,data=df_svtyper)
axes.title.set_position([.5, 1.2])
axes.set_title(str(total)+" simulated variants",size=25)
axes.axes.xaxis.label.set_size(20)
axes.axes.yaxis.label.set_size(20)
axes.tick_params(labelsize=15)
ymax = axes.get_ylim()[1]
for i in range(0, len(counts_svtyper)):
t1=axes.text(-0.1 + (i * 1), ymax+ymax/100, counts_svtyper[i], fontsize=15)
```
%% Cell type:markdown id: tags:
#### Inluence of precision : precise means less than 20bp
%% Cell type:code id: tags:
``` python
plt.figure(1, figsize=(8,5))
sns.stripplot(x="software", y="recall", jitter=True, hue="precision", palette="Set2", dodge=True, linewidth=1, edgecolor='gray', order=gt_tools,data=df_svtyper)
axes = sns.boxplot(x="software", y="recall",hue="precision",palette="Set2", order=gt_tools,data=df_svtyper)
axes.title.set_position([.5, 1.2])
axes.set_ylim(0.15, 1.05)
axes.set_title(str(total)+" simulated variants",size=25)
axes.axes.xaxis.label.set_size(20)
axes.axes.yaxis.label.set_size(20)
axes.tick_params(labelsize=15)
ymax = axes.get_ylim()[1]
for i in range(0, len(counts_svtyper)):
t1=axes.text(-0.1 + (i * 1), ymax+ymax/100, counts_svtyper[i], fontsize=15)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
```
%% Cell type:markdown id: tags:
#### Inluence of deletion size : small means less than 200bp
%% Cell type:code id: tags:
``` python
plt.figure(1, figsize=(8,5))
sns.stripplot(x="software", y="recall", jitter=True, hue="deltype", palette="Set2", dodge=True, linewidth=1, edgecolor='gray', order=gt_tools,data=df_svtyper)
axes = sns.boxplot(x="software", y="recall",hue="deltype",palette="Set2", order=gt_tools,data=df_svtyper)
axes.title.set_position([.5, 1.2])
axes.set_ylim(0.15, 1.05)
axes.set_title(str(total)+" simulated variants",size=25)
axes.axes.xaxis.label.set_size(20)
axes.axes.yaxis.label.set_size(20)
axes.tick_params(labelsize=15)
ymax = axes.get_ylim()[1]
for i in range(0, len(counts_svtyper)):
t1=axes.text(-0.1 + (i * 1), ymax+ymax/100, counts_svtyper[i], fontsize=15)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
```
%% Cell type:markdown id: tags:
##### Size VS Precision
%% Cell type:code id: tags:
``` python
plt.figure(1, figsize=(8,5))
sns.stripplot(x="precision", y="delsize", jitter=True, palette="Set2", dodge=True, linewidth=1, edgecolor='gray', order=['precise', 'unprecise'],data=df_svtyper)
axes = sns.boxplot(x="precision", y="delsize",palette="Set2", order=['precise', 'unprecise'],data=df_svtyper)
axes.axes.xaxis.label.set_size(20)
axes.axes.yaxis.label.set_size(20)
axes.tick_params(labelsize=15)
```
......@@ -18,9 +18,13 @@ import argparse
import string
import os
import re
import shutil
from pysam import VariantFile
import lib.genotype_results as gtres
import lib.vcf as vcf
import sys
prg_path = os.path.dirname(os.path.realpath(__file__))
sys.path.insert(0, os.path.join(prg_path, "svlib"))
......@@ -38,8 +42,6 @@ COLOR_IS_KEPT = "#81F781"
COLOR_FALSE_POSITIVE = "#FE642E"
COLOR_WRONG_GT = "#B40404"
ALLOW_VARIANTS = ['del', 'inv']
def get_args():
"""
......@@ -51,15 +53,16 @@ Build Results \n \
description: Build results of the simulated data detection")
parser.add_argument('-v', '--vcfs', type=str, required=True, help='File listing vcf files for each detection tool')
parser.add_argument('-t', '--true-vcf', type=str, required=True, help='VCF file containing the simulated deletions')
parser.add_argument('-f', '--filtered-vcf', type=str, required=False,
parser.add_argument('-f', '--filtered-vcf', type=str, required=True,
help='File listing VCF files containing the filtered results')
parser.add_argument('-y', '--type', required=True, type=str, choices=ALLOW_VARIANTS, help="Type of variant")
parser.add_argument('-y', '--type', required=True, type=str, choices=vcf.ALLOW_VARIANTS, help="Type of variant")
parser.add_argument('--overlap_cutoff', type=float, default=0.5, help='cutoff for reciprocal overlap')
parser.add_argument('--left_precision', type=int, default=-1, help='left breakpoint precision')
parser.add_argument('--right_precision', type=int, default=-1, help='right breakpoint precision')
parser.add_argument('-o', '--output', type=str, default="results", help='output folder')
parser.add_argument('--no-xls', action='store_const', const=True, default=False, help='Do not build Excel file')
parser.add_argument('--haploid', action='store_const', const=True, default=False, help='The organism is haploid')
parser.add_argument('-r', '--rules', type=str, required=False, help="Simulation rule file")
# parse the arguments
args = parser.parse_args()
......@@ -69,6 +72,9 @@ description: Build results of the simulated data detection")
if args.right_precision == -1:
args.right_precision = sys.maxsize
if args.rules is None:
args.rules = os.path.join(prg_path, "defaults.rules")
# send back the user input
return args
......@@ -79,35 +85,6 @@ def eprint(*args, **kwargs):
"""
print(*args, file=sys.stderr, **kwargs)
def passed_variant(record):
"""
Did this variant pass?
:param record: vcf record object
:return: True if pass, False else
"""
return record.filter is None or len(record.filter) == 0 or "PASS" in record.filter
def read_vcf_file(infile, type_v):
"""
Read a vcf file
:param infile: vcf file path
:param type_v: type of variant ("del" or "inv")
:return: set or records, list of records ids
"""
if type_v.lower() not in ALLOW_VARIANTS:
raise ValueError("Invalid variant type: %s" % type_v)
SVSet=[]
ids = []
for record in VCFReader(infile):
if record.sv_type.lower() == type_v:
if not passed_variant(record):
continue
SVSet.append(record)
ids.append(record.id)
return SVSet, ids
def svsort(sv, records):
"""
......@@ -818,7 +795,7 @@ def build_xlsx_cols():
XLSX_COLS.append(alp + j)
def init(output, vcf_files, true_vcf, filtered_vcfs=None, type_v="del", overlap_cutoff=0.5,
def init(output, vcf_files, true_vcf, rules, filtered_vcfs=None, type_v="del", overlap_cutoff=0.5,
left_precision=sys.maxsize, right_precision=sys.maxsize, no_xls=False, haploid=False):
if not os.path.exists(output):
......@@ -831,15 +808,18 @@ def init(output, vcf_files, true_vcf, filtered_vcfs=None, type_v="del", overlap_
nb_inds = 0
filtered = None
filtered_all = None
filtered_records = None
do_genotype = False
if filtered_vcfs:
filtered_records = []
filtered = {}
filtered_all = {}
for filtered_vcf in filtered_vcfs:
eprint(" Reading file %s" % filtered_vcf)
filtered_records += read_vcf_file(filtered_vcf, type_v)[1]
vcf.readvariants(filtered_vcf, type_v, filtered, True, filtered_all)
filtered_records = filtered.keys()
genotypes, gt_quality, nb_inds = get_genotypes(filtered_vcfs, true_vcf)
do_genotype = True
......@@ -848,20 +828,20 @@ def init(output, vcf_files, true_vcf, filtered_vcfs=None, type_v="del", overlap_
for infile in vcf_files:
eprint(" Reading file %s" % infile)
try:
sv_set += read_vcf_file(infile, type_v)[0]
sv_set += vcf.readvariants(infile, type_v).values()
except:
print("Ignoreing file %s" % infile)
eprint(" Reading file %s" % true_vcf)
sv_set_to, true_ones_records = read_vcf_file(true_vcf, type_v)
sv_set += sv_set_to
true_variants = vcf.readvariants(true_vcf, type_v)
sv_set += true_variants.values()
# Compute connected components:
eprint("Computing Connected components")
construct_overlap_graph(sv_set, overlap_cutoff, left_precision, right_precision)
# Build records:
records, tools, orphans = build_records(genotypes, sv_set, true_ones_records, filtered_records, gt_quality)
records, tools, orphans = build_records(genotypes, sv_set, true_variants.values(), filtered_records, gt_quality)
nb_records = len(records)
......@@ -911,6 +891,29 @@ def init(output, vcf_files, true_vcf, filtered_vcfs=None, type_v="del", overlap_
nb_tools + (2 if filtered_records is not None else 1),
nb_inds, (2, nb_records + 2))
###############################
# Build genotypes result file #
###############################
gtres.build(true_genotypes=true_variants,
pred_genotypes=filtered_all,
filtered_genotypes=filtered,
output=os.path.join(output, "results_genotype.tsv"))
#######################################
# Build Jupyter HTML notebook summary #
#######################################
# Copy necessary files:
files_to_copy = [rules, "Summarized_results.ipynb"]
for file in files_to_copy:
shutil.copy(file, os.path.join(output, os.path.basename(file)))
# Build HTML summary:
ipynb = os.path.join(output, "Summarized_results.ipynb")
os.popen("jupyter nbconvert --to html --template basic --execute %s" % ipynb)
print_results(nb_records, orphans, with_xlsx, output, do_genotype)
......@@ -944,7 +947,8 @@ def main():
left_precision=args.left_precision,
right_precision=args.right_precision,
no_xls=args.no_xls,
haploid=args.haploid)
haploid=args.haploid,
rules=args.rules)
# initialize the script
......
......@@ -2,7 +2,7 @@
from pybedtools import BedTool
from pybedtools import create_interval_from_list
from pysam import VariantFile
import lib.vcf as vcf
def variants_to_pybed(variants):
......@@ -17,23 +17,6 @@ def variants_to_pybed(variants):
return BedTool(intervals).sort()
def passed_variant(record):
"""
Did this variant pass?
:param record: vcf record object
:return: True if pass, False else
"""
return record.filter is None or len(record.filter) == 0 or "PASS" in record.filter
def readgenotypes(vcffile: str, variants: dict, type_v, dofilter: bool=False):
vcfin = VariantFile(vcffile)
for r in vcfin:
if (not dofilter or passed_variant(r)) and r.alts[0][1:-1].lower() == type_v:
variants[r.id] = r
return variants
def canonize(geno):
if geno == (1, 0):
return 0, 1
......@@ -65,7 +48,7 @@ def getvarsize(variant):
return variant.stop - variant.start + 1
def build(true_genotypes, pred_genotypes, filtered_genotypes, output):
def build(true_genotypes: dict, pred_genotypes: dict, filtered_genotypes: dict, output: str):
true_pybed = variants_to_pybed(true_genotypes)
pred_pybed = variants_to_pybed(pred_genotypes)
......@@ -89,23 +72,27 @@ def build(true_genotypes, pred_genotypes, filtered_genotypes, output):
def main(genotypes, predicted, filtered, output, type_v="del", verbose=False):
true_genotypes = readgenotypes(vcffile=genotypes,
variants={},
type_v=type_v)
true_genotypes = vcf.readvariants(vcffile=genotypes,
variants={},
type_v=type_v)
pred_genotypes = {}
filtered_genotypes = {}
with open(predicted, "r") as preds:
for pred in preds:
pred = pred.rstrip()
if pred != "":
readgenotypes(vcffile=pred,
variants=pred_genotypes,
type_v=type_v)
if filtered:
readgenotypes(vcffile=pred,
variants=filtered_genotypes,
type_v=type_v,
dofilter=True)
vcf.readvariants(vcffile=pred,
variants=filtered_genotypes,
type_v=type_v,
dofilter=True,
all_variants=pred_genotypes)
else:
if filtered:
vcf.readvariants(vcffile=pred,
variants=pred_genotypes,
type_v=type_v,
dofilter=False)
pred_with_true = build(true_genotypes=true_genotypes,
pred_genotypes=pred_genotypes,
......
from collections import OrderedDict
from pysam import VariantFile
ALLOW_VARIANTS = ['del', 'inv']
def passed_variant(record):
"""
Did this variant pass?
:param record: vcf record object
:return: True if pass, False else
"""
return record.filter is None or len(record.filter) == 0 or "PASS" in record.filter
def readvariants(vcffile: str, type_v, variants: dict=None, dofilter: bool=False, all_variants: dict=None):
"""
Read variants from a VCF file
:param vcffile: input vcf file
:param type_v: type of variant to get
:param variants: dict containing variants (only filtered if dofilter)
:param dofilter: filter to get only PASS variants
:param all_variants: all variants (ignored if is None or if dofilter is False)
:return:
"""
if variants is None:
variants = {}
if type_v.lower() not in ALLOW_VARIANTS:
raise ValueError("Invalid variant type: %s" % type_v)
vcfin = VariantFile(vcffile)
for r in vcfin:
if r.alts[0][1:-1].lower() == type_v:
if not dofilter or passed_variant(r):
variants[r.id] = r
if dofilter and all_variants is not None:
all_variants[r.id] = r
if dofilter and all_variants is not None:
return variants, all_variants
return variants
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment