Skip to content
Snippets Groups Projects
Summarized_results.ipynb 14.4 KiB
Newer Older
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# CNV detection benchmark"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from collections import OrderedDict\n",
    "\n",
    "import math\n",
    "\n",
    "import re\n",
    "import os\n",
    "import json\n",
    "import pylab as P\n",
    "import matplotlib as mpl\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "sns.set(style=\"whitegrid\", color_codes=True)\n",
    "%matplotlib inline\n",
    "\n",
    "from pysam import VariantFile\n",
    "from collections import defaultdict\n",
    "from collections import Counter\n",
    "\n",
    "from IPython.display import display, HTML\n",
    "\n",
    "# Read tsv file\n",
    "results_df = pd.read_table(\"results_sv_per_tools.tsv\", header=0, index_col=0)\n",
    "\n",
    "# Retrieve list of tools\n",
    "tools = set()\n",
    "for col in results_df.columns:\n",
    "    if col.endswith(\"__Start\") and col != \"Real_data__Start\" and col != \"Filtered_results__Start\":\n",
    "        tools.add(col.split(\"__\")[0])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Recall & Precision"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def compute_tp_fp_fn(svs, tool):\n",
    "    tp = 0\n",
    "    fp = 0\n",
    "    fn = 0\n",
    "    start_values = svs[\"{0}__Start\".format(tool)]\n",
    "    for i in range(0, len(start_values)):\n",
    "        if math.isnan(start_values[i]) and not math.isnan(svs[\"Real_data__Start\"][i]):\n",
    "            fn += 1\n",
    "        elif not math.isnan(start_values[i]) and not math.isnan(svs[\"Real_data__Start\"][i]):\n",
    "            tp += 1\n",
    "        elif not math.isnan(start_values[i]) and math.isnan(svs[\"Real_data__Start\"][i]):\n",
    "            fp += 1\n",
    "    return tp, fp, fn\n",
    "\n",
    "recall = OrderedDict()\n",
    "precision = OrderedDict()\n",
    "for tool in tools:\n",
    "    if tool + \"__Start\" in results_df:\n",
    "        tp, fp, fn = compute_tp_fp_fn(results_df, tool)\n",
    "        recall[tool] = [tp / (tp+fn) * 100]\n",
    "        precision[tool] = [tp / (tp+fp) * 100]\n",
    "        \n",
    "plt.figure(1, figsize=(20,10))\n",
    "\n",
    "# Plot recall\n",
    "plt.subplot(121)\n",
    "recall_df = pd.DataFrame.from_dict(recall, orient=\"columns\")\n",
    "plot = sns.barplot(data=recall_df)\n",
    "plot.set_title(\"Recall\", fontsize=30)\n",
    "plot.set_ylabel(\"Recall (%)\", fontsize=20)\n",
    "plot.set_xlabel(\"Tool\", fontsize=20)\n",
    "plot.tick_params(labelsize=14)\n",
    "plot.set_ylim([0,100])\n",
    "\n",
    "# Plot precision\n",
    "plt.subplot(122)\n",
    "precision_df = pd.DataFrame.from_dict(precision, orient=\"columns\")\n",
    "plot2 = sns.barplot(data=precision_df)\n",
    "plot2.set_title(\"Precision\", fontsize=30)\n",
    "plot2.set_ylabel(\"Precision (%)\", fontsize=20)\n",
    "plot2.set_xlabel(\"Tool\", fontsize=20)\n",
    "plot2.tick_params(labelsize=14)\n",
    "\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Influence of variant size on recall"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "groups = []\n",
    "with open(\"rules.sim\", \"r\") as rules:\n",
    "    for line in rules:\n",
    "        line = line.rstrip()\n",
    "        if line != \"\":\n",
    "            parts = re.split(r\"\\s+\", line)\n",
    "            groups.append((int(parts[1]), int(parts[2])))\n",
    "\n",
    "groups.sort(key=lambda x: x[0])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### By tool"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "nrows = math.ceil(len(tools)/2)\n",
    "ncols = min(2, len(tools))\n",
    "\n",
    "palettes = [\"Blues_d\", \"Greens_d\", \"Reds_d\", \"Purples_d\", \"YlOrBr_d\", \"PuBu_d\"]\n",
    "\n",
    "plt.figure(1, figsize=(20,nrows * 8))\n",
    "\n",
    "nplot=0\n",
    "\n",
    "for tool in tools:         \n",
    "    npalette = nplot\n",
    "    while npalette >= len(palettes):\n",
    "        npalette -= len(palettes)\n",
    "    nplot += 1\n",
    "    results_by_group = OrderedDict()\n",
    "    for group in groups:\n",
    "        tmp_res = results_df[(results_df.Real_data__Length >= group[0]) & (results_df.Real_data__Length < group[1])]\n",
    "        tp, fp, fn = compute_tp_fp_fn(tmp_res, tool)\n",
    "        results_by_group[\"-\".join(map(str,group))] = [tp / (tp+fn) * 100 if tp+fn >0 else 0]\n",
    "\n",
    "    recall_df = pd.DataFrame.from_dict(results_by_group, orient=\"columns\")\n",
    "    \n",
    "    plt.subplot(nrows, ncols, nplot)\n",
    "    plot = sns.barplot(data=recall_df, palette=palettes[npalette])\n",
    "    plot.set_title(tool, fontsize=25)\n",
    "    plot.set_ylabel(\"Recall (%)\", fontsize=20)\n",
    "    plot.set_xlabel(\"Variant size (bp)\", fontsize=20)\n",
    "    plot.tick_params(labelsize=14)\n",
    "    plot.set_ylim([0,100])\n",
    "\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Global"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "results_by_group = OrderedDict()\n",
    "            \n",
    "for group in groups:\n",
    "    tmp_res = results_df[(results_df.Real_data__Length >= group[0]) & (results_df.Real_data__Length < group[1])]\n",
    "    tp, fp, fn = compute_tp_fp_fn(tmp_res, \"Filtered_results\")\n",
    "    results_by_group[\"-\".join(map(str,group))] = [tp / (tp+fn) * 100 if tp+fn >0 else 0]\n",
    "    \n",
    "plt.figure(1, figsize=(15,8))\n",
    "    \n",
    "recall_df = pd.DataFrame.from_dict(results_by_group, orient=\"columns\")\n",
    "plot = sns.barplot(data=recall_df, color='black')\n",
    "plot.set_title(\"Recall\", fontsize=30)\n",
    "plot.set_ylabel(\"Recall (%)\", fontsize=20)\n",
    "plot.set_xlabel(\"Variant size (bp)\", fontsize=20)\n",
    "plot.tick_params(labelsize=14)\n",
    "plot.set_ylim([0,100])\n",
    "\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Shared variant by tool"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def compute_found_sv(svs: pd.DataFrame, tool: str):\n",
    "    variants = []\n",
    "    start_values = svs[\"{0}__Start\".format(tool)]\n",
    "    for idx in svs.index:\n",
    "        if not math.isnan(start_values[idx]) and not math.isnan(svs[\"Real_data__Start\"][idx]):\n",
    "            variants.append(idx)\n",
    "    return variants\n",
    "\n",
    "variants_by_tool = []\n",
    "\n",
    "for tool in tools:\n",
    "    variants_by_tool.append({\"name\": tool, \"data\": compute_found_sv(results_df, tool)})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "html=HTML(\"<script type='text/javascript' src='http://jvenn.toulouse.inra.fr/app/js/canvas2svg.js'></script> \\\n",
    "<script type='text/javascript' src='http://jvenn.toulouse.inra.fr/app/js/jvenn.min.js'></script> \\\n",
    "<div id='draw'></div> \\\n",
    "<script type='text/javascript'> \\\n",
    "  $(document).ready(function(){ \\\n",
    "     $('#draw').jvenn({ \\\n",
    "       series: \" + json.dumps(variants_by_tool) + \" \\\n",
    "     }); \\\n",
    "  }); \\\n",
    "</script>\")\n",
    "\n",
    "display(html)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Breakpoints precision"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure(1, figsize=(20,8))\n",
    "\n",
    "# Start precision\n",
    "df_diffs=pd.read_table(\"results_sv_diffs_per_tools.tsv\", header=0, index_col=0)\n",
    "all_diffs_soft_start = pd.DataFrame()\n",
    "for tool in tools:\n",
    "    all_diffs_soft_start[tool] = df_diffs[tool + \"__Start\"].abs()\n",
    "\n",
    "plt.subplot(121)\n",
    "plot = sns.stripplot(data=all_diffs_soft_start, jitter=True)\n",
    "plot.set_ylim([0,150])\n",
    "plot.tick_params(labelsize=15)\n",
    "plot.set_title(\"Start position\", fontsize=28, y=1.04)\n",
    "plot.set_ylabel(\"Diff from real data (abs)\", fontsize=20)\n",
    "\n",
    "# End precision\n",
    "df_diffs=pd.read_table(\"results_sv_diffs_per_tools.tsv\", header=0, index_col=0)\n",
    "all_diffs_soft_end = pd.DataFrame()\n",
    "for tool in tools:\n",
    "    all_diffs_soft_end[tool] = df_diffs[tool + \"__End\"].abs()\n",
    "\n",
    "plt.subplot(122)\n",
    "plot = sns.stripplot(data=all_diffs_soft_end, jitter=True)\n",
    "plot.set_ylim([0,150])\n",
    "plot.tick_params(labelsize=15)\n",
    "plot.set_title(\"End position\", fontsize=28, y=1.04)\n",
    "plot.set_ylabel(\"Diff from real data (abs)\", fontsize=20)\n",
    "\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. CNV Genotyping\n",
    "\n",
    "We compare quality of genotyping"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Simulated deletions\n",
    "total = len(results_df[results_df.Real_data__Start.notnull()])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Predicted deletions genotyped\n",
    "\n",
    "df_svtyper=pd.read_csv(\"results_genotype.tsv\",sep='\\t',index_col=False,names=['del','software','delsize','recall','left','right'])\n",
    "df_svtyper['precision'] = [ \"precise\" if (x+y)<20 else \"unprecise\" for (x,y) in zip(df_svtyper.left,df_svtyper.right)]\n",
    "df_svtyper['deltype'] = ['small' if x<200 else 'medium' for x in df_svtyper.delsize]\n",
    "counts_svtyper = np.unique(df_svtyper.software,return_counts=True)[1]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Genotype recall for each software prediction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "gt_tools = list(tools) + [\"pass\"]\n",
    "plt.figure(1, figsize=(8,5))\n",
    "sns.stripplot(x=\"software\", y=\"recall\", jitter=True, palette=\"Set2\", dodge=True, linewidth=1, edgecolor='gray', order=gt_tools,data=df_svtyper)\n",
    "axes = sns.boxplot(x=\"software\", y=\"recall\", palette=\"Set2\", order=gt_tools,data=df_svtyper)\n",
    "axes.title.set_position([.5, 1.2])\n",
    "axes.set_title(str(total)+\" simulated variants\",size=25)\n",
    "axes.axes.xaxis.label.set_size(20)\n",
    "axes.axes.yaxis.label.set_size(20)\n",
    "axes.tick_params(labelsize=15)\n",
    "ymax = axes.get_ylim()[1]\n",
    "for i in range(0, len(counts_svtyper)):\n",
    "    t1=axes.text(-0.1 + (i * 1), ymax+ymax/100, counts_svtyper[i], fontsize=15)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Inluence of precision : precise means less than 20bp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure(1, figsize=(8,5))\n",
    "sns.stripplot(x=\"software\", y=\"recall\", jitter=True, hue=\"precision\", palette=\"Set2\", dodge=True, linewidth=1, edgecolor='gray', order=gt_tools,data=df_svtyper)\n",
    "axes = sns.boxplot(x=\"software\", y=\"recall\",hue=\"precision\",palette=\"Set2\", order=gt_tools,data=df_svtyper)\n",
    "axes.title.set_position([.5, 1.2])\n",
    "axes.set_ylim(0.15, 1.05)\n",
    "axes.set_title(str(total)+\" simulated variants\",size=25)\n",
    "axes.axes.xaxis.label.set_size(20)\n",
    "axes.axes.yaxis.label.set_size(20)\n",
    "axes.tick_params(labelsize=15)\n",
    "ymax = axes.get_ylim()[1]\n",
    "for i in range(0, len(counts_svtyper)):\n",
    "    t1=axes.text(-0.1 + (i * 1), ymax+ymax/100, counts_svtyper[i], fontsize=15)\n",
    "plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Inluence of deletion size : small means less than 200bp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure(1, figsize=(8,5))\n",
    "sns.stripplot(x=\"software\", y=\"recall\", jitter=True, hue=\"deltype\", palette=\"Set2\", dodge=True, linewidth=1, edgecolor='gray', order=gt_tools,data=df_svtyper)\n",
    "axes = sns.boxplot(x=\"software\", y=\"recall\",hue=\"deltype\",palette=\"Set2\", order=gt_tools,data=df_svtyper)\n",
    "axes.title.set_position([.5, 1.2])\n",
    "axes.set_ylim(0.15, 1.05)\n",
    "axes.set_title(str(total)+\" simulated variants\",size=25)\n",
    "axes.axes.xaxis.label.set_size(20)\n",
    "axes.axes.yaxis.label.set_size(20)\n",
    "axes.tick_params(labelsize=15)\n",
    "ymax = axes.get_ylim()[1]\n",
    "for i in range(0, len(counts_svtyper)):\n",
    "    t1=axes.text(-0.1 + (i * 1), ymax+ymax/100, counts_svtyper[i], fontsize=15)\n",
    "plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### Size VS Precision"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure(1, figsize=(8,5))\n",
    "sns.stripplot(x=\"precision\", y=\"delsize\", jitter=True, palette=\"Set2\", dodge=True, linewidth=1, edgecolor='gray', order=['precise', 'unprecise'],data=df_svtyper)\n",
    "axes = sns.boxplot(x=\"precision\", y=\"delsize\",palette=\"Set2\", order=['precise', 'unprecise'],data=df_svtyper)\n",
    "axes.axes.xaxis.label.set_size(20)\n",
    "axes.axes.yaxis.label.set_size(20)\n",
    "axes.tick_params(labelsize=15)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}