Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
popsim
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
SVdetection
popsim
Commits
31b9c3bb
Commit
31b9c3bb
authored
6 years ago
by
Floreal Cabanettes
Browse files
Options
Downloads
Patches
Plain Diff
Add filter information on recall and precision first graphs
parent
2e1554a4
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
Summarized_results.ipynb
+30
-8
30 additions, 8 deletions
Summarized_results.ipynb
build_results.py
+51
-14
51 additions, 14 deletions
build_results.py
with
81 additions
and
22 deletions
Summarized_results.ipynb
+
30
−
8
View file @
31b9c3bb
...
...
@@ -58,34 +58,54 @@
"metadata": {},
"outputs": [],
"source": [
"def compute_tp_fp_fn(svs, tool):\n",
"def compute_tp_fp_fn(svs, tool
, only_filter=False, include_duplicates=True
):\n",
" tp = 0\n",
" fp = 0\n",
" fn = 0\n",
" start_values = svs[\"{0}__Start\".format(tool)]\n",
" for i in range(0, len(start_values)):\n",
" if math.isnan(start_values[i]) and not math.isnan(svs[\"Real_data__Start\"][i]):\n",
" if not only_filter or (svs[\"{0}__Filters\".format(tool)][i] == \"PASS\" or (include_duplicates \n",
" and svs[\"{0}__Filters\".format(tool)][i] == \"DUPLICATE\")):\n",
" if math.isnan(start_values[i]) and not math.isnan(svs[\"Real_data__Start\"][i]):\n",
" fn += 1\n",
" elif not math.isnan(start_values[i]) and not math.isnan(svs[\"Real_data__Start\"][i]):\n",
" tp += 1\n",
" elif not math.isnan(start_values[i]) and math.isnan(svs[\"Real_data__Start\"][i]):\n",
" fp += 1\n",
" elif not math.isnan(svs[\"Real_data__Start\"][i]):\n",
" fn += 1\n",
" elif not math.isnan(start_values[i]) and not math.isnan(svs[\"Real_data__Start\"][i]):\n",
" tp += 1\n",
" elif not math.isnan(start_values[i]) and math.isnan(svs[\"Real_data__Start\"][i]):\n",
" fp += 1\n",
" return tp, fp, fn\n",
"\n",
"recall = OrderedDict()\n",
"recall_f = OrderedDict()\n",
"precision = OrderedDict()\n",
"precision_f = OrderedDict()\n",
"all_colors = [\"#405f93\", \"#468a56\", \"#be4447\", \"#6553a0\", \"#DF3A01\", \"#868A08\"]\n",
"all_colors_f = [\"#072352\", \"#0a3614\", \"#450a0b\", \"#20124b\", \"#461707\", \"#2d2e05\"]\n",
"colors = {}\n",
"i = 0\n",
"for tool in tools:\n",
" if tool + \"__Start\" in results_df:\n",
" tp, fp, fn = compute_tp_fp_fn(results_df, tool)\n",
" recall[tool] = [tp / (tp+fn) * 100]\n",
" precision[tool] = [tp / (tp+fp) * 100]\n",
" colors[tool] = all_colors[i]\n",
" # Filtered ones:\n",
" tp, fp, fn = compute_tp_fp_fn(results_df, tool, True)\n",
" recall_f[tool] = [tp / (tp+fn) * 100]\n",
" precision_f[tool] = [tp / (tp+fp) * 100]\n",
" i += 1\n",
" \n",
"plt.figure(1, figsize=(20,10))\n",
"\n",
"# clrs = ['grey' if (x < max(values)) else 'red' for x in values ]\n",
"\n",
"# Plot recall\n",
"plt.subplot(121)\n",
"recall_df = pd.DataFrame.from_dict(recall, orient=\"columns\")\n",
"plot = sns.barplot(data=recall_df)\n",
"plot = sns.barplot(data=recall_df, palette=all_colors)\n",
"recall_f_df = pd.DataFrame.from_dict(recall_f, orient=\"columns\")\n",
"plot = sns.barplot(data=recall_f_df, palette=all_colors_f)\n",
"plot.set_title(\"Recall\", fontsize=30)\n",
"plot.set_ylabel(\"Recall (%)\", fontsize=20)\n",
"plot.set_xlabel(\"Tool\", fontsize=20)\n",
...
...
@@ -94,8 +114,10 @@
"\n",
"# Plot precision\n",
"plt.subplot(122)\n",
"precision_f_df = pd.DataFrame.from_dict(precision_f, orient=\"columns\")\n",
"plot2 = sns.barplot(data=precision_f_df, palette=all_colors_f)\n",
"precision_df = pd.DataFrame.from_dict(precision, orient=\"columns\")\n",
"plot2 = sns.barplot(data=precision_df)\n",
"plot2 = sns.barplot(data=precision_df
, palette=all_colors
)\n",
"plot2.set_title(\"Precision\", fontsize=30)\n",
"plot2.set_ylabel(\"Precision (%)\", fontsize=20)\n",
"plot2.set_xlabel(\"Tool\", fontsize=20)\n",
...
...
%% Cell type:markdown id: tags:
# CNV detection benchmark
%% Cell type:code id: tags:
```
python
import
pandas
as
pd
import
numpy
as
np
from
collections
import
OrderedDict
import
math
import
re
import
os
import
json
import
pylab
as
P
import
matplotlib
as
mpl
import
matplotlib.pyplot
as
plt
import
seaborn
as
sns
sns
.
set
(
style
=
"
whitegrid
"
,
color_codes
=
True
)
%
matplotlib
inline
from
pysam
import
VariantFile
from
collections
import
defaultdict
from
collections
import
Counter
from
IPython.display
import
display
,
HTML
# Read tsv file
results_df
=
pd
.
read_table
(
"
results_sv_per_tools.tsv
"
,
header
=
0
,
index_col
=
0
)
# Retrieve list of tools
tools
=
set
()
for
col
in
results_df
.
columns
:
if
col
.
endswith
(
"
__Start
"
)
and
col
!=
"
Real_data__Start
"
and
col
!=
"
Filtered_results__Start
"
:
tools
.
add
(
col
.
split
(
"
__
"
)[
0
])
```
%% Cell type:markdown id: tags:
### Recall & Precision
%% Cell type:code id: tags:
```
python
def
compute_tp_fp_fn
(
svs
,
tool
):
def
compute_tp_fp_fn
(
svs
,
tool
,
only_filter
=
False
,
include_duplicates
=
True
):
tp
=
0
fp
=
0
fn
=
0
start_values
=
svs
[
"
{0}__Start
"
.
format
(
tool
)]
for
i
in
range
(
0
,
len
(
start_values
)):
if
math
.
isnan
(
start_values
[
i
])
and
not
math
.
isnan
(
svs
[
"
Real_data__Start
"
][
i
]):
if
not
only_filter
or
(
svs
[
"
{0}__Filters
"
.
format
(
tool
)][
i
]
==
"
PASS
"
or
(
include_duplicates
and
svs
[
"
{0}__Filters
"
.
format
(
tool
)][
i
]
==
"
DUPLICATE
"
)):
if
math
.
isnan
(
start_values
[
i
])
and
not
math
.
isnan
(
svs
[
"
Real_data__Start
"
][
i
]):
fn
+=
1
elif
not
math
.
isnan
(
start_values
[
i
])
and
not
math
.
isnan
(
svs
[
"
Real_data__Start
"
][
i
]):
tp
+=
1
elif
not
math
.
isnan
(
start_values
[
i
])
and
math
.
isnan
(
svs
[
"
Real_data__Start
"
][
i
]):
fp
+=
1
elif
not
math
.
isnan
(
svs
[
"
Real_data__Start
"
][
i
]):
fn
+=
1
elif
not
math
.
isnan
(
start_values
[
i
])
and
not
math
.
isnan
(
svs
[
"
Real_data__Start
"
][
i
]):
tp
+=
1
elif
not
math
.
isnan
(
start_values
[
i
])
and
math
.
isnan
(
svs
[
"
Real_data__Start
"
][
i
]):
fp
+=
1
return
tp
,
fp
,
fn
recall
=
OrderedDict
()
recall_f
=
OrderedDict
()
precision
=
OrderedDict
()
precision_f
=
OrderedDict
()
all_colors
=
[
"
#405f93
"
,
"
#468a56
"
,
"
#be4447
"
,
"
#6553a0
"
,
"
#DF3A01
"
,
"
#868A08
"
]
all_colors_f
=
[
"
#072352
"
,
"
#0a3614
"
,
"
#450a0b
"
,
"
#20124b
"
,
"
#461707
"
,
"
#2d2e05
"
]
colors
=
{}
i
=
0
for
tool
in
tools
:
if
tool
+
"
__Start
"
in
results_df
:
tp
,
fp
,
fn
=
compute_tp_fp_fn
(
results_df
,
tool
)
recall
[
tool
]
=
[
tp
/
(
tp
+
fn
)
*
100
]
precision
[
tool
]
=
[
tp
/
(
tp
+
fp
)
*
100
]
colors
[
tool
]
=
all_colors
[
i
]
# Filtered ones:
tp
,
fp
,
fn
=
compute_tp_fp_fn
(
results_df
,
tool
,
True
)
recall_f
[
tool
]
=
[
tp
/
(
tp
+
fn
)
*
100
]
precision_f
[
tool
]
=
[
tp
/
(
tp
+
fp
)
*
100
]
i
+=
1
plt
.
figure
(
1
,
figsize
=
(
20
,
10
))
# clrs = ['grey' if (x < max(values)) else 'red' for x in values ]
# Plot recall
plt
.
subplot
(
121
)
recall_df
=
pd
.
DataFrame
.
from_dict
(
recall
,
orient
=
"
columns
"
)
plot
=
sns
.
barplot
(
data
=
recall_df
)
plot
=
sns
.
barplot
(
data
=
recall_df
,
palette
=
all_colors
)
recall_f_df
=
pd
.
DataFrame
.
from_dict
(
recall_f
,
orient
=
"
columns
"
)
plot
=
sns
.
barplot
(
data
=
recall_f_df
,
palette
=
all_colors_f
)
plot
.
set_title
(
"
Recall
"
,
fontsize
=
30
)
plot
.
set_ylabel
(
"
Recall (%)
"
,
fontsize
=
20
)
plot
.
set_xlabel
(
"
Tool
"
,
fontsize
=
20
)
plot
.
tick_params
(
labelsize
=
14
)
plot
.
set_ylim
([
0
,
100
])
# Plot precision
plt
.
subplot
(
122
)
precision_f_df
=
pd
.
DataFrame
.
from_dict
(
precision_f
,
orient
=
"
columns
"
)
plot2
=
sns
.
barplot
(
data
=
precision_f_df
,
palette
=
all_colors_f
)
precision_df
=
pd
.
DataFrame
.
from_dict
(
precision
,
orient
=
"
columns
"
)
plot2
=
sns
.
barplot
(
data
=
precision_df
)
plot2
=
sns
.
barplot
(
data
=
precision_df
,
palette
=
all_colors
)
plot2
.
set_title
(
"
Precision
"
,
fontsize
=
30
)
plot2
.
set_ylabel
(
"
Precision (%)
"
,
fontsize
=
20
)
plot2
.
set_xlabel
(
"
Tool
"
,
fontsize
=
20
)
plot2
.
tick_params
(
labelsize
=
14
)
plt
.
show
()
```
%% Cell type:markdown id: tags:
### Influence of variant size on recall
%% Cell type:code id: tags:
```
python
groups
=
[]
with
open
(
"
rules.sim
"
,
"
r
"
)
as
rules
:
for
line
in
rules
:
line
=
line
.
rstrip
()
if
line
!=
""
:
parts
=
re
.
split
(
r
"
\s+
"
,
line
)
groups
.
append
((
int
(
parts
[
1
]),
int
(
parts
[
2
])))
groups
.
sort
(
key
=
lambda
x
:
x
[
0
])
```
%% Cell type:markdown id: tags:
#### By tool
%% Cell type:code id: tags:
```
python
nrows
=
math
.
ceil
(
len
(
tools
)
/
2
)
ncols
=
min
(
2
,
len
(
tools
))
palettes
=
[
"
Blues_d
"
,
"
Greens_d
"
,
"
Reds_d
"
,
"
Purples_d
"
,
"
YlOrBr_d
"
,
"
PuBu_d
"
]
plt
.
figure
(
1
,
figsize
=
(
20
,
nrows
*
8
))
nplot
=
0
for
tool
in
tools
:
npalette
=
nplot
while
npalette
>=
len
(
palettes
):
npalette
-=
len
(
palettes
)
nplot
+=
1
results_by_group
=
OrderedDict
()
for
group
in
groups
:
tmp_res
=
results_df
[(
results_df
.
Real_data__Length
>=
group
[
0
])
&
(
results_df
.
Real_data__Length
<
group
[
1
])]
tp
,
fp
,
fn
=
compute_tp_fp_fn
(
tmp_res
,
tool
)
results_by_group
[
"
-
"
.
join
(
map
(
str
,
group
))]
=
[
tp
/
(
tp
+
fn
)
*
100
if
tp
+
fn
>
0
else
0
]
recall_df
=
pd
.
DataFrame
.
from_dict
(
results_by_group
,
orient
=
"
columns
"
)
plt
.
subplot
(
nrows
,
ncols
,
nplot
)
plot
=
sns
.
barplot
(
data
=
recall_df
,
palette
=
palettes
[
npalette
])
plot
.
set_title
(
tool
,
fontsize
=
25
)
plot
.
set_ylabel
(
"
Recall (%)
"
,
fontsize
=
20
)
plot
.
set_xlabel
(
"
Variant size (bp)
"
,
fontsize
=
20
)
plot
.
tick_params
(
labelsize
=
14
)
plot
.
set_ylim
([
0
,
100
])
plt
.
show
()
```
%% Cell type:markdown id: tags:
#### Global
%% Cell type:code id: tags:
```
python
results_by_group
=
OrderedDict
()
for
group
in
groups
:
tmp_res
=
results_df
[(
results_df
.
Real_data__Length
>=
group
[
0
])
&
(
results_df
.
Real_data__Length
<
group
[
1
])]
tp
,
fp
,
fn
=
compute_tp_fp_fn
(
tmp_res
,
"
Filtered_results
"
)
results_by_group
[
"
-
"
.
join
(
map
(
str
,
group
))]
=
[
tp
/
(
tp
+
fn
)
*
100
if
tp
+
fn
>
0
else
0
]
plt
.
figure
(
1
,
figsize
=
(
15
,
8
))
recall_df
=
pd
.
DataFrame
.
from_dict
(
results_by_group
,
orient
=
"
columns
"
)
plot
=
sns
.
barplot
(
data
=
recall_df
,
color
=
'
black
'
)
plot
.
set_title
(
"
Recall
"
,
fontsize
=
30
)
plot
.
set_ylabel
(
"
Recall (%)
"
,
fontsize
=
20
)
plot
.
set_xlabel
(
"
Variant size (bp)
"
,
fontsize
=
20
)
plot
.
tick_params
(
labelsize
=
14
)
plot
.
set_ylim
([
0
,
100
])
plt
.
show
()
```
%% Cell type:markdown id: tags:
### Shared variant by tool
%% Cell type:code id: tags:
```
python
def
compute_found_sv
(
svs
:
pd
.
DataFrame
,
tool
:
str
):
variants
=
[]
start_values
=
svs
[
"
{0}__Start
"
.
format
(
tool
)]
for
idx
in
svs
.
index
:
if
not
math
.
isnan
(
start_values
[
idx
])
and
not
math
.
isnan
(
svs
[
"
Real_data__Start
"
][
idx
]):
variants
.
append
(
idx
)
return
variants
variants_by_tool
=
[]
for
tool
in
tools
:
variants_by_tool
.
append
({
"
name
"
:
tool
,
"
data
"
:
compute_found_sv
(
results_df
,
tool
)})
```
%% Cell type:code id: tags:
```
python
html
=
HTML
(
"
<script type=
'
text/javascript
'
src=
'
http://jvenn.toulouse.inra.fr/app/js/canvas2svg.js
'
></script>
\
<script type=
'
text/javascript
'
src=
'
http://jvenn.toulouse.inra.fr/app/js/jvenn.min.js
'
></script>
\
<div id=
'
draw
'
></div>
\
<script type=
'
text/javascript
'
>
\
$(document).ready(function(){
\
$(
'
#draw
'
).jvenn({
\
series:
"
+
json
.
dumps
(
variants_by_tool
)
+
"
\
});
\
});
\
</script>
"
)
display
(
html
)
```
%% Cell type:markdown id: tags:
### Breakpoints precision
%% Cell type:code id: tags:
```
python
plt
.
figure
(
1
,
figsize
=
(
20
,
8
))
# Start precision
df_diffs
=
pd
.
read_table
(
"
results_sv_diffs_per_tools.tsv
"
,
header
=
0
,
index_col
=
0
)
all_diffs_soft_start
=
pd
.
DataFrame
()
for
tool
in
tools
:
all_diffs_soft_start
[
tool
]
=
df_diffs
[
tool
+
"
__Start
"
].
abs
()
plt
.
subplot
(
121
)
plot
=
sns
.
stripplot
(
data
=
all_diffs_soft_start
,
jitter
=
True
)
plot
.
set_ylim
([
0
,
150
])
plot
.
tick_params
(
labelsize
=
15
)
plot
.
set_title
(
"
Start position
"
,
fontsize
=
28
,
y
=
1.04
)
plot
.
set_ylabel
(
"
Diff from real data (abs)
"
,
fontsize
=
20
)
# End precision
df_diffs
=
pd
.
read_table
(
"
results_sv_diffs_per_tools.tsv
"
,
header
=
0
,
index_col
=
0
)
all_diffs_soft_end
=
pd
.
DataFrame
()
for
tool
in
tools
:
all_diffs_soft_end
[
tool
]
=
df_diffs
[
tool
+
"
__End
"
].
abs
()
plt
.
subplot
(
122
)
plot
=
sns
.
stripplot
(
data
=
all_diffs_soft_end
,
jitter
=
True
)
plot
.
set_ylim
([
0
,
150
])
plot
.
tick_params
(
labelsize
=
15
)
plot
.
set_title
(
"
End position
"
,
fontsize
=
28
,
y
=
1.04
)
plot
.
set_ylabel
(
"
Diff from real data (abs)
"
,
fontsize
=
20
)
plt
.
show
()
```
%% Cell type:markdown id: tags:
## 2. CNV Genotyping
We compare quality of genotyping
%% Cell type:code id: tags:
```
python
# Simulated deletions
total
=
len
(
results_df
[
results_df
.
Real_data__Start
.
notnull
()])
```
%% Cell type:code id: tags:
```
python
# Predicted deletions genotyped
df_svtyper
=
pd
.
read_csv
(
"
results_genotype.tsv
"
,
sep
=
'
\t
'
,
index_col
=
False
,
names
=
[
'
del
'
,
'
software
'
,
'
delsize
'
,
'
recall
'
,
'
left
'
,
'
right
'
])
df_svtyper
[
'
precision
'
]
=
[
"
precise
"
if
(
x
+
y
)
<
20
else
"
unprecise
"
for
(
x
,
y
)
in
zip
(
df_svtyper
.
left
,
df_svtyper
.
right
)]
df_svtyper
[
'
deltype
'
]
=
[
'
small
'
if
x
<
200
else
'
medium
'
for
x
in
df_svtyper
.
delsize
]
counts_svtyper
=
np
.
unique
(
df_svtyper
.
software
,
return_counts
=
True
)[
1
]
```
%% Cell type:markdown id: tags:
#### Genotype recall for each software prediction
%% Cell type:code id: tags:
```
python
gt_tools
=
list
(
tools
)
+
[
"
pass
"
]
plt
.
figure
(
1
,
figsize
=
(
8
,
5
))
sns
.
stripplot
(
x
=
"
software
"
,
y
=
"
recall
"
,
jitter
=
True
,
palette
=
"
Set2
"
,
dodge
=
True
,
linewidth
=
1
,
edgecolor
=
'
gray
'
,
order
=
gt_tools
,
data
=
df_svtyper
)
axes
=
sns
.
boxplot
(
x
=
"
software
"
,
y
=
"
recall
"
,
palette
=
"
Set2
"
,
order
=
gt_tools
,
data
=
df_svtyper
)
axes
.
title
.
set_position
([.
5
,
1.2
])
axes
.
set_title
(
str
(
total
)
+
"
simulated variants
"
,
size
=
25
)
axes
.
axes
.
xaxis
.
label
.
set_size
(
20
)
axes
.
axes
.
yaxis
.
label
.
set_size
(
20
)
axes
.
tick_params
(
labelsize
=
15
)
ymax
=
axes
.
get_ylim
()[
1
]
for
i
in
range
(
0
,
len
(
counts_svtyper
)):
t1
=
axes
.
text
(
-
0.1
+
(
i
*
1
),
ymax
+
ymax
/
100
,
counts_svtyper
[
i
],
fontsize
=
15
)
```
%% Cell type:markdown id: tags:
#### Inluence of precision : precise means less than 20bp
%% Cell type:code id: tags:
```
python
plt
.
figure
(
1
,
figsize
=
(
8
,
5
))
sns
.
stripplot
(
x
=
"
software
"
,
y
=
"
recall
"
,
jitter
=
True
,
hue
=
"
precision
"
,
palette
=
"
Set2
"
,
dodge
=
True
,
linewidth
=
1
,
edgecolor
=
'
gray
'
,
order
=
gt_tools
,
data
=
df_svtyper
)
axes
=
sns
.
boxplot
(
x
=
"
software
"
,
y
=
"
recall
"
,
hue
=
"
precision
"
,
palette
=
"
Set2
"
,
order
=
gt_tools
,
data
=
df_svtyper
)
axes
.
title
.
set_position
([.
5
,
1.2
])
axes
.
set_ylim
(
0.15
,
1.05
)
axes
.
set_title
(
str
(
total
)
+
"
simulated variants
"
,
size
=
25
)
axes
.
axes
.
xaxis
.
label
.
set_size
(
20
)
axes
.
axes
.
yaxis
.
label
.
set_size
(
20
)
axes
.
tick_params
(
labelsize
=
15
)
ymax
=
axes
.
get_ylim
()[
1
]
for
i
in
range
(
0
,
len
(
counts_svtyper
)):
t1
=
axes
.
text
(
-
0.1
+
(
i
*
1
),
ymax
+
ymax
/
100
,
counts_svtyper
[
i
],
fontsize
=
15
)
plt
.
legend
(
bbox_to_anchor
=
(
1.05
,
1
),
loc
=
2
,
borderaxespad
=
0.
)
```
%% Cell type:markdown id: tags:
#### Inluence of deletion size : small means less than 200bp
%% Cell type:code id: tags:
```
python
plt
.
figure
(
1
,
figsize
=
(
8
,
5
))
sns
.
stripplot
(
x
=
"
software
"
,
y
=
"
recall
"
,
jitter
=
True
,
hue
=
"
deltype
"
,
palette
=
"
Set2
"
,
dodge
=
True
,
linewidth
=
1
,
edgecolor
=
'
gray
'
,
order
=
gt_tools
,
data
=
df_svtyper
)
axes
=
sns
.
boxplot
(
x
=
"
software
"
,
y
=
"
recall
"
,
hue
=
"
deltype
"
,
palette
=
"
Set2
"
,
order
=
gt_tools
,
data
=
df_svtyper
)
axes
.
title
.
set_position
([.
5
,
1.2
])
axes
.
set_ylim
(
0.15
,
1.05
)
axes
.
set_title
(
str
(
total
)
+
"
simulated variants
"
,
size
=
25
)
axes
.
axes
.
xaxis
.
label
.
set_size
(
20
)
axes
.
axes
.
yaxis
.
label
.
set_size
(
20
)
axes
.
tick_params
(
labelsize
=
15
)
ymax
=
axes
.
get_ylim
()[
1
]
for
i
in
range
(
0
,
len
(
counts_svtyper
)):
t1
=
axes
.
text
(
-
0.1
+
(
i
*
1
),
ymax
+
ymax
/
100
,
counts_svtyper
[
i
],
fontsize
=
15
)
plt
.
legend
(
bbox_to_anchor
=
(
1.05
,
1
),
loc
=
2
,
borderaxespad
=
0.
)
```
%% Cell type:markdown id: tags:
##### Size VS Precision
%% Cell type:code id: tags:
```
python
plt
.
figure
(
1
,
figsize
=
(
8
,
5
))
sns
.
stripplot
(
x
=
"
precision
"
,
y
=
"
delsize
"
,
jitter
=
True
,
palette
=
"
Set2
"
,
dodge
=
True
,
linewidth
=
1
,
edgecolor
=
'
gray
'
,
order
=
[
'
precise
'
,
'
unprecise
'
],
data
=
df_svtyper
)
axes
=
sns
.
boxplot
(
x
=
"
precision
"
,
y
=
"
delsize
"
,
palette
=
"
Set2
"
,
order
=
[
'
precise
'
,
'
unprecise
'
],
data
=
df_svtyper
)
axes
.
axes
.
xaxis
.
label
.
set_size
(
20
)
axes
.
axes
.
yaxis
.
label
.
set_size
(
20
)
axes
.
tick_params
(
labelsize
=
15
)
```
...
...
This diff is collapsed.
Click to expand it.
build_results.py
+
51
−
14
View file @
31b9c3bb
...
...
@@ -175,7 +175,7 @@ def get_genotypes(genotypes_files, true_vcf_file):
return
genotypes
,
gt_quality
,
nb_inds
def
build_records
(
genotypes
,
SVSet
,
true_ones_records
,
filtered_records
,
gt_quality
):
def
build_records
(
genotypes
,
SVSet
,
true_ones_records
,
filtered_records
,
gt_quality
,
all_variants_filter
):
"""
Build records for each SV
:param genotypes: list of genotypes of each individual for each SV
...
...
@@ -216,9 +216,13 @@ def build_records(genotypes, SVSet, true_ones_records, filtered_records, gt_qual
"
end
"
:
node
.
end
,
"
length
"
:
int
(
node
.
end
)
-
int
(
node
.
start
),
"
filtered
"
:
(
node
.
id
in
filtered_records
)
if
filtered_records
is
not
None
else
False
,
"
filters
"
:
"
,
"
.
join
([
x
for
x
in
all_variants_filter
[
node
.
id
].
filter
])
if
node
.
id
in
all_variants_filter
else
""
,
"
genotypes
"
:
genotypes
[
node
.
id
]
if
node
.
id
in
genotypes
else
None
,
"
qualities
"
:
gt_quality
[
node
.
id
]
if
node
.
id
in
gt_quality
else
None
}
if
results
[
tool_name
][
"
filters
"
]
==
""
:
results
[
tool_name
][
"
filters
"
]
=
"
PASS
"
tools
.
add
(
tool_name
)
chromosome
=
node
.
chrom
...
...
@@ -295,6 +299,12 @@ def build_header(tools, filtered_records, nb_records, max_col_len, nb_inds):
"
format
"
:
l_format
}
max_col_len
[
i
+
2
]
=
get_max_len
(
"
Length
"
,
i
+
2
,
max_col_len
)
if
tool
not
in
[
"
Real data
"
,
"
Filtered results
"
]:
cells
[
XLSX_COLS
[
i
+
3
]
+
"
2
"
]
=
cells
[
XLSX_COLS
[
i
+
3
]
+
str
(
2
+
nb_records
+
3
)]
=
{
"
text
"
:
"
Filters
"
,
"
format
"
:
l_format
}
max_col_len
[
i
+
3
]
=
get_max_len
(
"
Filters
"
,
i
+
3
,
max_col_len
)
i
+=
1
# Genotypes:
for
k
in
range
(
0
,
nb_inds
):
cells_gt
[
XLSX_COLS
[
j
+
k
]
+
"
2
"
]
=
cells_gq
[
XLSX_COLS
[
j
+
k
]
+
"
2
"
]
=
\
...
...
@@ -352,7 +362,7 @@ def fill_real_data(row, cells, cells_gt, cells_gq, max_col_len, record, rec_id,
def
fill_tool_data
(
row
,
col
,
cells
,
max_col_len
,
record
,
nb_records
,
my_start
,
my_end
,
my_length
,
sv_format
=
None
):
sv_format
=
None
,
no_filter
=
False
):
"""
Fill cells for a tool and a record
:param row: row position
...
...
@@ -387,6 +397,10 @@ def fill_tool_data(row, col, cells, max_col_len, record, nb_records, my_start, m
cells
[
XLSX_COLS
[
col
+
2
]
+
str
(
row
)]
=
{
"
text
"
:
record
[
"
length
"
],
"
format
"
:
sv_format
}
max_col_len
[
col
+
2
]
=
get_max_len
(
record
[
"
length
"
],
col
+
2
,
max_col_len
)
if
"
filters
"
in
record
and
not
no_filter
:
cells
[
XLSX_COLS
[
col
+
3
]
+
str
(
row
)]
=
{
"
text
"
:
record
[
"
filters
"
],
"
format
"
:
sv_format
}
max_col_len
[
col
+
3
]
=
get_max_len
(
record
[
"
filters
"
],
col
+
3
,
max_col_len
)
###########
# DIFFS: #
###########
...
...
@@ -408,6 +422,11 @@ def fill_tool_data(row, col, cells, max_col_len, record, nb_records, my_start, m
cells
[
XLSX_COLS
[
col
+
2
]
+
str
(
row
+
nb_records
+
3
)]
=
{
"
text
"
:
length
,
"
format
"
:
sv_format
}
# FILTERS:
if
"
filters
"
in
record
and
not
no_filter
:
cells
[
XLSX_COLS
[
col
+
3
]
+
str
(
row
+
nb_records
+
3
)]
=
{
"
text
"
:
record
[
"
filters
"
],
"
format
"
:
sv_format
}
return
cells
,
max_col_len
...
...
@@ -454,18 +473,26 @@ def write_headers(workbook, worksheet, headers, cell_len, rows, filtered_records
:param rows: (list) rows into write the header
:param filtered_records: (bool) is there filtered records
"""
orig_cell_len
=
cell_len
for
row
in
rows
:
merge_format
=
workbook
.
add_format
({
'
bold
'
:
True
})
worksheet
.
write
(
XLSX_COLS
[
1
]
+
str
(
row
+
1
)
+
"
:
"
+
XLSX_COLS
[
cell_len
]
+
str
(
row
+
1
),
headers
[
0
],
merge_format
)
i
=
0
left_cell_lenth
=
2
for
header
in
headers
[
1
:]:
if
header
not
in
[
"
Real data
"
,
"
Filtered results
"
]:
cell_len
=
orig_cell_len
+
1
else
:
cell_len
=
orig_cell_len
if
i
<
len
(
headers
)
-
2
or
not
filtered_records
:
merge_format
=
workbook
.
add_format
({
'
align
'
:
'
center
'
})
else
:
merge_format
=
workbook
.
add_format
({
'
align
'
:
'
center
'
,
'
bg_color
'
:
COLOR_COL_FILTER
})
for
row
in
rows
:
worksheet
.
merge_range
(
XLSX_COLS
[
2
+
i
*
cell_len
]
+
str
(
row
)
+
"
:
"
+
XLSX_COLS
[
1
+
cell_len
+
i
*
cell_len
]
+
str
(
row
),
worksheet
.
merge_range
(
XLSX_COLS
[
left_cell_lenth
]
+
str
(
row
)
+
"
:
"
+
XLSX_COLS
[
left_cell_lenth
+
cell_len
-
1
]
+
str
(
row
),
header
,
merge_format
)
left_cell_lenth
+=
cell_len
i
+=
1
...
...
@@ -560,15 +587,24 @@ def create_tsv_file(filename: str, headers: list, cells: dict, nb_tools: int, nb
head
=
[
""
,
headers
[
0
]]
top_headers
=
{}
h
=
2
nb_only_tools
=
nb_tools
# Number of tools, except real data and filtered results
nb_others
=
0
for
header
in
headers
[
1
:]:
# Define top headers to each column:
for
i
in
range
(
0
,
nb_per_tool
):
top_headers
[
h
]
=
header
head
.
append
(
""
)
h
+=
1
if
header
not
in
[
"
Real data
"
,
"
Filtered results
"
]:
top_headers
[
h
]
=
header
head
.
append
(
""
)
h
+=
1
else
:
nb_only_tools
-=
1
nb_others
+=
1
rows
=
[
head
]
for
i
in
range
(
0
,
records_range
[
1
]
-
records_range
[
0
]):
rows
.
append
([
""
for
x
in
range
(
0
,
(
nb_
tool
s
*
nb_per_tool
)
+
2
)])
rows
.
append
([
""
for
x
in
range
(
0
,
(
nb_
only_tools
*
(
nb_per_tool
+
1
))
+
(
nb_other
s
*
nb_per_tool
)
+
2
)])
# Fill content:
for
id_cell
,
cell
in
cells
.
items
():
...
...
@@ -679,17 +715,17 @@ def apply_style_of_filter_cells(cells, cells_gt, cells_gq, i, is_kept, nb_record
cells
[
XLSX_COLS
[
0
]
+
str
(
i
+
nb_records
+
3
)][
"
format
"
][
"
bg_color
"
]
=
COLOR_IS_KEPT
elif
filtered_records
is
not
None
:
# SV does not pass the filter: color bg in red in the filter column
cells
[
XLSX_COLS
[
2
+
((
nb_tools
+
1
)
*
3
)]
+
str
(
i
)]
=
{
"
text
"
:
""
,
"
format
"
:
{
"
bg_color
"
:
COLOR_NOT_FOUND
}}
cells
[
XLSX_COLS
[
2
+
((
nb_tools
+
1
)
*
3
)
+
1
]
+
str
(
i
)]
=
{
"
text
"
:
""
,
cells
[
XLSX_COLS
[
2
+
((
nb_tools
*
4
)
+
3
)]
+
str
(
i
)]
=
{
"
text
"
:
""
,
"
format
"
:
{
"
bg_color
"
:
COLOR_NOT_FOUND
}}
cells
[
XLSX_COLS
[
2
+
((
nb_tools
*
4
)
+
3
)
+
1
]
+
str
(
i
)]
=
{
"
text
"
:
""
,
"
format
"
:
{
"
bg_color
"
:
COLOR_NOT_FOUND
}}
cells
[
XLSX_COLS
[
2
+
((
nb_tools
+
1
)
*
3
)
+
2
]
+
str
(
i
)]
=
{
"
text
"
:
""
,
cells
[
XLSX_COLS
[
2
+
((
nb_tools
*
4
)
+
3
)
+
2
]
+
str
(
i
)]
=
{
"
text
"
:
""
,
"
format
"
:
{
"
bg_color
"
:
COLOR_NOT_FOUND
}}
cells
[
XLSX_COLS
[
2
+
((
nb_tools
+
1
)
*
3
)]
+
str
(
i
+
nb_records
+
3
)]
=
{
"
text
"
:
""
,
cells
[
XLSX_COLS
[
2
+
((
nb_tools
*
4
)
+
3
)]
+
str
(
i
+
nb_records
+
3
)]
=
{
"
text
"
:
""
,
"
format
"
:
{
"
bg_color
"
:
COLOR_NOT_FOUND
}}
cells
[
XLSX_COLS
[
2
+
((
nb_tools
+
1
)
*
3
)
+
1
]
+
str
(
i
+
nb_records
+
3
)]
=
{
cells
[
XLSX_COLS
[
2
+
((
nb_tools
*
4
)
+
3
)
+
1
]
+
str
(
i
+
nb_records
+
3
)]
=
{
"
text
"
:
""
,
"
format
"
:
{
"
bg_color
"
:
COLOR_NOT_FOUND
}}
cells
[
XLSX_COLS
[
2
+
((
nb_tools
+
1
)
*
3
)
+
2
]
+
str
(
i
+
nb_records
+
3
)]
=
{
cells
[
XLSX_COLS
[
2
+
((
nb_tools
*
4
)
+
3
)
+
2
]
+
str
(
i
+
nb_records
+
3
)]
=
{
"
text
"
:
""
,
"
format
"
:
{
"
bg_color
"
:
COLOR_NOT_FOUND
}}
# Genotype:
...
...
@@ -743,9 +779,9 @@ def build_body_cells(rec_keys, records, nb_records, nb_inds, tools, cells, cells
sv_format
=
{
"
bg_color
"
:
COLOR_IS_KEPT
}
# SV data (sheet 1):
cells
,
max_col_len
=
fill_tool_data
(
i
,
2
+
((
nb_tools
+
1
)
*
3
),
cells
,
max_col_len
,
cells
,
max_col_len
=
fill_tool_data
(
i
,
2
+
((
nb_tools
*
4
)
+
3
),
cells
,
max_col_len
,
record
[
"
tools
"
][
tool
],
nb_records
,
my_start
,
my_end
,
my_length
,
{
"
bg_color
"
:
COLOR_COL_FILTER
})
{
"
bg_color
"
:
COLOR_COL_FILTER
}
,
True
)
if
do_genotype
:
# Genotype (sheets 2&3):
...
...
@@ -775,7 +811,7 @@ def build_body_cells(rec_keys, records, nb_records, nb_inds, tools, cells, cells
cells
,
cells_gt
,
cells_gq
=
fill_cells_no_tools
(
cells
,
cells_gt
,
cells_gq
,
i
,
j
,
g
,
nb_records
,
nb_inds
,
do_genotype
)
j
+=
3
j
+=
4
g
+=
nb_inds
###############################################################################
...
...
@@ -843,7 +879,8 @@ def init(output, vcf_files, true_vcf, rules, filtered_vcfs=None, type_v="del", o
construct_overlap_graph
(
sv_set
,
overlap_cutoff
,
left_precision
,
right_precision
)
# Build records:
records
,
tools
,
orphans
=
build_records
(
genotypes
,
sv_set
,
true_variants
.
keys
(),
filtered_records
,
gt_quality
)
records
,
tools
,
orphans
=
build_records
(
genotypes
,
sv_set
,
true_variants
.
keys
(),
filtered_records
,
gt_quality
,
filtered_all
)
nb_records
=
len
(
records
)
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment