Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
popsim
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
SVdetection
popsim
Commits
a96b9922
Commit
a96b9922
authored
7 years ago
by
Floreal Cabanettes
Browse files
Options
Downloads
Patches
Plain Diff
Add genotypes comparison + resize columns
parent
410e18aa
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
build_xls_results.py
+145
-8
145 additions, 8 deletions
build_xls_results.py
with
145 additions
and
8 deletions
build_xls_results.py
+
145
−
8
View file @
a96b9922
...
...
@@ -18,6 +18,7 @@ import argparse, sys
import
glob
import
vcf
import
string
import
os
import
re
from
svreader.vcfwrapper
import
VCFReader
...
...
@@ -25,6 +26,8 @@ from svinterval import construct_overlap_graph, connected_components
import
xlsxwriter
prg_path
=
os
.
path
.
dirname
(
os
.
path
.
realpath
(
__file__
))
alphabet
=
list
(
string
.
ascii_uppercase
)
xlsx_cols
=
alphabet
.
copy
()
for
i
in
alphabet
:
...
...
@@ -41,6 +44,7 @@ description: Merge SV based on reciprocal overlap")
parser
.
add_argument
(
'
-t
'
,
'
--true-vcf
'
,
type
=
str
,
required
=
True
,
help
=
'
VCF file containing the simulated deletions
'
)
parser
.
add_argument
(
'
-f
'
,
'
--filtered-vcf
'
,
type
=
str
,
required
=
False
,
help
=
'
VCF file containing the filtered results
'
)
parser
.
add_argument
(
'
-g
'
,
'
--genotypes
'
,
type
=
str
,
help
=
"
VCF file containing genotypes
"
)
parser
.
add_argument
(
'
--overlap_cutoff
'
,
type
=
float
,
default
=
0.5
,
help
=
'
cutoff for reciprocal overlap
'
)
parser
.
add_argument
(
'
--left_precision
'
,
type
=
int
,
default
=-
1
,
help
=
'
left breakpoint precision
'
)
parser
.
add_argument
(
'
--right_precision
'
,
type
=
int
,
default
=-
1
,
help
=
'
right breakpoint precision
'
)
...
...
@@ -125,10 +129,37 @@ def svsort(sv, records):
first_tool
=
list
(
records
[
sv
][
"
tools
"
].
keys
())[
0
]
return
int
(
records
[
sv
][
"
tools
"
][
first_tool
][
"
start
"
])
def
get_gt
(
geno
,
true_gt
):
if
true_gt
is
not
None
:
if
geno
==
"
1/0
"
and
true_gt
==
"
0/1
"
:
geno
=
"
0/1
"
elif
geno
==
"
0/1
"
and
true_gt
==
"
1/0
"
:
geno
=
"
1/0
"
return
geno
def
get_max_len
(
cell
,
col
,
max_col_len
):
return
max
(
len
(
str
(
cell
)),
max_col_len
[
col
]
if
col
in
max_col_len
else
0
)
def
main
():
# parse the command line args
args
=
get_args
()
genotypes
=
{}
nb_inds
=
0
if
args
.
genotypes
:
genotypes_raw
=
os
.
popen
(
"
zcat genotype_chr_chr12.vcf.gz
"
+
args
.
true_vcf
+
"
|
"
+
os
.
path
.
join
(
prg_path
,
"
vawk
"
)
+
"
'
{ print $3,S$*$GT}
'"
).
read
().
split
(
"
\n
"
)
for
gt
in
genotypes_raw
:
if
gt
!=
""
:
gt_l
=
gt
.
split
(
"
\t
"
)
genotypes
[
gt_l
[
0
]]
=
gt_l
[
1
:]
nb_inds
=
len
(
gt_l
[
1
:])
overlap_cutoff
=
args
.
overlap_cutoff
left_precision
=
args
.
left_precision
right_precision
=
args
.
right_precision
...
...
@@ -181,7 +212,8 @@ def main():
"
end
"
:
node
.
end
,
"
length
"
:
int
(
node
.
end
)
-
int
(
node
.
start
),
"
tools
"
:
{},
"
orphan
"
:
False
"
orphan
"
:
False
,
"
genotypes
"
:
genotypes
[
true_one
]
if
true_one
in
genotypes
else
None
}
else
:
tool_name
=
node
.
id
.
split
(
"
_
"
)[
0
]
...
...
@@ -189,7 +221,8 @@ def main():
"
start
"
:
node
.
start
,
"
end
"
:
node
.
end
,
"
length
"
:
int
(
node
.
end
)
-
int
(
node
.
start
),
"
filtered
"
:
(
node
.
id
in
filtered_records
)
if
filtered_records
is
not
None
else
False
"
filtered
"
:
(
node
.
id
in
filtered_records
)
if
filtered_records
is
not
None
else
False
,
"
genotypes
"
:
genotypes
[
node
.
id
]
if
node
.
id
in
genotypes
else
None
}
tools
.
add
(
tool_name
)
...
...
@@ -202,7 +235,8 @@ def main():
"
end
"
:
""
,
"
length
"
:
""
,
"
tools
"
:
results
,
"
orphan
"
:
True
"
orphan
"
:
True
,
"
genotypes
"
:
None
}
names
.
sort
()
...
...
@@ -218,10 +252,19 @@ def main():
"
A
"
+
str
(
1
+
nb_records
+
3
):
{
"
text
"
:
"
DIFFS
"
,
"
format
"
:
{
"
bg_color
"
:
"
#ffe856
"
}},
"
A
"
+
str
(
2
+
nb_records
+
3
):
{
"
text
"
:
"
Deletion
"
,
"
format
"
:
{
"
bold
"
:
True
}}
}
cells_gt
=
{
"
A1
"
:
{
"
text
"
:
"
GENOTYPES
"
,
"
format
"
:
{
"
bg_color
"
:
"
#ffe856
"
}},
"
A2
"
:
{
"
text
"
:
"
Deletion
"
,
"
format
"
:
{
"
bold
"
:
True
}}
}
tools
=
sorted
(
tools
)
nb_tools
=
len
(
tools
)
max_col_len
=
{}
i
=
1
j
=
1
headers
=
[]
for
tool
in
[
"
Real data
"
]
+
tools
+
([
"
Filtered results
"
]
if
filtered_records
is
not
None
else
[]):
#cells[xlsx_cols[i] + "1"] = cells[xlsx_cols[i] + str(1+nb_records+3)] = {"text": tool, "format": ""}
...
...
@@ -230,11 +273,20 @@ def main():
if
tool
==
"
Filtered results
"
:
l_format
[
"
bg_color
"
]
=
color_col_filter
cells
[
xlsx_cols
[
i
]
+
"
2
"
]
=
cells
[
xlsx_cols
[
i
]
+
str
(
2
+
nb_records
+
3
)]
=
{
"
text
"
:
"
Start
"
,
"
format
"
:
l_format
}
max_col_len
[
i
]
=
get_max_len
(
"
Start
"
,
i
,
max_col_len
)
cells
[
xlsx_cols
[
i
+
1
]
+
"
2
"
]
=
cells
[
xlsx_cols
[
i
+
1
]
+
str
(
2
+
nb_records
+
3
)]
=
{
"
text
"
:
"
End
"
,
"
format
"
:
l_format
}
max_col_len
[
i
+
1
]
=
get_max_len
(
"
End
"
,
i
+
1
,
max_col_len
)
cells
[
xlsx_cols
[
i
+
2
]
+
"
2
"
]
=
cells
[
xlsx_cols
[
i
+
2
]
+
str
(
2
+
nb_records
+
3
)]
=
{
"
text
"
:
"
Length
"
,
"
format
"
:
l_format
}
max_col_len
[
i
+
2
]
=
get_max_len
(
"
Length
"
,
i
+
2
,
max_col_len
)
# Genotypes:
for
k
in
range
(
0
,
nb_inds
):
cells_gt
[
xlsx_cols
[
j
+
k
]
+
"
2
"
]
=
{
"
text
"
:
"
INDIV_
"
+
str
(
k
+
1
),
"
format
"
:
{
"
bold
"
:
True
}}
i
+=
3
j
+=
nb_inds
rec_keys
=
sorted
(
records
.
keys
(),
key
=
lambda
x
:
svsort
(
x
,
records
))
...
...
@@ -242,17 +294,31 @@ def main():
for
id
in
rec_keys
:
record
=
records
[
id
]
cells
[
xlsx_cols
[
0
]
+
str
(
i
)]
=
cells
[
xlsx_cols
[
0
]
+
str
(
i
+
nb_records
+
3
)]
=
{
"
text
"
:
id
,
"
format
"
:
{}}
max_col_len
[
0
]
=
get_max_len
(
id
,
0
,
max_col_len
)
print
(
max_col_len
[
0
])
cells_gt
[
xlsx_cols
[
0
]
+
str
(
i
)]
=
{
"
text
"
:
id
,
"
format
"
:
{}}
cells
[
xlsx_cols
[
1
]
+
str
(
i
)]
=
cells
[
xlsx_cols
[
1
]
+
str
(
i
+
nb_records
+
3
)]
=
{
"
text
"
:
record
[
"
start
"
],
"
format
"
:
{}}
max_col_len
[
1
]
=
get_max_len
(
record
[
"
start
"
],
1
,
max_col_len
)
my_start
=
record
[
"
start
"
]
cells
[
xlsx_cols
[
2
]
+
str
(
i
)]
=
cells
[
xlsx_cols
[
2
]
+
str
(
i
+
nb_records
+
3
)]
=
{
"
text
"
:
record
[
"
end
"
],
"
format
"
:
{}}
max_col_len
[
2
]
=
get_max_len
(
record
[
"
end
"
],
2
,
max_col_len
)
my_end
=
record
[
"
end
"
]
cells
[
xlsx_cols
[
3
]
+
str
(
i
)]
=
cells
[
xlsx_cols
[
3
]
+
str
(
i
+
nb_records
+
3
)]
=
{
"
text
"
:
record
[
"
length
"
],
"
format
"
:
{}}
max_col_len
[
3
]
=
get_max_len
(
record
[
"
length
"
],
3
,
max_col_len
)
my_length
=
record
[
"
length
"
]
# Genotypes:
my_genotypes
=
None
if
record
[
"
genotypes
"
]
is
not
None
:
for
gt
in
range
(
0
,
len
(
record
[
"
genotypes
"
])):
cells_gt
[
xlsx_cols
[
1
+
gt
]
+
str
(
i
)]
=
{
"
text
"
:
record
[
"
genotypes
"
][
gt
],
"
format
"
:
{}}
my_genotypes
=
record
[
"
genotypes
"
]
j
=
4
g
=
nb_inds
+
1
is_kept
=
False
for
tool
in
tools
:
if
tool
in
record
[
"
tools
"
]:
...
...
@@ -262,10 +328,19 @@ def main():
# Raw values:
cells
[
xlsx_cols
[
1
+
((
nb_tools
+
1
)
*
3
)]
+
str
(
i
)]
=
{
"
text
"
:
record
[
"
tools
"
][
tool
][
"
start
"
],
"
format
"
:
{
"
bg_color
"
:
color_col_filter
}}
max_col_len
[
1
+
((
nb_tools
+
1
)
*
3
)]
=
\
get_max_len
(
record
[
"
tools
"
][
tool
][
"
start
"
],
1
+
((
nb_tools
+
1
)
*
3
),
max_col_len
)
cells
[
xlsx_cols
[
1
+
((
nb_tools
+
1
)
*
3
)
+
1
]
+
str
(
i
)]
=
{
"
text
"
:
record
[
"
tools
"
][
tool
][
"
end
"
],
"
format
"
:
{
"
bg_color
"
:
color_col_filter
}}
max_col_len
[
1
+
((
nb_tools
+
1
)
*
3
)
+
1
]
=
\
get_max_len
(
record
[
"
tools
"
][
tool
][
"
end
"
],
1
+
((
nb_tools
+
1
)
*
3
)
+
1
,
max_col_len
)
cells
[
xlsx_cols
[
1
+
((
nb_tools
+
1
)
*
3
)
+
2
]
+
str
(
i
)]
=
{
"
text
"
:
record
[
"
tools
"
][
tool
][
"
length
"
],
"
format
"
:
{
"
bg_color
"
:
color_col_filter
}}
max_col_len
[
1
+
((
nb_tools
+
1
)
*
3
)
+
2
]
=
\
get_max_len
(
record
[
"
tools
"
][
tool
][
"
length
"
],
1
+
((
nb_tools
+
1
)
*
3
)
+
2
,
max_col_len
)
# Diffs:
if
my_start
!=
""
:
cells
[
xlsx_cols
[
1
+
((
nb_tools
+
1
)
*
3
)]
+
str
(
i
+
nb_records
+
3
)]
=
{
...
...
@@ -283,12 +358,27 @@ def main():
cells
[
xlsx_cols
[
1
+
((
nb_tools
+
1
)
*
3
)
+
2
]
+
str
(
i
+
nb_records
+
3
)]
=
{
"
text
"
:
"
?????
"
,
"
format
"
:
{
"
bg_color
"
:
color_col_filter
}}
# Genotypes:
if
record
[
"
tools
"
][
tool
][
"
genotypes
"
]
is
not
None
:
the_genotypes
=
record
[
"
tools
"
][
tool
][
"
genotypes
"
]
for
gt
in
range
(
0
,
len
(
the_genotypes
)):
true_gt
=
my_genotypes
[
gt
]
geno
=
get_gt
(
the_genotypes
[
gt
],
true_gt
)
gt_format
=
{
"
bg_color
"
:
color_col_filter
}
if
geno
!=
my_genotypes
[
gt
]:
gt_format
[
"
font_color
"
]
=
"
#ff0000
"
cells_gt
[
xlsx_cols
[
1
+
((
nb_tools
+
1
)
*
nb_inds
)
+
gt
]
+
str
(
i
)]
=
\
{
"
text
"
:
geno
,
"
format
"
:
gt_format
}
else
:
sv_format
=
{}
# Raw values:
cells
[
xlsx_cols
[
j
]
+
str
(
i
)]
=
{
"
text
"
:
record
[
"
tools
"
][
tool
][
"
start
"
],
"
format
"
:
sv_format
}
max_col_len
[
j
]
=
get_max_len
(
record
[
"
tools
"
][
tool
][
"
start
"
],
j
,
max_col_len
)
cells
[
xlsx_cols
[
j
+
1
]
+
str
(
i
)]
=
{
"
text
"
:
record
[
"
tools
"
][
tool
][
"
end
"
],
"
format
"
:
sv_format
}
max_col_len
[
j
+
1
]
=
get_max_len
(
record
[
"
tools
"
][
tool
][
"
end
"
],
j
+
1
,
max_col_len
)
cells
[
xlsx_cols
[
j
+
2
]
+
str
(
i
)]
=
{
"
text
"
:
record
[
"
tools
"
][
tool
][
"
length
"
],
"
format
"
:
sv_format
}
max_col_len
[
j
+
2
]
=
get_max_len
(
record
[
"
tools
"
][
tool
][
"
length
"
],
j
+
2
,
max_col_len
)
# Diffs :
if
my_start
!=
""
:
...
...
@@ -299,19 +389,36 @@ def main():
cells
[
xlsx_cols
[
j
+
2
]
+
str
(
i
+
nb_records
+
3
)]
=
{
"
text
"
:
record
[
"
tools
"
][
tool
][
"
length
"
]
-
my_length
,
"
format
"
:
sv_format
}
else
:
cells
[
xlsx_cols
[
j
]
+
str
(
i
+
nb_records
+
3
)]
=
{
"
text
"
:
"
?????
"
,
"
format
"
:
sv_format
}
cells
[
xlsx_cols
[
j
+
1
]
+
str
(
i
+
nb_records
+
3
)]
=
{
"
text
"
:
"
?????
"
,
"
format
"
:
sv_format
}
cells
[
xlsx_cols
[
j
+
2
]
+
str
(
i
+
nb_records
+
3
)]
=
{
"
text
"
:
"
?????
"
,
"
format
"
:
sv_format
}
cells
[
xlsx_cols
[
j
]
+
str
(
i
+
nb_records
+
3
)]
=
{
"
text
"
:
"
NA
"
,
"
format
"
:
sv_format
}
cells
[
xlsx_cols
[
j
+
1
]
+
str
(
i
+
nb_records
+
3
)]
=
{
"
text
"
:
"
NA
"
,
"
format
"
:
sv_format
}
cells
[
xlsx_cols
[
j
+
2
]
+
str
(
i
+
nb_records
+
3
)]
=
{
"
text
"
:
"
NA
"
,
"
format
"
:
sv_format
}
# Genotypes:
if
record
[
"
tools
"
][
tool
][
"
genotypes
"
]
is
not
None
:
the_genotypes
=
record
[
"
tools
"
][
tool
][
"
genotypes
"
]
for
gt
in
range
(
0
,
len
(
the_genotypes
)):
true_gt
=
my_genotypes
[
gt
]
geno
=
get_gt
(
the_genotypes
[
gt
],
true_gt
)
gt_format
=
sv_format
.
copy
()
if
geno
!=
true_gt
:
gt_format
[
"
font_color
"
]
=
"
#ff0000
"
cells_gt
[
xlsx_cols
[
g
+
gt
]
+
str
(
i
)]
=
\
{
"
text
"
:
geno
,
"
format
"
:
gt_format
}
else
:
for
k
in
range
(
0
,
3
):
cells
[
xlsx_cols
[
j
+
k
]
+
str
(
i
)]
=
{
"
text
"
:
""
,
"
format
"
:
{
"
bg_color
"
:
color_not_found
}}
cells
[
xlsx_cols
[
j
+
k
]
+
str
(
i
+
nb_records
+
3
)]
=
{
"
text
"
:
""
,
"
format
"
:
{
"
bg_color
"
:
color_not_found
}}
# Genotype:
for
gt
in
range
(
0
,
nb_inds
):
cells_gt
[
xlsx_cols
[
g
+
gt
]
+
str
(
i
)]
=
{
"
text
"
:
""
,
"
format
"
:
{
"
bg_color
"
:
"
#000000
"
}}
j
+=
3
g
+=
nb_inds
if
is_kept
:
cells
[
xlsx_cols
[
0
]
+
str
(
i
)][
"
format
"
][
"
bg_color
"
]
=
\
cells
[
xlsx_cols
[
0
]
+
str
(
i
+
nb_records
+
3
)][
"
format
"
][
"
bg_color
"
]
=
color_is_kept
el
s
e
:
el
if
filtered_records
is
not
Non
e
:
cells
[
xlsx_cols
[
1
+
((
nb_tools
+
1
)
*
3
)]
+
str
(
i
)]
=
{
"
text
"
:
""
,
"
format
"
:
{
"
bg_color
"
:
color_not_found
}}
cells
[
xlsx_cols
[
1
+
((
nb_tools
+
1
)
*
3
)
+
1
]
+
str
(
i
)]
=
{
"
text
"
:
""
,
"
format
"
:
{
"
bg_color
"
:
color_not_found
}}
cells
[
xlsx_cols
[
1
+
((
nb_tools
+
1
)
*
3
)
+
2
]
+
str
(
i
)]
=
{
"
text
"
:
""
,
"
format
"
:
{
"
bg_color
"
:
color_not_found
}}
...
...
@@ -322,6 +429,11 @@ def main():
cells
[
xlsx_cols
[
1
+
((
nb_tools
+
1
)
*
3
)
+
2
]
+
str
(
i
+
nb_records
+
3
)]
=
{
"
text
"
:
""
,
"
format
"
:
{
"
bg_color
"
:
color_not_found
}}
# Genotype:
for
gt
in
range
(
0
,
nb_inds
):
cells_gt
[
xlsx_cols
[
1
+
((
nb_tools
+
1
)
*
nb_inds
)
+
gt
]
+
str
(
i
)]
=
{
"
text
"
:
""
,
"
format
"
:
{
"
bg_color
"
:
"
#000000
"
}}
# False positives (orphans) in orange:
if
re
.
match
(
r
"
^orphan_\d+$
"
,
id
):
cells
[
xlsx_cols
[
0
]
+
str
(
i
)][
"
format
"
][
"
bg_color
"
]
=
\
...
...
@@ -331,9 +443,11 @@ def main():
# Create document:
with
xlsxwriter
.
Workbook
(
args
.
output
)
as
workbook
:
worksheet
=
workbook
.
add_worksheet
()
worksheet
=
workbook
.
add_worksheet
(
"
SVs
"
)
i
=
0
max_header_length
=
0
for
header
in
headers
:
max_header_length
=
max
(
len
(
header
),
max_header_length
)
if
i
<
len
(
headers
)
-
1
or
filtered_records
is
None
:
merge_format
=
workbook
.
add_format
({
'
align
'
:
'
center
'
})
else
:
...
...
@@ -347,6 +461,29 @@ def main():
cell_format
=
workbook
.
add_format
(
cell_content
[
"
format
"
])
worksheet
.
write
(
cell_id
,
cell_content
[
"
text
"
],
cell_format
)
# Resize columns:
for
col
,
max_len
in
max_col_len
.
items
():
print
(
col
,
max_len
)
worksheet
.
set_column
(
col
,
col
,
max_len
+
1
)
# Genotypes:
if
args
.
genotypes
:
worksheet_gt
=
workbook
.
add_worksheet
(
"
Genotypes
"
)
i
=
0
for
header
in
headers
:
if
i
<
len
(
headers
)
-
1
or
filtered_records
is
None
:
merge_format
=
workbook
.
add_format
({
'
align
'
:
'
center
'
})
else
:
merge_format
=
workbook
.
add_format
({
'
align
'
:
'
center
'
,
'
bg_color
'
:
color_col_filter
})
worksheet_gt
.
merge_range
(
xlsx_cols
[
1
+
i
*
nb_inds
]
+
"
1:
"
+
xlsx_cols
[
nb_inds
+
i
*
nb_inds
]
+
"
1
"
,
header
,
merge_format
)
i
+=
1
for
cell_id
,
cell_content
in
cells_gt
.
items
():
cell_format
=
workbook
.
add_format
(
cell_content
[
"
format
"
])
worksheet_gt
.
write
(
cell_id
,
cell_content
[
"
text
"
],
cell_format
)
worksheet_gt
.
freeze_panes
(
0
,
9
)
worksheet_gt
.
set_column
(
0
,
0
,
max_col_len
[
0
]
+
1
)
print
(
""
)
print
(
"
###########
"
)
print
(
"
# RESULTS #
"
)
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment