Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
popsim
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
SVdetection
popsim
Commits
09249c14
Commit
09249c14
authored
7 years ago
by
Floreal Cabanettes
Browse files
Options
Downloads
Patches
Plain Diff
Allow several filter and genotype files
parent
4461fed1
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
build_results.py
+37
-33
37 additions, 33 deletions
build_results.py
with
37 additions
and
33 deletions
build_results.py
+
37
−
33
View file @
09249c14
...
...
@@ -50,8 +50,8 @@ description: Build results of the simulated data detection")
parser
.
add_argument
(
'
-v
'
,
'
--vcf
'
,
type
=
str
,
required
=
True
,
help
=
'
folder containing all vcf results files
'
)
parser
.
add_argument
(
'
-t
'
,
'
--true-vcf
'
,
type
=
str
,
required
=
True
,
help
=
'
VCF file containing the simulated deletions
'
)
parser
.
add_argument
(
'
-f
'
,
'
--filtered-vcf
'
,
type
=
str
,
required
=
False
,
help
=
'
VCF file containing the filtered results
'
)
parser
.
add_argument
(
'
-g
'
,
'
--genotypes
'
,
type
=
str
,
help
=
"
VCF file containing genotypes
"
)
help
=
'
VCF file containing the filtered results
'
,
nargs
=
'
+
'
)
parser
.
add_argument
(
'
-g
'
,
'
--genotypes
'
,
type
=
str
,
help
=
"
VCF file containing genotypes
"
,
nargs
=
'
+
'
)
parser
.
add_argument
(
'
--overlap_cutoff
'
,
type
=
float
,
default
=
0.5
,
help
=
'
cutoff for reciprocal overlap
'
)
parser
.
add_argument
(
'
--left_precision
'
,
type
=
int
,
default
=-
1
,
help
=
'
left breakpoint precision
'
)
parser
.
add_argument
(
'
--right_precision
'
,
type
=
int
,
default
=-
1
,
help
=
'
right breakpoint precision
'
)
...
...
@@ -162,7 +162,7 @@ def get_quality_color(quality):
return
color_very_low_quality
def
get_genotypes
(
genotypes_file
,
true_vcf_file
):
def
get_genotypes
(
genotypes_file
s
,
true_vcf_file
):
"""
Get genotype of each individual for each SV
:param genotypes_file: VCF file containing genotypes
...
...
@@ -182,12 +182,13 @@ def get_genotypes(genotypes_file, true_vcf_file):
nb_inds
=
len
(
list
(
genotypes
.
values
())[
0
])
# Samples:
reader
=
VariantFile
(
genotypes_file
)
for
rec
in
reader
:
samples
=
rec
.
samples
genotypes
[
rec
.
id
]
=
[
"
/
"
.
join
(
map
(
str
,
samples
[
x
][
"
GT
"
]))
for
x
in
samples_t
.
keys
()]
# Fixed: use samples keys
# from real data to keep the same order
gt_quality
[
rec
.
id
]
=
[
samples
[
x
][
"
GQ
"
]
for
x
in
samples_t
.
keys
()]
for
genotypes_file
in
genotypes_files
:
reader
=
VariantFile
(
genotypes_file
)
for
rec
in
reader
:
samples
=
rec
.
samples
genotypes
[
rec
.
id
]
=
[
"
/
"
.
join
(
map
(
str
,
samples
[
x
][
"
GT
"
]))
for
x
in
samples_t
.
keys
()]
# Fixed: use samples keys
# from real data to keep the same order
gt_quality
[
rec
.
id
]
=
[
samples
[
x
][
"
GQ
"
]
for
x
in
samples_t
.
keys
()]
return
genotypes
,
gt_quality
,
nb_inds
...
...
@@ -625,14 +626,14 @@ def search_vcf_files(my_folder):
return
vcf_files
def
print_results
(
nb_records
,
orphans
,
with_xlsx
,
output
,
genotype
s_file
):
def
print_results
(
nb_records
,
orphans
,
with_xlsx
,
output
,
do_
genotype
):
"""
Print list of outputs
:param nb_records: number of records {int}
:param orphans: sv found in tools but not present in real data {dict}
:param with_xlsx: build xlsx file {bool}
:param output: output prefix {str}
:param genotype
s_file: genotypes file {str
}
:param
do_
genotype
: do the genotype {bool
}
"""
print
(
""
)
print
(
"
###########
"
)
...
...
@@ -650,13 +651,13 @@ def print_results(nb_records, orphans, with_xlsx, output, genotypes_file):
print
(
"
TSV files:
"
)
print
(
"
\t
-
"
+
output
+
"
_sv_per_tools.tsv
"
)
print
(
"
\t
-
"
+
output
+
"
_sv_diffs_per_tools.tsv
"
)
if
genotype
s_file
:
if
do_
genotype
:
print
(
"
\t
-
"
+
output
+
"
_sv_genotypes_per_tools.tsv
"
)
print
(
"
\t
-
"
+
output
+
"
_sv_genotypes_quality_per_tools.tsv
"
)
print
(
""
)
def
fill_cells_no_tools
(
cells
,
cells_gt
,
cells_gq
,
i
,
j
,
g
,
nb_records
,
nb_inds
,
genotype
s_file
):
def
fill_cells_no_tools
(
cells
,
cells_gt
,
cells_gq
,
i
,
j
,
g
,
nb_records
,
nb_inds
,
do_
genotype
):
"""
Fill cells when a tool does not detect a SV
:param cells: cells definition
...
...
@@ -667,7 +668,7 @@ def fill_cells_no_tools(cells, cells_gt, cells_gq, i, j, g, nb_records, nb_inds,
:param g: column index for genotypes and genotypes quality tables
:param nb_records: total number of records
:param nb_inds: total number of individuals
:param genotype
s_file: VCF file containing
genotypes
:param
do_
genotype
: do the
genotypes
:return: cells, completed
"""
for
k
in
range
(
0
,
3
):
...
...
@@ -677,7 +678,7 @@ def fill_cells_no_tools(cells, cells_gt, cells_gq, i, j, g, nb_records, nb_inds,
"
format
"
:
{
"
bg_color
"
:
COLOR_NOT_FOUND
}}
# Genotype:
if
genotype
s_file
:
if
do_
genotype
:
for
gt
in
range
(
0
,
nb_inds
):
# noinspection PyUnresolvedReferences
cells_gt
[
XLSX_COLS
[
g
+
gt
]
+
str
(
i
)]
=
cells_gq
[
XLSX_COLS
[
g
+
gt
]
+
str
(
i
)]
=
\
...
...
@@ -686,7 +687,7 @@ def fill_cells_no_tools(cells, cells_gt, cells_gq, i, j, g, nb_records, nb_inds,
def
apply_style_of_filter_cells
(
cells
,
cells_gt
,
cells_gq
,
i
,
is_kept
,
nb_records
,
nb_inds
,
nb_tools
,
filtered_records
,
genotype
s_file
,
rec_id
):
do_
genotype
,
rec_id
):
"""
Apply style of cells
:param cells: cells of the default table {dict}
...
...
@@ -698,7 +699,7 @@ def apply_style_of_filter_cells(cells, cells_gt, cells_gq, i, is_kept, nb_record
:param nb_inds: number of individuals {int}
:param nb_tools: number of tools {int}
:param filtered_records: file containing filtered records {str}
:param genotype
s_file: file containing
genotype
s
{
str
}
:param
do_
genotype
: do the
genotype {
bool
}
:param rec_id: id of the record {str}
:return:
"""
...
...
@@ -722,7 +723,7 @@ def apply_style_of_filter_cells(cells, cells_gt, cells_gq, i, is_kept, nb_record
"
text
"
:
""
,
"
format
"
:
{
"
bg_color
"
:
COLOR_NOT_FOUND
}}
# Genotype:
if
genotype
s_file
:
if
do_
genotype
:
# Color in gray in the filter column
for
gt
in
range
(
0
,
nb_inds
):
cells_gt
[
XLSX_COLS
[
2
+
((
nb_tools
+
1
)
*
nb_inds
)
+
gt
]
+
str
(
i
)]
=
\
...
...
@@ -739,7 +740,7 @@ def apply_style_of_filter_cells(cells, cells_gt, cells_gq, i, is_kept, nb_record
def
build_body_cells
(
rec_keys
,
records
,
nb_records
,
nb_inds
,
tools
,
cells
,
cells_gt
,
cells_gq
,
max_col_len
,
nb_tools
,
genotype
s_file
,
haploid
,
filtered_records
):
nb_tools
,
do_
genotype
,
haploid
,
filtered_records
):
i
=
3
for
rec_id
in
rec_keys
:
record
=
records
[
rec_id
]
...
...
@@ -776,7 +777,7 @@ def build_body_cells(rec_keys, records, nb_records, nb_inds, tools, cells, cells
record
[
"
tools
"
][
tool
],
nb_records
,
my_start
,
my_end
,
my_length
,
{
"
bg_color
"
:
COLOR_COL_FILTER
})
if
genotype
s_file
:
if
do_
genotype
:
# Genotype (sheets 2&3):
cells_gt
,
cells_gq
=
fill_genotypes_data
(
i
,
2
+
((
nb_tools
+
1
)
*
nb_inds
),
cells_gt
,
cells_gq
,
record
[
"
tools
"
][
tool
],
my_genotypes
,
haploid
)
...
...
@@ -792,7 +793,7 @@ def build_body_cells(rec_keys, records, nb_records, nb_inds, tools, cells, cells
record
[
"
tools
"
][
tool
],
nb_records
,
my_start
,
my_end
,
my_length
,
sv_format
)
if
genotype
s_file
:
if
do_
genotype
:
# Genotype (sheets 2&3):
cells_gt
,
cells_gq
=
fill_genotypes_data
(
i
,
g
,
cells_gt
,
cells_gq
,
record
[
"
tools
"
][
tool
],
my_genotypes
,
haploid
)
...
...
@@ -802,7 +803,7 @@ def build_body_cells(rec_keys, records, nb_records, nb_inds, tools, cells, cells
# TOOL DOES NOT DETECT THE SV #
###############################
cells
,
cells_gt
,
cells_gq
=
fill_cells_no_tools
(
cells
,
cells_gt
,
cells_gq
,
i
,
j
,
g
,
nb_records
,
nb_inds
,
genotype
s_file
)
do_
genotype
)
j
+=
3
g
+=
nb_inds
...
...
@@ -811,7 +812,7 @@ def build_body_cells(rec_keys, records, nb_records, nb_inds, tools, cells, cells
# Until we have filled all tools, check if the record is kept after filtering: #
###############################################################################
cells
,
cells_gt
,
cells_gq
=
apply_style_of_filter_cells
(
cells
,
cells_gt
,
cells_gq
,
i
,
is_kept
,
nb_records
,
nb_inds
,
nb_tools
,
filtered_records
,
genotype
s_file
,
nb_inds
,
nb_tools
,
filtered_records
,
do_
genotype
,
rec_id
)
i
+=
1
...
...
@@ -825,7 +826,7 @@ def build_xlsx_cols():
XLSX_COLS
.
append
(
alp
+
j
)
def
init
(
output
,
vcf_folder
,
true_vcf
,
filtered_vcf
=
None
,
genotypes_file
=
None
,
overlap_cutoff
=
0.5
,
def
init
(
output
,
vcf_folder
,
true_vcf
,
filtered_vcf
s
=
None
,
genotypes_file
s
=
None
,
overlap_cutoff
=
0.5
,
left_precision
=
sys
.
maxsize
,
right_precision
=
sys
.
maxsize
,
no_xls
=
False
,
haploid
=
False
):
build_xlsx_cols
()
...
...
@@ -834,8 +835,10 @@ def init(output, vcf_folder, true_vcf, filtered_vcf=None, genotypes_file=None, o
nb_inds
=
0
if
genotypes_file
:
genotypes
,
gt_quality
,
nb_inds
=
get_genotypes
(
genotypes_file
,
true_vcf
)
if
genotypes_files
:
genotypes
,
gt_quality
,
nb_inds
=
get_genotypes
(
genotypes_files
,
true_vcf
)
do_genotype
=
genotypes_files
is
not
None
filenames
=
search_vcf_files
(
vcf_folder
)
...
...
@@ -853,11 +856,12 @@ def init(output, vcf_folder, true_vcf, filtered_vcf=None, genotypes_file=None, o
eprint
(
"
Reading file %s
"
%
true_ones
)
sv_set_to
,
true_ones_records
=
read_vcf_file
(
true_ones
)
sv_set
+=
sv_set_to
filtered_records
=
None
filtered_records
=
[]
if
filtered_vcf
:
eprint
(
"
Reading file %s
"
%
filtered_vcf
)
filtered_records
=
read_vcf_file
(
filtered_vcf
)[
1
]
if
filtered_vcfs
:
for
filtered_vcf
in
filtered_vcfs
:
eprint
(
"
Reading file %s
"
%
filtered_vcf
)
filtered_records
+=
read_vcf_file
(
filtered_vcf
)[
1
]
# Compute connected components:
eprint
(
"
Computing Connected components
"
)
...
...
@@ -890,7 +894,7 @@ def init(output, vcf_folder, true_vcf, filtered_vcf=None, genotypes_file=None, o
# BUILD BODY CELLS #
####################
cells
,
cells_gt
,
cells_gq
,
max_col_len
=
build_body_cells
(
rec_keys
,
records
,
nb_records
,
nb_inds
,
tools
,
cells
,
cells_gt
,
cells_gq
,
max_col_len
,
nb_tools
,
genotype
s_file
,
cells_gt
,
cells_gq
,
max_col_len
,
nb_tools
,
do_
genotype
,
haploid
,
filtered_records
)
# Create document:
...
...
@@ -906,7 +910,7 @@ def init(output, vcf_folder, true_vcf, filtered_vcf=None, genotypes_file=None, o
create_tsv_file
(
output
+
"
_sv_diffs_per_tools.tsv
"
,
headers
,
cells
,
nb_tools
+
(
2
if
filtered_records
is
not
None
else
1
),
3
,
(
2
+
nb_records
+
3
,
nb_records
*
2
+
5
))
if
genotype
s_file
:
if
do_
genotype
:
create_tsv_file
(
output
+
"
_sv_genotypes_per_tools.tsv
"
,
headers
,
cells_gt
,
nb_tools
+
(
2
if
filtered_records
is
not
None
else
1
),
nb_inds
,
(
2
,
nb_records
+
2
))
...
...
@@ -914,7 +918,7 @@ def init(output, vcf_folder, true_vcf, filtered_vcf=None, genotypes_file=None, o
nb_tools
+
(
2
if
filtered_records
is
not
None
else
1
),
nb_inds
,
(
2
,
nb_records
+
2
))
print_results
(
nb_records
,
orphans
,
with_xlsx
,
output
,
genotype
s_file
)
print_results
(
nb_records
,
orphans
,
with_xlsx
,
output
,
do_
genotype
)
def
main
():
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment