Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
S
svlib
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Admin message
A compter du 1er avril, attention à vos pipelines :
Nouvelles limitations de Docker Hub
Show more breadcrumbs
SVdetection
svlib
Commits
b0069bcf
Commit
b0069bcf
authored
5 years ago
by
Thomas Faraut
Browse files
Options
Downloads
Patches
Plain Diff
a more pep8 compliant code
parent
28cadf8a
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
svreader/annotation.py
+57
-59
57 additions, 59 deletions
svreader/annotation.py
with
57 additions
and
59 deletions
svreader/annotation.py
+
57
−
59
View file @
b0069bcf
...
@@ -53,6 +53,7 @@ class AnnotateRecord(VCFRecord):
...
@@ -53,6 +53,7 @@ class AnnotateRecord(VCFRecord):
"""
"""
A lightweight object to annotated the final records
A lightweight object to annotated the final records
"""
"""
def
__init__
(
self
,
record
):
def
__init__
(
self
,
record
):
"""
"""
A pysam VariantRecord wrapper
A pysam VariantRecord wrapper
...
@@ -128,7 +129,7 @@ class AnnotateRecord(VCFRecord):
...
@@ -128,7 +129,7 @@ class AnnotateRecord(VCFRecord):
def
maxGQ
(
self
):
def
maxGQ
(
self
):
return
max
(
self
.
GQ_samples
())
return
max
(
self
.
GQ_samples
())
def
set
Q
ual
(
self
):
def
set
_q
ual
(
self
):
self
.
record
.
qual
=
self
.
qual
()
self
.
record
.
qual
=
self
.
qual
()
def
numdiffGenotypes
(
self
):
def
numdiffGenotypes
(
self
):
...
@@ -138,10 +139,10 @@ class AnnotateRecord(VCFRecord):
...
@@ -138,10 +139,10 @@ class AnnotateRecord(VCFRecord):
genotypes
[
coded_geno
(
s
.
get
(
'
GT
'
))]
=
1
genotypes
[
coded_geno
(
s
.
get
(
'
GT
'
))]
=
1
return
len
(
genotypes
.
keys
())
return
len
(
genotypes
.
keys
())
def
P
olymorph
(
self
):
def
p
olymorph
(
self
):
return
self
.
numdiffGenotypes
()
>
1
return
self
.
numdiffGenotypes
()
>
1
def
A
dd
S
upport
I
nfos
(
self
):
def
a
dd
_s
upport
ing_i
nfos
(
self
):
supp_reads
=
self
.
variant_read_support
()
supp_reads
=
self
.
variant_read_support
()
num_supp_samples
=
self
.
num_variant_samples
()
num_supp_samples
=
self
.
num_variant_samples
()
try
:
try
:
...
@@ -151,25 +152,25 @@ class AnnotateRecord(VCFRecord):
...
@@ -151,25 +152,25 @@ class AnnotateRecord(VCFRecord):
eprint
(
"
SUPP_READS or NUM_SUPP_SAMPLES absent from record info keys
"
)
eprint
(
"
SUPP_READS or NUM_SUPP_SAMPLES absent from record info keys
"
)
sys
.
exit
(
1
)
sys
.
exit
(
1
)
def
C
all
R
ate
(
self
,
cutoff
):
def
c
all
_r
ate
(
self
,
cutoff
):
call_qual
=
[]
call_qual
=
[]
for
s
in
self
.
samples
.
values
():
for
s
in
self
.
samples
.
values
():
if
s
.
get
(
'
GQ
'
)
is
not
None
:
if
s
.
get
(
'
GQ
'
)
is
not
None
:
call_qual
.
append
(
s
.
get
(
'
GQ
'
))
call_qual
.
append
(
s
.
get
(
'
GQ
'
))
num_qual_call
=
sum
([(
qual
>
cutoff
)
for
qual
in
call_qual
])
num_qual_call
=
sum
([(
qual
>
cutoff
)
for
qual
in
call_qual
])
return
num_qual_call
/
self
.
num_samples
return
num_qual_call
/
self
.
num_samples
def
V
ariant
C
all
R
ate
(
self
,
cutoff
):
def
v
ariant
_c
all
_r
ate
(
self
,
cutoff
):
samples
=
self
.
samples
.
values
()
samples
=
self
.
samples
.
values
()
num_qual_var
=
0
num_qual_var
=
0
for
s
in
samples
:
for
s
in
samples
:
if
s
.
get
(
"
GQ
"
)
is
not
None
and
s
.
get
(
"
GQ
"
)
>
cutoff
and
Variant
(
s
):
if
s
.
get
(
"
GQ
"
)
is
not
None
and
s
.
get
(
"
GQ
"
)
>
cutoff
and
Variant
(
s
):
num_qual_var
+=
1
num_qual_var
+=
1
num_var_samples
=
self
.
num_variant_samples
()
num_var_samples
=
self
.
num_variant_samples
()
var_call_rate
=
num_qual_var
/
num_var_samples
if
num_var_samples
else
0
var_call_rate
=
num_qual_var
/
num_var_samples
if
num_var_samples
else
0
return
var_call_rate
return
var_call_rate
def
U
nif
iedPass
(
self
):
def
u
nif
y_pass_filtertag
(
self
):
"""
"""
All records passing the filters (PASS, .) ar now labelled PASS
All records passing the filters (PASS, .) ar now labelled PASS
"""
"""
...
@@ -203,17 +204,6 @@ class AnnotateReader(VCFReader):
...
@@ -203,17 +204,6 @@ class AnnotateReader(VCFReader):
def
getHeader
(
self
):
def
getHeader
(
self
):
return
self
.
vcf_reader
.
header
return
self
.
vcf_reader
.
header
# def addInfo(self, name, number, type, description):
# self.vcf_reader.header.info.add(id=name,
# number=number,
# type=type,
# description=description)
#
# def addFilter(self, name, description):
# self.vcf_reader.header.filters.add(id=name,
# number=None,
# type=None,
# description=description)
def
add_annotation_metadata
(
self
):
def
add_annotation_metadata
(
self
):
self
.
addInfo
(
"
SOURCEID
"
,
1
,
"
String
"
,
self
.
addInfo
(
"
SOURCEID
"
,
1
,
"
String
"
,
...
@@ -290,7 +280,7 @@ def probas(likelihoods):
...
@@ -290,7 +280,7 @@ def probas(likelihoods):
def
getprobas
(
sample
):
def
getprobas
(
sample
):
# Transforming likelihods into probabilities
# Transforming likelihods into probabilities
return
probas
(
getlikelihoods
(
sample
))
/
np
.
sum
(
probas
(
getlikelihoods
(
sample
)))
return
probas
(
getlikelihoods
(
sample
))
/
np
.
sum
(
probas
(
getlikelihoods
(
sample
)))
def
ondiagonal
(
u_s
,
v_s
):
def
ondiagonal
(
u_s
,
v_s
):
...
@@ -301,7 +291,7 @@ def ondiagonal(u_s, v_s):
...
@@ -301,7 +291,7 @@ def ondiagonal(u_s, v_s):
q
=
getprobas
(
v_s
)
q
=
getprobas
(
v_s
)
proba
=
0
proba
=
0
for
a
,
b
in
zip
(
p
,
q
):
for
a
,
b
in
zip
(
p
,
q
):
proba
+=
a
*
b
proba
+=
a
*
b
# print("Proba on %3.5f" %(proba))
# print("Proba on %3.5f" %(proba))
return
proba
return
proba
...
@@ -315,7 +305,7 @@ def offdiagonal(u_s, v_s):
...
@@ -315,7 +305,7 @@ def offdiagonal(u_s, v_s):
for
i
,
a
in
enumerate
(
p
):
for
i
,
a
in
enumerate
(
p
):
for
j
,
b
in
enumerate
(
q
):
for
j
,
b
in
enumerate
(
q
):
if
i
!=
j
:
if
i
!=
j
:
proba
+=
a
*
b
proba
+=
a
*
b
# print("Proba off %3.2f" %(proba))
# print("Proba off %3.2f" %(proba))
return
proba
return
proba
...
@@ -345,35 +335,41 @@ def duplicatescore(u, v):
...
@@ -345,35 +335,41 @@ def duplicatescore(u, v):
if
offdiago
>
max_disc
:
if
offdiago
>
max_disc
:
max_disc
=
offdiago
max_disc
=
offdiago
if
max_disc
>
0
and
max_disc
<
1
:
if
max_disc
>
0
and
max_disc
<
1
:
ratio
=
(
1
-
max_disc
)
/
max_disc
ratio
=
(
1
-
max_disc
)
/
max_disc
computed
=
np
.
log
(
ratio
)
computed
=
np
.
log
(
ratio
)
return
computed
return
computed
def
gstrength
(
u
):
def
gstrength
(
u
):
# Sum of phred-like genotype qualities provides a measure of the
"""
# combined genotype quality of the site
Sum of phred-like genotype qualities provides a measure of the
# np.sum([s['GQ'] if s['GQ'] is not None else 0 for s in u.samples.values()])
combined genotype quality of the site
np.sum([s[
'
GQ
'
] if s[
'
GQ
'
] is not None else 0 for s in u.samples.values()])
"""
return
u
.
GQ_sum_score
()
return
u
.
GQ_sum_score
()
def
variantstrength
(
u
):
def
variantstrength
(
u
):
# maximum SQ, where SQ stands for
"""
# Phred-scaled probability that this site is variant (non-reference)
maximum SQ, where SQ stands for
# in this sample)
Phred-scaled probability that this site is variant (non-reference)
# QUAL = -10 * log(P(locus is reference in all samples)), which is
in this sample)
# equal to the sum of the SQ scores.
QUAL = -10 * log(P(locus is reference in all samples)), which is
# see https://github.com/hall-lab/svtyper/issues/10
equal to the sum of the SQ scores.
# sum([s['SQ'] if s['SQ'] is not None else 0 for s in u.samples.values()])
see https://github.com/hall-lab/svtyper/issues/10
sum([s[
'
SQ
'
] if s[
'
SQ
'
] is not None else 0 for s in u.samples.values()])
"""
return
u
.
qual
()
return
u
.
qual
()
# max([s['SQ'] if s['SQ'] is not None else 0 for s in u.samples.values()])
# max([s['SQ'] if s['SQ'] is not None else 0 for s in u.samples.values()])
def
getduplicates_GQ
(
u
,
v
):
def
getduplicates_GQ
(
u
,
v
):
# select the prefered duplicate on the basis of the
"""
# Sum of phred-like genotype qualities
select the prefered duplicate on the basis of the
# see gstrength
Sum of phred-like genotype qualities
# returns prefered, discarded, strength of both
see gstrength
returns prefered, discarded, strength of both
"""
if
gstrength
(
u
)
>
gstrength
(
v
):
if
gstrength
(
u
)
>
gstrength
(
v
):
return
(
u
,
v
,
gstrength
(
u
),
gstrength
(
v
))
return
(
u
,
v
,
gstrength
(
u
),
gstrength
(
v
))
else
:
else
:
...
@@ -381,10 +377,12 @@ def getduplicates_GQ(u, v):
...
@@ -381,10 +377,12 @@ def getduplicates_GQ(u, v):
def
getduplicates_QUAL
(
u
,
v
):
def
getduplicates_QUAL
(
u
,
v
):
# select the prefered duplicate on the basis of
"""
# Phred-scaled probability that this site is a variant
select the prefered duplicate on the basis of
# see variantstrength
Phred-scaled probability that this site is a variant
# returns prefered, discarded, strength of both
see variantstrength
returns prefered, discarded, strength of both
"""
if
variantstrength
(
u
)
>
variantstrength
(
v
):
if
variantstrength
(
u
)
>
variantstrength
(
v
):
return
(
u
,
v
,
variantstrength
(
u
),
variantstrength
(
v
))
return
(
u
,
v
,
variantstrength
(
u
),
variantstrength
(
v
))
else
:
else
:
...
@@ -393,7 +391,7 @@ def getduplicates_QUAL(u, v):
...
@@ -393,7 +391,7 @@ def getduplicates_QUAL(u, v):
def
getoverlap
(
u
,
osize
):
def
getoverlap
(
u
,
osize
):
# percentage overlap given the size of the overlap
# percentage overlap given the size of the overlap
return
100
*
osize
/
u
.
svlen
return
100
*
osize
/
u
.
svlen
def
add_redundancy_infos_header
(
reader
):
def
add_redundancy_infos_header
(
reader
):
...
@@ -418,9 +416,8 @@ def add_redundancy_infos_header(reader):
...
@@ -418,9 +416,8 @@ def add_redundancy_infos_header(reader):
"
Tools supporting (detecting) the sv
"
)
"
Tools supporting (detecting) the sv
"
)
def
GenomeSTRIPLikeRedundancyAnnotator
(
SVSet
,
reader
,
def
redundancy_annotator
(
SVSet
,
reader
,
duplicatescore_threshold
=-
2
,
duplicatescore_threshold
=-
2
,
genotyper
=
"
svtyper
"
):
genotyper
=
"
svtyper
"
):
"""
Annotating duplicate candidates based on the genotype likelihoods
"""
Annotating duplicate candidates based on the genotype likelihoods
- genotype likelihoods can be provided by svtyper or genomestrip
- genotype likelihoods can be provided by svtyper or genomestrip
"""
"""
...
@@ -452,7 +449,7 @@ def GenomeSTRIPLikeRedundancyAnnotator(SVSet, reader,
...
@@ -452,7 +449,7 @@ def GenomeSTRIPLikeRedundancyAnnotator(SVSet, reader,
score
=
duplicatescore
(
u
,
v
)
score
=
duplicatescore
(
u
,
v
)
# print("Comparing %s and %s : %3.8f" % (u.id, v.id, score))
# print("Comparing %s and %s : %3.8f" % (u.id, v.id, score))
if
score
>
duplicatescore_threshold
:
if
score
>
duplicatescore_threshold
:
ref
,
dupli
,
s1
,
s2
=
getduplicates_GQ
(
u
,
v
)
ref
,
dupli
,
_
,
_
=
getduplicates_GQ
(
u
,
v
)
# print("%s prefered to %s %3.8f" % (ref.id, dupli.id, score))
# print("%s prefered to %s %3.8f" % (ref.id, dupli.id, score))
reference
[
ref
]
=
1
reference
[
ref
]
=
1
overlap_size
=
int
(
o
[
-
1
])
overlap_size
=
int
(
o
[
-
1
])
...
@@ -523,7 +520,7 @@ def add_filter_infos_header(reader):
...
@@ -523,7 +520,7 @@ def add_filter_infos_header(reader):
reader
.
addFilter
(
"
ABFREQ
"
,
"
AB frequency <0.3 for >50% heterosamples
"
)
reader
.
addFilter
(
"
ABFREQ
"
,
"
AB frequency <0.3 for >50% heterosamples
"
)
def
GenomeSTRIPLikefiltering
(
SVSet
,
reader
):
def
variant_filtration
(
SVSet
,
reader
):
"""
Filtering the candidate CNVs according to the following criteria
"""
Filtering the candidate CNVs according to the following criteria
- non duplicate sites
- non duplicate sites
- variant sites
- variant sites
...
@@ -539,11 +536,11 @@ def GenomeSTRIPLikefiltering(SVSet, reader):
...
@@ -539,11 +536,11 @@ def GenomeSTRIPLikefiltering(SVSet, reader):
for
sv
in
SVSet
:
for
sv
in
SVSet
:
info
=
sv
.
record
.
info
info
=
sv
.
record
.
info
sv
.
record
.
info
[
'
CALLRATE
'
]
=
sv
.
C
all
R
ate
(
13
)
sv
.
record
.
info
[
'
CALLRATE
'
]
=
sv
.
c
all
_r
ate
(
13
)
sv
.
record
.
info
[
'
VARIANTCALLRATE
'
]
=
sv
.
V
ariant
C
all
R
ate
(
13
)
sv
.
record
.
info
[
'
VARIANTCALLRATE
'
]
=
sv
.
v
ariant
_c
all
_r
ate
(
13
)
if
sv
.
C
all
R
ate
(
13
)
<
0.75
:
if
sv
.
c
all
_r
ate
(
13
)
<
0.75
:
sv
.
filter
.
add
(
"
CALLRATE
"
)
sv
.
filter
.
add
(
"
CALLRATE
"
)
if
not
sv
.
P
olymorph
():
if
not
sv
.
p
olymorph
():
sv
.
filter
.
add
(
"
MONOMORPH
"
)
sv
.
filter
.
add
(
"
MONOMORPH
"
)
if
'
NONDUPLICATEOVERLAP
'
in
info
and
info
[
'
NONDUPLICATEOVERLAP
'
]
>
0.7
:
if
'
NONDUPLICATEOVERLAP
'
in
info
and
info
[
'
NONDUPLICATEOVERLAP
'
]
>
0.7
:
sv
.
filter
.
add
(
"
OVERLAP
"
)
sv
.
filter
.
add
(
"
OVERLAP
"
)
...
@@ -551,21 +548,22 @@ def GenomeSTRIPLikefiltering(SVSet, reader):
...
@@ -551,21 +548,22 @@ def GenomeSTRIPLikefiltering(SVSet, reader):
sv
.
filter
.
add
(
"
DUPLICATE
"
)
sv
.
filter
.
add
(
"
DUPLICATE
"
)
def
AB
FreqF
iltering
(
SVS
et
):
def
AB
_f
iltering
(
variant_s
et
):
"""
Filtering the candidate CNVs according to the following criteria
"""
Filtering the candidate CNVs according to the following criteria
- more than 50% of variant samples should have AB freq > 0.3
- more than 50% of variant samples should have AB freq > 0.3
"""
"""
for
sv
in
SVS
et
:
for
sv
in
variant_s
et
:
ABfreq
OK
=
[]
valid_
AB
_
freq
=
[]
for
s
in
sv
.
record
.
samples
.
values
():
for
s
in
sv
.
record
.
samples
.
values
():
if
Heterozygote
(
s
):
if
Heterozygote
(
s
):
ABfreqOK
.
append
((
s
.
get
(
'
AB
'
)[
0
]
>
0.3
))
valid_AB_freq
.
append
((
s
.
get
(
'
AB
'
)[
0
]
>
0.3
))
if
len
(
ABfreqOK
)
>
0
and
sum
(
ABfreqOK
)
<
len
(
ABfreqOK
)
/
2
:
if
(
len
(
valid_AB_freq
)
>
0
and
sum
(
valid_AB_freq
)
<
len
(
valid_AB_freq
)
/
2
):
sv
.
filter
.
add
(
"
ABFREQ
"
)
sv
.
filter
.
add
(
"
ABFREQ
"
)
def
G
et
C
onnected
D
uplicates
(
SVSet
):
def
g
et
_c
onnected
_d
uplicates
(
SVSet
):
"""
"""
Construct connected components of duplicates and rename the variants
Construct connected components of duplicates and rename the variants
"""
"""
...
@@ -606,7 +604,7 @@ def get_tool_name(sv_ident):
...
@@ -606,7 +604,7 @@ def get_tool_name(sv_ident):
return
sv_ident
.
split
(
"
_
"
)[
0
]
return
sv_ident
.
split
(
"
_
"
)[
0
]
def
S
et
S
upporting
T
ools
(
SVSet
):
def
s
et
_s
upporting
_t
ools
(
SVSet
):
for
sv
in
SVSet
:
for
sv
in
SVSet
:
tools
=
{
get_tool_name
(
sv
.
id
)}
tools
=
{
get_tool_name
(
sv
.
id
)}
if
"
DUPLICATES
"
in
sv
.
record
.
info
:
if
"
DUPLICATES
"
in
sv
.
record
.
info
:
...
@@ -642,7 +640,7 @@ def rename_info_field(sv, key, sv_dict):
...
@@ -642,7 +640,7 @@ def rename_info_field(sv, key, sv_dict):
sv
.
record
.
info
[
key
]
=
info_newid
sv
.
record
.
info
[
key
]
=
info_newid
def
R
ename
SV
(
SVSet
):
def
r
ename
_variants
(
SVSet
):
sv_dict
=
defaultdict
()
sv_dict
=
defaultdict
()
for
sv
in
SVSet
:
for
sv
in
SVSet
:
new_id
=
new_id_str
(
sv
)
new_id
=
new_id_str
(
sv
)
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment