Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
S
svlib
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Admin message
A compter du 1er avril, attention à vos pipelines :
Nouvelles limitations de Docker Hub
Show more breadcrumbs
SVdetection
svlib
Commits
b0069bcf
Commit
b0069bcf
authored
5 years ago
by
Thomas Faraut
Browse files
Options
Downloads
Patches
Plain Diff
a more pep8 compliant code
parent
28cadf8a
No related branches found
No related tags found
Tags containing commit
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
svreader/annotation.py
+57
-59
57 additions, 59 deletions
svreader/annotation.py
with
57 additions
and
59 deletions
svreader/annotation.py
+
57
−
59
View file @
b0069bcf
...
...
@@ -53,6 +53,7 @@ class AnnotateRecord(VCFRecord):
"""
A lightweight object to annotated the final records
"""
def
__init__
(
self
,
record
):
"""
A pysam VariantRecord wrapper
...
...
@@ -128,7 +129,7 @@ class AnnotateRecord(VCFRecord):
def
maxGQ
(
self
):
return
max
(
self
.
GQ_samples
())
def
set
Q
ual
(
self
):
def
set
_q
ual
(
self
):
self
.
record
.
qual
=
self
.
qual
()
def
numdiffGenotypes
(
self
):
...
...
@@ -138,10 +139,10 @@ class AnnotateRecord(VCFRecord):
genotypes
[
coded_geno
(
s
.
get
(
'
GT
'
))]
=
1
return
len
(
genotypes
.
keys
())
def
P
olymorph
(
self
):
def
p
olymorph
(
self
):
return
self
.
numdiffGenotypes
()
>
1
def
A
dd
S
upport
I
nfos
(
self
):
def
a
dd
_s
upport
ing_i
nfos
(
self
):
supp_reads
=
self
.
variant_read_support
()
num_supp_samples
=
self
.
num_variant_samples
()
try
:
...
...
@@ -151,25 +152,25 @@ class AnnotateRecord(VCFRecord):
eprint
(
"
SUPP_READS or NUM_SUPP_SAMPLES absent from record info keys
"
)
sys
.
exit
(
1
)
def
C
all
R
ate
(
self
,
cutoff
):
def
c
all
_r
ate
(
self
,
cutoff
):
call_qual
=
[]
for
s
in
self
.
samples
.
values
():
if
s
.
get
(
'
GQ
'
)
is
not
None
:
call_qual
.
append
(
s
.
get
(
'
GQ
'
))
num_qual_call
=
sum
([(
qual
>
cutoff
)
for
qual
in
call_qual
])
return
num_qual_call
/
self
.
num_samples
return
num_qual_call
/
self
.
num_samples
def
V
ariant
C
all
R
ate
(
self
,
cutoff
):
def
v
ariant
_c
all
_r
ate
(
self
,
cutoff
):
samples
=
self
.
samples
.
values
()
num_qual_var
=
0
for
s
in
samples
:
if
s
.
get
(
"
GQ
"
)
is
not
None
and
s
.
get
(
"
GQ
"
)
>
cutoff
and
Variant
(
s
):
num_qual_var
+=
1
num_var_samples
=
self
.
num_variant_samples
()
var_call_rate
=
num_qual_var
/
num_var_samples
if
num_var_samples
else
0
var_call_rate
=
num_qual_var
/
num_var_samples
if
num_var_samples
else
0
return
var_call_rate
def
U
nif
iedPass
(
self
):
def
u
nif
y_pass_filtertag
(
self
):
"""
All records passing the filters (PASS, .) ar now labelled PASS
"""
...
...
@@ -203,17 +204,6 @@ class AnnotateReader(VCFReader):
def
getHeader
(
self
):
return
self
.
vcf_reader
.
header
# def addInfo(self, name, number, type, description):
# self.vcf_reader.header.info.add(id=name,
# number=number,
# type=type,
# description=description)
#
# def addFilter(self, name, description):
# self.vcf_reader.header.filters.add(id=name,
# number=None,
# type=None,
# description=description)
def
add_annotation_metadata
(
self
):
self
.
addInfo
(
"
SOURCEID
"
,
1
,
"
String
"
,
...
...
@@ -290,7 +280,7 @@ def probas(likelihoods):
def
getprobas
(
sample
):
# Transforming likelihods into probabilities
return
probas
(
getlikelihoods
(
sample
))
/
np
.
sum
(
probas
(
getlikelihoods
(
sample
)))
return
probas
(
getlikelihoods
(
sample
))
/
np
.
sum
(
probas
(
getlikelihoods
(
sample
)))
def
ondiagonal
(
u_s
,
v_s
):
...
...
@@ -301,7 +291,7 @@ def ondiagonal(u_s, v_s):
q
=
getprobas
(
v_s
)
proba
=
0
for
a
,
b
in
zip
(
p
,
q
):
proba
+=
a
*
b
proba
+=
a
*
b
# print("Proba on %3.5f" %(proba))
return
proba
...
...
@@ -315,7 +305,7 @@ def offdiagonal(u_s, v_s):
for
i
,
a
in
enumerate
(
p
):
for
j
,
b
in
enumerate
(
q
):
if
i
!=
j
:
proba
+=
a
*
b
proba
+=
a
*
b
# print("Proba off %3.2f" %(proba))
return
proba
...
...
@@ -345,35 +335,41 @@ def duplicatescore(u, v):
if
offdiago
>
max_disc
:
max_disc
=
offdiago
if
max_disc
>
0
and
max_disc
<
1
:
ratio
=
(
1
-
max_disc
)
/
max_disc
ratio
=
(
1
-
max_disc
)
/
max_disc
computed
=
np
.
log
(
ratio
)
return
computed
def
gstrength
(
u
):
# Sum of phred-like genotype qualities provides a measure of the
# combined genotype quality of the site
# np.sum([s['GQ'] if s['GQ'] is not None else 0 for s in u.samples.values()])
"""
Sum of phred-like genotype qualities provides a measure of the
combined genotype quality of the site
np.sum([s[
'
GQ
'
] if s[
'
GQ
'
] is not None else 0 for s in u.samples.values()])
"""
return
u
.
GQ_sum_score
()
def
variantstrength
(
u
):
# maximum SQ, where SQ stands for
# Phred-scaled probability that this site is variant (non-reference)
# in this sample)
# QUAL = -10 * log(P(locus is reference in all samples)), which is
# equal to the sum of the SQ scores.
# see https://github.com/hall-lab/svtyper/issues/10
# sum([s['SQ'] if s['SQ'] is not None else 0 for s in u.samples.values()])
"""
maximum SQ, where SQ stands for
Phred-scaled probability that this site is variant (non-reference)
in this sample)
QUAL = -10 * log(P(locus is reference in all samples)), which is
equal to the sum of the SQ scores.
see https://github.com/hall-lab/svtyper/issues/10
sum([s[
'
SQ
'
] if s[
'
SQ
'
] is not None else 0 for s in u.samples.values()])
"""
return
u
.
qual
()
# max([s['SQ'] if s['SQ'] is not None else 0 for s in u.samples.values()])
def
getduplicates_GQ
(
u
,
v
):
# select the prefered duplicate on the basis of the
# Sum of phred-like genotype qualities
# see gstrength
# returns prefered, discarded, strength of both
"""
select the prefered duplicate on the basis of the
Sum of phred-like genotype qualities
see gstrength
returns prefered, discarded, strength of both
"""
if
gstrength
(
u
)
>
gstrength
(
v
):
return
(
u
,
v
,
gstrength
(
u
),
gstrength
(
v
))
else
:
...
...
@@ -381,10 +377,12 @@ def getduplicates_GQ(u, v):
def
getduplicates_QUAL
(
u
,
v
):
# select the prefered duplicate on the basis of
# Phred-scaled probability that this site is a variant
# see variantstrength
# returns prefered, discarded, strength of both
"""
select the prefered duplicate on the basis of
Phred-scaled probability that this site is a variant
see variantstrength
returns prefered, discarded, strength of both
"""
if
variantstrength
(
u
)
>
variantstrength
(
v
):
return
(
u
,
v
,
variantstrength
(
u
),
variantstrength
(
v
))
else
:
...
...
@@ -393,7 +391,7 @@ def getduplicates_QUAL(u, v):
def
getoverlap
(
u
,
osize
):
# percentage overlap given the size of the overlap
return
100
*
osize
/
u
.
svlen
return
100
*
osize
/
u
.
svlen
def
add_redundancy_infos_header
(
reader
):
...
...
@@ -418,9 +416,8 @@ def add_redundancy_infos_header(reader):
"
Tools supporting (detecting) the sv
"
)
def
GenomeSTRIPLikeRedundancyAnnotator
(
SVSet
,
reader
,
duplicatescore_threshold
=-
2
,
genotyper
=
"
svtyper
"
):
def
redundancy_annotator
(
SVSet
,
reader
,
duplicatescore_threshold
=-
2
,
genotyper
=
"
svtyper
"
):
"""
Annotating duplicate candidates based on the genotype likelihoods
- genotype likelihoods can be provided by svtyper or genomestrip
"""
...
...
@@ -452,7 +449,7 @@ def GenomeSTRIPLikeRedundancyAnnotator(SVSet, reader,
score
=
duplicatescore
(
u
,
v
)
# print("Comparing %s and %s : %3.8f" % (u.id, v.id, score))
if
score
>
duplicatescore_threshold
:
ref
,
dupli
,
s1
,
s2
=
getduplicates_GQ
(
u
,
v
)
ref
,
dupli
,
_
,
_
=
getduplicates_GQ
(
u
,
v
)
# print("%s prefered to %s %3.8f" % (ref.id, dupli.id, score))
reference
[
ref
]
=
1
overlap_size
=
int
(
o
[
-
1
])
...
...
@@ -523,7 +520,7 @@ def add_filter_infos_header(reader):
reader
.
addFilter
(
"
ABFREQ
"
,
"
AB frequency <0.3 for >50% heterosamples
"
)
def
GenomeSTRIPLikefiltering
(
SVSet
,
reader
):
def
variant_filtration
(
SVSet
,
reader
):
"""
Filtering the candidate CNVs according to the following criteria
- non duplicate sites
- variant sites
...
...
@@ -539,11 +536,11 @@ def GenomeSTRIPLikefiltering(SVSet, reader):
for
sv
in
SVSet
:
info
=
sv
.
record
.
info
sv
.
record
.
info
[
'
CALLRATE
'
]
=
sv
.
C
all
R
ate
(
13
)
sv
.
record
.
info
[
'
VARIANTCALLRATE
'
]
=
sv
.
V
ariant
C
all
R
ate
(
13
)
if
sv
.
C
all
R
ate
(
13
)
<
0.75
:
sv
.
record
.
info
[
'
CALLRATE
'
]
=
sv
.
c
all
_r
ate
(
13
)
sv
.
record
.
info
[
'
VARIANTCALLRATE
'
]
=
sv
.
v
ariant
_c
all
_r
ate
(
13
)
if
sv
.
c
all
_r
ate
(
13
)
<
0.75
:
sv
.
filter
.
add
(
"
CALLRATE
"
)
if
not
sv
.
P
olymorph
():
if
not
sv
.
p
olymorph
():
sv
.
filter
.
add
(
"
MONOMORPH
"
)
if
'
NONDUPLICATEOVERLAP
'
in
info
and
info
[
'
NONDUPLICATEOVERLAP
'
]
>
0.7
:
sv
.
filter
.
add
(
"
OVERLAP
"
)
...
...
@@ -551,21 +548,22 @@ def GenomeSTRIPLikefiltering(SVSet, reader):
sv
.
filter
.
add
(
"
DUPLICATE
"
)
def
AB
FreqF
iltering
(
SVS
et
):
def
AB
_f
iltering
(
variant_s
et
):
"""
Filtering the candidate CNVs according to the following criteria
- more than 50% of variant samples should have AB freq > 0.3
"""
for
sv
in
SVS
et
:
ABfreq
OK
=
[]
for
sv
in
variant_s
et
:
valid_
AB
_
freq
=
[]
for
s
in
sv
.
record
.
samples
.
values
():
if
Heterozygote
(
s
):
ABfreqOK
.
append
((
s
.
get
(
'
AB
'
)[
0
]
>
0.3
))
if
len
(
ABfreqOK
)
>
0
and
sum
(
ABfreqOK
)
<
len
(
ABfreqOK
)
/
2
:
valid_AB_freq
.
append
((
s
.
get
(
'
AB
'
)[
0
]
>
0.3
))
if
(
len
(
valid_AB_freq
)
>
0
and
sum
(
valid_AB_freq
)
<
len
(
valid_AB_freq
)
/
2
):
sv
.
filter
.
add
(
"
ABFREQ
"
)
def
G
et
C
onnected
D
uplicates
(
SVSet
):
def
g
et
_c
onnected
_d
uplicates
(
SVSet
):
"""
Construct connected components of duplicates and rename the variants
"""
...
...
@@ -606,7 +604,7 @@ def get_tool_name(sv_ident):
return
sv_ident
.
split
(
"
_
"
)[
0
]
def
S
et
S
upporting
T
ools
(
SVSet
):
def
s
et
_s
upporting
_t
ools
(
SVSet
):
for
sv
in
SVSet
:
tools
=
{
get_tool_name
(
sv
.
id
)}
if
"
DUPLICATES
"
in
sv
.
record
.
info
:
...
...
@@ -642,7 +640,7 @@ def rename_info_field(sv, key, sv_dict):
sv
.
record
.
info
[
key
]
=
info_newid
def
R
ename
SV
(
SVSet
):
def
r
ename
_variants
(
SVSet
):
sv_dict
=
defaultdict
()
for
sv
in
SVSet
:
new_id
=
new_id_str
(
sv
)
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment