Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
S
svlib
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Admin message
A compter du 1er avril, attention à vos pipelines :
Nouvelles limitations de Docker Hub
Show more breadcrumbs
SVdetection
svlib
Commits
a47a5d83
Commit
a47a5d83
authored
5 years ago
by
Thomas Faraut
Browse files
Options
Downloads
Patches
Plain Diff
adding duplicate informations with the new identifiers
parent
4f118e1d
No related branches found
No related tags found
Tags containing commit
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
svreader/annotation.py
+87
-67
87 additions, 67 deletions
svreader/annotation.py
svreader/vcfwrapper.py
+24
-8
24 additions, 8 deletions
svreader/vcfwrapper.py
with
111 additions
and
75 deletions
svreader/annotation.py
+
87
−
67
View file @
a47a5d83
...
...
@@ -8,18 +8,31 @@ from math import isnan
from
pysam
import
VariantFile
from
svrunner_utils
import
eprint
,
warn
,
vcf_to_pybed
from
svreader.vcfwrapper
import
VCFRecord
,
S
VReader
,
S
VWriter
from
svreader.vcfwrapper
import
VCFRecord
,
V
CF
Reader
,
V
CF
Writer
HOM_REF
=
(
0
,
0
)
HET_VAR
=
(
0
,
1
)
HOM_VAR
=
(
1
,
1
)
VALID_GENOTYPES
=
[
HOM_REF
,
HET_VAR
,
HOM_VAR
]
SVTYPER_GENO_LIKELI_TAG
=
"
GL
"
GENOMESTRIP_GENO_LIKELI_TAG
=
"
GP
"
Geno_likeli_tag
=
SVTYPER_GENO_LIKELI_TAG
def
coded_geno
(
geno
):
if
geno
==
HOM_REF
:
return
"
HOMO_REF
"
elif
geno
==
HET_VAR
:
return
"
HET_VAR
"
elif
geno
==
HOM_VAR
:
return
"
HOMO_VAR
"
else
:
return
"
UNDEF
"
def
Variant
(
sample
):
if
sample
.
get
(
'
GT
'
)
in
[
HET_VAR
,
HOM_VAR
]:
return
True
...
...
@@ -27,7 +40,7 @@ def Variant(sample):
return
False
class
Variant
Record
(
VCFRecord
):
class
Annotate
Record
(
VCFRecord
):
"""
A lightweight object to annotated the final records
"""
...
...
@@ -35,7 +48,8 @@ class VariantRecord(VCFRecord):
"""
A pysam VariantRecord wrapper
"""
super
(
VariantRecord
,
self
).
__init__
(
record
)
super
(
AnnotateRecord
,
self
).
__init__
(
record
)
self
.
new_id
=
None
@property
def
start
(
self
):
...
...
@@ -60,13 +74,20 @@ class VariantRecord(VCFRecord):
else
:
return
False
@property
def
num_samples
(
self
):
return
len
(
self
.
record
.
samples
.
keys
())
def
setNewId
(
self
,
identifier
):
self
.
new_id
=
identifier
def
rename
(
self
):
try
:
self
.
record
.
info
[
'
SOURCEID
'
]
=
self
.
id
except
KeyError
:
eprint
(
"
SOURCEID absent from record info keys,
"
)
sys
.
exit
(
1
)
self
.
id
=
identifier
self
.
id
=
self
.
new_id
def
num_variant_samples
(
self
):
return
sum
([
Variant
(
s
)
for
s
in
self
.
samples
.
values
()])
...
...
@@ -78,11 +99,24 @@ class VariantRecord(VCFRecord):
return
sum
([
s
.
get
(
'
SQ
'
)
for
s
in
self
.
samples
.
values
()])
def
GQ_sum_score
(
self
):
return
sum
([
s
.
get
(
'
SQ
'
)
for
s
in
self
.
samples
.
values
()])
return
sum
([
s
.
get
(
'
GQ
'
)
for
s
in
self
.
samples
.
values
()])
def
maxGQ
(
self
):
return
max
([
s
.
get
(
'
GQ
'
)
for
s
in
self
.
samples
.
values
()])
def
setQual
(
self
):
self
.
record
.
qual
=
self
.
qual
()
def
numdiffGenotypes
(
self
):
genotypes
=
defaultdict
()
for
s
in
self
.
samples
.
values
():
if
s
.
get
(
'
GT
'
)
in
VALID_GENOTYPES
:
genotypes
[
coded_geno
(
s
.
get
(
'
GT
'
))]
=
1
return
len
(
genotypes
.
keys
())
def
Polymorph
(
self
):
return
self
.
numdiffGenotypes
()
>
1
def
AddSupportInfos
(
self
):
supp_reads
=
self
.
variant_read_support
()
num_supp_samples
=
self
.
num_variant_samples
()
...
...
@@ -93,6 +127,14 @@ class VariantRecord(VCFRecord):
eprint
(
"
SUPP_READS or NUM_SUPP_SAMPLES absent from record info keys
"
)
sys
.
exit
(
1
)
def
CallRate
(
self
,
cutoff
):
num_qual_call
=
sum
([(
s
.
get
(
'
GQ
'
)
>
cutoff
)
for
s
in
self
.
samples
.
values
()])
return
num_qual_call
/
self
.
num_samples
def
VariantCallRate
(
self
,
cutoff
):
num_qual_call
=
sum
([(
s
.
get
(
'
GQ
'
)
>
cutoff
)
for
s
in
self
.
samples
.
values
()
if
Variant
(
s
)])
return
num_qual_call
/
self
.
num_variant_samples
()
def
UnifiedPass
(
self
):
"""
All records passing the filters (PASS, .) ar now labelled PASS
...
...
@@ -106,51 +148,14 @@ class VariantRecord(VCFRecord):
record
.
filter
.
add
(
"
PASS
"
)
class
VariantRecordSample
(
object
):
"""
A lightweight object to VariantRecordSample
"""
def
__init__
(
self
,
variantsample
):
self
.
variantsample
=
variantsample
def
genotype
(
self
):
return
self
.
variantsample
.
get
(
'
GT
'
)
def
SQ_score
(
self
):
# Phred-scaled probability that this site is variant
# (non-reference in this sample)
return
self
.
variantsample
.
get
(
'
SQ
'
)
def
GQ_score
(
self
):
# Genotype quality
return
self
.
variantsample
.
get
(
'
GQ
'
)
def
GL_score
(
self
):
# Genotype Likelihood, log10-scaled likelihoods of the data given the
# called genotype for each possible genotype generated from the
# reference and alternate alleles given the sample ploidy
return
self
.
variantsample
.
get
(
'
GL
'
)
def
AltSupport
(
self
):
return
self
.
variantsample
.
get
(
'
AO
'
)[
0
]
@property
def
isVariant
(
self
):
if
self
.
genotype
()
in
[
HET_VAR
,
HOM_VAR
]:
return
True
else
:
return
False
class
VCFReader
(
SVReader
):
class
AnnotateReader
(
VCFReader
):
def
__init__
(
self
,
file_name
,
sv_to_report
=
None
):
super
(
VCF
Reader
,
self
).
__init__
(
file_name
)
super
(
Annotate
Reader
,
self
).
__init__
(
file_name
)
self
.
filename
=
file_name
self
.
sv_to_report
=
sv_to_report
self
.
vcf_reader
=
VariantFile
(
file_name
)
self
.
add_metadata
()
self
.
add_
annotation_
metadata
()
def
__iter__
(
self
):
return
self
...
...
@@ -158,25 +163,25 @@ class VCFReader(SVReader):
def
__next__
(
self
):
while
True
:
raw_record
=
next
(
self
.
vcf_reader
)
record
=
Variant
Record
(
raw_record
)
record
=
Annotate
Record
(
raw_record
)
return
record
def
getHeader
(
self
):
return
self
.
vcf_reader
.
header
def
addInfo
(
self
,
name
,
number
,
type
,
description
):
self
.
vcf_reader
.
header
.
info
.
add
(
id
=
name
,
number
=
number
,
type
=
type
,
description
=
description
)
def
addFilter
(
self
,
name
,
description
):
self
.
vcf_reader
.
header
.
filters
.
add
(
id
=
name
,
number
=
None
,
type
=
None
,
description
=
description
)
def
add_metadata
(
self
):
#
def addInfo(self, name, number, type, description):
#
self.vcf_reader.header.info.add(id=name,
#
number=number,
#
type=type,
#
description=description)
#
#
def addFilter(self, name, description):
#
self.vcf_reader.header.filters.add(id=name,
#
number=None,
#
type=None,
#
description=description)
def
add_
annotation_
metadata
(
self
):
self
.
addInfo
(
"
SOURCEID
"
,
1
,
"
String
"
,
"
The source sv identifier
"
)
self
.
addInfo
(
"
MAX_SUPP_READS
"
,
1
,
"
Integer
"
,
...
...
@@ -195,7 +200,7 @@ class VCFReader(SVReader):
return
len
(
self
.
vcf_reader
.
header
.
samples
)
class
VCF
Writer
(
S
VWriter
):
class
Annotate
Writer
(
V
CF
Writer
):
def
__init__
(
self
,
file_name
,
template_reader
,
index
=
True
):
...
...
@@ -365,9 +370,12 @@ def add_redundancy_infos_header(reader):
# Adding DUPLICATES info (equivalent to GSDUPLICATEOVERLAP)
reader
.
addInfo
(
"
DUPLICATEOVERLAP
"
,
1
,
"
Float
"
,
"
Highest overlap with a duplicate event
"
)
# Adding DUPLICATEO
VERLAP
info (
equivalent to GSDUPLICATEOVERLAP
)
reader
.
addInfo
(
"
DUPLICATE
S
"
,
"
.
"
,
"
String
"
,
# Adding DUPLICATEO
F
info (
list of variant prefered to this one
)
reader
.
addInfo
(
"
DUPLICATE
OF
"
,
"
.
"
,
"
String
"
,
"
List of duplicate events preferred to this one
"
)
# Adding DUPLICATES info (list of variants duplicates of this one)
reader
.
addInfo
(
"
DUPLICATES
"
,
"
.
"
,
"
String
"
,
"
List of duplicate events represented by the current sv
"
)
# Adding NONDUPLICATEOVERLAP
reader
.
addInfo
(
"
NONDUPLICATEOVERLAP
"
,
1
,
"
Float
"
,
"
Amount of overlap with a non-duplicate
"
)
...
...
@@ -396,7 +404,6 @@ def GenomeSTRIPLikeRedundancyAnnotator(SVSet, reader,
overlapping
=
defaultdict
(
tuple
)
reference
=
defaultdict
()
for
o
in
self_overlap
:
print
(
"
Here ?
"
,
o
)
if
o
[
3
]
==
o
[
7
]:
continue
a
,
b
=
ordered
(
o
[
3
],
o
[
7
])
...
...
@@ -413,7 +420,7 @@ def GenomeSTRIPLikeRedundancyAnnotator(SVSet, reader,
reference
[
ref
]
=
1
overlap_size
=
int
(
o
[
-
1
])
overlap
=
getoverlap
(
dupli
,
overlap_size
)
if
maxGQ
(
ref
)
>
0
and
passed
(
dupli
)
:
if
ref
.
maxGQ
()
>
0
and
dupli
.
passed
:
# are dismissed
# - reference variant with 0 genotype quality for all markers
# - duplicate that are already tagged as filtered out
...
...
@@ -445,7 +452,7 @@ def add_duplicate_info_sv(sv, duplicateoverlap, duplicatescore, duplicates):
else
:
sv
.
record
.
info
[
'
DUPLICATESCORE
'
]
=
duplicatescore
sv
.
record
.
info
[
'
DUPLICATEOVERLAP
'
]
=
duplicateoverlap
sv
.
record
.
info
[
'
DUPLICATE
S
'
]
=
duplicates
sv
.
record
.
info
[
'
DUPLICATE
OF
'
]
=
duplicates
def
add_overlap_info_sv
(
sv
,
overlap
,
duplicatescore
):
...
...
@@ -459,10 +466,20 @@ def add_overlap_info_sv(sv, overlap, duplicatescore):
sv
.
record
.
info
[
'
NONDUPLICATEOVERLAP
'
]
=
overlap
def
add_callrate_infos_header
(
reader
):
# Adding CALLRATE info
reader
.
addInfo
(
"
CALLRATE
"
,
1
,
"
Float
"
,
"
Percentage of samples called with a GQ>13
"
)
# Adding VARIANTCALLRATE info
reader
.
addInfo
(
"
VARIANTCALLRATE
"
,
1
,
"
Float
"
,
"
Percentage of variant samples called with a GQ>13
"
)
def
add_filter_infos_header
(
reader
):
# FILTERS
# Adding specific filters
reader
.
addFilter
(
"
CALLRATE
"
,
"
Call rate <0.75
"
)
reader
.
addFilter
(
"
VARIANTCALLRATE
"
,
"
Variant Call rate <0.75
"
)
reader
.
addFilter
(
"
MONOMORPH
"
,
"
All samples have the same genotype
"
)
reader
.
addFilter
(
"
DUPLICATE
"
,
"
GSDUPLICATESCORE>0
"
)
reader
.
addFilter
(
"
OVERLAP
"
,
"
NONDUPLICATEOVERLAP>0.7
"
)
...
...
@@ -479,13 +496,16 @@ def GenomeSTRIPLikefiltering(SVSet, reader):
(use NONVARIANTSCORE in both cases)
"""
add_callrate_infos_header
(
reader
)
add_filter_infos_header
(
reader
)
for
sv
in
SVSet
:
info
=
sv
.
record
.
info
if
callRate
(
sv
)
<
0.75
:
sv
.
record
.
info
[
'
CALLRATE
'
]
=
sv
.
CallRate
(
13
)
sv
.
record
.
info
[
'
VARIANTCALLRATE
'
]
=
sv
.
VariantCallRate
(
13
)
if
sv
.
CallRate
(
13
)
<
0.75
:
sv
.
filter
.
add
(
"
CALLRATE
"
)
if
not
Polymorph
(
sv
):
if
not
sv
.
Polymorph
():
sv
.
filter
.
add
(
"
MONOMORPH
"
)
if
'
NONDUPLICATEOVERLAP
'
in
info
and
info
[
'
NONDUPLICATEOVERLAP
'
]
>
0.7
:
sv
.
filter
.
add
(
"
OVERLAP
"
)
...
...
This diff is collapsed.
Click to expand it.
svreader/vcfwrapper.py
+
24
−
8
View file @
a47a5d83
...
...
@@ -39,6 +39,10 @@ class VCFRecord(SVRecord):
def
id
(
self
):
return
self
.
record
.
id
@id.setter
def
id
(
self
,
id
):
self
.
record
.
id
=
id
@property
def
pos
(
self
):
return
self
.
record
.
pos
...
...
@@ -51,6 +55,10 @@ class VCFRecord(SVRecord):
def
stop
(
self
):
return
self
.
record
.
stop
@property
def
svtype
(
self
):
return
self
.
_sv_type
@property
def
filter
(
self
):
return
self
.
record
.
filter
...
...
@@ -84,16 +92,24 @@ class VCFReader(SVReader):
self
.
vcf_reader
.
header
.
info
.
remove_header
(
key
)
def
addInfo
(
self
,
name
,
number
,
type
,
description
):
self
.
vcf_reader
.
header
.
info
.
add
(
id
=
name
,
number
=
number
,
type
=
type
,
description
=
description
)
# we add info only if absent from the current header
if
name
in
self
.
vcf_reader
.
header
.
info
:
return
else
:
self
.
vcf_reader
.
header
.
info
.
add
(
id
=
name
,
number
=
number
,
type
=
type
,
description
=
description
)
def
addFilter
(
self
,
name
,
description
):
self
.
vcf_reader
.
header
.
filters
.
add
(
id
=
name
,
number
=
None
,
type
=
None
,
description
=
description
)
# we add filter info only if absent from the current header
if
name
in
self
.
vcf_reader
.
header
.
filters
:
return
else
:
self
.
vcf_reader
.
header
.
filters
.
add
(
id
=
name
,
number
=
None
,
type
=
None
,
description
=
description
)
def
getOrderedSamples
(
self
):
samples
=
self
.
vcf_reader
.
header
.
samples
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment