Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
S
svlib
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Admin message
A compter du 1er avril, attention à vos pipelines :
Nouvelles limitations de Docker Hub
Show more breadcrumbs
SVdetection
svlib
Commits
093720f3
Commit
093720f3
authored
5 years ago
by
Thomas Faraut
Browse files
Options
Downloads
Plain Diff
Merge branch 'newfilter' of forgemia.inra.fr:svdetection/svlib into newfilter
parents
affa74ae
aa96035d
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
svreader/annotation.py
+232
-72
232 additions, 72 deletions
svreader/annotation.py
svreader/pindel.py
+1
-0
1 addition, 0 deletions
svreader/pindel.py
svreader/vcfwrapper.py
+24
-8
24 additions, 8 deletions
svreader/vcfwrapper.py
with
257 additions
and
80 deletions
svreader/annotation.py
+
232
−
72
View file @
093720f3
...
@@ -2,24 +2,39 @@
...
@@ -2,24 +2,39 @@
import
sys
import
sys
from
collections
import
defaultdict
from
collections
import
defaultdict
from
networkx
import
Graph
,
connected_components
import
numpy
as
np
import
numpy
as
np
from
math
import
isnan
from
math
import
isnan
from
pysam
import
VariantFile
from
pysam
import
VariantFile
from
svrunner_utils
import
eprint
,
warn
,
vcf_to_pybed
from
svrunner_utils
import
eprint
,
warn
,
vcf_to_pybed
from
svreader.vcfwrapper
import
VCFRecord
,
S
VReader
,
S
VWriter
from
svreader.vcfwrapper
import
VCFRecord
,
V
CF
Reader
,
V
CF
Writer
HOM_REF
=
(
0
,
0
)
HOM_REF
=
(
0
,
0
)
HET_VAR
=
(
0
,
1
)
HET_VAR
=
(
0
,
1
)
HOM_VAR
=
(
1
,
1
)
HOM_VAR
=
(
1
,
1
)
VALID_GENOTYPES
=
[
HOM_REF
,
HET_VAR
,
HOM_VAR
]
SVTYPER_GENO_LIKELI_TAG
=
"
GL
"
SVTYPER_GENO_LIKELI_TAG
=
"
GL
"
GENOMESTRIP_GENO_LIKELI_TAG
=
"
GP
"
GENOMESTRIP_GENO_LIKELI_TAG
=
"
GP
"
Geno_likeli_tag
=
SVTYPER_GENO_LIKELI_TAG
Geno_likeli_tag
=
SVTYPER_GENO_LIKELI_TAG
def
coded_geno
(
geno
):
if
geno
==
HOM_REF
:
return
"
HOMO_REF
"
elif
geno
==
HET_VAR
:
return
"
HET_VAR
"
elif
geno
==
HOM_VAR
:
return
"
HOMO_VAR
"
else
:
return
"
UNDEF
"
def
Variant
(
sample
):
def
Variant
(
sample
):
if
sample
.
get
(
'
GT
'
)
in
[
HET_VAR
,
HOM_VAR
]:
if
sample
.
get
(
'
GT
'
)
in
[
HET_VAR
,
HOM_VAR
]:
return
True
return
True
...
@@ -27,7 +42,14 @@ def Variant(sample):
...
@@ -27,7 +42,14 @@ def Variant(sample):
return
False
return
False
class
VariantRecord
(
VCFRecord
):
def
Heterozygote
(
sample
):
if
sample
.
get
(
'
GT
'
)
in
[
HET_VAR
]:
return
True
else
:
return
False
class
AnnotateRecord
(
VCFRecord
):
"""
"""
A lightweight object to annotated the final records
A lightweight object to annotated the final records
"""
"""
...
@@ -35,7 +57,8 @@ class VariantRecord(VCFRecord):
...
@@ -35,7 +57,8 @@ class VariantRecord(VCFRecord):
"""
"""
A pysam VariantRecord wrapper
A pysam VariantRecord wrapper
"""
"""
super
(
VariantRecord
,
self
).
__init__
(
record
)
super
(
AnnotateRecord
,
self
).
__init__
(
record
)
self
.
new_id
=
None
@property
@property
def
start
(
self
):
def
start
(
self
):
...
@@ -60,29 +83,64 @@ class VariantRecord(VCFRecord):
...
@@ -60,29 +83,64 @@ class VariantRecord(VCFRecord):
else
:
else
:
return
False
return
False
def
setNewId
(
self
,
identifier
):
@property
def
num_samples
(
self
):
return
len
(
self
.
record
.
samples
.
keys
())
def
setNewId
(
self
,
new_id
):
self
.
new_id
=
new_id
def
rename
(
self
):
try
:
try
:
self
.
record
.
info
[
'
SOURCEID
'
]
=
self
.
id
self
.
record
.
info
[
'
SOURCEID
'
]
=
self
.
id
except
KeyError
:
except
KeyError
:
eprint
(
"
SOURCEID absent from record info keys,
"
)
eprint
(
"
SOURCEID absent from record info keys,
"
)
sys
.
exit
(
1
)
sys
.
exit
(
1
)
self
.
id
=
identifier
self
.
id
=
self
.
new_id
def
num_variant_samples
(
self
):
def
num_variant_samples
(
self
):
return
sum
([
Variant
(
s
)
for
s
in
self
.
samples
.
values
()])
return
sum
([
Variant
(
s
)
for
s
in
self
.
samples
.
values
()])
def
variant_read_support
(
self
):
def
variant_read_support
(
self
):
return
max
([
s
.
get
(
'
AO
'
)[
0
]
for
s
in
self
.
samples
.
values
()])
support
=
[]
for
s
in
self
.
samples
.
values
():
if
s
.
get
(
'
AO
'
)
is
not
None
:
support
.
append
(
s
.
get
(
'
AO
'
)[
0
])
return
max
(
support
)
def
qual
(
self
):
def
qual
(
self
):
return
sum
([
s
.
get
(
'
SQ
'
)
for
s
in
self
.
samples
.
values
()])
variant_qual
=
[]
for
s
in
self
.
samples
.
values
():
if
s
.
get
(
'
SQ
'
)
is
not
None
:
variant_qual
.
append
(
s
.
get
(
'
SQ
'
))
return
sum
(
variant_qual
)
def
GQ_samples
(
self
):
genotype_qual
=
[]
for
s
in
self
.
samples
.
values
():
if
s
.
get
(
'
GQ
'
)
is
not
None
:
genotype_qual
.
append
(
s
.
get
(
'
GQ
'
))
return
genotype_qual
def
GQ_sum_score
(
self
):
def
GQ_sum_score
(
self
):
return
sum
([
s
.
get
(
'
SQ
'
)
for
s
in
self
.
samples
.
values
()])
return
sum
(
self
.
GQ_samples
())
def
maxGQ
(
self
):
return
max
(
self
.
GQ_samples
())
def
setQual
(
self
):
def
setQual
(
self
):
self
.
record
.
qual
=
self
.
qual
()
self
.
record
.
qual
=
self
.
qual
()
def
numdiffGenotypes
(
self
):
genotypes
=
defaultdict
()
for
s
in
self
.
samples
.
values
():
if
s
.
get
(
'
GT
'
)
in
VALID_GENOTYPES
:
genotypes
[
coded_geno
(
s
.
get
(
'
GT
'
))]
=
1
return
len
(
genotypes
.
keys
())
def
Polymorph
(
self
):
return
self
.
numdiffGenotypes
()
>
1
def
AddSupportInfos
(
self
):
def
AddSupportInfos
(
self
):
supp_reads
=
self
.
variant_read_support
()
supp_reads
=
self
.
variant_read_support
()
num_supp_samples
=
self
.
num_variant_samples
()
num_supp_samples
=
self
.
num_variant_samples
()
...
@@ -93,6 +151,24 @@ class VariantRecord(VCFRecord):
...
@@ -93,6 +151,24 @@ class VariantRecord(VCFRecord):
eprint
(
"
SUPP_READS or NUM_SUPP_SAMPLES absent from record info keys
"
)
eprint
(
"
SUPP_READS or NUM_SUPP_SAMPLES absent from record info keys
"
)
sys
.
exit
(
1
)
sys
.
exit
(
1
)
def
CallRate
(
self
,
cutoff
):
call_qual
=
[]
for
s
in
self
.
samples
.
values
():
if
s
.
get
(
'
GQ
'
)
is
not
None
:
call_qual
.
append
(
s
.
get
(
'
GQ
'
))
num_qual_call
=
sum
([(
qual
>
cutoff
)
for
qual
in
call_qual
])
return
num_qual_call
/
self
.
num_samples
def
VariantCallRate
(
self
,
cutoff
):
samples
=
self
.
samples
.
values
()
num_qual_var
=
0
for
s
in
samples
:
if
s
.
get
(
"
GQ
"
)
is
not
None
and
s
.
get
(
"
GQ
"
)
>
cutoff
and
Variant
(
s
):
num_qual_var
+=
1
num_var_samples
=
self
.
num_variant_samples
()
var_call_rate
=
num_qual_var
/
num_var_samples
if
num_var_samples
else
0
return
var_call_rate
def
UnifiedPass
(
self
):
def
UnifiedPass
(
self
):
"""
"""
All records passing the filters (PASS, .) ar now labelled PASS
All records passing the filters (PASS, .) ar now labelled PASS
...
@@ -106,52 +182,14 @@ class VariantRecord(VCFRecord):
...
@@ -106,52 +182,14 @@ class VariantRecord(VCFRecord):
record
.
filter
.
add
(
"
PASS
"
)
record
.
filter
.
add
(
"
PASS
"
)
class
VariantRecordSample
(
object
):
class
AnnotateReader
(
VCFReader
):
"""
A lightweight object to VariantRecordSample
"""
def
__init__
(
self
,
variantsample
):
self
.
variantsample
=
variantsample
def
genotype
(
self
):
return
self
.
variantsample
.
get
(
'
GT
'
)
def
SQ_score
(
self
):
# Phred-scaled probability that this site is variant
# (non-reference in this sample)
return
self
.
variantsample
.
get
(
'
SQ
'
)
def
GQ_score
(
self
):
# Genotype quality
return
self
.
variantsample
.
get
(
'
GQ
'
)
def
GL_score
(
self
):
# Genotype Likelihood, log10-scaled likelihoods of the data given the
# called genotype for each possible genotype generated from the
# reference and alternate alleles given the sample ploidy
return
self
.
variantsample
.
get
(
'
GL
'
)
def
AltSupport
(
self
):
return
self
.
variantsample
.
get
(
'
AO
'
)[
0
]
@property
def
isVariant
(
self
):
if
self
.
genotype
()
in
[
HET_VAR
,
HOM_VAR
]:
return
True
else
:
return
False
class
VCFReader
(
SVReader
):
def
__init__
(
self
,
file_name
,
sv_to_report
=
None
):
def
__init__
(
self
,
file_name
,
sv_to_report
=
None
):
super
(
VCFReader
,
self
).
__init__
(
file_name
)
super
(
AnnotateReader
,
self
).
__init__
(
file_name
)
self
.
vcf_reader
=
VariantFile
(
file_name
)
self
.
filename
=
file_name
self
.
filename
=
file_name
self
.
sv_to_report
=
sv_to_report
self
.
sv_to_report
=
sv_to_report
self
.
vcf_reader
=
VariantFile
(
file_name
)
self
.
vcf_reader
=
VariantFile
(
file_name
)
self
.
add_metadata
()
self
.
add_
annotation_
metadata
()
def
__iter__
(
self
):
def
__iter__
(
self
):
return
self
return
self
...
@@ -159,25 +197,25 @@ class VCFReader(SVReader):
...
@@ -159,25 +197,25 @@ class VCFReader(SVReader):
def
__next__
(
self
):
def
__next__
(
self
):
while
True
:
while
True
:
raw_record
=
next
(
self
.
vcf_reader
)
raw_record
=
next
(
self
.
vcf_reader
)
record
=
Variant
Record
(
raw_record
)
record
=
Annotate
Record
(
raw_record
)
return
record
return
record
def
getHeader
(
self
):
def
getHeader
(
self
):
return
self
.
vcf_reader
.
header
return
self
.
vcf_reader
.
header
def
addInfo
(
self
,
name
,
number
,
type
,
description
):
#
def addInfo(self, name, number, type, description):
self
.
vcf_reader
.
header
.
info
.
add
(
id
=
name
,
#
self.vcf_reader.header.info.add(id=name,
number
=
number
,
#
number=number,
type
=
type
,
#
type=type,
description
=
description
)
#
description=description)
#
def
addFilter
(
self
,
name
,
description
):
#
def addFilter(self, name, description):
self
.
vcf_reader
.
header
.
filters
.
add
(
id
=
name
,
#
self.vcf_reader.header.filters.add(id=name,
number
=
None
,
#
number=None,
type
=
None
,
#
type=None,
description
=
description
)
#
description=description)
def
add_metadata
(
self
):
def
add_
annotation_
metadata
(
self
):
self
.
addInfo
(
"
SOURCEID
"
,
1
,
"
String
"
,
self
.
addInfo
(
"
SOURCEID
"
,
1
,
"
String
"
,
"
The source sv identifier
"
)
"
The source sv identifier
"
)
self
.
addInfo
(
"
MAX_SUPP_READS
"
,
1
,
"
Integer
"
,
self
.
addInfo
(
"
MAX_SUPP_READS
"
,
1
,
"
Integer
"
,
...
@@ -187,7 +225,6 @@ class VCFReader(SVReader):
...
@@ -187,7 +225,6 @@ class VCFReader(SVReader):
self
.
addFilter
(
"
LOWSUPPORT
"
,
self
.
addFilter
(
"
LOWSUPPORT
"
,
"
total supp reads < 5 or supp samples < 2
"
)
"
total supp reads < 5 or supp samples < 2
"
)
def
getOrderedSamples
(
self
):
def
getOrderedSamples
(
self
):
samples
=
self
.
vcf_reader
.
header
.
samples
samples
=
self
.
vcf_reader
.
header
.
samples
sample_names
=
[
sample
.
rsplit
(
'
.
'
)[
0
]
for
sample
in
samples
]
sample_names
=
[
sample
.
rsplit
(
'
.
'
)[
0
]
for
sample
in
samples
]
...
@@ -197,7 +234,7 @@ class VCFReader(SVReader):
...
@@ -197,7 +234,7 @@ class VCFReader(SVReader):
return
len
(
self
.
vcf_reader
.
header
.
samples
)
return
len
(
self
.
vcf_reader
.
header
.
samples
)
class
VCF
Writer
(
S
VWriter
):
class
Annotate
Writer
(
V
CF
Writer
):
def
__init__
(
self
,
file_name
,
template_reader
,
index
=
True
):
def
__init__
(
self
,
file_name
,
template_reader
,
index
=
True
):
...
@@ -367,12 +404,18 @@ def add_redundancy_infos_header(reader):
...
@@ -367,12 +404,18 @@ def add_redundancy_infos_header(reader):
# Adding DUPLICATES info (equivalent to GSDUPLICATEOVERLAP)
# Adding DUPLICATES info (equivalent to GSDUPLICATEOVERLAP)
reader
.
addInfo
(
"
DUPLICATEOVERLAP
"
,
1
,
"
Float
"
,
reader
.
addInfo
(
"
DUPLICATEOVERLAP
"
,
1
,
"
Float
"
,
"
Highest overlap with a duplicate event
"
)
"
Highest overlap with a duplicate event
"
)
# Adding DUPLICATEO
VERLAP
info (
equivalent to GSDUPLICATEOVERLAP
)
# Adding DUPLICATEO
F
info (
list of variant prefered to this one
)
reader
.
addInfo
(
"
DUPLICATE
S
"
,
"
.
"
,
"
String
"
,
reader
.
addInfo
(
"
DUPLICATE
OF
"
,
"
.
"
,
"
String
"
,
"
List of duplicate events preferred to this one
"
)
"
List of duplicate events preferred to this one
"
)
# Adding DUPLICATES info (list of variants duplicates of this one)
reader
.
addInfo
(
"
DUPLICATES
"
,
"
.
"
,
"
String
"
,
"
List of duplicate events represented by the current sv
"
)
# Adding NONDUPLICATEOVERLAP
# Adding NONDUPLICATEOVERLAP
reader
.
addInfo
(
"
NONDUPLICATEOVERLAP
"
,
1
,
"
Float
"
,
reader
.
addInfo
(
"
NONDUPLICATEOVERLAP
"
,
1
,
"
Float
"
,
"
Amount of overlap with a non-duplicate
"
)
"
Amount of overlap with a non-duplicate
"
)
# Adding TOOLSUPPORT
reader
.
addInfo
(
"
TOOLSUPPORT
"
,
"
.
"
,
"
String
"
,
"
Tools supporting (detecting) the sv
"
)
def
GenomeSTRIPLikeRedundancyAnnotator
(
SVSet
,
reader
,
def
GenomeSTRIPLikeRedundancyAnnotator
(
SVSet
,
reader
,
...
@@ -398,7 +441,6 @@ def GenomeSTRIPLikeRedundancyAnnotator(SVSet, reader,
...
@@ -398,7 +441,6 @@ def GenomeSTRIPLikeRedundancyAnnotator(SVSet, reader,
overlapping
=
defaultdict
(
tuple
)
overlapping
=
defaultdict
(
tuple
)
reference
=
defaultdict
()
reference
=
defaultdict
()
for
o
in
self_overlap
:
for
o
in
self_overlap
:
print
(
"
Here ?
"
,
o
)
if
o
[
3
]
==
o
[
7
]:
if
o
[
3
]
==
o
[
7
]:
continue
continue
a
,
b
=
ordered
(
o
[
3
],
o
[
7
])
a
,
b
=
ordered
(
o
[
3
],
o
[
7
])
...
@@ -415,7 +457,7 @@ def GenomeSTRIPLikeRedundancyAnnotator(SVSet, reader,
...
@@ -415,7 +457,7 @@ def GenomeSTRIPLikeRedundancyAnnotator(SVSet, reader,
reference
[
ref
]
=
1
reference
[
ref
]
=
1
overlap_size
=
int
(
o
[
-
1
])
overlap_size
=
int
(
o
[
-
1
])
overlap
=
getoverlap
(
dupli
,
overlap_size
)
overlap
=
getoverlap
(
dupli
,
overlap_size
)
if
maxGQ
(
ref
)
>
0
and
passed
(
dupli
)
:
if
ref
.
maxGQ
()
>
0
and
dupli
.
passed
:
# are dismissed
# are dismissed
# - reference variant with 0 genotype quality for all markers
# - reference variant with 0 genotype quality for all markers
# - duplicate that are already tagged as filtered out
# - duplicate that are already tagged as filtered out
...
@@ -447,7 +489,7 @@ def add_duplicate_info_sv(sv, duplicateoverlap, duplicatescore, duplicates):
...
@@ -447,7 +489,7 @@ def add_duplicate_info_sv(sv, duplicateoverlap, duplicatescore, duplicates):
else
:
else
:
sv
.
record
.
info
[
'
DUPLICATESCORE
'
]
=
duplicatescore
sv
.
record
.
info
[
'
DUPLICATESCORE
'
]
=
duplicatescore
sv
.
record
.
info
[
'
DUPLICATEOVERLAP
'
]
=
duplicateoverlap
sv
.
record
.
info
[
'
DUPLICATEOVERLAP
'
]
=
duplicateoverlap
sv
.
record
.
info
[
'
DUPLICATE
S
'
]
=
duplicates
sv
.
record
.
info
[
'
DUPLICATE
OF
'
]
=
duplicates
def
add_overlap_info_sv
(
sv
,
overlap
,
duplicatescore
):
def
add_overlap_info_sv
(
sv
,
overlap
,
duplicatescore
):
...
@@ -461,13 +503,24 @@ def add_overlap_info_sv(sv, overlap, duplicatescore):
...
@@ -461,13 +503,24 @@ def add_overlap_info_sv(sv, overlap, duplicatescore):
sv
.
record
.
info
[
'
NONDUPLICATEOVERLAP
'
]
=
overlap
sv
.
record
.
info
[
'
NONDUPLICATEOVERLAP
'
]
=
overlap
def
add_callrate_infos_header
(
reader
):
# Adding CALLRATE info
reader
.
addInfo
(
"
CALLRATE
"
,
1
,
"
Float
"
,
"
Percentage of samples called with a GQ>13
"
)
# Adding VARIANTCALLRATE info
reader
.
addInfo
(
"
VARIANTCALLRATE
"
,
1
,
"
Float
"
,
"
Percentage of variant samples called with a GQ>13
"
)
def
add_filter_infos_header
(
reader
):
def
add_filter_infos_header
(
reader
):
# FILTERS
# FILTERS
# Adding specific filters
# Adding specific filters
reader
.
addFilter
(
"
CALLRATE
"
,
"
Call rate <0.75
"
)
reader
.
addFilter
(
"
CALLRATE
"
,
"
Call rate <0.75
"
)
reader
.
addFilter
(
"
VARIANTCALLRATE
"
,
"
Variant Call rate <0.75
"
)
reader
.
addFilter
(
"
MONOMORPH
"
,
"
All samples have the same genotype
"
)
reader
.
addFilter
(
"
MONOMORPH
"
,
"
All samples have the same genotype
"
)
reader
.
addFilter
(
"
DUPLICATE
"
,
"
GSDUPLICATESCORE>0
"
)
reader
.
addFilter
(
"
DUPLICATE
"
,
"
GSDUPLICATESCORE>0
"
)
reader
.
addFilter
(
"
OVERLAP
"
,
"
NONDUPLICATEOVERLAP>0.7
"
)
reader
.
addFilter
(
"
OVERLAP
"
,
"
NONDUPLICATEOVERLAP>0.7
"
)
reader
.
addFilter
(
"
ABFREQ
"
,
"
AB frequency <0.3 for >50% heterosamples
"
)
def
GenomeSTRIPLikefiltering
(
SVSet
,
reader
):
def
GenomeSTRIPLikefiltering
(
SVSet
,
reader
):
...
@@ -481,15 +534,122 @@ def GenomeSTRIPLikefiltering(SVSet, reader):
...
@@ -481,15 +534,122 @@ def GenomeSTRIPLikefiltering(SVSet, reader):
(use NONVARIANTSCORE in both cases)
(use NONVARIANTSCORE in both cases)
"""
"""
add_callrate_infos_header
(
reader
)
add_filter_infos_header
(
reader
)
add_filter_infos_header
(
reader
)
for
sv
in
SVSet
:
for
sv
in
SVSet
:
info
=
sv
.
record
.
info
info
=
sv
.
record
.
info
if
callRate
(
sv
)
<
0.75
:
sv
.
record
.
info
[
'
CALLRATE
'
]
=
sv
.
CallRate
(
13
)
sv
.
record
.
info
[
'
VARIANTCALLRATE
'
]
=
sv
.
VariantCallRate
(
13
)
if
sv
.
CallRate
(
13
)
<
0.75
:
sv
.
filter
.
add
(
"
CALLRATE
"
)
sv
.
filter
.
add
(
"
CALLRATE
"
)
if
not
Polymorph
(
sv
):
if
not
sv
.
Polymorph
():
sv
.
filter
.
add
(
"
MONOMORPH
"
)
sv
.
filter
.
add
(
"
MONOMORPH
"
)
if
'
NONDUPLICATEOVERLAP
'
in
info
and
info
[
'
NONDUPLICATEOVERLAP
'
]
>
0.7
:
if
'
NONDUPLICATEOVERLAP
'
in
info
and
info
[
'
NONDUPLICATEOVERLAP
'
]
>
0.7
:
sv
.
filter
.
add
(
"
OVERLAP
"
)
sv
.
filter
.
add
(
"
OVERLAP
"
)
if
"
DUPLICATESCORE
"
in
info
is
not
None
and
info
[
'
DUPLICATESCORE
'
]
>
-
2
:
if
"
DUPLICATESCORE
"
in
info
is
not
None
and
info
[
'
DUPLICATESCORE
'
]
>
-
2
:
sv
.
filter
.
add
(
"
DUPLICATE
"
)
sv
.
filter
.
add
(
"
DUPLICATE
"
)
def
ABFreqFiltering
(
SVSet
):
"""
Filtering the candidate CNVs according to the following criteria
- more than 50% of variant samples should have AB freq > 0.3
"""
for
sv
in
SVSet
:
ABfreqOK
=
[]
for
s
in
sv
.
record
.
samples
.
values
():
if
Heterozygote
(
s
):
ABfreqOK
.
append
((
s
.
get
(
'
AB
'
)[
0
]
>
0.3
))
if
len
(
ABfreqOK
)
>
0
and
sum
(
ABfreqOK
)
<
len
(
ABfreqOK
)
/
2
:
sv
.
filter
.
add
(
"
ABFREQ
"
)
def
GetConnectedDuplicates
(
SVSet
):
"""
Construct connected components of duplicates and rename the variants
"""
undirected
=
Graph
()
variant_dict
=
defaultdict
()
representatives
=
defaultdict
()
for
s
in
SVSet
:
variant_dict
[
s
.
id
]
=
s
if
"
DUPLICATE
"
in
s
.
filter
:
for
dupli_repr
in
s
.
record
.
info
[
"
DUPLICATEOF
"
]:
undirected
.
add_edge
(
s
.
id
,
dupli_repr
)
for
component
in
connected_components
(
undirected
):
for
c
in
component
:
if
"
DUPLICATEOF
"
in
variant_dict
[
c
].
record
.
info
:
# the current variant is a duplicate
continue
rep
=
c
# the representative of the equivalence class
break
duplicates
=
component
duplicates
.
remove
(
rep
)
representatives
[
rep
]
=
duplicates
add_duplicate_infos
(
representatives
,
variant_dict
)
def
add_duplicate_infos
(
representatives
,
sv_dict
):
for
rep
,
elements
in
representatives
.
items
():
for
d
in
elements
:
sv_dict
[
d
].
record
.
info
[
'
DUPLICATEOF
'
]
=
rep
duplicates
=
list
(
elements
)
if
'
DUPLICATES
'
in
sv_dict
[
rep
].
record
.
info
:
print
(
sv_dict
[
rep
].
record
.
info
[
'
DUPLICATES
'
])
duplicates
.
extend
(
sv_dict
[
rep
].
record
.
info
[
'
DUPLICATES
'
])
if
len
(
duplicates
)
>
0
:
sv_dict
[
rep
].
record
.
info
[
'
DUPLICATES
'
]
=
duplicates
def
get_tool_name
(
sv_ident
):
return
sv_ident
.
split
(
"
_
"
)[
0
]
def
SetSupportingTools
(
SVSet
):
for
sv
in
SVSet
:
tools
=
{
get_tool_name
(
sv
.
id
)}
if
"
DUPLICATES
"
in
sv
.
record
.
info
:
duplicates
=
sv
.
record
.
info
[
'
DUPLICATES
'
]
# print(duplicates)
for
dupli
in
duplicates
:
tools
.
add
(
get_tool_name
(
dupli
))
if
'
TOOLSUPPORT
'
in
sv
.
record
.
info
:
supporting
=
set
(
sv
.
record
.
info
[
'
TOOLSUPPORT
'
])
tools
.
union
(
supporting
)
sv
.
record
.
info
[
'
TOOLSUPPORT
'
]
=
list
(
tools
)
def
static_vars
(
**
kwargs
):
def
decorate
(
func
):
for
k
in
kwargs
:
setattr
(
func
,
k
,
kwargs
[
k
])
return
func
return
decorate
@static_vars
(
counter
=
0
)
def
new_id_str
(
sv
):
new_id_str
.
counter
+=
1
return
"
_
"
.
join
([
"
cnvpipeline
"
,
sv
.
chrom
,
sv
.
svtype
,
str
(
new_id_str
.
counter
)])
def
rename_info_field
(
sv
,
key
,
sv_dict
):
if
key
in
sv
.
record
.
info
:
info_oldid
=
sv
.
record
.
info
[
key
]
info_newid
=
[
sv_dict
[
id
]
for
id
in
info_oldid
]
sv
.
record
.
info
[
key
]
=
info_newid
def
RenameSV
(
SVSet
):
sv_dict
=
defaultdict
()
for
sv
in
SVSet
:
new_id
=
new_id_str
(
sv
)
sv
.
setNewId
(
new_id
)
sv_dict
[
sv
.
id
]
=
new_id
for
sv
in
SVSet
:
rename_info_field
(
sv
,
"
DUPLICATEOF
"
,
sv_dict
)
rename_info_field
(
sv
,
"
DUPLICATES
"
,
sv_dict
)
sv
.
record
.
info
[
'
SOURCEID
'
]
=
sv
.
id
sv
.
rename
()
This diff is collapsed.
Click to expand it.
svreader/pindel.py
+
1
−
0
View file @
093720f3
...
@@ -388,6 +388,7 @@ class PindelReader(SVReader):
...
@@ -388,6 +388,7 @@ class PindelReader(SVReader):
# # Sudmant et al 2015 SuppInfo
# # Sudmant et al 2015 SuppInfo
# return (record.length() > 60)
# return (record.length() > 60)
def
SpecificFilterPass
(
self
,
record
):
def
SpecificFilterPass
(
self
,
record
):
# new pindel specific filter
if
record
.
length
()
>=
800
or
record
.
length
()
<=
60
:
if
record
.
length
()
>=
800
or
record
.
length
()
<=
60
:
return
False
return
False
elif
record
.
MaxIndividualSupport
()
<=
3
:
elif
record
.
MaxIndividualSupport
()
<=
3
:
...
...
This diff is collapsed.
Click to expand it.
svreader/vcfwrapper.py
+
24
−
8
View file @
093720f3
...
@@ -39,6 +39,10 @@ class VCFRecord(SVRecord):
...
@@ -39,6 +39,10 @@ class VCFRecord(SVRecord):
def
id
(
self
):
def
id
(
self
):
return
self
.
record
.
id
return
self
.
record
.
id
@id.setter
def
id
(
self
,
id
):
self
.
record
.
id
=
id
@property
@property
def
pos
(
self
):
def
pos
(
self
):
return
self
.
record
.
pos
return
self
.
record
.
pos
...
@@ -51,6 +55,10 @@ class VCFRecord(SVRecord):
...
@@ -51,6 +55,10 @@ class VCFRecord(SVRecord):
def
stop
(
self
):
def
stop
(
self
):
return
self
.
record
.
stop
return
self
.
record
.
stop
@property
def
svtype
(
self
):
return
self
.
_sv_type
@property
@property
def
filter
(
self
):
def
filter
(
self
):
return
self
.
record
.
filter
return
self
.
record
.
filter
...
@@ -84,16 +92,24 @@ class VCFReader(SVReader):
...
@@ -84,16 +92,24 @@ class VCFReader(SVReader):
self
.
vcf_reader
.
header
.
info
.
remove_header
(
key
)
self
.
vcf_reader
.
header
.
info
.
remove_header
(
key
)
def
addInfo
(
self
,
name
,
number
,
type
,
description
):
def
addInfo
(
self
,
name
,
number
,
type
,
description
):
self
.
vcf_reader
.
header
.
info
.
add
(
id
=
name
,
# we add info only if absent from the current header
number
=
number
,
if
name
in
self
.
vcf_reader
.
header
.
info
:
type
=
type
,
return
description
=
description
)
else
:
self
.
vcf_reader
.
header
.
info
.
add
(
id
=
name
,
number
=
number
,
type
=
type
,
description
=
description
)
def
addFilter
(
self
,
name
,
description
):
def
addFilter
(
self
,
name
,
description
):
self
.
vcf_reader
.
header
.
filters
.
add
(
id
=
name
,
# we add filter info only if absent from the current header
number
=
None
,
if
name
in
self
.
vcf_reader
.
header
.
filters
:
type
=
None
,
return
description
=
description
)
else
:
self
.
vcf_reader
.
header
.
filters
.
add
(
id
=
name
,
number
=
None
,
type
=
None
,
description
=
description
)
def
getOrderedSamples
(
self
):
def
getOrderedSamples
(
self
):
samples
=
self
.
vcf_reader
.
header
.
samples
samples
=
self
.
vcf_reader
.
header
.
samples
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment