Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
S
svlib
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
SVdetection
svlib
Commits
4645bf7c
Commit
4645bf7c
authored
5 years ago
by
Thomas Faraut
Browse files
Options
Downloads
Patches
Plain Diff
modified annotation
parent
bf4fba45
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
svfilter.py
+31
-19
31 additions, 19 deletions
svfilter.py
svreader/Annotate.py
+79
-8
79 additions, 8 deletions
svreader/Annotate.py
svreader/__init__.py
+1
-28
1 addition, 28 deletions
svreader/__init__.py
with
111 additions
and
55 deletions
svfilter.py
+
31
−
19
View file @
4645bf7c
...
...
@@ -253,12 +253,30 @@ def setLikeliTag(genotyper):
exit
(
1
)
def
GenomeSTRIPLikeRedundancyAnnotator
(
SVSet
,
duplicatescore_threshold
=-
2
,
def
add_redundancy_infos_header
(
reader
):
# Adding DUPLICATESCORE info (equivalent to GSDUPLICATESCORE)
reader
.
addInfo
(
"
DUPLICATESCORE
"
,
1
,
"
Float
"
,
"
LOD score that the events are distinct based on the
"
"
genotypes of the most discordant sample
"
)
# Adding DUPLICATES info (equivalent to GSDUPLICATEOVERLAP)
reader
.
addInfo
(
"
DUPLICATEOVERLAP
"
,
1
,
"
Float
"
,
"
Highest overlap with a duplicate event
"
)
# Adding DUPLICATEOVERLAP info (equivalent to GSDUPLICATEOVERLAP)
reader
.
addInfo
(
"
DUPLICATES
"
,
"
.
"
,
"
String
"
,
"
List of duplicate events preferred to this one
"
)
# Adding NONDUPLICATEOVERLAP
reader
.
addInfo
(
"
NONDUPLICATEOVERLAP
"
,
1
,
"
Float
"
,
"
Amount of overlap with a non-duplicate
"
)
def
GenomeSTRIPLikeRedundancyAnnotator
(
SVSet
,
reader
,
duplicatescore_threshold
=-
2
,
genotyper
=
"
svtyper
"
):
"""
Annotating duplicate candidates based on the genotype likelihoods
- genotype likelihoods can be provided by svtyper or genomestrip
"""
add_redundancy_infos_header
(
reader
)
setLikeliTag
(
genotyper
)
variants
=
defaultdict
()
...
...
@@ -344,7 +362,16 @@ def add_overlap_info_sv(sv, overlap, duplicatescore):
sv
.
record
.
info
[
'
NONDUPLICATEOVERLAP
'
]
=
overlap
def
GenomeSTRIPLikefiltering
(
SVSet
):
def
add_filter_infos_header
(
reader
):
# FILTERS
# Adding specific filters
reader
.
addFilter
(
"
CALLRATE
"
,
"
Call rate <0.75
"
)
reader
.
addFilter
(
"
MONOMORPH
"
,
"
All samples have the same genotype
"
)
reader
.
addFilter
(
"
DUPLICATE
"
,
"
GSDUPLICATESCORE>0
"
)
reader
.
addFilter
(
"
OVERLAP
"
,
"
NONDUPLICATEOVERLAP>0.7
"
)
def
GenomeSTRIPLikefiltering
(
SVSet
,
reader
):
"""
Filtering the candidate CNVs according to the following criteria
- non duplicate sites
- variant sites
...
...
@@ -355,30 +382,15 @@ def GenomeSTRIPLikefiltering(SVSet):
(use NONVARIANTSCORE in both cases)
"""
add_filter_infos_header
(
reader
)
for
sv
in
SVSet
:
info
=
sv
.
record
.
info
if
callRate
(
sv
)
<
0.75
:
sv
.
filter
.
add
(
"
CALLRATE
"
)
if
not
Polymorph
(
sv
):
sv
.
filter
.
add
(
"
MONOMORPH
"
)
# if info['GSDUPLICATESCORE'] and info['GSDUPLICATESCORE']>0:
# sv.filter.add("DUPLICATE")
# DUPLICATE
if
'
NONDUPLICATEOVERLAP
'
in
info
and
info
[
'
NONDUPLICATEOVERLAP
'
]
>
0.7
:
sv
.
filter
.
add
(
"
OVERLAP
"
)
if
"
DUPLICATESCORE
"
in
info
is
not
None
and
info
[
'
DUPLICATESCORE
'
]
>
-
2
:
sv
.
filter
.
add
(
"
DUPLICATE
"
)
# NONVARIANT
if
'
GSNONVARSCORE
'
in
info
and
info
[
'
GSNONVARSCORE
'
]
and
info
[
'
GSNONVARSCORE
'
]
>
13
:
sv
.
filter
.
add
(
"
NONVARIANT
"
)
# GTDEPTH
if
'
GSDEPTHRATIO
'
in
info
and
(
info
[
'
GSDEPTHRATIO
'
]
is
None
or
info
[
'
GSDEPTHRATIO
'
]
>
0.8
):
sv
.
filter
.
add
(
"
DEPTH
"
)
if
'
DEPTHRATIO
'
in
info
and
(
isnan
(
info
[
'
DEPTHRATIO
'
])
or
info
[
'
DEPTHRATIO
'
]
>
0.8
):
sv
.
filter
.
add
(
"
DDEPTH
"
)
if
'
DEPTHCALLTHRESHOLD
'
in
info
and
info
[
'
DEPTHCALLTHRESHOLD
'
]
>
1
:
sv
.
filter
.
add
(
"
CCOVERAGE
"
)
# CLUSTERSEP
if
'
GSM1
'
in
info
and
(
info
[
'
GSM1
'
]
<=
-
0.5
or
info
[
'
GSM1
'
]
>=
2.0
):
sv
.
filter
.
add
(
"
CLUSTERSEP
"
)
This diff is collapsed.
Click to expand it.
svreader/Annotate.py
+
79
−
8
View file @
4645bf7c
from
svreader
import
SVRecord
,
SVReader
,
SVWriter
import
sys
from
svrunner_utils
import
eprint
from
svreader
import
SVReader
,
SVWriter
from
pysam
import
VariantFile
...
...
@@ -8,6 +9,7 @@ HOM_REF = (0, 0)
HET_VAR
=
(
0
,
1
)
HOM_VAR
=
(
1
,
1
)
class
AnnotatedRecord
(
object
):
"""
A lightweight object to annotated the final records
...
...
@@ -56,7 +58,49 @@ class AnnotatedRecord(object):
@property
def
samples
(
self
):
return
self
.
record
.
samples
return
[
AnnotatedRecordSample
(
s
)
for
s
in
self
.
record
.
samples
.
values
()]
def
setNewId
(
self
,
identifier
):
try
:
self
.
record
.
info
[
'
SOURCEID
'
]
=
self
.
id
except
KeyError
:
eprint
(
"
SOURCEID absent from record info keys,
"
)
sys
.
exit
(
1
)
self
.
id
=
identifier
def
num_variant_samples
(
self
):
return
sum
([
s
.
isVariant
for
s
in
self
.
samples
])
def
variant_read_support
(
self
):
return
sum
([
s
.
AltSupport
()
for
s
in
self
.
samples
])
def
qual
(
self
):
return
sum
([
s
.
SQ_score
()
for
s
in
self
.
samples
if
s
.
isVariant
])
def
setQual
(
self
):
self
.
record
.
qual
=
self
.
qual
()
def
AddSupportInfos
(
self
):
supp_reads
=
self
.
variant_read_support
()
num_supp_samples
=
self
.
num_variant_samples
()
try
:
self
.
record
.
info
[
'
SUPP_READS
'
]
=
supp_reads
self
.
record
.
info
[
'
NUM_SUPP_SAMPLES
'
]
=
num_supp_samples
except
KeyError
:
eprint
(
"
SUPP_READS or NUM_SUPP_SAMPLES absent from record info keys
"
)
sys
.
exit
(
1
)
def
UnifiedPass
(
self
):
"""
All records passing the filters (PASS, .) ar now labelled PASS
"""
record
=
self
.
record
filters
=
[
f
for
f
in
record
.
filter
]
# We make the assumption when a "." is present no other filter
# are present
if
len
(
filters
)
==
0
or
"
.
"
in
filters
:
record
.
filter
.
clear
()
record
.
filter
.
add
(
"
PASS
"
)
class
AnnotatedRecordSample
(
object
):
...
...
@@ -66,25 +110,41 @@ class AnnotatedRecordSample(object):
def
__init__
(
self
,
variantsample
):
self
.
variantsample
=
variantsample
def
genotype
(
self
):
return
self
.
variantsample
.
get
(
'
GT
'
)
def
SQ_score
(
self
):
return
self
.
variantsample
.
get
(
'
SQ
'
)
def
AltSupport
(
self
):
return
self
.
variantsample
.
get
(
'
AO
'
)[
0
]
@property
def
isVariant
(
self
):
if
self
.
genotype
()
in
[
HET_VAR
,
HOM_VAR
]:
return
True
else
:
return
False
class
VCFReader
(
SVReader
):
def
__init__
(
self
,
file_name
,
sv_to_report
=
None
):
super
(
VCFReader
,
self
).
__init__
(
file_name
,
generic_name
,
reference_handle
)
super
(
VCFReader
,
self
).
__init__
(
file_name
)
self
.
vcf_reader
=
VariantFile
(
file_name
)
self
.
filename
=
file_name
self
.
sv_to_report
=
sv_to_report
self
.
sv_to_report
=
sv_to_report
self
.
vcf_reader
=
VariantFile
(
file_name
)
self
.
add_metadata
()
def
__iter__
(
self
):
return
self
def
__next__
(
self
):
while
True
:
raw_record
=
next
(
self
.
vcf_reader
)
record
=
(
raw_record
)
record
=
AnnotatedRecord
(
raw_record
)
return
record
def
getHeader
(
self
):
...
...
@@ -102,6 +162,17 @@ class VCFReader(SVReader):
type
=
None
,
description
=
description
)
def
add_metadata
(
self
):
self
.
addInfo
(
"
SOURCEID
"
,
1
,
"
String
"
,
"
The source sv identifier
"
)
self
.
addInfo
(
"
SUPP_READS
"
,
1
,
"
Integer
"
,
"
Number of supporting reads
"
)
self
.
addInfo
(
"
NUM_SUPP_SAMPLES
"
,
1
,
"
Integer
"
,
"
Number of supporting samples
"
)
self
.
addFilter
(
"
LOWSUPPORT
"
,
"
total supp reads < 5 or supp samples < 2
"
)
def
getOrderedSamples
(
self
):
samples
=
self
.
vcf_reader
.
header
.
samples
sample_names
=
[
sample
.
rsplit
(
'
.
'
)[
0
]
for
sample
in
samples
]
...
...
This diff is collapsed.
Click to expand it.
svreader/__init__.py
+
1
−
28
View file @
4645bf7c
...
...
@@ -228,34 +228,7 @@ class SVReader(object):
if
excluded_regions
is
not
None
:
vcf_records
=
self
.
_filter_for_gaps
(
vcf_records
,
excluded_regions
,
cutoff_proximity
)
# Convert vcf to intervals
# eprint("Converting to bedtools intervals")
# variants = vcf_to_pybed(vcf_records, excluded_regions,
# cutoff_proximity)
#
# # Identify SV overlapping regions to exclude
# eprint("Loading exlusion intervals ")
# eprint(excluded_regions)
# toexclude = BedTool(excluded_regions).sort()
# excluded = dict()
# if len(toexclude):
# eprint("Computing distance with gaps: %d gaps" % len(toexclude))
# # removing cnv in a close proximity to gaps in the assembly
# # -d=True In addition to the closest feature in B,
# # report its distance to A as an extra column
# # TODO this is questionnable ??? (remove this for genomestrip ?)
# for sv in variants.closest(toexclude, d=True):
# if (sv[4] != "."
# and sv[4] != "-1"
# and float(sv[-1]) < cutoff_proximity):
# excluded[sv[3]] = 1
#
# vcf_records_filtered = []
# for record in vcf_records:
# if record.id not in excluded:
# vcf_records_filtered.append(record)
# vcf_records = vcf_records_filtered
return
vcf_records
def
GeneralFilterPass
(
self
,
record
,
minlen
,
maxlen
):
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment