Newer
Older
from collections import defaultdict
from networkx import Graph, connected_components
import numpy as np
from math import isnan
from svrunner_utils import eprint, warn, vcf_to_pybed
from svreader.vcfwrapper import VCFRecord, VCFReader, VCFWriter
VALID_GENOTYPES = [HOM_REF, HET_VAR, HOM_VAR]
SVTYPER_GENO_LIKELI_TAG = "GL"
GENOMESTRIP_GENO_LIKELI_TAG = "GP"
Geno_likeli_tag = SVTYPER_GENO_LIKELI_TAG
def coded_geno(geno):
if geno == HOM_REF:
return "HOMO_REF"
elif geno == HET_VAR:
return "HET_VAR"
elif geno == HOM_VAR:
return "HOMO_VAR"
else:
return "UNDEF"
def Variant(sample):
if sample.get('GT') in [HET_VAR, HOM_VAR]:
return True
else:
return False
def Heterozygote(sample):
if sample.get('GT') in [HET_VAR]:
return True
else:
return False
class AnnotateRecord(VCFRecord):
"""
A lightweight object to annotated the final records
"""
def __init__(self, record):
"""
A pysam VariantRecord wrapper
"""
super(AnnotateRecord, self).__init__(record)
self.new_id = None
@property
def start(self):
return self.record.pos
@property
def end(self):
return self.record.stop
@property
def svtype(self):
return self._sv_type
@property
def svlen(self):
return abs(self.stop - self.start)
def passed(self):
if "PASS" in self.record.filter:
return True
else:
return False
@property
def num_samples(self):
return len(self.record.samples.keys())
def setNewId(self, new_id):
self.new_id = new_id
def rename(self):
try:
self.record.info['SOURCEID'] = self.id
except KeyError:
eprint("SOURCEID absent from record info keys,")
sys.exit(1)
self.id = self.new_id
return sum([Variant(s) for s in self.samples.values()])
support = []
for s in self.samples.values():
if s.get('AO') is not None:
support.append(s.get('AO')[0])
return max(support)
variant_qual = []
for s in self.samples.values():
if s.get('SQ') is not None:
variant_qual.append(s.get('SQ'))
return sum(variant_qual)
def GQ_samples(self):
genotype_qual = []
for s in self.samples.values():
if s.get('GQ') is not None:
genotype_qual.append(s.get('GQ'))
return genotype_qual
def GQ_sum_score(self):
def maxGQ(self):
def numdiffGenotypes(self):
genotypes = defaultdict()
for s in self.samples.values():
if s.get('GT') in VALID_GENOTYPES:
genotypes[coded_geno(s.get('GT'))] = 1
return len(genotypes.keys())
return self.numdiffGenotypes() > 1
supp_reads = self.variant_read_support()
num_supp_samples = self.num_variant_samples()
#print(supp_reads, num_supp_samples)
self.record.info['MAX_SUPP_READS'] = supp_reads
self.record.info['NUM_SUPP_SAMPLES'] = num_supp_samples
except KeyError:
eprint("SUPP_READS or NUM_SUPP_SAMPLES absent from record info keys")
sys.exit(1)
call_qual = []
for s in self.samples.values():
if s.get('GQ') is not None:
call_qual.append(s.get('GQ'))
num_qual_call = sum([(qual > cutoff) for qual in call_qual])
samples = self.samples.values()
num_qual_var = 0
for s in samples:
if s.get("GQ") is not None and s.get("GQ") > cutoff and Variant(s):
num_qual_var += 1
num_var_samples = self.num_variant_samples()
var_call_rate = num_qual_var / num_var_samples if num_var_samples else 0
"""
All records passing the filters (PASS, .) ar now labelled PASS
"""
record = self.record
filters = [f for f in record.filter]
# We make the assumption when a "." is present no other filter
# are present
class AnnotateReader(VCFReader):
def __init__(self, file_name, sv_to_report=None):
super(AnnotateReader, self).__init__(file_name)
self.add_annotation_metadata()
def __iter__(self):
return self
def __next__(self):
while True:
raw_record = next(self.vcf_reader)
record = AnnotateRecord(raw_record)
return record
def getHeader(self):
return self.vcf_reader.header
def add_annotation_metadata(self):
self.addInfo("SOURCEID", 1, "String",
"The source sv identifier")
self.addInfo("MAX_SUPP_READS", 1, "Integer",
"Max number of supporting reads")
self.addInfo("NUM_SUPP_SAMPLES", 1, "Integer",
"Number of supporting samples")
self.addFilter("LOWSUPPORT",
"total supp reads < 5 or supp samples < 2")
def getOrderedSamples(self):
samples = self.vcf_reader.header.samples
sample_names = [sample.rsplit('.')[0] for sample in samples]
return sample_names
def numSamples(self):
return len(self.vcf_reader.header.samples)
class AnnotateWriter(VCFWriter):
def __init__(self, file_name, template_reader, index=True):
super(VCFWriter, self).__init__(file_name,
template_reader.tool_name,
template_reader)
def _open(self):
self.vcf_writer = VariantFile(self.filename, 'w',
header=self.template_reader.getHeader())
self._isopen = True
def _write(self, record):
self.vcf_writer.write(record.record)
def _close(self):
if self._isopen:
self.vcf_writer.close()
else: # nothing was written
self._dumpemptyvcf()
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
def ordered(a, b):
# simply ordering the two string
return (a, b) if a < b else (b, a)
def setLikeliTag(genotyper):
global Geno_likeli_tag
if genotyper == "svtyper":
Geno_likeli_tag = SVTYPER_GENO_LIKELI_TAG
warn("Assuming genotypes provided by svtyper software" +
" hence tag is %s" % (Geno_likeli_tag))
elif genotyper == "genomestrip":
Geno_likeli_tag = GENOMESTRIP_GENO_LIKELI_TAG
warn("Assuming genotypes provided by genomestrip software" +
" hence tag is %s" % (Geno_likeli_tag))
else:
print("Unknown genotyping software")
exit(1)
def getlikelihoods(sample):
return sample.get('GL')
def probas(likelihoods):
# transform log likelihoods into likelihoods
return np.exp(np.array(likelihoods))
def getprobas(sample):
# Transforming likelihods into probabilities
return probas(getlikelihoods(sample)) / np.sum(probas(getlikelihoods(sample)))
def ondiagonal(u_s, v_s):
# Probability that, for that individual, the two SVs are identically
# genotyped P1(0/0)*P2(0/0) + ... P1(1/1)*P2(1/1)
# in the same way :
p = getprobas(u_s)
q = getprobas(v_s)
proba = 0
for a, b in zip(p, q):
# print("Proba on %3.5f" %(proba))
return proba
def offdiagonal(u_s, v_s):
# Probability that, for that individual, the two SVs are not identically
# in the same way, complement of the previous one
p = getprobas(u_s)
q = getprobas(v_s)
proba = 0
for i, a in enumerate(p):
for j, b in enumerate(q):
if i != j:
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
# print("Proba off %3.2f" %(proba))
return proba
def duplicatescore(u, v):
# For the two SVs, max discordant genotype log-ratio proba
# same genotypes against discordant against (worse individual)
# u_s is the sample of id s of u (idem for v_s)
# Valid samples
valid_samples = []
for s in u.samples:
u_s = u.samples[s]
v_s = v.samples[s]
if (u_s.get('GQ') is not None
and v_s.get('GQ') is not None):
valid_samples.append(s)
max_disc = 0
computed = float('NaN')
for s in valid_samples:
# ondiago is not used, we keep it just for comprehension
# ondiago = ondiagonal(s, u, v)
u_s = u.samples[s]
v_s = v.samples[s]
offdiago = offdiagonal(u_s, v_s)
if offdiago > max_disc:
max_disc = offdiago
if max_disc > 0 and max_disc < 1:
computed = np.log(ratio)
return computed
def gstrength(u):
"""
Sum of phred-like genotype qualities provides a measure of the
combined genotype quality of the site
np.sum([s['GQ'] if s['GQ'] is not None else 0 for s in u.samples.values()])
"""
return u.GQ_sum_score()
def variantstrength(u):
"""
maximum SQ, where SQ stands for
Phred-scaled probability that this site is variant (non-reference)
in this sample)
QUAL = -10 * log(P(locus is reference in all samples)), which is
equal to the sum of the SQ scores.
see https://github.com/hall-lab/svtyper/issues/10
sum([s['SQ'] if s['SQ'] is not None else 0 for s in u.samples.values()])
"""
return u.qual()
# max([s['SQ'] if s['SQ'] is not None else 0 for s in u.samples.values()])
def getduplicates_GQ(u, v):
"""
select the prefered duplicate on the basis of the
Sum of phred-like genotype qualities
see gstrength
returns prefered, discarded, strength of both
"""
if gstrength(u) > gstrength(v):
return (u, v, gstrength(u), gstrength(v))
else:
return (v, u, gstrength(v), gstrength(u))
def getduplicates_QUAL(u, v):
"""
select the prefered duplicate on the basis of
Phred-scaled probability that this site is a variant
see variantstrength
returns prefered, discarded, strength of both
"""
if variantstrength(u) > variantstrength(v):
return (u, v, variantstrength(u), variantstrength(v))
else:
return (v, u, variantstrength(v), variantstrength(u))
def getoverlap(u, osize):
# percentage overlap given the size of the overlap
def add_redundancy_infos_header(reader):
# Adding DUPLICATESCORE info (equivalent to GSDUPLICATESCORE)
reader.addInfo("DUPLICATESCORE", 1, "Float",
"LOD score that the events are distinct based on the "
"genotypes of the most discordant sample")
# Adding DUPLICATES info (equivalent to GSDUPLICATEOVERLAP)
reader.addInfo("DUPLICATEOVERLAP", 1, "Float",
"Highest overlap with a duplicate event")
# Adding DUPLICATEOF info (list of variant prefered to this one)
reader.addInfo("DUPLICATEOF", ".", "String",
"List of duplicate events preferred to this one")
# Adding DUPLICATES info (list of variants duplicates of this one)
reader.addInfo("DUPLICATES", ".", "String",
"List of duplicate events represented by the current sv")
# Adding NONDUPLICATEOVERLAP
reader.addInfo("NONDUPLICATEOVERLAP", 1, "Float",
"Amount of overlap with a non-duplicate")
# Adding TOOLSUPPORT
reader.addInfo("TOOLSUPPORT", ".", "String",
"Tools supporting (detecting) the sv")
overlap_cutoff,
duplicatescore_threshold=-2,
genotyper="svtyper"):
""" Annotating duplicate candidates based on the genotype likelihoods
- genotype likelihoods can be provided by svtyper or genomestrip
"""
add_redundancy_infos_header(reader)
setLikeliTag(genotyper)
variants = defaultdict()
for sv in SVSet:
variants[sv.id] = sv
pybed_variants = vcf_to_pybed(SVSet)
self_overlap = pybed_variants.intersect(pybed_variants,
seen = defaultdict(tuple)
duplicates = defaultdict(list)
overlapping = defaultdict(tuple)
reference = defaultdict()
for o in self_overlap:
if o[3] == o[7]:
continue
a, b = ordered(o[3], o[7])
if seen[(a, b)]:
continue
seen[(a, b)] = True
u = variants[a]
v = variants[b]
score = duplicatescore(u, v)
# print("Comparing %s and %s : %3.8f" % (u.id, v.id, score))
if score > duplicatescore_threshold:
# print("%s prefered to %s %3.8f" % (ref.id, dupli.id, score))
reference[ref] = 1
overlap_size = int(o[-1])
overlap = getoverlap(dupli, overlap_size)
if ref.maxGQ() > 0 and dupli.passed:
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
# are dismissed
# - reference variant with 0 genotype quality for all markers
# - duplicate that are already tagged as filtered out
# dupli.id is considered as a duplicate of ref.id
duplicates[dupli.id].append((ref.id, score, overlap))
else:
overlap_size = int(o[-1])
overlap_u = getoverlap(u, overlap_size)
overlap_v = getoverlap(v, overlap_size)
overlapping[(u, v)] = {'dupli_score': score,
'overlap_left': overlap_u,
'overlap_right': overlap_v,
}
for u in SVSet:
if u.id in duplicates:
print("tagged as duplicate %s" % u.id)
duplis = sorted([a for (a, s, o) in duplicates[u.id]])
score = max([s for (a, s, o) in duplicates[u.id]])
overlap = max([o for (a, s, o) in duplicates[u.id]])
add_duplicate_info_sv(u, overlap, score, duplis)
def add_duplicate_info_sv(sv, duplicateoverlap, duplicatescore, duplicates):
"""
simply adding two information to the sv infos
"""
if isnan(duplicatescore):
sv.record.info['DUPLICATESCORE'] = float('nan')
else:
sv.record.info['DUPLICATESCORE'] = duplicatescore
sv.record.info['DUPLICATEOVERLAP'] = duplicateoverlap
sv.record.info['DUPLICATEOF'] = duplicates
def add_overlap_info_sv(sv, overlap, duplicatescore):
"""
simply adding two information to the sv infos
"""
if isnan(duplicatescore):
sv.record.info['DUPLICATESCORE'] = float('nan')
else:
sv.record.info['DUPLICATESCORE'] = duplicatescore
sv.record.info['NONDUPLICATEOVERLAP'] = overlap
def add_callrate_infos_header(reader):
# Adding CALLRATE info
reader.addInfo("CALLRATE", 1, "Float",
"Percentage of samples called with a GQ>13")
# Adding VARIANTCALLRATE info
reader.addInfo("VARIANTCALLRATE", 1, "Float",
"Percentage of variant samples called with a GQ>13")
def add_filter_infos_header(reader):
# FILTERS
# Adding specific filters
reader.addFilter("CALLRATE", "Call rate <0.75")
reader.addFilter("VARIANTCALLRATE", "Variant Call rate <0.75")
reader.addFilter("MONOMORPH", "All samples have the same genotype")
reader.addFilter("DUPLICATE", "GSDUPLICATESCORE>0")
reader.addFilter("OVERLAP", "NONDUPLICATEOVERLAP>0.7")
reader.addFilter("ABFREQ", "AB frequency <0.3 for >50% heterosamples")
def variant_filtration(variant_set, reader, filter_monomorph=False,
filter_callrate=False):
""" Filtering the candidate CNVs according to the following criteria
- non duplicate sites
- variant sites
- call rate > 0.8
- at least one variant (homozygous or heterozygote) has a genotype
quality > 20
- the variant is not everywhere heterozygote or homozygote
(use NONVARIANTSCORE in both cases)
"""
add_callrate_infos_header(reader)
add_filter_infos_header(reader)
sv.record.info['CALLRATE'] = sv.call_rate(13)
sv.record.info['VARIANTCALLRATE'] = sv.variant_call_rate(13)
if sv.call_rate(13) < 0.75 and filter_callrate:
sv.filter.add("CALLRATE")
if not sv.polymorph() and filter_monomorph:
sv.filter.add("MONOMORPH")
if 'NONDUPLICATEOVERLAP' in info and info['NONDUPLICATEOVERLAP'] > 0.8:
sv.filter.add("OVERLAP")
if "DUPLICATESCORE" in info is not None and info['DUPLICATESCORE'] > -2:
sv.filter.add("DUPLICATE")
""" Filtering the candidate CNVs according to the following criteria
- more than 50% of variant samples should have AB freq > 0.3
"""
for sv in variant_set:
valid_AB_freq = []
for s in sv.record.samples.values():
if Heterozygote(s):
valid_AB_freq.append((s.get('AB')[0] > 0.3))
if (len(valid_AB_freq) > 0 and
sum(valid_AB_freq) < len(valid_AB_freq) / 2):
"""
Construct connected components of duplicates and rename the variants
"""
undirected = Graph()
variant_dict = defaultdict()
representatives = defaultdict()
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
variant_dict[s.id] = s
if "DUPLICATE" in s.filter:
for dupli_repr in s.record.info["DUPLICATEOF"]:
undirected.add_edge(s.id, dupli_repr)
for component in connected_components(undirected):
for c in component:
if "DUPLICATEOF" in variant_dict[c].record.info:
# the current variant is a duplicate
continue
rep = c # the representative of the equivalence class
break
duplicates = component
duplicates.remove(rep)
representatives[rep] = duplicates
add_duplicate_infos(representatives, variant_dict)
def add_duplicate_infos(representatives, sv_dict):
for rep, elements in representatives.items():
for d in elements:
sv_dict[d].record.info['DUPLICATEOF'] = rep
duplicates = list(elements)
if 'DUPLICATES' in sv_dict[rep].record.info:
print(sv_dict[rep].record.info['DUPLICATES'])
duplicates.extend(sv_dict[rep].record.info['DUPLICATES'])
if len(duplicates) > 0:
sv_dict[rep].record.info['DUPLICATES'] = duplicates
def get_tool_name(sv_ident):
return sv_ident.split("_")[0]
def set_supporting_tools(variant_set):
for sv in variant_set:
tools = {get_tool_name(sv.id)}
if "DUPLICATES" in sv.record.info:
duplicates = sv.record.info['DUPLICATES']
# print(duplicates)
for dupli in duplicates:
tools.add(get_tool_name(dupli))
if 'TOOLSUPPORT' in sv.record.info:
supporting = set(sv.record.info['TOOLSUPPORT'])
sv.record.info['TOOLSUPPORT'] = list(tools)
def static_vars(**kwargs):
def decorate(func):
for k in kwargs:
setattr(func, k, kwargs[k])
return func
return decorate
@static_vars(counter=0)
def new_id_str(sv):
new_id_str.counter += 1
return "_".join(["cnvpipeline", sv.chrom, sv.svtype,
str(new_id_str.counter)])
def rename_info_field(sv, key, sv_dict):
if key in sv.record.info:
info_oldid = sv.record.info[key]
info_newid = [sv_dict[id] for id in info_oldid]
sv.record.info[key] = info_newid
sv_dict = defaultdict()
for sv in SVSet:
new_id = new_id_str(sv)
sv.setNewId(new_id)
sv_dict[sv.id] = new_id
for sv in SVSet:
rename_info_field(sv, "DUPLICATEOF", sv_dict)
rename_info_field(sv, "DUPLICATES", sv_dict)
sv.record.info['SOURCEID'] = sv.id
sv.rename()