Newer
Older
from collections import defaultdict
import numpy as np
from math import isnan
from svrunner_utils import eprint, warn, vcf_to_pybed
from svreader.vcfwrapper import VCFRecord, VCFReader, VCFWriter
VALID_GENOTYPES = [HOM_REF, HET_VAR, HOM_VAR]
SVTYPER_GENO_LIKELI_TAG = "GL"
GENOMESTRIP_GENO_LIKELI_TAG = "GP"
Geno_likeli_tag = SVTYPER_GENO_LIKELI_TAG
def coded_geno(geno):
if geno == HOM_REF:
return "HOMO_REF"
elif geno == HET_VAR:
return "HET_VAR"
elif geno == HOM_VAR:
return "HOMO_VAR"
else:
return "UNDEF"
def Variant(sample):
if sample.get('GT') in [HET_VAR, HOM_VAR]:
return True
else:
return False
class AnnotateRecord(VCFRecord):
"""
A lightweight object to annotated the final records
"""
def __init__(self, record):
"""
A pysam VariantRecord wrapper
"""
super(AnnotateRecord, self).__init__(record)
self.new_id = None
@property
def start(self):
return self.record.pos
@property
def end(self):
return self.record.stop
@property
def svtype(self):
return self._sv_type
@property
def svlen(self):
return abs(self.stop - self.start)
def passed(self):
if "PASS" in self.record.filter:
return True
else:
return False
@property
def num_samples(self):
return len(self.record.samples.keys())
self.new_id = identifier
def rename(self):
try:
self.record.info['SOURCEID'] = self.id
except KeyError:
eprint("SOURCEID absent from record info keys,")
sys.exit(1)
self.id = self.new_id
return sum([Variant(s) for s in self.samples.values()])
return max([s.get('AO')[0] for s in self.samples.values()])
return sum([s.get('SQ') for s in self.samples.values()])
def GQ_sum_score(self):
return sum([s.get('GQ') for s in self.samples.values()])
def maxGQ(self):
return max([s.get('GQ') for s in self.samples.values()])
def setQual(self):
self.record.qual = self.qual()
def numdiffGenotypes(self):
genotypes = defaultdict()
for s in self.samples.values():
if s.get('GT') in VALID_GENOTYPES:
genotypes[coded_geno(s.get('GT'))] = 1
return len(genotypes.keys())
def Polymorph(self):
return self.numdiffGenotypes() > 1
def AddSupportInfos(self):
supp_reads = self.variant_read_support()
num_supp_samples = self.num_variant_samples()
try:
self.record.info['MAX_SUPP_READS'] = supp_reads
self.record.info['NUM_SUPP_SAMPLES'] = num_supp_samples
except KeyError:
eprint("SUPP_READS or NUM_SUPP_SAMPLES absent from record info keys")
sys.exit(1)
def CallRate(self, cutoff):
num_qual_call = sum([(s.get('GQ') > cutoff) for s in self.samples.values()])
return num_qual_call/self.num_samples
def VariantCallRate(self, cutoff):
num_qual_call = sum([(s.get('GQ') > cutoff) for s in self.samples.values() if Variant(s)])
return num_qual_call/self.num_variant_samples()
def UnifiedPass(self):
"""
All records passing the filters (PASS, .) ar now labelled PASS
"""
record = self.record
filters = [f for f in record.filter]
# We make the assumption when a "." is present no other filter
# are present
if len(filters) == 0 or "." in filters:
record.filter.clear()
record.filter.add("PASS")
class AnnotateReader(VCFReader):
def __init__(self, file_name, sv_to_report=None):
super(AnnotateReader, self).__init__(file_name)
self.add_annotation_metadata()
def __iter__(self):
return self
def __next__(self):
while True:
raw_record = next(self.vcf_reader)
record = AnnotateRecord(raw_record)
return record
def getHeader(self):
return self.vcf_reader.header
# def addInfo(self, name, number, type, description):
# self.vcf_reader.header.info.add(id=name,
# number=number,
# type=type,
# description=description)
#
# def addFilter(self, name, description):
# self.vcf_reader.header.filters.add(id=name,
# number=None,
# type=None,
# description=description)
def add_annotation_metadata(self):
self.addInfo("SOURCEID", 1, "String",
"The source sv identifier")
self.addInfo("MAX_SUPP_READS", 1, "Integer",
"Max number of supporting reads")
self.addInfo("NUM_SUPP_SAMPLES", 1, "Integer",
"Number of supporting samples")
self.addFilter("LOWSUPPORT",
"total supp reads < 5 or supp samples < 2")
def getOrderedSamples(self):
samples = self.vcf_reader.header.samples
sample_names = [sample.rsplit('.')[0] for sample in samples]
return sample_names
def numSamples(self):
return len(self.vcf_reader.header.samples)
class AnnotateWriter(VCFWriter):
def __init__(self, file_name, template_reader, index=True):
super(VCFWriter, self).__init__(file_name,
template_reader.tool_name,
template_reader)
def _open(self):
self.vcf_writer = VariantFile(self.filename, 'w',
header=self.template_reader.getHeader())
self._isopen = True
def _write(self, record):
self.vcf_writer.write(record.record)
def _close(self):
if self._isopen:
self.vcf_writer.close()
else: # nothing was written
self._dumpemptyvcf()
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
def ordered(a, b):
# simply ordering the two string
return (a, b) if a < b else (b, a)
def setLikeliTag(genotyper):
global Geno_likeli_tag
if genotyper == "svtyper":
Geno_likeli_tag = SVTYPER_GENO_LIKELI_TAG
warn("Assuming genotypes provided by svtyper software" +
" hence tag is %s" % (Geno_likeli_tag))
elif genotyper == "genomestrip":
Geno_likeli_tag = GENOMESTRIP_GENO_LIKELI_TAG
warn("Assuming genotypes provided by genomestrip software" +
" hence tag is %s" % (Geno_likeli_tag))
else:
print("Unknown genotyping software")
exit(1)
def getlikelihoods(sample):
return sample.get('GL')
def probas(likelihoods):
# transform log likelihoods into likelihoods
return np.exp(np.array(likelihoods))
def getprobas(sample):
# Transforming likelihods into probabilities
return probas(getlikelihoods(sample))/np.sum(probas(getlikelihoods(sample)))
def ondiagonal(u_s, v_s):
# Probability that, for that individual, the two SVs are identically
# genotyped P1(0/0)*P2(0/0) + ... P1(1/1)*P2(1/1)
# in the same way :
p = getprobas(u_s)
q = getprobas(v_s)
proba = 0
for a, b in zip(p, q):
proba += a*b
# print("Proba on %3.5f" %(proba))
return proba
def offdiagonal(u_s, v_s):
# Probability that, for that individual, the two SVs are not identically
# in the same way, complement of the previous one
p = getprobas(u_s)
q = getprobas(v_s)
proba = 0
for i, a in enumerate(p):
for j, b in enumerate(q):
if i != j:
proba += a*b
# print("Proba off %3.2f" %(proba))
return proba
def duplicatescore(u, v):
# For the two SVs, max discordant genotype log-ratio proba
# same genotypes against discordant against (worse individual)
# u_s is the sample of id s of u (idem for v_s)
# Valid samples
valid_samples = []
for s in u.samples:
u_s = u.samples[s]
v_s = v.samples[s]
if (u_s.get('GQ') is not None
and v_s.get('GQ') is not None):
valid_samples.append(s)
max_disc = 0
computed = float('NaN')
for s in valid_samples:
# ondiago is not used, we keep it just for comprehension
# ondiago = ondiagonal(s, u, v)
u_s = u.samples[s]
v_s = v.samples[s]
offdiago = offdiagonal(u_s, v_s)
if offdiago > max_disc:
max_disc = offdiago
if max_disc > 0 and max_disc < 1:
ratio = (1-max_disc)/max_disc
computed = np.log(ratio)
return computed
def gstrength(u):
# Sum of phred-like genotype qualities provides a measure of the
# combined genotype quality of the site
# np.sum([s['GQ'] if s['GQ'] is not None else 0 for s in u.samples.values()])
return u.GQ_sum_score()
def variantstrength(u):
# maximum SQ, where SQ stands for
# Phred-scaled probability that this site is variant (non-reference)
# in this sample)
# QUAL = -10 * log(P(locus is reference in all samples)), which is
# equal to the sum of the SQ scores.
# see https://github.com/hall-lab/svtyper/issues/10
# sum([s['SQ'] if s['SQ'] is not None else 0 for s in u.samples.values()])
return u.qual()
# max([s['SQ'] if s['SQ'] is not None else 0 for s in u.samples.values()])
def getduplicates_GQ(u, v):
# select the prefered duplicate on the basis of the
# Sum of phred-like genotype qualities
# see gstrength
# returns prefered, discarded, strength of both
if gstrength(u) > gstrength(v):
return (u, v, gstrength(u), gstrength(v))
else:
return (v, u, gstrength(v), gstrength(u))
def getduplicates_QUAL(u, v):
# select the prefered duplicate on the basis of
# Phred-scaled probability that this site is a variant
# see variantstrength
# returns prefered, discarded, strength of both
if variantstrength(u) > variantstrength(v):
return (u, v, variantstrength(u), variantstrength(v))
else:
return (v, u, variantstrength(v), variantstrength(u))
def getoverlap(u, osize):
# percentage overlap given the size of the overlap
return 100*osize/u.svlen
def add_redundancy_infos_header(reader):
# Adding DUPLICATESCORE info (equivalent to GSDUPLICATESCORE)
reader.addInfo("DUPLICATESCORE", 1, "Float",
"LOD score that the events are distinct based on the "
"genotypes of the most discordant sample")
# Adding DUPLICATES info (equivalent to GSDUPLICATEOVERLAP)
reader.addInfo("DUPLICATEOVERLAP", 1, "Float",
"Highest overlap with a duplicate event")
# Adding DUPLICATEOF info (list of variant prefered to this one)
reader.addInfo("DUPLICATEOF", ".", "String",
"List of duplicate events preferred to this one")
# Adding DUPLICATES info (list of variants duplicates of this one)
reader.addInfo("DUPLICATES", ".", "String",
"List of duplicate events represented by the current sv")
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
# Adding NONDUPLICATEOVERLAP
reader.addInfo("NONDUPLICATEOVERLAP", 1, "Float",
"Amount of overlap with a non-duplicate")
def GenomeSTRIPLikeRedundancyAnnotator(SVSet, reader,
duplicatescore_threshold=-2,
genotyper="svtyper"):
""" Annotating duplicate candidates based on the genotype likelihoods
- genotype likelihoods can be provided by svtyper or genomestrip
"""
add_redundancy_infos_header(reader)
setLikeliTag(genotyper)
variants = defaultdict()
for sv in SVSet:
variants[sv.id] = sv
pybed_variants = vcf_to_pybed(SVSet)
self_overlap = pybed_variants.intersect(pybed_variants,
f=0.5, r=True, wo=True)
seen = defaultdict(tuple)
duplicates = defaultdict(list)
overlapping = defaultdict(tuple)
reference = defaultdict()
for o in self_overlap:
if o[3] == o[7]:
continue
a, b = ordered(o[3], o[7])
if seen[(a, b)]:
continue
seen[(a, b)] = True
u = variants[a]
v = variants[b]
score = duplicatescore(u, v)
# print("Comparing %s and %s : %3.8f" % (u.id, v.id, score))
if score > duplicatescore_threshold:
ref, dupli, s1, s2 = getduplicates_GQ(u, v)
# print("%s prefered to %s %3.8f" % (ref.id, dupli.id, score))
reference[ref] = 1
overlap_size = int(o[-1])
overlap = getoverlap(dupli, overlap_size)
if ref.maxGQ() > 0 and dupli.passed:
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
# are dismissed
# - reference variant with 0 genotype quality for all markers
# - duplicate that are already tagged as filtered out
# dupli.id is considered as a duplicate of ref.id
duplicates[dupli.id].append((ref.id, score, overlap))
else:
overlap_size = int(o[-1])
overlap_u = getoverlap(u, overlap_size)
overlap_v = getoverlap(v, overlap_size)
overlapping[(u, v)] = {'dupli_score': score,
'overlap_left': overlap_u,
'overlap_right': overlap_v,
}
for u in SVSet:
if u.id in duplicates:
print("tagged as duplicate %s" % u.id)
duplis = sorted([a for (a, s, o) in duplicates[u.id]])
score = max([s for (a, s, o) in duplicates[u.id]])
overlap = max([o for (a, s, o) in duplicates[u.id]])
add_duplicate_info_sv(u, overlap, score, duplis)
def add_duplicate_info_sv(sv, duplicateoverlap, duplicatescore, duplicates):
"""
simply adding two information to the sv infos
"""
if isnan(duplicatescore):
sv.record.info['DUPLICATESCORE'] = float('nan')
else:
sv.record.info['DUPLICATESCORE'] = duplicatescore
sv.record.info['DUPLICATEOVERLAP'] = duplicateoverlap
sv.record.info['DUPLICATEOF'] = duplicates
def add_overlap_info_sv(sv, overlap, duplicatescore):
"""
simply adding two information to the sv infos
"""
if isnan(duplicatescore):
sv.record.info['DUPLICATESCORE'] = float('nan')
else:
sv.record.info['DUPLICATESCORE'] = duplicatescore
sv.record.info['NONDUPLICATEOVERLAP'] = overlap
def add_callrate_infos_header(reader):
# Adding CALLRATE info
reader.addInfo("CALLRATE", 1, "Float",
"Percentage of samples called with a GQ>13")
# Adding VARIANTCALLRATE info
reader.addInfo("VARIANTCALLRATE", 1, "Float",
"Percentage of variant samples called with a GQ>13")
def add_filter_infos_header(reader):
# FILTERS
# Adding specific filters
reader.addFilter("CALLRATE", "Call rate <0.75")
reader.addFilter("VARIANTCALLRATE", "Variant Call rate <0.75")
reader.addFilter("MONOMORPH", "All samples have the same genotype")
reader.addFilter("DUPLICATE", "GSDUPLICATESCORE>0")
reader.addFilter("OVERLAP", "NONDUPLICATEOVERLAP>0.7")
def GenomeSTRIPLikefiltering(SVSet, reader):
""" Filtering the candidate CNVs according to the following criteria
- non duplicate sites
- variant sites
- call rate > 0.8
- at least one variant (homozygous or heterozygote) has a genotype
quality > 20
- the variant is not everywhere heterozygote or homozygote
(use NONVARIANTSCORE in both cases)
"""
add_callrate_infos_header(reader)
add_filter_infos_header(reader)
for sv in SVSet:
info = sv.record.info
sv.record.info['CALLRATE'] = sv.CallRate(13)
sv.record.info['VARIANTCALLRATE'] = sv.VariantCallRate(13)
if sv.CallRate(13) < 0.75:
sv.filter.add("CALLRATE")
if not sv.Polymorph():
sv.filter.add("MONOMORPH")
if 'NONDUPLICATEOVERLAP' in info and info['NONDUPLICATEOVERLAP'] > 0.7:
sv.filter.add("OVERLAP")
if "DUPLICATESCORE" in info is not None and info['DUPLICATESCORE'] > -2:
sv.filter.add("DUPLICATE")