Newer
Older
from collections import defaultdict
import numpy as np
from math import isnan
from svrunner_utils import eprint, warn, vcf_to_pybed
from svreader.vcfwrapper import VCFRecord, SVReader, SVWriter
SVTYPER_GENO_LIKELI_TAG = "GL"
GENOMESTRIP_GENO_LIKELI_TAG = "GP"
Geno_likeli_tag = SVTYPER_GENO_LIKELI_TAG
def Variant(sample):
if sample.get('GT') in [HET_VAR, HOM_VAR]:
return True
else:
return False
class VariantRecord(VCFRecord):
"""
A lightweight object to annotated the final records
"""
def __init__(self, record):
"""
A pysam VariantRecord wrapper
"""
super(VariantRecord, self).__init__(record)
@property
def start(self):
return self.record.pos
@property
def end(self):
return self.record.stop
@property
def svtype(self):
return self._sv_type
@property
def svlen(self):
return abs(self.stop - self.start)
def passed(self):
if "PASS" in self.record.filter:
return True
else:
return False
def setNewId(self, identifier):
try:
self.record.info['SOURCEID'] = self.id
except KeyError:
eprint("SOURCEID absent from record info keys,")
sys.exit(1)
self.id = identifier
def num_variant_samples(self):
return sum([Variant(s) for s in self.samples.values()])
return max([s.get('AO')[0] for s in self.samples.values()])
return sum([s.get('SQ') for s in self.samples.values()])
def GQ_sum_score(self):
return sum([s.get('SQ') for s in self.samples.values()])
def setQual(self):
self.record.qual = self.qual()
def AddSupportInfos(self):
supp_reads = self.variant_read_support()
num_supp_samples = self.num_variant_samples()
try:
self.record.info['MAX_SUPP_READS'] = supp_reads
self.record.info['NUM_SUPP_SAMPLES'] = num_supp_samples
except KeyError:
eprint("SUPP_READS or NUM_SUPP_SAMPLES absent from record info keys")
sys.exit(1)
def UnifiedPass(self):
"""
All records passing the filters (PASS, .) ar now labelled PASS
"""
record = self.record
filters = [f for f in record.filter]
# We make the assumption when a "." is present no other filter
# are present
if len(filters) == 0 or "." in filters:
record.filter.clear()
record.filter.add("PASS")
class VariantRecordSample(object):
"""
A lightweight object to VariantRecordSample
"""
def __init__(self, variantsample):
self.variantsample = variantsample
def genotype(self):
return self.variantsample.get('GT')
def SQ_score(self):
# Phred-scaled probability that this site is variant
# (non-reference in this sample)
def GQ_score(self):
# Genotype quality
return self.variantsample.get('GQ')
def GL_score(self):
# Genotype Likelihood, log10-scaled likelihoods of the data given the
# called genotype for each possible genotype generated from the
# reference and alternate alleles given the sample ploidy
return self.variantsample.get('GL')
def AltSupport(self):
return self.variantsample.get('AO')[0]
@property
def isVariant(self):
if self.genotype() in [HET_VAR, HOM_VAR]:
return True
else:
return False
def __init__(self, file_name, sv_to_report=None):
def __iter__(self):
return self
def __next__(self):
while True:
raw_record = next(self.vcf_reader)
record = VariantRecord(raw_record)
return record
def getHeader(self):
return self.vcf_reader.header
def addInfo(self, name, number, type, description):
self.vcf_reader.header.info.add(id=name,
number=number,
type=type,
description=description)
def addFilter(self, name, description):
self.vcf_reader.header.filters.add(id=name,
number=None,
type=None,
description=description)
def add_metadata(self):
self.addInfo("SOURCEID", 1, "String",
"The source sv identifier")
self.addInfo("MAX_SUPP_READS", 1, "Integer",
"Max number of supporting reads")
self.addInfo("NUM_SUPP_SAMPLES", 1, "Integer",
"Number of supporting samples")
self.addFilter("LOWSUPPORT",
"total supp reads < 5 or supp samples < 2")
def getOrderedSamples(self):
samples = self.vcf_reader.header.samples
sample_names = [sample.rsplit('.')[0] for sample in samples]
return sample_names
def numSamples(self):
return len(self.vcf_reader.header.samples)
class VCFWriter(SVWriter):
def __init__(self, file_name, template_reader, index=True):
super(VCFWriter, self).__init__(file_name,
template_reader.tool_name,
template_reader)
def _open(self):
self.vcf_writer = VariantFile(self.filename, 'w',
header=self.template_reader.getHeader())
self._isopen = True
def _write(self, record):
self.vcf_writer.write(record.record)
def _close(self):
if self._isopen:
self.vcf_writer.close()
else: # nothing was written
self._dumpemptyvcf()
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
def ordered(a, b):
# simply ordering the two string
return (a, b) if a < b else (b, a)
def setLikeliTag(genotyper):
global Geno_likeli_tag
if genotyper == "svtyper":
Geno_likeli_tag = SVTYPER_GENO_LIKELI_TAG
warn("Assuming genotypes provided by svtyper software" +
" hence tag is %s" % (Geno_likeli_tag))
elif genotyper == "genomestrip":
Geno_likeli_tag = GENOMESTRIP_GENO_LIKELI_TAG
warn("Assuming genotypes provided by genomestrip software" +
" hence tag is %s" % (Geno_likeli_tag))
else:
print("Unknown genotyping software")
exit(1)
def getlikelihoods(sample):
return sample.get('GL')
def probas(likelihoods):
# transform log likelihoods into likelihoods
return np.exp(np.array(likelihoods))
def getprobas(sample):
# Transforming likelihods into probabilities
return probas(getlikelihoods(sample))/np.sum(probas(getlikelihoods(sample)))
def ondiagonal(u_s, v_s):
# Probability that, for that individual, the two SVs are identically
# genotyped P1(0/0)*P2(0/0) + ... P1(1/1)*P2(1/1)
# in the same way :
p = getprobas(u_s)
q = getprobas(v_s)
proba = 0
for a, b in zip(p, q):
proba += a*b
# print("Proba on %3.5f" %(proba))
return proba
def offdiagonal(u_s, v_s):
# Probability that, for that individual, the two SVs are not identically
# in the same way, complement of the previous one
p = getprobas(u_s)
q = getprobas(v_s)
proba = 0
for i, a in enumerate(p):
for j, b in enumerate(q):
if i != j:
proba += a*b
# print("Proba off %3.2f" %(proba))
return proba
def duplicatescore(u, v):
# For the two SVs, max discordant genotype log-ratio proba
# same genotypes against discordant against (worse individual)
# u_s is the sample of id s of u (idem for v_s)
# Valid samples
valid_samples = []
for s in u.samples:
u_s = u.samples[s]
v_s = v.samples[s]
if (u_s.get('GQ') is not None
and v_s.get('GQ') is not None):
valid_samples.append(s)
max_disc = 0
computed = float('NaN')
for s in valid_samples:
# ondiago is not used, we keep it just for comprehension
# ondiago = ondiagonal(s, u, v)
u_s = u.samples[s]
v_s = v.samples[s]
offdiago = offdiagonal(u_s, v_s)
if offdiago > max_disc:
max_disc = offdiago
if max_disc > 0 and max_disc < 1:
ratio = (1-max_disc)/max_disc
computed = np.log(ratio)
return computed
def gstrength(u):
# Sum of phred-like genotype qualities provides a measure of the
# combined genotype quality of the site
# np.sum([s['GQ'] if s['GQ'] is not None else 0 for s in u.samples.values()])
return u.GQ_sum_score()
def variantstrength(u):
# maximum SQ, where SQ stands for
# Phred-scaled probability that this site is variant (non-reference)
# in this sample)
# QUAL = -10 * log(P(locus is reference in all samples)), which is
# equal to the sum of the SQ scores.
# see https://github.com/hall-lab/svtyper/issues/10
# sum([s['SQ'] if s['SQ'] is not None else 0 for s in u.samples.values()])
return u.qual()
# max([s['SQ'] if s['SQ'] is not None else 0 for s in u.samples.values()])
def getduplicates_GQ(u, v):
# select the prefered duplicate on the basis of the
# Sum of phred-like genotype qualities
# see gstrength
# returns prefered, discarded, strength of both
if gstrength(u) > gstrength(v):
return (u, v, gstrength(u), gstrength(v))
else:
return (v, u, gstrength(v), gstrength(u))
def getduplicates_QUAL(u, v):
# select the prefered duplicate on the basis of
# Phred-scaled probability that this site is a variant
# see variantstrength
# returns prefered, discarded, strength of both
if variantstrength(u) > variantstrength(v):
return (u, v, variantstrength(u), variantstrength(v))
else:
return (v, u, variantstrength(v), variantstrength(u))
def getoverlap(u, osize):
# percentage overlap given the size of the overlap
return 100*osize/u.svlen
def add_redundancy_infos_header(reader):
# Adding DUPLICATESCORE info (equivalent to GSDUPLICATESCORE)
reader.addInfo("DUPLICATESCORE", 1, "Float",
"LOD score that the events are distinct based on the "
"genotypes of the most discordant sample")
# Adding DUPLICATES info (equivalent to GSDUPLICATEOVERLAP)
reader.addInfo("DUPLICATEOVERLAP", 1, "Float",
"Highest overlap with a duplicate event")
# Adding DUPLICATEOVERLAP info (equivalent to GSDUPLICATEOVERLAP)
reader.addInfo("DUPLICATES", ".", "String",
"List of duplicate events preferred to this one")
# Adding NONDUPLICATEOVERLAP
reader.addInfo("NONDUPLICATEOVERLAP", 1, "Float",
"Amount of overlap with a non-duplicate")
def GenomeSTRIPLikeRedundancyAnnotator(SVSet, reader,
duplicatescore_threshold=-2,
genotyper="svtyper"):
""" Annotating duplicate candidates based on the genotype likelihoods
- genotype likelihoods can be provided by svtyper or genomestrip
"""
add_redundancy_infos_header(reader)
setLikeliTag(genotyper)
variants = defaultdict()
for sv in SVSet:
variants[sv.id] = sv
pybed_variants = vcf_to_pybed(SVSet)
self_overlap = pybed_variants.intersect(pybed_variants,
f=0.5, r=True, wo=True)
seen = defaultdict(tuple)
duplicates = defaultdict(list)
overlapping = defaultdict(tuple)
reference = defaultdict()
for o in self_overlap:
print("Here ?", o)
if o[3] == o[7]:
continue
a, b = ordered(o[3], o[7])
if seen[(a, b)]:
continue
seen[(a, b)] = True
u = variants[a]
v = variants[b]
score = duplicatescore(u, v)
# print("Comparing %s and %s : %3.8f" % (u.id, v.id, score))
if score > duplicatescore_threshold:
ref, dupli, s1, s2 = getduplicates_GQ(u, v)
# print("%s prefered to %s %3.8f" % (ref.id, dupli.id, score))
reference[ref] = 1
overlap_size = int(o[-1])
overlap = getoverlap(dupli, overlap_size)
if maxGQ(ref) > 0 and passed(dupli):
# are dismissed
# - reference variant with 0 genotype quality for all markers
# - duplicate that are already tagged as filtered out
# dupli.id is considered as a duplicate of ref.id
duplicates[dupli.id].append((ref.id, score, overlap))
else:
overlap_size = int(o[-1])
overlap_u = getoverlap(u, overlap_size)
overlap_v = getoverlap(v, overlap_size)
overlapping[(u, v)] = {'dupli_score': score,
'overlap_left': overlap_u,
'overlap_right': overlap_v,
}
for u in SVSet:
if u.id in duplicates:
print("tagged as duplicate %s" % u.id)
duplis = sorted([a for (a, s, o) in duplicates[u.id]])
score = max([s for (a, s, o) in duplicates[u.id]])
overlap = max([o for (a, s, o) in duplicates[u.id]])
add_duplicate_info_sv(u, overlap, score, duplis)
def add_duplicate_info_sv(sv, duplicateoverlap, duplicatescore, duplicates):
"""
simply adding two information to the sv infos
"""
if isnan(duplicatescore):
sv.record.info['DUPLICATESCORE'] = float('nan')
else:
sv.record.info['DUPLICATESCORE'] = duplicatescore
sv.record.info['DUPLICATEOVERLAP'] = duplicateoverlap
sv.record.info['DUPLICATES'] = duplicates
def add_overlap_info_sv(sv, overlap, duplicatescore):
"""
simply adding two information to the sv infos
"""
if isnan(duplicatescore):
sv.record.info['DUPLICATESCORE'] = float('nan')
else:
sv.record.info['DUPLICATESCORE'] = duplicatescore
sv.record.info['NONDUPLICATEOVERLAP'] = overlap
def add_filter_infos_header(reader):
# FILTERS
# Adding specific filters
reader.addFilter("CALLRATE", "Call rate <0.75")
reader.addFilter("MONOMORPH", "All samples have the same genotype")
reader.addFilter("DUPLICATE", "GSDUPLICATESCORE>0")
reader.addFilter("OVERLAP", "NONDUPLICATEOVERLAP>0.7")
def GenomeSTRIPLikefiltering(SVSet, reader):
""" Filtering the candidate CNVs according to the following criteria
- non duplicate sites
- variant sites
- call rate > 0.8
- at least one variant (homozygous or heterozygote) has a genotype
quality > 20
- the variant is not everywhere heterozygote or homozygote
(use NONVARIANTSCORE in both cases)
"""
add_filter_infos_header(reader)
for sv in SVSet:
info = sv.record.info
if callRate(sv) < 0.75:
sv.filter.add("CALLRATE")
if not Polymorph(sv):
sv.filter.add("MONOMORPH")
if 'NONDUPLICATEOVERLAP' in info and info['NONDUPLICATEOVERLAP'] > 0.7:
sv.filter.add("OVERLAP")
if "DUPLICATESCORE" in info is not None and info['DUPLICATESCORE'] > -2:
sv.filter.add("DUPLICATE")