Newer
Older
from collections import defaultdict
from networkx import Graph, connected_components
import numpy as np
from math import isnan
from svrunner_utils import eprint, warn, vcf_to_pybed
from svreader.vcfwrapper import VCFRecord, VCFReader, VCFWriter
VALID_GENOTYPES = [HOM_REF, HET_VAR, HOM_VAR]
SVTYPER_GENO_LIKELI_TAG = "GL"
GENOMESTRIP_GENO_LIKELI_TAG = "GP"
Geno_likeli_tag = SVTYPER_GENO_LIKELI_TAG
def coded_geno(geno):
if geno == HOM_REF:
return "HOMO_REF"
elif geno == HET_VAR:
return "HET_VAR"
elif geno == HOM_VAR:
return "HOMO_VAR"
else:
return "UNDEF"
def Variant(sample):
if sample.get('GT') in [HET_VAR, HOM_VAR]:
return True
else:
return False
def Heterozygote(sample):
if sample.get('GT') in [HET_VAR]:
return True
else:
return False
class AnnotateRecord(VCFRecord):
"""
A lightweight object to annotated the final records
"""
def __init__(self, record):
"""
A pysam VariantRecord wrapper
"""
super(AnnotateRecord, self).__init__(record)
self.new_id = None
@property
def start(self):
return self.record.pos
@property
def end(self):
return self.record.stop
@property
def svtype(self):
return self._sv_type
@property
def svlen(self):
return abs(self.stop - self.start)
def passed(self):
if "PASS" in self.record.filter:
return True
else:
return False
@property
def num_samples(self):
return len(self.record.samples.keys())
def setNewId(self, new_id):
self.new_id = new_id
def rename(self):
try:
self.record.info['SOURCEID'] = self.id
except KeyError:
eprint("SOURCEID absent from record info keys,")
sys.exit(1)
self.id = self.new_id
return sum([Variant(s) for s in self.samples.values()])
support = []
for s in self.samples.values():
if s.get('AO') is not None:
support.append(s.get('AO')[0])
return max(support)
variant_qual = []
for s in self.samples.values():
if s.get('SQ') is not None:
variant_qual.append(s.get('SQ'))
return sum(variant_qual)
def GQ_samples(self):
genotype_qual = []
for s in self.samples.values():
if s.get('GQ') is not None:
genotype_qual.append(s.get('GQ'))
return genotype_qual
def GQ_sum_score(self):
def maxGQ(self):
def setQual(self):
self.record.qual = self.qual()
def numdiffGenotypes(self):
genotypes = defaultdict()
for s in self.samples.values():
if s.get('GT') in VALID_GENOTYPES:
genotypes[coded_geno(s.get('GT'))] = 1
return len(genotypes.keys())
def Polymorph(self):
return self.numdiffGenotypes() > 1
def AddSupportInfos(self):
supp_reads = self.variant_read_support()
num_supp_samples = self.num_variant_samples()
try:
self.record.info['MAX_SUPP_READS'] = supp_reads
self.record.info['NUM_SUPP_SAMPLES'] = num_supp_samples
except KeyError:
eprint("SUPP_READS or NUM_SUPP_SAMPLES absent from record info keys")
sys.exit(1)
def CallRate(self, cutoff):
call_qual = []
for s in self.samples.values():
if s.get('GQ') is not None:
call_qual.append(s.get('GQ'))
num_qual_call = sum([(qual > cutoff) for qual in call_qual])
return num_qual_call/self.num_samples
def VariantCallRate(self, cutoff):
samples = self.samples.values()
num_qual_var = 0
for s in samples:
if s.get("GQ") is not None and s.get("GQ") > cutoff and Variant(s):
num_qual_var += 1
num_var_samples = self.num_variant_samples()
var_call_rate = num_qual_var/num_var_samples if num_var_samples else 0
return var_call_rate
def UnifiedPass(self):
"""
All records passing the filters (PASS, .) ar now labelled PASS
"""
record = self.record
filters = [f for f in record.filter]
# We make the assumption when a "." is present no other filter
# are present
if len(filters) == 0 or "." in filters:
record.filter.clear()
record.filter.add("PASS")
class AnnotateReader(VCFReader):
def __init__(self, file_name, sv_to_report=None):
super(AnnotateReader, self).__init__(file_name)
self.add_annotation_metadata()
def __iter__(self):
return self
def __next__(self):
while True:
raw_record = next(self.vcf_reader)
record = AnnotateRecord(raw_record)
return record
def getHeader(self):
return self.vcf_reader.header
# def addInfo(self, name, number, type, description):
# self.vcf_reader.header.info.add(id=name,
# number=number,
# type=type,
# description=description)
#
# def addFilter(self, name, description):
# self.vcf_reader.header.filters.add(id=name,
# number=None,
# type=None,
# description=description)
def add_annotation_metadata(self):
self.addInfo("SOURCEID", 1, "String",
"The source sv identifier")
self.addInfo("MAX_SUPP_READS", 1, "Integer",
"Max number of supporting reads")
self.addInfo("NUM_SUPP_SAMPLES", 1, "Integer",
"Number of supporting samples")
self.addFilter("LOWSUPPORT",
"total supp reads < 5 or supp samples < 2")
def getOrderedSamples(self):
samples = self.vcf_reader.header.samples
sample_names = [sample.rsplit('.')[0] for sample in samples]
return sample_names
def numSamples(self):
return len(self.vcf_reader.header.samples)
class AnnotateWriter(VCFWriter):
def __init__(self, file_name, template_reader, index=True):
super(VCFWriter, self).__init__(file_name,
template_reader.tool_name,
template_reader)
def _open(self):
self.vcf_writer = VariantFile(self.filename, 'w',
header=self.template_reader.getHeader())
self._isopen = True
def _write(self, record):
self.vcf_writer.write(record.record)
def _close(self):
if self._isopen:
self.vcf_writer.close()
else: # nothing was written
self._dumpemptyvcf()
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
def ordered(a, b):
# simply ordering the two string
return (a, b) if a < b else (b, a)
def setLikeliTag(genotyper):
global Geno_likeli_tag
if genotyper == "svtyper":
Geno_likeli_tag = SVTYPER_GENO_LIKELI_TAG
warn("Assuming genotypes provided by svtyper software" +
" hence tag is %s" % (Geno_likeli_tag))
elif genotyper == "genomestrip":
Geno_likeli_tag = GENOMESTRIP_GENO_LIKELI_TAG
warn("Assuming genotypes provided by genomestrip software" +
" hence tag is %s" % (Geno_likeli_tag))
else:
print("Unknown genotyping software")
exit(1)
def getlikelihoods(sample):
return sample.get('GL')
def probas(likelihoods):
# transform log likelihoods into likelihoods
return np.exp(np.array(likelihoods))
def getprobas(sample):
# Transforming likelihods into probabilities
return probas(getlikelihoods(sample))/np.sum(probas(getlikelihoods(sample)))
def ondiagonal(u_s, v_s):
# Probability that, for that individual, the two SVs are identically
# genotyped P1(0/0)*P2(0/0) + ... P1(1/1)*P2(1/1)
# in the same way :
p = getprobas(u_s)
q = getprobas(v_s)
proba = 0
for a, b in zip(p, q):
proba += a*b
# print("Proba on %3.5f" %(proba))
return proba
def offdiagonal(u_s, v_s):
# Probability that, for that individual, the two SVs are not identically
# in the same way, complement of the previous one
p = getprobas(u_s)
q = getprobas(v_s)
proba = 0
for i, a in enumerate(p):
for j, b in enumerate(q):
if i != j:
proba += a*b
# print("Proba off %3.2f" %(proba))
return proba
def duplicatescore(u, v):
# For the two SVs, max discordant genotype log-ratio proba
# same genotypes against discordant against (worse individual)
# u_s is the sample of id s of u (idem for v_s)
# Valid samples
valid_samples = []
for s in u.samples:
u_s = u.samples[s]
v_s = v.samples[s]
if (u_s.get('GQ') is not None
and v_s.get('GQ') is not None):
valid_samples.append(s)
max_disc = 0
computed = float('NaN')
for s in valid_samples:
# ondiago is not used, we keep it just for comprehension
# ondiago = ondiagonal(s, u, v)
u_s = u.samples[s]
v_s = v.samples[s]
offdiago = offdiagonal(u_s, v_s)
if offdiago > max_disc:
max_disc = offdiago
if max_disc > 0 and max_disc < 1:
ratio = (1-max_disc)/max_disc
computed = np.log(ratio)
return computed
def gstrength(u):
# Sum of phred-like genotype qualities provides a measure of the
# combined genotype quality of the site
# np.sum([s['GQ'] if s['GQ'] is not None else 0 for s in u.samples.values()])
return u.GQ_sum_score()
def variantstrength(u):
# maximum SQ, where SQ stands for
# Phred-scaled probability that this site is variant (non-reference)
# in this sample)
# QUAL = -10 * log(P(locus is reference in all samples)), which is
# equal to the sum of the SQ scores.
# see https://github.com/hall-lab/svtyper/issues/10
# sum([s['SQ'] if s['SQ'] is not None else 0 for s in u.samples.values()])
return u.qual()
# max([s['SQ'] if s['SQ'] is not None else 0 for s in u.samples.values()])
def getduplicates_GQ(u, v):
# select the prefered duplicate on the basis of the
# Sum of phred-like genotype qualities
# see gstrength
# returns prefered, discarded, strength of both
if gstrength(u) > gstrength(v):
return (u, v, gstrength(u), gstrength(v))
else:
return (v, u, gstrength(v), gstrength(u))
def getduplicates_QUAL(u, v):
# select the prefered duplicate on the basis of
# Phred-scaled probability that this site is a variant
# see variantstrength
# returns prefered, discarded, strength of both
if variantstrength(u) > variantstrength(v):
return (u, v, variantstrength(u), variantstrength(v))
else:
return (v, u, variantstrength(v), variantstrength(u))
def getoverlap(u, osize):
# percentage overlap given the size of the overlap
return 100*osize/u.svlen
def add_redundancy_infos_header(reader):
# Adding DUPLICATESCORE info (equivalent to GSDUPLICATESCORE)
reader.addInfo("DUPLICATESCORE", 1, "Float",
"LOD score that the events are distinct based on the "
"genotypes of the most discordant sample")
# Adding DUPLICATES info (equivalent to GSDUPLICATEOVERLAP)
reader.addInfo("DUPLICATEOVERLAP", 1, "Float",
"Highest overlap with a duplicate event")
# Adding DUPLICATEOF info (list of variant prefered to this one)
reader.addInfo("DUPLICATEOF", ".", "String",
"List of duplicate events preferred to this one")
# Adding DUPLICATES info (list of variants duplicates of this one)
reader.addInfo("DUPLICATES", ".", "String",
"List of duplicate events represented by the current sv")
# Adding NONDUPLICATEOVERLAP
reader.addInfo("NONDUPLICATEOVERLAP", 1, "Float",
"Amount of overlap with a non-duplicate")
# Adding TOOLSUPPORT
reader.addInfo("TOOLSUPPORT", ".", "String",
"Tools supporting (detecting) the sv")
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
def GenomeSTRIPLikeRedundancyAnnotator(SVSet, reader,
duplicatescore_threshold=-2,
genotyper="svtyper"):
""" Annotating duplicate candidates based on the genotype likelihoods
- genotype likelihoods can be provided by svtyper or genomestrip
"""
add_redundancy_infos_header(reader)
setLikeliTag(genotyper)
variants = defaultdict()
for sv in SVSet:
variants[sv.id] = sv
pybed_variants = vcf_to_pybed(SVSet)
self_overlap = pybed_variants.intersect(pybed_variants,
f=0.5, r=True, wo=True)
seen = defaultdict(tuple)
duplicates = defaultdict(list)
overlapping = defaultdict(tuple)
reference = defaultdict()
for o in self_overlap:
if o[3] == o[7]:
continue
a, b = ordered(o[3], o[7])
if seen[(a, b)]:
continue
seen[(a, b)] = True
u = variants[a]
v = variants[b]
score = duplicatescore(u, v)
# print("Comparing %s and %s : %3.8f" % (u.id, v.id, score))
if score > duplicatescore_threshold:
ref, dupli, s1, s2 = getduplicates_GQ(u, v)
# print("%s prefered to %s %3.8f" % (ref.id, dupli.id, score))
reference[ref] = 1
overlap_size = int(o[-1])
overlap = getoverlap(dupli, overlap_size)
if ref.maxGQ() > 0 and dupli.passed:
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
# are dismissed
# - reference variant with 0 genotype quality for all markers
# - duplicate that are already tagged as filtered out
# dupli.id is considered as a duplicate of ref.id
duplicates[dupli.id].append((ref.id, score, overlap))
else:
overlap_size = int(o[-1])
overlap_u = getoverlap(u, overlap_size)
overlap_v = getoverlap(v, overlap_size)
overlapping[(u, v)] = {'dupli_score': score,
'overlap_left': overlap_u,
'overlap_right': overlap_v,
}
for u in SVSet:
if u.id in duplicates:
print("tagged as duplicate %s" % u.id)
duplis = sorted([a for (a, s, o) in duplicates[u.id]])
score = max([s for (a, s, o) in duplicates[u.id]])
overlap = max([o for (a, s, o) in duplicates[u.id]])
add_duplicate_info_sv(u, overlap, score, duplis)
def add_duplicate_info_sv(sv, duplicateoverlap, duplicatescore, duplicates):
"""
simply adding two information to the sv infos
"""
if isnan(duplicatescore):
sv.record.info['DUPLICATESCORE'] = float('nan')
else:
sv.record.info['DUPLICATESCORE'] = duplicatescore
sv.record.info['DUPLICATEOVERLAP'] = duplicateoverlap
sv.record.info['DUPLICATEOF'] = duplicates
def add_overlap_info_sv(sv, overlap, duplicatescore):
"""
simply adding two information to the sv infos
"""
if isnan(duplicatescore):
sv.record.info['DUPLICATESCORE'] = float('nan')
else:
sv.record.info['DUPLICATESCORE'] = duplicatescore
sv.record.info['NONDUPLICATEOVERLAP'] = overlap
def add_callrate_infos_header(reader):
# Adding CALLRATE info
reader.addInfo("CALLRATE", 1, "Float",
"Percentage of samples called with a GQ>13")
# Adding VARIANTCALLRATE info
reader.addInfo("VARIANTCALLRATE", 1, "Float",
"Percentage of variant samples called with a GQ>13")
def add_filter_infos_header(reader):
# FILTERS
# Adding specific filters
reader.addFilter("CALLRATE", "Call rate <0.75")
reader.addFilter("VARIANTCALLRATE", "Variant Call rate <0.75")
reader.addFilter("MONOMORPH", "All samples have the same genotype")
reader.addFilter("DUPLICATE", "GSDUPLICATESCORE>0")
reader.addFilter("OVERLAP", "NONDUPLICATEOVERLAP>0.7")
reader.addFilter("ABFREQ", "AB frequency <0.3 for >50% heterosamples")
def GenomeSTRIPLikefiltering(SVSet, reader):
""" Filtering the candidate CNVs according to the following criteria
- non duplicate sites
- variant sites
- call rate > 0.8
- at least one variant (homozygous or heterozygote) has a genotype
quality > 20
- the variant is not everywhere heterozygote or homozygote
(use NONVARIANTSCORE in both cases)
"""
add_callrate_infos_header(reader)
add_filter_infos_header(reader)
for sv in SVSet:
info = sv.record.info
sv.record.info['CALLRATE'] = sv.CallRate(13)
sv.record.info['VARIANTCALLRATE'] = sv.VariantCallRate(13)
if sv.CallRate(13) < 0.75:
sv.filter.add("CALLRATE")
if not sv.Polymorph():
sv.filter.add("MONOMORPH")
if 'NONDUPLICATEOVERLAP' in info and info['NONDUPLICATEOVERLAP'] > 0.7:
sv.filter.add("OVERLAP")
if "DUPLICATESCORE" in info is not None and info['DUPLICATESCORE'] > -2:
sv.filter.add("DUPLICATE")
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
def ABFreqFiltering(SVSet):
""" Filtering the candidate CNVs according to the following criteria
- more than 50% of variant samples should have AB freq > 0.3
"""
for sv in SVSet:
ABfreqOK = []
for s in sv.record.samples.values():
if Heterozygote(s):
ABfreqOK.append((s.get('AB')[0] > 0.3))
if len(ABfreqOK) > 0 and sum(ABfreqOK) < len(ABfreqOK)/2:
sv.filter.add("ABFREQ")
def GetConnectedDuplicates(SVSet):
"""
Construct connected components of duplicates and rename the variants
"""
undirected = Graph()
variant_dict = defaultdict()
representatives = defaultdict()
for s in SVSet:
variant_dict[s.id] = s
if "DUPLICATE" in s.filter:
for dupli_repr in s.record.info["DUPLICATEOF"]:
undirected.add_edge(s.id, dupli_repr)
for component in connected_components(undirected):
for c in component:
if "DUPLICATEOF" in variant_dict[c].record.info:
# the current variant is a duplicate
continue
rep = c # the representative of the equivalence class
break
duplicates = component
duplicates.remove(rep)
representatives[rep] = duplicates
add_duplicate_infos(representatives, variant_dict)
def add_duplicate_infos(representatives, sv_dict):
for rep, elements in representatives.items():
for d in elements:
sv_dict[d].record.info['DUPLICATEOF'] = rep
duplicates = list(elements)
if 'DUPLICATES' in sv_dict[rep].record.info:
print(sv_dict[rep].record.info['DUPLICATES'])
duplicates.extend(sv_dict[rep].record.info['DUPLICATES'])
if len(duplicates) > 0:
sv_dict[rep].record.info['DUPLICATES'] = duplicates
def get_tool_name(sv_ident):
return sv_ident.split("_")[0]
def SetSupportingTools(SVSet):
for sv in SVSet:
tools = {get_tool_name(sv.id)}
if "DUPLICATES" in sv.record.info:
duplicates = sv.record.info['DUPLICATES']
# print(duplicates)
for dupli in duplicates:
tools.add(get_tool_name(dupli))
if 'TOOLSUPPORT' in sv.record.info:
supporting = set(sv.record.info['TOOLSUPPORT'])
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
sv.record.info['TOOLSUPPORT'] = list(tools)
def static_vars(**kwargs):
def decorate(func):
for k in kwargs:
setattr(func, k, kwargs[k])
return func
return decorate
@static_vars(counter=0)
def new_id_str(sv):
new_id_str.counter += 1
return "_".join(["cnvpipeline", sv.chrom, sv.svtype,
str(new_id_str.counter)])
def rename_info_field(sv, key, sv_dict):
if key in sv.record.info:
info_oldid = sv.record.info[key]
info_newid = [sv_dict[id] for id in info_oldid]
sv.record.info[key] = info_newid
def RenameSV(SVSet):
sv_dict = defaultdict()
for sv in SVSet:
new_id = new_id_str(sv)
sv.setNewId(new_id)
sv_dict[sv.id] = new_id
for sv in SVSet:
rename_info_field(sv, "DUPLICATEOF", sv_dict)
rename_info_field(sv, "DUPLICATES", sv_dict)
sv.record.info['SOURCEID'] = sv.id
sv.rename()