import sys from collections import defaultdict from networkx import Graph, connected_components import numpy as np from math import isnan from pysam import VariantFile from svrunner_utils import eprint, warn, vcf_to_pybed from svreader.vcfwrapper import VCFRecord, VCFReader, VCFWriter HOM_REF = (0, 0) HET_VAR = (0, 1) HOM_VAR = (1, 1) VALID_GENOTYPES = [HOM_REF, HET_VAR, HOM_VAR] SVTYPER_GENO_LIKELI_TAG = "GL" GENOMESTRIP_GENO_LIKELI_TAG = "GP" Geno_likeli_tag = SVTYPER_GENO_LIKELI_TAG def coded_geno(geno): if geno == HOM_REF: return "HOMO_REF" elif geno == HET_VAR: return "HET_VAR" elif geno == HOM_VAR: return "HOMO_VAR" else: return "UNDEF" def Variant(sample): if sample.get('GT') in [HET_VAR, HOM_VAR]: return True else: return False def Heterozygote(sample): if sample.get('GT') in [HET_VAR]: return True else: return False class AnnotateRecord(VCFRecord): """ A lightweight object to annotated the final records """ def __init__(self, record): """ A pysam VariantRecord wrapper """ super(AnnotateRecord, self).__init__(record) self.new_id = None @property def start(self): return self.record.pos @property def end(self): return self.record.stop @property def svtype(self): return self._sv_type @property def svlen(self): return abs(self.stop - self.start) @property def passed(self): if "PASS" in self.record.filter: return True else: return False @property def num_samples(self): return len(self.record.samples.keys()) def setNewId(self, new_id): self.new_id = new_id def rename(self): try: self.record.info['SOURCEID'] = self.id except KeyError: eprint("SOURCEID absent from record info keys,") sys.exit(1) self.id = self.new_id def num_variant_samples(self): return sum([Variant(s) for s in self.samples.values()]) def variant_read_support(self): support = [] for s in self.samples.values(): if s.get('AO') is not None: support.append(s.get('AO')[0]) return max(support) def qual(self): variant_qual = [] for s in self.samples.values(): if s.get('SQ') is not None: variant_qual.append(s.get('SQ')) return sum(variant_qual) def GQ_samples(self): genotype_qual = [] for s in self.samples.values(): if s.get('GQ') is not None: genotype_qual.append(s.get('GQ')) return genotype_qual def GQ_sum_score(self): return sum(self.GQ_samples()) def maxGQ(self): return max(self.GQ_samples()) def set_qual(self): self.record.qual = self.qual() def numdiffGenotypes(self): genotypes = defaultdict() for s in self.samples.values(): if s.get('GT') in VALID_GENOTYPES: genotypes[coded_geno(s.get('GT'))] = 1 return len(genotypes.keys()) def polymorph(self): return self.numdiffGenotypes() > 1 def add_supporting_infos(self): supp_reads = self.variant_read_support() num_supp_samples = self.num_variant_samples() #print(supp_reads, num_supp_samples) try: self.record.info['MAX_SUPP_READS'] = supp_reads self.record.info['NUM_SUPP_SAMPLES'] = num_supp_samples except KeyError: eprint("SUPP_READS or NUM_SUPP_SAMPLES absent from record info keys") sys.exit(1) def call_rate(self, cutoff): call_qual = [] for s in self.samples.values(): if s.get('GQ') is not None: call_qual.append(s.get('GQ')) num_qual_call = sum([(qual > cutoff) for qual in call_qual]) return num_qual_call / self.num_samples def variant_call_rate(self, cutoff): samples = self.samples.values() num_qual_var = 0 for s in samples: if s.get("GQ") is not None and s.get("GQ") > cutoff and Variant(s): num_qual_var += 1 num_var_samples = self.num_variant_samples() var_call_rate = num_qual_var / num_var_samples if num_var_samples else 0 return var_call_rate def unify_pass_filtertag(self): """ All records passing the filters (PASS, .) ar now labelled PASS """ record = self.record filters = [f for f in record.filter] # We make the assumption when a "." is present no other filter # are present if not filters or "." in filters: record.filter.clear() record.filter.add("PASS") class AnnotateReader(VCFReader): def __init__(self, file_name, sv_to_report=None): super(AnnotateReader, self).__init__(file_name) self.filename = file_name self.sv_to_report = sv_to_report self.vcf_reader = VariantFile(file_name) self.add_annotation_metadata() def __iter__(self): return self def __next__(self): while True: raw_record = next(self.vcf_reader) record = AnnotateRecord(raw_record) return record def getHeader(self): return self.vcf_reader.header def add_annotation_metadata(self): self.addInfo("SOURCEID", 1, "String", "The source sv identifier") self.addInfo("MAX_SUPP_READS", 1, "Integer", "Max number of supporting reads") self.addInfo("NUM_SUPP_SAMPLES", 1, "Integer", "Number of supporting samples") self.addFilter("LOWSUPPORT", "total supp reads < 5 or supp samples < 2") def getOrderedSamples(self): samples = self.vcf_reader.header.samples sample_names = [sample.rsplit('.')[0] for sample in samples] return sample_names def numSamples(self): return len(self.vcf_reader.header.samples) class AnnotateWriter(VCFWriter): def __init__(self, file_name, template_reader, index=True): super(VCFWriter, self).__init__(file_name, template_reader.tool_name, template_reader) def _open(self): self.vcf_writer = VariantFile(self.filename, 'w', header=self.template_reader.getHeader()) self._isopen = True def _write(self, record): self.vcf_writer.write(record.record) def _close(self): if self._isopen: self.vcf_writer.close() else: # nothing was written self._dumpemptyvcf() def ordered(a, b): # simply ordering the two string return (a, b) if a < b else (b, a) def setLikeliTag(genotyper): global Geno_likeli_tag if genotyper == "svtyper": Geno_likeli_tag = SVTYPER_GENO_LIKELI_TAG warn("Assuming genotypes provided by svtyper software" + " hence tag is %s" % (Geno_likeli_tag)) elif genotyper == "genomestrip": Geno_likeli_tag = GENOMESTRIP_GENO_LIKELI_TAG warn("Assuming genotypes provided by genomestrip software" + " hence tag is %s" % (Geno_likeli_tag)) else: print("Unknown genotyping software") exit(1) def getlikelihoods(sample): return sample.get('GL') def probas(likelihoods): # transform log likelihoods into likelihoods return np.exp(np.array(likelihoods)) def getprobas(sample): # Transforming likelihods into probabilities return probas(getlikelihoods(sample)) / np.sum(probas(getlikelihoods(sample))) def ondiagonal(u_s, v_s): # Probability that, for that individual, the two SVs are identically # genotyped P1(0/0)*P2(0/0) + ... P1(1/1)*P2(1/1) # in the same way : p = getprobas(u_s) q = getprobas(v_s) proba = 0 for a, b in zip(p, q): proba += a * b # print("Proba on %3.5f" %(proba)) return proba def offdiagonal(u_s, v_s): # Probability that, for that individual, the two SVs are not identically # in the same way, complement of the previous one p = getprobas(u_s) q = getprobas(v_s) proba = 0 for i, a in enumerate(p): for j, b in enumerate(q): if i != j: proba += a * b # print("Proba off %3.2f" %(proba)) return proba def duplicatescore(u, v): # For the two SVs, max discordant genotype log-ratio proba # same genotypes against discordant against (worse individual) # u_s is the sample of id s of u (idem for v_s) # Valid samples valid_samples = [] for s in u.samples: u_s = u.samples[s] v_s = v.samples[s] if (u_s.get('GQ') is not None and v_s.get('GQ') is not None): valid_samples.append(s) max_disc = 0 computed = float('NaN') for s in valid_samples: # ondiago is not used, we keep it just for comprehension # ondiago = ondiagonal(s, u, v) u_s = u.samples[s] v_s = v.samples[s] offdiago = offdiagonal(u_s, v_s) if offdiago > max_disc: max_disc = offdiago if max_disc > 0 and max_disc < 1: ratio = (1 - max_disc) / max_disc computed = np.log(ratio) return computed def gstrength(u): """ Sum of phred-like genotype qualities provides a measure of the combined genotype quality of the site np.sum([s['GQ'] if s['GQ'] is not None else 0 for s in u.samples.values()]) """ return u.GQ_sum_score() def variantstrength(u): """ maximum SQ, where SQ stands for Phred-scaled probability that this site is variant (non-reference) in this sample) QUAL = -10 * log(P(locus is reference in all samples)), which is equal to the sum of the SQ scores. see https://github.com/hall-lab/svtyper/issues/10 sum([s['SQ'] if s['SQ'] is not None else 0 for s in u.samples.values()]) """ return u.qual() # max([s['SQ'] if s['SQ'] is not None else 0 for s in u.samples.values()]) def getduplicates_GQ(u, v): """ select the prefered duplicate on the basis of the Sum of phred-like genotype qualities see gstrength returns prefered, discarded, strength of both """ if gstrength(u) > gstrength(v): return (u, v, gstrength(u), gstrength(v)) else: return (v, u, gstrength(v), gstrength(u)) def getduplicates_QUAL(u, v): """ select the prefered duplicate on the basis of Phred-scaled probability that this site is a variant see variantstrength returns prefered, discarded, strength of both """ if variantstrength(u) > variantstrength(v): return (u, v, variantstrength(u), variantstrength(v)) else: return (v, u, variantstrength(v), variantstrength(u)) def getoverlap(u, osize): # percentage overlap given the size of the overlap return 100 * osize / u.svlen def add_redundancy_infos_header(reader): # Adding DUPLICATESCORE info (equivalent to GSDUPLICATESCORE) reader.addInfo("DUPLICATESCORE", 1, "Float", "LOD score that the events are distinct based on the " "genotypes of the most discordant sample") # Adding DUPLICATES info (equivalent to GSDUPLICATEOVERLAP) reader.addInfo("DUPLICATEOVERLAP", 1, "Float", "Highest overlap with a duplicate event") # Adding DUPLICATEOF info (list of variant prefered to this one) reader.addInfo("DUPLICATEOF", ".", "String", "List of duplicate events preferred to this one") # Adding DUPLICATES info (list of variants duplicates of this one) reader.addInfo("DUPLICATES", ".", "String", "List of duplicate events represented by the current sv") # Adding NONDUPLICATEOVERLAP reader.addInfo("NONDUPLICATEOVERLAP", 1, "Float", "Amount of overlap with a non-duplicate") # Adding TOOLSUPPORT reader.addInfo("TOOLSUPPORT", ".", "String", "Tools supporting (detecting) the sv") def redundancy_annotator(SVSet, reader, overlap_cutoff, duplicatescore_threshold=-2, genotyper="svtyper"): """ Annotating duplicate candidates based on the genotype likelihoods - genotype likelihoods can be provided by svtyper or genomestrip """ add_redundancy_infos_header(reader) setLikeliTag(genotyper) variants = defaultdict() for sv in SVSet: variants[sv.id] = sv pybed_variants = vcf_to_pybed(SVSet) self_overlap = pybed_variants.intersect(pybed_variants, f=overlap_cutoff, r=True, wo=True) seen = defaultdict(tuple) duplicates = defaultdict(list) overlapping = defaultdict(tuple) reference = defaultdict() for o in self_overlap: if o[3] == o[7]: continue a, b = ordered(o[3], o[7]) if seen[(a, b)]: continue seen[(a, b)] = True u = variants[a] v = variants[b] score = duplicatescore(u, v) # print("Comparing %s and %s : %3.8f" % (u.id, v.id, score)) if score > duplicatescore_threshold: ref, dupli, _, _ = getduplicates_GQ(u, v) # print("%s prefered to %s %3.8f" % (ref.id, dupli.id, score)) reference[ref] = 1 overlap_size = int(o[-1]) overlap = getoverlap(dupli, overlap_size) if ref.maxGQ() > 0 and dupli.passed: # are dismissed # - reference variant with 0 genotype quality for all markers # - duplicate that are already tagged as filtered out # dupli.id is considered as a duplicate of ref.id duplicates[dupli.id].append((ref.id, score, overlap)) else: overlap_size = int(o[-1]) overlap_u = getoverlap(u, overlap_size) overlap_v = getoverlap(v, overlap_size) overlapping[(u, v)] = {'dupli_score': score, 'overlap_left': overlap_u, 'overlap_right': overlap_v, } for u in SVSet: if u.id in duplicates: print("tagged as duplicate %s" % u.id) duplis = sorted([a for (a, s, o) in duplicates[u.id]]) score = max([s for (a, s, o) in duplicates[u.id]]) overlap = max([o for (a, s, o) in duplicates[u.id]]) add_duplicate_info_sv(u, overlap, score, duplis) def add_duplicate_info_sv(sv, duplicateoverlap, duplicatescore, duplicates): """ simply adding two information to the sv infos """ if isnan(duplicatescore): sv.record.info['DUPLICATESCORE'] = float('nan') else: sv.record.info['DUPLICATESCORE'] = duplicatescore sv.record.info['DUPLICATEOVERLAP'] = duplicateoverlap sv.record.info['DUPLICATEOF'] = duplicates def add_overlap_info_sv(sv, overlap, duplicatescore): """ simply adding two information to the sv infos """ if isnan(duplicatescore): sv.record.info['DUPLICATESCORE'] = float('nan') else: sv.record.info['DUPLICATESCORE'] = duplicatescore sv.record.info['NONDUPLICATEOVERLAP'] = overlap def add_callrate_infos_header(reader): # Adding CALLRATE info reader.addInfo("CALLRATE", 1, "Float", "Percentage of samples called with a GQ>13") # Adding VARIANTCALLRATE info reader.addInfo("VARIANTCALLRATE", 1, "Float", "Percentage of variant samples called with a GQ>13") def add_filter_infos_header(reader): # FILTERS # Adding specific filters reader.addFilter("CALLRATE", "Call rate <0.75") reader.addFilter("VARIANTCALLRATE", "Variant Call rate <0.75") reader.addFilter("MONOMORPH", "All samples have the same genotype") reader.addFilter("DUPLICATE", "GSDUPLICATESCORE>0") reader.addFilter("OVERLAP", "NONDUPLICATEOVERLAP>0.7") reader.addFilter("ABFREQ", "AB frequency <0.3 for >50% heterosamples") def variant_filtration(variant_set, reader, filter_monomorph=False, filter_callrate=False): """ Filtering the candidate CNVs according to the following criteria - non duplicate sites - variant sites - call rate > 0.8 - at least one variant (homozygous or heterozygote) has a genotype quality > 20 - the variant is not everywhere heterozygote or homozygote (use NONVARIANTSCORE in both cases) """ add_callrate_infos_header(reader) add_filter_infos_header(reader) for sv in variant_set: info = sv.record.info sv.record.info['CALLRATE'] = sv.call_rate(13) sv.record.info['VARIANTCALLRATE'] = sv.variant_call_rate(13) if sv.call_rate(13) < 0.75 and filter_callrate: sv.filter.add("CALLRATE") if not sv.polymorph() and filter_monomorph: sv.filter.add("MONOMORPH") if 'NONDUPLICATEOVERLAP' in info and info['NONDUPLICATEOVERLAP'] > 0.8: sv.filter.add("OVERLAP") if "DUPLICATESCORE" in info is not None and info['DUPLICATESCORE'] > -2: sv.filter.add("DUPLICATE") def AB_filtering(variant_set): """ Filtering the candidate CNVs according to the following criteria - more than 50% of variant samples should have AB freq > 0.3 """ for sv in variant_set: valid_AB_freq = [] for s in sv.record.samples.values(): if Heterozygote(s): valid_AB_freq.append((s.get('AB')[0] > 0.3)) if (len(valid_AB_freq) > 0 and sum(valid_AB_freq) < len(valid_AB_freq) / 2): sv.filter.add("ABFREQ") def get_connected_duplicates(variant_set): """ Construct connected components of duplicates and rename the variants """ undirected = Graph() variant_dict = defaultdict() representatives = defaultdict() for s in variant_set: variant_dict[s.id] = s if "DUPLICATE" in s.filter: for dupli_repr in s.record.info["DUPLICATEOF"]: undirected.add_edge(s.id, dupli_repr) for component in connected_components(undirected): for c in component: if "DUPLICATEOF" in variant_dict[c].record.info: # the current variant is a duplicate continue rep = c # the representative of the equivalence class break duplicates = component duplicates.remove(rep) representatives[rep] = duplicates add_duplicate_infos(representatives, variant_dict) def add_duplicate_infos(representatives, sv_dict): for rep, elements in representatives.items(): for d in elements: sv_dict[d].record.info['DUPLICATEOF'] = rep duplicates = list(elements) if 'DUPLICATES' in sv_dict[rep].record.info: print(sv_dict[rep].record.info['DUPLICATES']) duplicates.extend(sv_dict[rep].record.info['DUPLICATES']) if len(duplicates) > 0: sv_dict[rep].record.info['DUPLICATES'] = duplicates def get_tool_name(sv_ident): return sv_ident.split("_")[0] def set_supporting_tools(variant_set): for sv in variant_set: tools = {get_tool_name(sv.id)} if "DUPLICATES" in sv.record.info: duplicates = sv.record.info['DUPLICATES'] # print(duplicates) for dupli in duplicates: tools.add(get_tool_name(dupli)) if 'TOOLSUPPORT' in sv.record.info: supporting = set(sv.record.info['TOOLSUPPORT']) tools = tools.union(supporting) sv.record.info['TOOLSUPPORT'] = list(tools) def static_vars(**kwargs): def decorate(func): for k in kwargs: setattr(func, k, kwargs[k]) return func return decorate @static_vars(counter=0) def new_id_str(sv): new_id_str.counter += 1 return "_".join(["cnvpipeline", sv.chrom, sv.svtype, str(new_id_str.counter)]) def rename_info_field(sv, key, sv_dict): if key in sv.record.info: info_oldid = sv.record.info[key] info_newid = [sv_dict[id] for id in info_oldid] sv.record.info[key] = info_newid def rename_variants(SVSet): sv_dict = defaultdict() for sv in SVSet: new_id = new_id_str(sv) sv.setNewId(new_id) sv_dict[sv.id] = new_id for sv in SVSet: rename_info_field(sv, "DUPLICATEOF", sv_dict) rename_info_field(sv, "DUPLICATES", sv_dict) sv.record.info['SOURCEID'] = sv.id sv.rename()