diff --git a/svreader/annotation.py b/svreader/annotation.py index 8c5ce8b154915c5f5467a6af3350cf02d8a81ea8..11c2b9855d973ca60e8159a1a7e85a431ae0fc34 100644 --- a/svreader/annotation.py +++ b/svreader/annotation.py @@ -2,24 +2,39 @@ import sys from collections import defaultdict +from networkx import Graph, connected_components + import numpy as np from math import isnan from pysam import VariantFile from svrunner_utils import eprint, warn, vcf_to_pybed -from svreader.vcfwrapper import VCFRecord, SVReader, SVWriter +from svreader.vcfwrapper import VCFRecord, VCFReader, VCFWriter HOM_REF = (0, 0) HET_VAR = (0, 1) HOM_VAR = (1, 1) +VALID_GENOTYPES = [HOM_REF, HET_VAR, HOM_VAR] + SVTYPER_GENO_LIKELI_TAG = "GL" GENOMESTRIP_GENO_LIKELI_TAG = "GP" Geno_likeli_tag = SVTYPER_GENO_LIKELI_TAG +def coded_geno(geno): + if geno == HOM_REF: + return "HOMO_REF" + elif geno == HET_VAR: + return "HET_VAR" + elif geno == HOM_VAR: + return "HOMO_VAR" + else: + return "UNDEF" + + def Variant(sample): if sample.get('GT') in [HET_VAR, HOM_VAR]: return True @@ -27,7 +42,14 @@ def Variant(sample): return False -class VariantRecord(VCFRecord): +def Heterozygote(sample): + if sample.get('GT') in [HET_VAR]: + return True + else: + return False + + +class AnnotateRecord(VCFRecord): """ A lightweight object to annotated the final records """ @@ -35,7 +57,8 @@ class VariantRecord(VCFRecord): """ A pysam VariantRecord wrapper """ - super(VariantRecord, self).__init__(record) + super(AnnotateRecord, self).__init__(record) + self.new_id = None @property def start(self): @@ -60,29 +83,64 @@ class VariantRecord(VCFRecord): else: return False - def setNewId(self, identifier): + @property + def num_samples(self): + return len(self.record.samples.keys()) + + def setNewId(self, new_id): + self.new_id = new_id + + def rename(self): try: self.record.info['SOURCEID'] = self.id except KeyError: eprint("SOURCEID absent from record info keys,") sys.exit(1) - self.id = identifier + self.id = self.new_id def num_variant_samples(self): return sum([Variant(s) for s in self.samples.values()]) def variant_read_support(self): - return max([s.get('AO')[0] for s in self.samples.values()]) + support = [] + for s in self.samples.values(): + if s.get('AO') is not None: + support.append(s.get('AO')[0]) + return max(support) def qual(self): - return sum([s.get('SQ') for s in self.samples.values()]) + variant_qual = [] + for s in self.samples.values(): + if s.get('SQ') is not None: + variant_qual.append(s.get('SQ')) + return sum(variant_qual) + + def GQ_samples(self): + genotype_qual = [] + for s in self.samples.values(): + if s.get('GQ') is not None: + genotype_qual.append(s.get('GQ')) + return genotype_qual def GQ_sum_score(self): - return sum([s.get('SQ') for s in self.samples.values()]) + return sum(self.GQ_samples()) + + def maxGQ(self): + return max(self.GQ_samples()) def setQual(self): self.record.qual = self.qual() + def numdiffGenotypes(self): + genotypes = defaultdict() + for s in self.samples.values(): + if s.get('GT') in VALID_GENOTYPES: + genotypes[coded_geno(s.get('GT'))] = 1 + return len(genotypes.keys()) + + def Polymorph(self): + return self.numdiffGenotypes() > 1 + def AddSupportInfos(self): supp_reads = self.variant_read_support() num_supp_samples = self.num_variant_samples() @@ -93,6 +151,24 @@ class VariantRecord(VCFRecord): eprint("SUPP_READS or NUM_SUPP_SAMPLES absent from record info keys") sys.exit(1) + def CallRate(self, cutoff): + call_qual = [] + for s in self.samples.values(): + if s.get('GQ') is not None: + call_qual.append(s.get('GQ')) + num_qual_call = sum([(qual > cutoff) for qual in call_qual]) + return num_qual_call/self.num_samples + + def VariantCallRate(self, cutoff): + samples = self.samples.values() + num_qual_var = 0 + for s in samples: + if s.get("GQ") is not None and s.get("GQ") > cutoff and Variant(s): + num_qual_var += 1 + num_var_samples = self.num_variant_samples() + var_call_rate = num_qual_var/num_var_samples if num_var_samples else 0 + return var_call_rate + def UnifiedPass(self): """ All records passing the filters (PASS, .) ar now labelled PASS @@ -106,52 +182,14 @@ class VariantRecord(VCFRecord): record.filter.add("PASS") -class VariantRecordSample(object): - """ - A lightweight object to VariantRecordSample - """ - def __init__(self, variantsample): - self.variantsample = variantsample - - def genotype(self): - return self.variantsample.get('GT') - - def SQ_score(self): - # Phred-scaled probability that this site is variant - # (non-reference in this sample) - return self.variantsample.get('SQ') - - def GQ_score(self): - # Genotype quality - return self.variantsample.get('GQ') - - def GL_score(self): - # Genotype Likelihood, log10-scaled likelihoods of the data given the - # called genotype for each possible genotype generated from the - # reference and alternate alleles given the sample ploidy - return self.variantsample.get('GL') - - def AltSupport(self): - return self.variantsample.get('AO')[0] - - @property - def isVariant(self): - if self.genotype() in [HET_VAR, HOM_VAR]: - return True - else: - return False - - -class VCFReader(SVReader): - +class AnnotateReader(VCFReader): def __init__(self, file_name, sv_to_report=None): - super(VCFReader, self).__init__(file_name) - self.vcf_reader = VariantFile(file_name) + super(AnnotateReader, self).__init__(file_name) self.filename = file_name self.sv_to_report = sv_to_report self.vcf_reader = VariantFile(file_name) - self.add_metadata() + self.add_annotation_metadata() def __iter__(self): return self @@ -159,25 +197,25 @@ class VCFReader(SVReader): def __next__(self): while True: raw_record = next(self.vcf_reader) - record = VariantRecord(raw_record) + record = AnnotateRecord(raw_record) return record def getHeader(self): return self.vcf_reader.header - def addInfo(self, name, number, type, description): - self.vcf_reader.header.info.add(id=name, - number=number, - type=type, - description=description) - - def addFilter(self, name, description): - self.vcf_reader.header.filters.add(id=name, - number=None, - type=None, - description=description) - - def add_metadata(self): + # def addInfo(self, name, number, type, description): + # self.vcf_reader.header.info.add(id=name, + # number=number, + # type=type, + # description=description) + # + # def addFilter(self, name, description): + # self.vcf_reader.header.filters.add(id=name, + # number=None, + # type=None, + # description=description) + + def add_annotation_metadata(self): self.addInfo("SOURCEID", 1, "String", "The source sv identifier") self.addInfo("MAX_SUPP_READS", 1, "Integer", @@ -187,7 +225,6 @@ class VCFReader(SVReader): self.addFilter("LOWSUPPORT", "total supp reads < 5 or supp samples < 2") - def getOrderedSamples(self): samples = self.vcf_reader.header.samples sample_names = [sample.rsplit('.')[0] for sample in samples] @@ -197,7 +234,7 @@ class VCFReader(SVReader): return len(self.vcf_reader.header.samples) -class VCFWriter(SVWriter): +class AnnotateWriter(VCFWriter): def __init__(self, file_name, template_reader, index=True): @@ -367,12 +404,18 @@ def add_redundancy_infos_header(reader): # Adding DUPLICATES info (equivalent to GSDUPLICATEOVERLAP) reader.addInfo("DUPLICATEOVERLAP", 1, "Float", "Highest overlap with a duplicate event") - # Adding DUPLICATEOVERLAP info (equivalent to GSDUPLICATEOVERLAP) - reader.addInfo("DUPLICATES", ".", "String", + # Adding DUPLICATEOF info (list of variant prefered to this one) + reader.addInfo("DUPLICATEOF", ".", "String", "List of duplicate events preferred to this one") + # Adding DUPLICATES info (list of variants duplicates of this one) + reader.addInfo("DUPLICATES", ".", "String", + "List of duplicate events represented by the current sv") # Adding NONDUPLICATEOVERLAP reader.addInfo("NONDUPLICATEOVERLAP", 1, "Float", "Amount of overlap with a non-duplicate") + # Adding TOOLSUPPORT + reader.addInfo("TOOLSUPPORT", ".", "String", + "Tools supporting (detecting) the sv") def GenomeSTRIPLikeRedundancyAnnotator(SVSet, reader, @@ -398,7 +441,6 @@ def GenomeSTRIPLikeRedundancyAnnotator(SVSet, reader, overlapping = defaultdict(tuple) reference = defaultdict() for o in self_overlap: - print("Here ?", o) if o[3] == o[7]: continue a, b = ordered(o[3], o[7]) @@ -415,7 +457,7 @@ def GenomeSTRIPLikeRedundancyAnnotator(SVSet, reader, reference[ref] = 1 overlap_size = int(o[-1]) overlap = getoverlap(dupli, overlap_size) - if maxGQ(ref) > 0 and passed(dupli): + if ref.maxGQ() > 0 and dupli.passed: # are dismissed # - reference variant with 0 genotype quality for all markers # - duplicate that are already tagged as filtered out @@ -447,7 +489,7 @@ def add_duplicate_info_sv(sv, duplicateoverlap, duplicatescore, duplicates): else: sv.record.info['DUPLICATESCORE'] = duplicatescore sv.record.info['DUPLICATEOVERLAP'] = duplicateoverlap - sv.record.info['DUPLICATES'] = duplicates + sv.record.info['DUPLICATEOF'] = duplicates def add_overlap_info_sv(sv, overlap, duplicatescore): @@ -461,13 +503,24 @@ def add_overlap_info_sv(sv, overlap, duplicatescore): sv.record.info['NONDUPLICATEOVERLAP'] = overlap +def add_callrate_infos_header(reader): + # Adding CALLRATE info + reader.addInfo("CALLRATE", 1, "Float", + "Percentage of samples called with a GQ>13") + # Adding VARIANTCALLRATE info + reader.addInfo("VARIANTCALLRATE", 1, "Float", + "Percentage of variant samples called with a GQ>13") + + def add_filter_infos_header(reader): # FILTERS # Adding specific filters reader.addFilter("CALLRATE", "Call rate <0.75") + reader.addFilter("VARIANTCALLRATE", "Variant Call rate <0.75") reader.addFilter("MONOMORPH", "All samples have the same genotype") reader.addFilter("DUPLICATE", "GSDUPLICATESCORE>0") reader.addFilter("OVERLAP", "NONDUPLICATEOVERLAP>0.7") + reader.addFilter("ABFREQ", "AB frequency <0.3 for >50% heterosamples") def GenomeSTRIPLikefiltering(SVSet, reader): @@ -481,15 +534,122 @@ def GenomeSTRIPLikefiltering(SVSet, reader): (use NONVARIANTSCORE in both cases) """ + add_callrate_infos_header(reader) add_filter_infos_header(reader) for sv in SVSet: info = sv.record.info - if callRate(sv) < 0.75: + sv.record.info['CALLRATE'] = sv.CallRate(13) + sv.record.info['VARIANTCALLRATE'] = sv.VariantCallRate(13) + if sv.CallRate(13) < 0.75: sv.filter.add("CALLRATE") - if not Polymorph(sv): + if not sv.Polymorph(): sv.filter.add("MONOMORPH") if 'NONDUPLICATEOVERLAP' in info and info['NONDUPLICATEOVERLAP'] > 0.7: sv.filter.add("OVERLAP") if "DUPLICATESCORE" in info is not None and info['DUPLICATESCORE'] > -2: sv.filter.add("DUPLICATE") + + +def ABFreqFiltering(SVSet): + """ Filtering the candidate CNVs according to the following criteria + - more than 50% of variant samples should have AB freq > 0.3 + """ + + for sv in SVSet: + ABfreqOK = [] + for s in sv.record.samples.values(): + if Heterozygote(s): + ABfreqOK.append((s.get('AB')[0] > 0.3)) + if len(ABfreqOK) > 0 and sum(ABfreqOK) < len(ABfreqOK)/2: + sv.filter.add("ABFREQ") + + +def GetConnectedDuplicates(SVSet): + """ + Construct connected components of duplicates and rename the variants + """ + undirected = Graph() + variant_dict = defaultdict() + representatives = defaultdict() + for s in SVSet: + variant_dict[s.id] = s + if "DUPLICATE" in s.filter: + for dupli_repr in s.record.info["DUPLICATEOF"]: + undirected.add_edge(s.id, dupli_repr) + for component in connected_components(undirected): + for c in component: + if "DUPLICATEOF" in variant_dict[c].record.info: + # the current variant is a duplicate + continue + rep = c # the representative of the equivalence class + break + duplicates = component + duplicates.remove(rep) + representatives[rep] = duplicates + add_duplicate_infos(representatives, variant_dict) + + +def add_duplicate_infos(representatives, sv_dict): + for rep, elements in representatives.items(): + for d in elements: + sv_dict[d].record.info['DUPLICATEOF'] = rep + duplicates = list(elements) + if 'DUPLICATES' in sv_dict[rep].record.info: + print(sv_dict[rep].record.info['DUPLICATES']) + duplicates.extend(sv_dict[rep].record.info['DUPLICATES']) + if len(duplicates) > 0: + sv_dict[rep].record.info['DUPLICATES'] = duplicates + + +def get_tool_name(sv_ident): + return sv_ident.split("_")[0] + + +def SetSupportingTools(SVSet): + for sv in SVSet: + tools = {get_tool_name(sv.id)} + if "DUPLICATES" in sv.record.info: + duplicates = sv.record.info['DUPLICATES'] + # print(duplicates) + for dupli in duplicates: + tools.add(get_tool_name(dupli)) + if 'TOOLSUPPORT' in sv.record.info: + supporting = set(sv.record.info['TOOLSUPPORT']) + tools.union(supporting) + sv.record.info['TOOLSUPPORT'] = list(tools) + + +def static_vars(**kwargs): + def decorate(func): + for k in kwargs: + setattr(func, k, kwargs[k]) + return func + return decorate + + +@static_vars(counter=0) +def new_id_str(sv): + new_id_str.counter += 1 + return "_".join(["cnvpipeline", sv.chrom, sv.svtype, + str(new_id_str.counter)]) + + +def rename_info_field(sv, key, sv_dict): + if key in sv.record.info: + info_oldid = sv.record.info[key] + info_newid = [sv_dict[id] for id in info_oldid] + sv.record.info[key] = info_newid + + +def RenameSV(SVSet): + sv_dict = defaultdict() + for sv in SVSet: + new_id = new_id_str(sv) + sv.setNewId(new_id) + sv_dict[sv.id] = new_id + for sv in SVSet: + rename_info_field(sv, "DUPLICATEOF", sv_dict) + rename_info_field(sv, "DUPLICATES", sv_dict) + sv.record.info['SOURCEID'] = sv.id + sv.rename() diff --git a/svreader/pindel.py b/svreader/pindel.py index 29c3593365e469d0ec224b85492c3df4781679ee..9b320e096f1d1bd9ce61574c41f742a3fd193e61 100644 --- a/svreader/pindel.py +++ b/svreader/pindel.py @@ -388,6 +388,7 @@ class PindelReader(SVReader): # # Sudmant et al 2015 SuppInfo # return (record.length() > 60) def SpecificFilterPass(self, record): + # new pindel specific filter if record.length() >= 800 or record.length() <= 60: return False elif record.MaxIndividualSupport() <= 3: diff --git a/svreader/vcfwrapper.py b/svreader/vcfwrapper.py index 7c3bd7346f461e9a1694876ffc21db2312ca3eb6..605830db0911b5bdd21f89b6ab0af3f890985941 100644 --- a/svreader/vcfwrapper.py +++ b/svreader/vcfwrapper.py @@ -39,6 +39,10 @@ class VCFRecord(SVRecord): def id(self): return self.record.id + @id.setter + def id(self, id): + self.record.id = id + @property def pos(self): return self.record.pos @@ -51,6 +55,10 @@ class VCFRecord(SVRecord): def stop(self): return self.record.stop + @property + def svtype(self): + return self._sv_type + @property def filter(self): return self.record.filter @@ -84,16 +92,24 @@ class VCFReader(SVReader): self.vcf_reader.header.info.remove_header(key) def addInfo(self, name, number, type, description): - self.vcf_reader.header.info.add(id=name, - number=number, - type=type, - description=description) + # we add info only if absent from the current header + if name in self.vcf_reader.header.info: + return + else: + self.vcf_reader.header.info.add(id=name, + number=number, + type=type, + description=description) def addFilter(self, name, description): - self.vcf_reader.header.filters.add(id=name, - number=None, - type=None, - description=description) + # we add filter info only if absent from the current header + if name in self.vcf_reader.header.filters: + return + else: + self.vcf_reader.header.filters.add(id=name, + number=None, + type=None, + description=description) def getOrderedSamples(self): samples = self.vcf_reader.header.samples