adding duplicate informations with the new identifiers

a47a5d83 · Thomas Faraut · 4f118e1d · a47a5d83 · a47a5d83
Commit a47a5d83 authored 5 years ago by Thomas Faraut
--- a/svreader/annotation.py
+++ b/svreader/annotation.py
@@ -8,18 +8,31 @@ from math import isnan
 from pysam import VariantFile

 from svrunner_utils import eprint, warn, vcf_to_pybed
-from svreader.vcfwrapper import VCFRecord, SVReader, SVWriter
+from svreader.vcfwrapper import VCFRecord, VCFReader, VCFWriter

 HOM_REF = (0, 0)
 HET_VAR = (0, 1)
 HOM_VAR = (1, 1)

+VALID_GENOTYPES = [HOM_REF, HET_VAR,  HOM_VAR]
+
 SVTYPER_GENO_LIKELI_TAG = "GL"
 GENOMESTRIP_GENO_LIKELI_TAG = "GP"

 Geno_likeli_tag = SVTYPER_GENO_LIKELI_TAG


+def coded_geno(geno):
+    if geno == HOM_REF:
+        return "HOMO_REF"
+    elif geno == HET_VAR:
+        return "HET_VAR"
+    elif geno == HOM_VAR:
+        return "HOMO_VAR"
+    else:
+        return "UNDEF"
+
+
 def Variant(sample):
    if sample.get('GT') in [HET_VAR, HOM_VAR]:
        return True
@@ -27,7 +40,7 @@ def Variant(sample):
        return False


-class VariantRecord(VCFRecord):
+class AnnotateRecord(VCFRecord):
    """
    A lightweight object to annotated the final records
    """
@@ -35,7 +48,8 @@ class VariantRecord(VCFRecord):
        """
        A pysam VariantRecord wrapper
        """
-        super(VariantRecord, self).__init__(record)
+        super(AnnotateRecord, self).__init__(record)
+        self.new_id = None

    @property
    def start(self):
@@ -60,13 +74,20 @@ class VariantRecord(VCFRecord):
        else:
            return False

+    @property
+    def num_samples(self):
+        return len(self.record.samples.keys())
+
    def setNewId(self, identifier):
+        self.new_id = identifier
+
+    def rename(self):
        try:
            self.record.info['SOURCEID'] = self.id
        except KeyError:
            eprint("SOURCEID absent from record info keys,")
            sys.exit(1)
-        self.id = identifier
+        self.id = self.new_id

    def num_variant_samples(self):
        return sum([Variant(s) for s in self.samples.values()])
@@ -78,11 +99,24 @@ class VariantRecord(VCFRecord):
        return sum([s.get('SQ') for s in self.samples.values()])

    def GQ_sum_score(self):
-        return sum([s.get('SQ') for s in self.samples.values()])
+        return sum([s.get('GQ') for s in self.samples.values()])
+
+    def maxGQ(self):
+        return max([s.get('GQ') for s in self.samples.values()])

    def setQual(self):
        self.record.qual = self.qual()

+    def numdiffGenotypes(self):
+        genotypes = defaultdict()
+        for s in self.samples.values():
+            if s.get('GT') in VALID_GENOTYPES:
+                genotypes[coded_geno(s.get('GT'))] = 1
+        return len(genotypes.keys())
+
+    def Polymorph(self):
+        return self.numdiffGenotypes() > 1
+
    def AddSupportInfos(self):
        supp_reads = self.variant_read_support()
        num_supp_samples = self.num_variant_samples()
@@ -93,6 +127,14 @@ class VariantRecord(VCFRecord):
            eprint("SUPP_READS or NUM_SUPP_SAMPLES absent from record info keys")
            sys.exit(1)

+    def CallRate(self, cutoff):
+        num_qual_call = sum([(s.get('GQ') > cutoff) for s in self.samples.values()])
+        return num_qual_call/self.num_samples
+
+    def VariantCallRate(self, cutoff):
+        num_qual_call = sum([(s.get('GQ') > cutoff) for s in self.samples.values() if Variant(s)])
+        return num_qual_call/self.num_variant_samples()
+
    def UnifiedPass(self):
        """
           All records passing the filters (PASS, .) ar now labelled PASS
@@ -106,51 +148,14 @@ class VariantRecord(VCFRecord):
            record.filter.add("PASS")


-class VariantRecordSample(object):
-    """
-    A lightweight object to VariantRecordSample
-    """
-    def __init__(self, variantsample):
-        self.variantsample = variantsample
-
-    def genotype(self):
-        return self.variantsample.get('GT')
-
-    def SQ_score(self):
-        # Phred-scaled probability that this site is variant
-        # (non-reference in this sample)
-        return self.variantsample.get('SQ')
-
-    def GQ_score(self):
-        # Genotype quality
-        return self.variantsample.get('GQ')
-
-    def GL_score(self):
-        # Genotype Likelihood, log10-scaled likelihoods of the data given the
-        # called genotype for each possible genotype generated from the
-        # reference and alternate alleles given the sample ploidy
-        return self.variantsample.get('GL')
-
-    def AltSupport(self):
-        return self.variantsample.get('AO')[0]
-
-    @property
-    def isVariant(self):
-        if self.genotype() in [HET_VAR, HOM_VAR]:
-            return True
-        else:
-            return False
-
-
-class VCFReader(SVReader):
-
+class AnnotateReader(VCFReader):
    def __init__(self, file_name, sv_to_report=None):
-        super(VCFReader, self).__init__(file_name)
+        super(AnnotateReader, self).__init__(file_name)
        self.filename = file_name
        self.sv_to_report = sv_to_report
        self.vcf_reader = VariantFile(file_name)

-        self.add_metadata()
+        self.add_annotation_metadata()

    def __iter__(self):
        return self
@@ -158,25 +163,25 @@ class VCFReader(SVReader):
    def __next__(self):
        while True:
            raw_record = next(self.vcf_reader)
-            record = VariantRecord(raw_record)
+            record = AnnotateRecord(raw_record)
            return record

    def getHeader(self):
        return self.vcf_reader.header

-    def addInfo(self, name, number, type, description):
-        self.vcf_reader.header.info.add(id=name,
-                                        number=number,
-                                        type=type,
-                                        description=description)
-
-    def addFilter(self, name, description):
-        self.vcf_reader.header.filters.add(id=name,
-                                           number=None,
-                                           type=None,
-                                           description=description)
-
-    def add_metadata(self):
+    # def addInfo(self, name, number, type, description):
+    #     self.vcf_reader.header.info.add(id=name,
+    #                                     number=number,
+    #                                     type=type,
+    #                                     description=description)
+    #
+    # def addFilter(self, name, description):
+    #     self.vcf_reader.header.filters.add(id=name,
+    #                                        number=None,
+    #                                        type=None,
+    #                                        description=description)
+
+    def add_annotation_metadata(self):
        self.addInfo("SOURCEID", 1, "String",
                     "The source sv identifier")
        self.addInfo("MAX_SUPP_READS", 1, "Integer",
@@ -195,7 +200,7 @@ class VCFReader(SVReader):
        return len(self.vcf_reader.header.samples)


-class VCFWriter(SVWriter):
+class AnnotateWriter(VCFWriter):

    def __init__(self, file_name,  template_reader, index=True):

@@ -365,9 +370,12 @@ def add_redundancy_infos_header(reader):
    # Adding DUPLICATES info (equivalent to GSDUPLICATEOVERLAP)
    reader.addInfo("DUPLICATEOVERLAP", 1, "Float",
                   "Highest overlap with a duplicate event")
-    # Adding DUPLICATEOVERLAP info (equivalent to GSDUPLICATEOVERLAP)
-    reader.addInfo("DUPLICATES", ".", "String",
+    # Adding DUPLICATEOF info (list of variant prefered to this one)
+    reader.addInfo("DUPLICATEOF", ".", "String",
                   "List of duplicate events preferred to this one")
+    # Adding DUPLICATES info (list of variants duplicates of this one)
+    reader.addInfo("DUPLICATES", ".", "String",
+                   "List of duplicate events represented by the current sv")
    # Adding NONDUPLICATEOVERLAP
    reader.addInfo("NONDUPLICATEOVERLAP", 1, "Float",
                   "Amount of overlap with a non-duplicate")
@@ -396,7 +404,6 @@ def GenomeSTRIPLikeRedundancyAnnotator(SVSet, reader,
    overlapping = defaultdict(tuple)
    reference = defaultdict()
    for o in self_overlap:
-        print("Here ?", o)
        if o[3] == o[7]:
            continue
        a, b = ordered(o[3], o[7])
@@ -413,7 +420,7 @@ def GenomeSTRIPLikeRedundancyAnnotator(SVSet, reader,
            reference[ref] = 1
            overlap_size = int(o[-1])
            overlap = getoverlap(dupli, overlap_size)
-            if maxGQ(ref) > 0 and passed(dupli):
+            if ref.maxGQ() > 0 and dupli.passed:
                # are dismissed
                # - reference variant with 0 genotype quality for all markers
                # - duplicate that are already tagged as filtered out
@@ -445,7 +452,7 @@ def add_duplicate_info_sv(sv, duplicateoverlap, duplicatescore, duplicates):
    else:
        sv.record.info['DUPLICATESCORE'] = duplicatescore
    sv.record.info['DUPLICATEOVERLAP'] = duplicateoverlap
-    sv.record.info['DUPLICATES'] = duplicates
+    sv.record.info['DUPLICATEOF'] = duplicates


 def add_overlap_info_sv(sv, overlap, duplicatescore):
@@ -459,10 +466,20 @@ def add_overlap_info_sv(sv, overlap, duplicatescore):
    sv.record.info['NONDUPLICATEOVERLAP'] = overlap


+def add_callrate_infos_header(reader):
+    # Adding CALLRATE info
+    reader.addInfo("CALLRATE", 1, "Float",
+                   "Percentage of samples called with a GQ>13")
+    # Adding VARIANTCALLRATE info
+    reader.addInfo("VARIANTCALLRATE", 1, "Float",
+                   "Percentage of variant samples called with a GQ>13")
+
+
 def add_filter_infos_header(reader):
    # FILTERS
    # Adding specific filters
    reader.addFilter("CALLRATE", "Call rate <0.75")
+    reader.addFilter("VARIANTCALLRATE", "Variant Call rate <0.75")
    reader.addFilter("MONOMORPH", "All samples have the same genotype")
    reader.addFilter("DUPLICATE", "GSDUPLICATESCORE>0")
    reader.addFilter("OVERLAP", "NONDUPLICATEOVERLAP>0.7")
@@ -479,13 +496,16 @@ def GenomeSTRIPLikefiltering(SVSet, reader):
            (use NONVARIANTSCORE in both cases)
    """

+    add_callrate_infos_header(reader)
    add_filter_infos_header(reader)

    for sv in SVSet:
        info = sv.record.info
-        if callRate(sv) < 0.75:
+        sv.record.info['CALLRATE'] = sv.CallRate(13)
+        sv.record.info['VARIANTCALLRATE'] = sv.VariantCallRate(13)
+        if sv.CallRate(13) < 0.75:
            sv.filter.add("CALLRATE")
-        if not Polymorph(sv):
+        if not sv.Polymorph():
            sv.filter.add("MONOMORPH")
        if 'NONDUPLICATEOVERLAP' in info and info['NONDUPLICATEOVERLAP'] > 0.7:
            sv.filter.add("OVERLAP")

--- a/svreader/vcfwrapper.py
+++ b/svreader/vcfwrapper.py
@@ -39,6 +39,10 @@ class VCFRecord(SVRecord):
    def id(self):
        return self.record.id

+    @id.setter
+    def id(self, id):
+        self.record.id = id
+
    @property
    def pos(self):
        return self.record.pos
@@ -51,6 +55,10 @@ class VCFRecord(SVRecord):
    def stop(self):
        return self.record.stop

+    @property
+    def svtype(self):
+        return self._sv_type
+
    @property
    def filter(self):
        return self.record.filter
@@ -84,16 +92,24 @@ class VCFReader(SVReader):
        self.vcf_reader.header.info.remove_header(key)

    def addInfo(self, name, number, type, description):
-        self.vcf_reader.header.info.add(id=name,
-                                        number=number,
-                                        type=type,
-                                        description=description)
+        # we add info only if absent from the current header
+        if name in self.vcf_reader.header.info:
+            return
+        else:
+            self.vcf_reader.header.info.add(id=name,
+                                            number=number,
+                                            type=type,
+                                            description=description)

    def addFilter(self, name, description):
-        self.vcf_reader.header.filters.add(id=name,
-                                           number=None,
-                                           type=None,
-                                           description=description)
+        # we add filter info only if absent from the current header
+        if name in self.vcf_reader.header.filters:
+            return
+        else:
+            self.vcf_reader.header.filters.add(id=name,
+                                               number=None,
+                                               type=None,
+                                               description=description)

    def getOrderedSamples(self):
        samples = self.vcf_reader.header.samples