modified annotation

85d17402 · Thomas Faraut · 3ab9cd8e · 85d17402 · 85d17402 · 85d17402
Commit 85d17402 authored 5 years ago by Thomas Faraut
--- a/svfilter.py
+++ b/svfilter.py
@@ -253,12 +253,30 @@ def setLikeliTag(genotyper):
        exit(1)
-def GenomeSTRIPLikeRedundancyAnnotator(SVSet, duplicatescore_threshold=-2,
+def add_redundancy_infos_header(reader):
+    # Adding DUPLICATESCORE info (equivalent to GSDUPLICATESCORE)
+    reader.addInfo("DUPLICATESCORE", 1, "Float",
+                   "LOD score that the events are distinct based on the "
+                   "genotypes of the most discordant sample")
+    # Adding DUPLICATES info (equivalent to GSDUPLICATEOVERLAP)
+    reader.addInfo("DUPLICATEOVERLAP", 1, "Float",
+                   "Highest overlap with a duplicate event")
+    # Adding DUPLICATEOVERLAP info (equivalent to GSDUPLICATEOVERLAP)
+    reader.addInfo("DUPLICATES", ".", "String",
+                   "List of duplicate events preferred to this one")
+    # Adding NONDUPLICATEOVERLAP
+    reader.addInfo("NONDUPLICATEOVERLAP", 1, "Float",
+                    "Amount of overlap with a non-duplicate")
+def GenomeSTRIPLikeRedundancyAnnotator(SVSet, reader,
+                                       duplicatescore_threshold=-2,
                                       genotyper="svtyper"):
    """ Annotating duplicate candidates based on the genotype likelihoods
      - genotype likelihoods can be provided by svtyper or genomestrip
    """
+    add_redundancy_infos_header(reader)
    setLikeliTag(genotyper)
    variants = defaultdict()
@@ -344,7 +362,16 @@ def add_overlap_info_sv(sv, overlap, duplicatescore):
    sv.record.info['NONDUPLICATEOVERLAP'] = overlap
-def GenomeSTRIPLikefiltering(SVSet):
+def add_filter_infos_header(reader):
+    # FILTERS
+    # Adding specific filters
+    reader.addFilter("CALLRATE", "Call rate <0.75")
+    reader.addFilter("MONOMORPH", "All samples have the same genotype")
+    reader.addFilter("DUPLICATE", "GSDUPLICATESCORE>0")
+    reader.addFilter("OVERLAP", "NONDUPLICATEOVERLAP>0.7")
+def GenomeSTRIPLikefiltering(SVSet, reader):
    """ Filtering the candidate CNVs according to the following criteria
          - non duplicate sites
          - variant sites
@@ -355,30 +382,15 @@ def GenomeSTRIPLikefiltering(SVSet):
            (use NONVARIANTSCORE in both cases)
    """
+    add_filter_infos_header(reader)
    for sv in SVSet:
        info = sv.record.info
        if callRate(sv) < 0.75:
            sv.filter.add("CALLRATE")
        if not Polymorph(sv):
            sv.filter.add("MONOMORPH")
-        # if info['GSDUPLICATESCORE'] and info['GSDUPLICATESCORE']>0:
-        #    sv.filter.add("DUPLICATE")
-        # DUPLICATE
        if 'NONDUPLICATEOVERLAP' in info and info['NONDUPLICATEOVERLAP'] > 0.7:
            sv.filter.add("OVERLAP")
        if "DUPLICATESCORE" in info is not None and info['DUPLICATESCORE'] > -2:
            sv.filter.add("DUPLICATE")
-        # NONVARIANT
-        if 'GSNONVARSCORE' in info and info['GSNONVARSCORE'] and info['GSNONVARSCORE'] > 13:
-            sv.filter.add("NONVARIANT")
-        # GTDEPTH
-        if 'GSDEPTHRATIO' in info and (info['GSDEPTHRATIO'] is None or info['GSDEPTHRATIO'] > 0.8):
-            sv.filter.add("DEPTH")
-        if 'DEPTHRATIO' in info and (isnan(info['DEPTHRATIO']) or info['DEPTHRATIO'] > 0.8):
-            sv.filter.add("DDEPTH")
-        if 'DEPTHCALLTHRESHOLD' in info and info['DEPTHCALLTHRESHOLD'] > 1:
-            sv.filter.add("CCOVERAGE")
-        # CLUSTERSEP
-        if 'GSM1' in info and (info['GSM1'] <= -0.5 or info['GSM1'] >= 2.0):
-            sv.filter.add("CLUSTERSEP")
--- a/svreader/Annotate.py
+++ b/svreader/Annotate.py
+import sys
-from svreader import SVRecord, SVReader, SVWriter
+from svrunner_utils import eprint
+from svreader import SVReader, SVWriter
 from pysam import VariantFile
@@ -8,6 +9,7 @@ HOM_REF = (0, 0)
 HET_VAR = (0, 1)
 HOM_VAR = (1, 1)
 class AnnotatedRecord(object):
    """
    A lightweight object to annotated the final records
@@ -56,7 +58,49 @@ class AnnotatedRecord(object):
    @property
    def samples(self):
-        return self.record.samples
+        return [AnnotatedRecordSample(s) for s in self.record.samples.values()]
+    def setNewId(self, identifier):
+        try:
+            self.record.info['SOURCEID'] = self.id
+        except KeyError:
+            eprint("SOURCEID absent from record info keys,")
+            sys.exit(1)
+        self.id = identifier
+    def num_variant_samples(self):
+        return sum([s.isVariant for s in self.samples])
+    def variant_read_support(self):
+        return sum([s.AltSupport() for s in self.samples])
+    def qual(self):
+        return sum([s.SQ_score() for s in self.samples if s.isVariant])
+    def setQual(self):
+        self.record.qual = self.qual()
+    def AddSupportInfos(self):
+        supp_reads = self.variant_read_support()
+        num_supp_samples = self.num_variant_samples()
+        try:
+            self.record.info['SUPP_READS'] = supp_reads
+            self.record.info['NUM_SUPP_SAMPLES'] = num_supp_samples
+        except KeyError:
+            eprint("SUPP_READS or NUM_SUPP_SAMPLES absent from record info keys")
+            sys.exit(1)
+    def UnifiedPass(self):
+        """
+           All records passing the filters (PASS, .) ar now labelled PASS
+        """
+        record = self.record
+        filters = [f for f in record.filter]
+        # We make the assumption when a "." is present no other filter
+        # are present
+        if len(filters) == 0 or "." in filters:
+            record.filter.clear()
+            record.filter.add("PASS")
 class AnnotatedRecordSample(object):
@@ -66,25 +110,41 @@ class AnnotatedRecordSample(object):
    def __init__(self, variantsample):
        self.variantsample = variantsample
+    def genotype(self):
+        return self.variantsample.get('GT')
+    def SQ_score(self):
+        return self.variantsample.get('SQ')
+    def AltSupport(self):
+        return self.variantsample.get('AO')[0]
+    @property
+    def isVariant(self):
+        if self.genotype() in [HET_VAR, HOM_VAR]:
+            return True
+        else:
+            return False
 class VCFReader(SVReader):
    def __init__(self, file_name, sv_to_report=None):
-        super(VCFReader, self).__init__(file_name,
+        super(VCFReader, self).__init__(file_name)
-                                        generic_name,
-                                        reference_handle)
        self.vcf_reader = VariantFile(file_name)
        self.filename = file_name
-        self.sv_to_report =sv_to_report
+        self.sv_to_report = sv_to_report
        self.vcf_reader = VariantFile(file_name)
+        self.add_metadata()
    def __iter__(self):
        return self
    def __next__(self):
        while True:
            raw_record = next(self.vcf_reader)
-            record = (raw_record)
+            record = AnnotatedRecord(raw_record)
            return record
    def getHeader(self):
@@ -102,6 +162,17 @@ class VCFReader(SVReader):
                                           type=None,
                                           description=description)
+    def add_metadata(self):
+        self.addInfo("SOURCEID", 1, "String",
+                     "The source sv identifier")
+        self.addInfo("SUPP_READS", 1, "Integer",
+                     "Number of supporting reads")
+        self.addInfo("NUM_SUPP_SAMPLES", 1, "Integer",
+                     "Number of supporting samples")
+        self.addFilter("LOWSUPPORT",
+                       "total supp reads < 5 or supp samples < 2")
    def getOrderedSamples(self):
        samples = self.vcf_reader.header.samples
        sample_names = [sample.rsplit('.')[0] for sample in samples]

--- a/svreader/__init__.py
+++ b/svreader/__init__.py
@@ -228,34 +228,7 @@ class SVReader(object):
        if excluded_regions is not None:
            vcf_records = self._filter_for_gaps(vcf_records, excluded_regions,
                                                cutoff_proximity)
-            # Convert vcf to intervals
-        #     eprint("Converting to bedtools intervals")
-        #     variants = vcf_to_pybed(vcf_records, excluded_regions,
-        #                             cutoff_proximity)
-        #
-        #     # Identify SV overlapping regions to exclude
-        #     eprint("Loading exlusion intervals ")
-        #     eprint(excluded_regions)
-        #     toexclude = BedTool(excluded_regions).sort()
-        #     excluded = dict()
-        #     if len(toexclude):
-        #         eprint("Computing distance with gaps: %d gaps" % len(toexclude))
-        #         # removing cnv in a close proximity to gaps in the assembly
-        #         # -d=True In addition to the closest feature in B,
-        #         # report its distance to A as an extra column
-        #         # TODO this is questionnable ??? (remove this for genomestrip ?)
-        #         for sv in variants.closest(toexclude, d=True):
-        #             if (sv[4] != "."
-        #                     and sv[4] != "-1"
-        #                     and float(sv[-1]) < cutoff_proximity):
-        #                 excluded[sv[3]] = 1
-        #
-        #     vcf_records_filtered = []
-        #     for record in vcf_records:
-        #         if record.id not in excluded:
-        #             vcf_records_filtered.append(record)
-        # vcf_records = vcf_records_filtered
        return vcf_records
    def GeneralFilterPass(self, record, minlen, maxlen):