From 4645bf7cfb76bd799090380f23c444e7620a9af4 Mon Sep 17 00:00:00 2001
From: Thomas Faraut <Thomas.Faraut@inra.fr>
Date: Wed, 27 Nov 2019 14:23:30 +0100
Subject: [PATCH] modified annotation

---
 svfilter.py          | 50 +++++++++++++++----------
 svreader/Annotate.py | 87 ++++++++++++++++++++++++++++++++++++++++----
 svreader/__init__.py | 29 +--------------
 3 files changed, 111 insertions(+), 55 deletions(-)

diff --git a/svfilter.py b/svfilter.py
index abd885f..4a568eb 100644
--- a/svfilter.py
+++ b/svfilter.py
@@ -253,12 +253,30 @@ def setLikeliTag(genotyper):
         exit(1)
 
 
-def GenomeSTRIPLikeRedundancyAnnotator(SVSet, duplicatescore_threshold=-2,
+def add_redundancy_infos_header(reader):
+    # Adding DUPLICATESCORE info (equivalent to GSDUPLICATESCORE)
+    reader.addInfo("DUPLICATESCORE", 1, "Float",
+                   "LOD score that the events are distinct based on the "
+                   "genotypes of the most discordant sample")
+    # Adding DUPLICATES info (equivalent to GSDUPLICATEOVERLAP)
+    reader.addInfo("DUPLICATEOVERLAP", 1, "Float",
+                   "Highest overlap with a duplicate event")
+    # Adding DUPLICATEOVERLAP info (equivalent to GSDUPLICATEOVERLAP)
+    reader.addInfo("DUPLICATES", ".", "String",
+                   "List of duplicate events preferred to this one")
+    # Adding NONDUPLICATEOVERLAP
+    reader.addInfo("NONDUPLICATEOVERLAP", 1, "Float",
+                    "Amount of overlap with a non-duplicate")
+
+
+def GenomeSTRIPLikeRedundancyAnnotator(SVSet, reader,
+                                       duplicatescore_threshold=-2,
                                        genotyper="svtyper"):
     """ Annotating duplicate candidates based on the genotype likelihoods
       - genotype likelihoods can be provided by svtyper or genomestrip
     """
 
+    add_redundancy_infos_header(reader)
     setLikeliTag(genotyper)
 
     variants = defaultdict()
@@ -344,7 +362,16 @@ def add_overlap_info_sv(sv, overlap, duplicatescore):
     sv.record.info['NONDUPLICATEOVERLAP'] = overlap
 
 
-def GenomeSTRIPLikefiltering(SVSet):
+def add_filter_infos_header(reader):
+    # FILTERS
+    # Adding specific filters
+    reader.addFilter("CALLRATE", "Call rate <0.75")
+    reader.addFilter("MONOMORPH", "All samples have the same genotype")
+    reader.addFilter("DUPLICATE", "GSDUPLICATESCORE>0")
+    reader.addFilter("OVERLAP", "NONDUPLICATEOVERLAP>0.7")
+
+
+def GenomeSTRIPLikefiltering(SVSet, reader):
     """ Filtering the candidate CNVs according to the following criteria
           - non duplicate sites
           - variant sites
@@ -355,30 +382,15 @@ def GenomeSTRIPLikefiltering(SVSet):
             (use NONVARIANTSCORE in both cases)
     """
 
+    add_filter_infos_header(reader)
+
     for sv in SVSet:
         info = sv.record.info
         if callRate(sv) < 0.75:
             sv.filter.add("CALLRATE")
         if not Polymorph(sv):
             sv.filter.add("MONOMORPH")
-        # if info['GSDUPLICATESCORE'] and info['GSDUPLICATESCORE']>0:
-        #    sv.filter.add("DUPLICATE")
-        # DUPLICATE
         if 'NONDUPLICATEOVERLAP' in info and info['NONDUPLICATEOVERLAP'] > 0.7:
             sv.filter.add("OVERLAP")
-
         if "DUPLICATESCORE" in info is not None and info['DUPLICATESCORE'] > -2:
             sv.filter.add("DUPLICATE")
-        # NONVARIANT
-        if 'GSNONVARSCORE' in info and info['GSNONVARSCORE'] and info['GSNONVARSCORE'] > 13:
-            sv.filter.add("NONVARIANT")
-        # GTDEPTH
-        if 'GSDEPTHRATIO' in info and (info['GSDEPTHRATIO'] is None or info['GSDEPTHRATIO'] > 0.8):
-            sv.filter.add("DEPTH")
-        if 'DEPTHRATIO' in info and (isnan(info['DEPTHRATIO']) or info['DEPTHRATIO'] > 0.8):
-            sv.filter.add("DDEPTH")
-        if 'DEPTHCALLTHRESHOLD' in info and info['DEPTHCALLTHRESHOLD'] > 1:
-            sv.filter.add("CCOVERAGE")
-        # CLUSTERSEP
-        if 'GSM1' in info and (info['GSM1'] <= -0.5 or info['GSM1'] >= 2.0):
-            sv.filter.add("CLUSTERSEP")
diff --git a/svreader/Annotate.py b/svreader/Annotate.py
index d6bb22a..66d6813 100644
--- a/svreader/Annotate.py
+++ b/svreader/Annotate.py
@@ -1,6 +1,7 @@
 
-
-from svreader import SVRecord, SVReader, SVWriter
+import sys
+from svrunner_utils import eprint
+from svreader import SVReader, SVWriter
 
 from pysam import VariantFile
 
@@ -8,6 +9,7 @@ HOM_REF = (0, 0)
 HET_VAR = (0, 1)
 HOM_VAR = (1, 1)
 
+
 class AnnotatedRecord(object):
     """
     A lightweight object to annotated the final records
@@ -56,7 +58,49 @@ class AnnotatedRecord(object):
 
     @property
     def samples(self):
-        return self.record.samples
+        return [AnnotatedRecordSample(s) for s in self.record.samples.values()]
+
+    def setNewId(self, identifier):
+        try:
+            self.record.info['SOURCEID'] = self.id
+        except KeyError:
+            eprint("SOURCEID absent from record info keys,")
+            sys.exit(1)
+        self.id = identifier
+
+    def num_variant_samples(self):
+        return sum([s.isVariant for s in self.samples])
+
+    def variant_read_support(self):
+        return sum([s.AltSupport() for s in self.samples])
+
+    def qual(self):
+        return sum([s.SQ_score() for s in self.samples if s.isVariant])
+
+    def setQual(self):
+        self.record.qual = self.qual()
+
+    def AddSupportInfos(self):
+        supp_reads = self.variant_read_support()
+        num_supp_samples = self.num_variant_samples()
+        try:
+            self.record.info['SUPP_READS'] = supp_reads
+            self.record.info['NUM_SUPP_SAMPLES'] = num_supp_samples
+        except KeyError:
+            eprint("SUPP_READS or NUM_SUPP_SAMPLES absent from record info keys")
+            sys.exit(1)
+
+    def UnifiedPass(self):
+        """
+           All records passing the filters (PASS, .) ar now labelled PASS
+        """
+        record = self.record
+        filters = [f for f in record.filter]
+        # We make the assumption when a "." is present no other filter
+        # are present
+        if len(filters) == 0 or "." in filters:
+            record.filter.clear()
+            record.filter.add("PASS")
 
 
 class AnnotatedRecordSample(object):
@@ -66,25 +110,41 @@ class AnnotatedRecordSample(object):
     def __init__(self, variantsample):
         self.variantsample = variantsample
 
+    def genotype(self):
+        return self.variantsample.get('GT')
+
+    def SQ_score(self):
+        return self.variantsample.get('SQ')
+
+    def AltSupport(self):
+        return self.variantsample.get('AO')[0]
+
+    @property
+    def isVariant(self):
+        if self.genotype() in [HET_VAR, HOM_VAR]:
+            return True
+        else:
+            return False
+
 
 class VCFReader(SVReader):
 
     def __init__(self, file_name, sv_to_report=None):
-        super(VCFReader, self).__init__(file_name,
-                                        generic_name,
-                                        reference_handle)
+        super(VCFReader, self).__init__(file_name)
         self.vcf_reader = VariantFile(file_name)
         self.filename = file_name
-        self.sv_to_report =sv_to_report
+        self.sv_to_report = sv_to_report
         self.vcf_reader = VariantFile(file_name)
 
+        self.add_metadata()
+
     def __iter__(self):
         return self
 
     def __next__(self):
         while True:
             raw_record = next(self.vcf_reader)
-            record = (raw_record)
+            record = AnnotatedRecord(raw_record)
             return record
 
     def getHeader(self):
@@ -102,6 +162,17 @@ class VCFReader(SVReader):
                                            type=None,
                                            description=description)
 
+    def add_metadata(self):
+        self.addInfo("SOURCEID", 1, "String",
+                     "The source sv identifier")
+        self.addInfo("SUPP_READS", 1, "Integer",
+                     "Number of supporting reads")
+        self.addInfo("NUM_SUPP_SAMPLES", 1, "Integer",
+                     "Number of supporting samples")
+        self.addFilter("LOWSUPPORT",
+                       "total supp reads < 5 or supp samples < 2")
+
+
     def getOrderedSamples(self):
         samples = self.vcf_reader.header.samples
         sample_names = [sample.rsplit('.')[0] for sample in samples]
diff --git a/svreader/__init__.py b/svreader/__init__.py
index be3e22c..2309f13 100644
--- a/svreader/__init__.py
+++ b/svreader/__init__.py
@@ -228,34 +228,7 @@ class SVReader(object):
         if excluded_regions is not None:
             vcf_records = self._filter_for_gaps(vcf_records, excluded_regions,
                                                 cutoff_proximity)
-            # Convert vcf to intervals
-        #     eprint("Converting to bedtools intervals")
-        #     variants = vcf_to_pybed(vcf_records, excluded_regions,
-        #                             cutoff_proximity)
-        #
-        #     # Identify SV overlapping regions to exclude
-        #     eprint("Loading exlusion intervals ")
-        #     eprint(excluded_regions)
-        #     toexclude = BedTool(excluded_regions).sort()
-        #     excluded = dict()
-        #     if len(toexclude):
-        #         eprint("Computing distance with gaps: %d gaps" % len(toexclude))
-        #         # removing cnv in a close proximity to gaps in the assembly
-        #         # -d=True In addition to the closest feature in B,
-        #         # report its distance to A as an extra column
-        #         # TODO this is questionnable ??? (remove this for genomestrip ?)
-        #         for sv in variants.closest(toexclude, d=True):
-        #             if (sv[4] != "."
-        #                     and sv[4] != "-1"
-        #                     and float(sv[-1]) < cutoff_proximity):
-        #                 excluded[sv[3]] = 1
-        #
-        #     vcf_records_filtered = []
-        #     for record in vcf_records:
-        #         if record.id not in excluded:
-        #             vcf_records_filtered.append(record)
-        # vcf_records = vcf_records_filtered
-
+        
         return vcf_records
 
     def GeneralFilterPass(self, record, minlen, maxlen):
-- 
GitLab