Compare revisions

Thomas Faraut · 814c02a2 · 814c02a2 · 814c02a2 · 814c02a2
--- a/svfilter.py
+++ b/svfilter.py
@@ -118,7 +118,7 @@ def gstrength(u):

 def variantstrength(u):
    # maximum SQ, where SQ stands for
-    # Phred-scaled probability that this site is variant (non-reference
+    # Phred-scaled probability that this site is variant (non-reference)
    # in this sample)
    return max([s['SQ'] if s['SQ'] is not None else 0 for s in u.samples.values()])

@@ -176,7 +176,9 @@ def duplicatescore(u, v):


 def getduplicatesOld(u, v):
-    # select the prefered duplicate
+    # select the prefered duplicate on the basis of the
+    # Sum of phred-like genotype qualities
+    # see gstrength
    # returns prefered, discarded, strength of both
    if gstrength(u) > gstrength(v):
        return (u, v, gstrength(u), gstrength(v))
@@ -185,7 +187,9 @@ def getduplicatesOld(u, v):


 def getduplicates(u, v):
-    # select the prefered duplicate
+    # select the prefered duplicate on the basis of
+    # Phred-scaled probability that this site is a variant
+    # see variantstrength
    # returns prefered, discarded, strength of both
    if variantstrength(u) > variantstrength(v):
        return (u, v, variantstrength(u), variantstrength(v))
@@ -258,7 +262,7 @@ def GenomeSTRIPLikeRedundancyAnnotator(SVSet, duplicatescore_threshold=-2,
        variants[sv.id] = sv

    pybed_variants = vcf_to_pybed(SVSet)
-    self_overlap = pybed_variants.intersect(pybed_variants, f=0.5, wo=True)
+    self_overlap = pybed_variants.intersect(pybed_variants, f=0.5, r=True, wo=True)

    seen = defaultdict(tuple)
    duplicates = defaultdict(list)
@@ -274,8 +278,10 @@ def GenomeSTRIPLikeRedundancyAnnotator(SVSet, duplicatescore_threshold=-2,
        u = variants[a]
        v = variants[b]
        score = duplicatescore(u, v)
+        # print("Comparing %s and %s : %3.8f" % (u.id, v.id, score))
        if score > duplicatescore_threshold:
            ref, dupli, s1, s2 = getduplicates(u, v)
+            # print("%s prefered to %s %3.8f" % (ref.id, dupli.id, score))
            reference[ref] = 1
            overlap_size = int(o[-1])
            overlap = getoverlap(dupli, overlap_size)
@@ -296,6 +302,7 @@ def GenomeSTRIPLikeRedundancyAnnotator(SVSet, duplicatescore_threshold=-2,

    for u in SVSet:
        if u.id in duplicates:
+            # print("tagged as duplicate %s" % u.id)
            duplis = sorted([a for (a, s, o) in duplicates[u.id]])
            score = max([s for (a, s, o) in duplicates[u.id]])
            overlap = max([o for (a, s, o) in duplicates[u.id]])

--- a/svreader/__init__.py
+++ b/svreader/__init__.py
@@ -120,8 +120,6 @@ class SVRecord(SVInter):

    def addbatch2Id(self, batch=None):
        raise NotImplementedError()
-        if batch:
-            self.id += "_" + batch

    def __str__(self):
        return self.chrom+":"+str(self.start)+"-"+str(self.end)
@@ -174,6 +172,34 @@ class SVReader(object):
    def tool_name(self):
        return self.__tool_name

+    def _filter_for_gaps(sef, vcf_records, gap_regions, cutoff_proximity=50):
+        eprint("Converting to bedtools intervals")
+        variants = vcf_to_pybed(vcf_records)
+
+        # Identify SV overlapping regions to exclude
+        eprint("Loading gap intervals ")
+        eprint(gap_regions)
+        toexclude = BedTool(gap_regions).sort()
+        excluded = dict()
+        if len(toexclude):
+            eprint("Computing distance with gaps: %d gaps" % len(toexclude))
+            # removing cnv in a close proximity to gaps in the assembly
+            # -d=True In addition to the closest feature in B,
+            # report its distance to A as an extra column
+            # TODO this is questionnable ??? (remove this for genomestrip ?)
+            for sv in variants.closest(toexclude, d=True):
+                if (sv[4] != "."
+                        and sv[4] != "-1"
+                        and float(sv[-1]) < cutoff_proximity):
+                    excluded[sv[3]] = 1
+
+        vcf_records_filtered = []
+        for record in vcf_records:
+            if record.id not in excluded:
+                vcf_records_filtered.append(record)
+
+        return vcf_records_filtered
+
    def filtering(self, all_vcf_records, minlen, maxlen,
                  excluded_regions=None, cutoff_proximity=50):
        """
@@ -200,31 +226,35 @@ class SVReader(object):
        eprint("%d records" % (len(vcf_records)))

        if excluded_regions is not None:
+            vcf_records = self._filter_for_gaps(vcf_records, excluded_regions,
+                                                cutoff_proximity)
            # Convert vcf to intervals
-            eprint("Converting to bedtools intervals")
-            variants = vcf_to_pybed(vcf_records)
-
-            # Identify SV overlapping regions to exclude
-            eprint("Loading exlusion intervals ")
-            eprint(excluded_regions)
-            toexclude = BedTool(excluded_regions).sort()
-            excluded = dict()
-            if len(toexclude):
-                eprint("Computing distance with gaps: %d gaps" % len(toexclude))
-                # removing cnv in a close proximity to gaps in the assembly
-                # -d=True In addition to the closest feature in B,
-                # report its distance to A as an extra column
-                for sv in variants.closest(toexclude, d=True):
-                    if (sv[4] != "."
-                            and sv[4] != "-1"
-                            and float(sv[-1]) < cutoff_proximity):
-                        excluded[sv[3]] = 1
-
-            vcf_records_filtered = []
-            for record in vcf_records:
-                if record.id not in excluded:
-                    vcf_records_filtered.append(record)
-        vcf_records = vcf_records_filtered
+        #     eprint("Converting to bedtools intervals")
+        #     variants = vcf_to_pybed(vcf_records, excluded_regions,
+        #                             cutoff_proximity)
+        #
+        #     # Identify SV overlapping regions to exclude
+        #     eprint("Loading exlusion intervals ")
+        #     eprint(excluded_regions)
+        #     toexclude = BedTool(excluded_regions).sort()
+        #     excluded = dict()
+        #     if len(toexclude):
+        #         eprint("Computing distance with gaps: %d gaps" % len(toexclude))
+        #         # removing cnv in a close proximity to gaps in the assembly
+        #         # -d=True In addition to the closest feature in B,
+        #         # report its distance to A as an extra column
+        #         # TODO this is questionnable ??? (remove this for genomestrip ?)
+        #         for sv in variants.closest(toexclude, d=True):
+        #             if (sv[4] != "."
+        #                     and sv[4] != "-1"
+        #                     and float(sv[-1]) < cutoff_proximity):
+        #                 excluded[sv[3]] = 1
+        #
+        #     vcf_records_filtered = []
+        #     for record in vcf_records:
+        #         if record.id not in excluded:
+        #             vcf_records_filtered.append(record)
+        # vcf_records = vcf_records_filtered

        return vcf_records


--- a/svreader/genomestrip.py
+++ b/svreader/genomestrip.py
@@ -193,6 +193,12 @@ class GenomeSTRIPReader(SVReader):
        sample_names = [sample.rsplit('.')[0] for sample in samples]
        return sample_names

+    def _filter_for_gaps(sef, vcf_records, gap_regions, cutoff_proximity=50):
+        """
+        We do not filter for gaps for genomestrip
+        """
+        return vcf_records
+

 class GenomeSTRIPWriter(SVWriter):


--- a/svreader/pindel.py
+++ b/svreader/pindel.py
@@ -195,7 +195,7 @@ def pysam_header(pindel_header):
 class PindelRecord(SVRecord):
    counter = {"I": 0, "D": 0, "LI": 0, "TD": 0, "INV": 0, "RPL": 0}

-    # Only DEL and DUP for the moment
+    # Only DEL, INV and DUP for the moment
    svs_supported = set(["DEL", "DUP", "INV"])

    def __init__(self, record_string, reference_handle=None):
No results found