- new criteria for selecting the representative of duplicate calls (based on SQ)

- remove sorting contig based on contig names when adding contigs info to vcf file

- new criteria for selecting the representative of duplicate calls (based on SQ)
785c7324 · Thomas Faraut · 2f124943 · 785c7324 · 785c7324
Commit 785c7324 authored 6 years ago by Thomas Faraut
--- a/svfilter.py
+++ b/svfilter.py
@@ -116,6 +116,13 @@ def gstrength(u):
    return np.sum([s['GQ'] if s['GQ'] is not None else 0 for s in u.samples.values()])


+def variantstrength(u):
+    # maximum SQ, where SQ stands for
+    # Phred-scaled probability that this site is variant (non-reference
+    # in this sample)
+    return max([s['SQ'] if s['SQ'] is not None else 0 for s in u.samples.values()])
+
+
 def ondiagonal(s, u, v):
    # Probability that, for that individual, the two SVs are identically
    # genotyped P1(0/0)*P2(0/0) + ... P1(1/1)*P2(1/1)
@@ -168,7 +175,7 @@ def duplicatescore(u, v):
    return computed


-def getduplicates(u, v):
+def getduplicatesOld(u, v):
    # select the prefered duplicate
    # returns prefered, discarded, strength of both
    if gstrength(u) > gstrength(v):
@@ -177,6 +184,15 @@ def getduplicates(u, v):
        return (v, u, gstrength(v), gstrength(u))


+def getduplicates(u, v):
+    # select the prefered duplicate
+    # returns prefered, discarded, strength of both
+    if variantstrength(u) > variantstrength(v):
+        return (u, v, variantstrength(u), variantstrength(v))
+    else:
+        return (v, u, variantstrength(v), variantstrength(u))
+
+
 def GenotypeQuals(u):
    # the genotype qualities as an array
    return [u.samples[s]['GQ'] if u.samples[s]['GQ'] is not None else 0
@@ -247,6 +263,7 @@ def GenomeSTRIPLikeRedundancyAnnotator(SVSet, duplicatescore_threshold=-2,
    seen = defaultdict(tuple)
    duplicates = defaultdict(list)
    overlapping = defaultdict(tuple)
+    reference = defaultdict()
    for o in self_overlap:
        if o[3] == o[7]:
            continue
@@ -259,6 +276,7 @@ def GenomeSTRIPLikeRedundancyAnnotator(SVSet, duplicatescore_threshold=-2,
        score = duplicatescore(u, v)
        if score > duplicatescore_threshold:
            ref, dupli, s1, s2 = getduplicates(u, v)
+            reference[ref] = 1
            overlap_size = int(o[-1])
            overlap = getoverlap(dupli, overlap_size)
            if maxGQ(ref) > 0 and passed(dupli):
@@ -286,8 +304,10 @@ def GenomeSTRIPLikeRedundancyAnnotator(SVSet, duplicatescore_threshold=-2,
    for o in overlapping:
        info = overlapping[o]
        u, v = o
-        add_overlap_info_sv(u, info['overlap_left'], info['dupli_score'])
-        add_overlap_info_sv(v, info['overlap_right'], info['dupli_score'])
+        if u not in reference:
+            add_overlap_info_sv(u, info['overlap_left'], info['dupli_score'])
+        if v not in reference:
+            add_overlap_info_sv(v, info['overlap_right'], info['dupli_score'])


 def add_duplicate_info_sv(sv, duplicateoverlap, duplicatescore, duplicates):

--- a/svrunner_utils.py
+++ b/svrunner_utils.py
@@ -11,6 +11,7 @@ from collections import defaultdict
 from pybedtools import BedTool, create_interval_from_list
 from pysam import AlignmentFile
 import numpy as np
+import natsort

 VERBOSE = True

@@ -129,6 +130,7 @@ def addContigInfosTovcf(vcffile, outvcffile, reference):
    """
    # compute contig information
    contigs = get_contigs(reference)
+
    filename, extension = os.path.splitext(outvcffile)
    if extension == ".gz":
        outputfile = filename
@@ -142,9 +144,12 @@ def addContigInfosTovcf(vcffile, outvcffile, reference):
                if re.search(r"^##contig", line) is not None:
                    continue
                if re.search(r"^#CHROM", line):
-                    for ctg in sorted(contigs, key=lambda x: chromNum(x.name)):
+                    for ctg in contigs:
                        fout.write('##contig=<ID=%s,length=%d>\n'
                                   % (ctg.name, ctg.length))
+                    #  for ctg in sorted(contigs,
+                    #                    key=lambda x: chromNum(x.name)):
+
                fout.write(line)
    # bgzip file
    if gzip: