Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • svdetection/svlib
1 result
Show changes
Commits on Source (1)
......@@ -118,7 +118,7 @@ def gstrength(u):
def variantstrength(u):
# maximum SQ, where SQ stands for
# Phred-scaled probability that this site is variant (non-reference
# Phred-scaled probability that this site is variant (non-reference)
# in this sample)
return max([s['SQ'] if s['SQ'] is not None else 0 for s in u.samples.values()])
......@@ -176,7 +176,9 @@ def duplicatescore(u, v):
def getduplicatesOld(u, v):
# select the prefered duplicate
# select the prefered duplicate on the basis of the
# Sum of phred-like genotype qualities
# see gstrength
# returns prefered, discarded, strength of both
if gstrength(u) > gstrength(v):
return (u, v, gstrength(u), gstrength(v))
......@@ -185,7 +187,9 @@ def getduplicatesOld(u, v):
def getduplicates(u, v):
# select the prefered duplicate
# select the prefered duplicate on the basis of
# Phred-scaled probability that this site is a variant
# see variantstrength
# returns prefered, discarded, strength of both
if variantstrength(u) > variantstrength(v):
return (u, v, variantstrength(u), variantstrength(v))
......@@ -258,7 +262,7 @@ def GenomeSTRIPLikeRedundancyAnnotator(SVSet, duplicatescore_threshold=-2,
variants[sv.id] = sv
pybed_variants = vcf_to_pybed(SVSet)
self_overlap = pybed_variants.intersect(pybed_variants, f=0.5, wo=True)
self_overlap = pybed_variants.intersect(pybed_variants, f=0.5, r=True, wo=True)
seen = defaultdict(tuple)
duplicates = defaultdict(list)
......@@ -274,8 +278,10 @@ def GenomeSTRIPLikeRedundancyAnnotator(SVSet, duplicatescore_threshold=-2,
u = variants[a]
v = variants[b]
score = duplicatescore(u, v)
# print("Comparing %s and %s : %3.8f" % (u.id, v.id, score))
if score > duplicatescore_threshold:
ref, dupli, s1, s2 = getduplicates(u, v)
# print("%s prefered to %s %3.8f" % (ref.id, dupli.id, score))
reference[ref] = 1
overlap_size = int(o[-1])
overlap = getoverlap(dupli, overlap_size)
......@@ -296,6 +302,7 @@ def GenomeSTRIPLikeRedundancyAnnotator(SVSet, duplicatescore_threshold=-2,
for u in SVSet:
if u.id in duplicates:
# print("tagged as duplicate %s" % u.id)
duplis = sorted([a for (a, s, o) in duplicates[u.id]])
score = max([s for (a, s, o) in duplicates[u.id]])
overlap = max([o for (a, s, o) in duplicates[u.id]])
......
......@@ -120,8 +120,6 @@ class SVRecord(SVInter):
def addbatch2Id(self, batch=None):
raise NotImplementedError()
if batch:
self.id += "_" + batch
def __str__(self):
return self.chrom+":"+str(self.start)+"-"+str(self.end)
......@@ -174,6 +172,34 @@ class SVReader(object):
def tool_name(self):
return self.__tool_name
def _filter_for_gaps(sef, vcf_records, gap_regions, cutoff_proximity=50):
eprint("Converting to bedtools intervals")
variants = vcf_to_pybed(vcf_records)
# Identify SV overlapping regions to exclude
eprint("Loading gap intervals ")
eprint(gap_regions)
toexclude = BedTool(gap_regions).sort()
excluded = dict()
if len(toexclude):
eprint("Computing distance with gaps: %d gaps" % len(toexclude))
# removing cnv in a close proximity to gaps in the assembly
# -d=True In addition to the closest feature in B,
# report its distance to A as an extra column
# TODO this is questionnable ??? (remove this for genomestrip ?)
for sv in variants.closest(toexclude, d=True):
if (sv[4] != "."
and sv[4] != "-1"
and float(sv[-1]) < cutoff_proximity):
excluded[sv[3]] = 1
vcf_records_filtered = []
for record in vcf_records:
if record.id not in excluded:
vcf_records_filtered.append(record)
return vcf_records_filtered
def filtering(self, all_vcf_records, minlen, maxlen,
excluded_regions=None, cutoff_proximity=50):
"""
......@@ -200,31 +226,35 @@ class SVReader(object):
eprint("%d records" % (len(vcf_records)))
if excluded_regions is not None:
vcf_records = self._filter_for_gaps(vcf_records, excluded_regions,
cutoff_proximity)
# Convert vcf to intervals
eprint("Converting to bedtools intervals")
variants = vcf_to_pybed(vcf_records)
# Identify SV overlapping regions to exclude
eprint("Loading exlusion intervals ")
eprint(excluded_regions)
toexclude = BedTool(excluded_regions).sort()
excluded = dict()
if len(toexclude):
eprint("Computing distance with gaps: %d gaps" % len(toexclude))
# removing cnv in a close proximity to gaps in the assembly
# -d=True In addition to the closest feature in B,
# report its distance to A as an extra column
for sv in variants.closest(toexclude, d=True):
if (sv[4] != "."
and sv[4] != "-1"
and float(sv[-1]) < cutoff_proximity):
excluded[sv[3]] = 1
vcf_records_filtered = []
for record in vcf_records:
if record.id not in excluded:
vcf_records_filtered.append(record)
vcf_records = vcf_records_filtered
# eprint("Converting to bedtools intervals")
# variants = vcf_to_pybed(vcf_records, excluded_regions,
# cutoff_proximity)
#
# # Identify SV overlapping regions to exclude
# eprint("Loading exlusion intervals ")
# eprint(excluded_regions)
# toexclude = BedTool(excluded_regions).sort()
# excluded = dict()
# if len(toexclude):
# eprint("Computing distance with gaps: %d gaps" % len(toexclude))
# # removing cnv in a close proximity to gaps in the assembly
# # -d=True In addition to the closest feature in B,
# # report its distance to A as an extra column
# # TODO this is questionnable ??? (remove this for genomestrip ?)
# for sv in variants.closest(toexclude, d=True):
# if (sv[4] != "."
# and sv[4] != "-1"
# and float(sv[-1]) < cutoff_proximity):
# excluded[sv[3]] = 1
#
# vcf_records_filtered = []
# for record in vcf_records:
# if record.id not in excluded:
# vcf_records_filtered.append(record)
# vcf_records = vcf_records_filtered
return vcf_records
......
......@@ -193,6 +193,12 @@ class GenomeSTRIPReader(SVReader):
sample_names = [sample.rsplit('.')[0] for sample in samples]
return sample_names
def _filter_for_gaps(sef, vcf_records, gap_regions, cutoff_proximity=50):
"""
We do not filter for gaps for genomestrip
"""
return vcf_records
class GenomeSTRIPWriter(SVWriter):
......
......@@ -195,7 +195,7 @@ def pysam_header(pindel_header):
class PindelRecord(SVRecord):
counter = {"I": 0, "D": 0, "LI": 0, "TD": 0, "INV": 0, "RPL": 0}
# Only DEL and DUP for the moment
# Only DEL, INV and DUP for the moment
svs_supported = set(["DEL", "DUP", "INV"])
def __init__(self, record_string, reference_handle=None):
......