Skip to content
Snippets Groups Projects
Commit 85d17402 authored by Thomas Faraut's avatar Thomas Faraut
Browse files

modified annotation

parent 3ab9cd8e
No related branches found
No related tags found
No related merge requests found
...@@ -253,12 +253,30 @@ def setLikeliTag(genotyper): ...@@ -253,12 +253,30 @@ def setLikeliTag(genotyper):
exit(1) exit(1)
def GenomeSTRIPLikeRedundancyAnnotator(SVSet, duplicatescore_threshold=-2, def add_redundancy_infos_header(reader):
# Adding DUPLICATESCORE info (equivalent to GSDUPLICATESCORE)
reader.addInfo("DUPLICATESCORE", 1, "Float",
"LOD score that the events are distinct based on the "
"genotypes of the most discordant sample")
# Adding DUPLICATES info (equivalent to GSDUPLICATEOVERLAP)
reader.addInfo("DUPLICATEOVERLAP", 1, "Float",
"Highest overlap with a duplicate event")
# Adding DUPLICATEOVERLAP info (equivalent to GSDUPLICATEOVERLAP)
reader.addInfo("DUPLICATES", ".", "String",
"List of duplicate events preferred to this one")
# Adding NONDUPLICATEOVERLAP
reader.addInfo("NONDUPLICATEOVERLAP", 1, "Float",
"Amount of overlap with a non-duplicate")
def GenomeSTRIPLikeRedundancyAnnotator(SVSet, reader,
duplicatescore_threshold=-2,
genotyper="svtyper"): genotyper="svtyper"):
""" Annotating duplicate candidates based on the genotype likelihoods """ Annotating duplicate candidates based on the genotype likelihoods
- genotype likelihoods can be provided by svtyper or genomestrip - genotype likelihoods can be provided by svtyper or genomestrip
""" """
add_redundancy_infos_header(reader)
setLikeliTag(genotyper) setLikeliTag(genotyper)
variants = defaultdict() variants = defaultdict()
...@@ -344,7 +362,16 @@ def add_overlap_info_sv(sv, overlap, duplicatescore): ...@@ -344,7 +362,16 @@ def add_overlap_info_sv(sv, overlap, duplicatescore):
sv.record.info['NONDUPLICATEOVERLAP'] = overlap sv.record.info['NONDUPLICATEOVERLAP'] = overlap
def GenomeSTRIPLikefiltering(SVSet): def add_filter_infos_header(reader):
# FILTERS
# Adding specific filters
reader.addFilter("CALLRATE", "Call rate <0.75")
reader.addFilter("MONOMORPH", "All samples have the same genotype")
reader.addFilter("DUPLICATE", "GSDUPLICATESCORE>0")
reader.addFilter("OVERLAP", "NONDUPLICATEOVERLAP>0.7")
def GenomeSTRIPLikefiltering(SVSet, reader):
""" Filtering the candidate CNVs according to the following criteria """ Filtering the candidate CNVs according to the following criteria
- non duplicate sites - non duplicate sites
- variant sites - variant sites
...@@ -355,30 +382,15 @@ def GenomeSTRIPLikefiltering(SVSet): ...@@ -355,30 +382,15 @@ def GenomeSTRIPLikefiltering(SVSet):
(use NONVARIANTSCORE in both cases) (use NONVARIANTSCORE in both cases)
""" """
add_filter_infos_header(reader)
for sv in SVSet: for sv in SVSet:
info = sv.record.info info = sv.record.info
if callRate(sv) < 0.75: if callRate(sv) < 0.75:
sv.filter.add("CALLRATE") sv.filter.add("CALLRATE")
if not Polymorph(sv): if not Polymorph(sv):
sv.filter.add("MONOMORPH") sv.filter.add("MONOMORPH")
# if info['GSDUPLICATESCORE'] and info['GSDUPLICATESCORE']>0:
# sv.filter.add("DUPLICATE")
# DUPLICATE
if 'NONDUPLICATEOVERLAP' in info and info['NONDUPLICATEOVERLAP'] > 0.7: if 'NONDUPLICATEOVERLAP' in info and info['NONDUPLICATEOVERLAP'] > 0.7:
sv.filter.add("OVERLAP") sv.filter.add("OVERLAP")
if "DUPLICATESCORE" in info is not None and info['DUPLICATESCORE'] > -2: if "DUPLICATESCORE" in info is not None and info['DUPLICATESCORE'] > -2:
sv.filter.add("DUPLICATE") sv.filter.add("DUPLICATE")
# NONVARIANT
if 'GSNONVARSCORE' in info and info['GSNONVARSCORE'] and info['GSNONVARSCORE'] > 13:
sv.filter.add("NONVARIANT")
# GTDEPTH
if 'GSDEPTHRATIO' in info and (info['GSDEPTHRATIO'] is None or info['GSDEPTHRATIO'] > 0.8):
sv.filter.add("DEPTH")
if 'DEPTHRATIO' in info and (isnan(info['DEPTHRATIO']) or info['DEPTHRATIO'] > 0.8):
sv.filter.add("DDEPTH")
if 'DEPTHCALLTHRESHOLD' in info and info['DEPTHCALLTHRESHOLD'] > 1:
sv.filter.add("CCOVERAGE")
# CLUSTERSEP
if 'GSM1' in info and (info['GSM1'] <= -0.5 or info['GSM1'] >= 2.0):
sv.filter.add("CLUSTERSEP")
import sys
from svreader import SVRecord, SVReader, SVWriter from svrunner_utils import eprint
from svreader import SVReader, SVWriter
from pysam import VariantFile from pysam import VariantFile
...@@ -8,6 +9,7 @@ HOM_REF = (0, 0) ...@@ -8,6 +9,7 @@ HOM_REF = (0, 0)
HET_VAR = (0, 1) HET_VAR = (0, 1)
HOM_VAR = (1, 1) HOM_VAR = (1, 1)
class AnnotatedRecord(object): class AnnotatedRecord(object):
""" """
A lightweight object to annotated the final records A lightweight object to annotated the final records
...@@ -56,7 +58,49 @@ class AnnotatedRecord(object): ...@@ -56,7 +58,49 @@ class AnnotatedRecord(object):
@property @property
def samples(self): def samples(self):
return self.record.samples return [AnnotatedRecordSample(s) for s in self.record.samples.values()]
def setNewId(self, identifier):
try:
self.record.info['SOURCEID'] = self.id
except KeyError:
eprint("SOURCEID absent from record info keys,")
sys.exit(1)
self.id = identifier
def num_variant_samples(self):
return sum([s.isVariant for s in self.samples])
def variant_read_support(self):
return sum([s.AltSupport() for s in self.samples])
def qual(self):
return sum([s.SQ_score() for s in self.samples if s.isVariant])
def setQual(self):
self.record.qual = self.qual()
def AddSupportInfos(self):
supp_reads = self.variant_read_support()
num_supp_samples = self.num_variant_samples()
try:
self.record.info['SUPP_READS'] = supp_reads
self.record.info['NUM_SUPP_SAMPLES'] = num_supp_samples
except KeyError:
eprint("SUPP_READS or NUM_SUPP_SAMPLES absent from record info keys")
sys.exit(1)
def UnifiedPass(self):
"""
All records passing the filters (PASS, .) ar now labelled PASS
"""
record = self.record
filters = [f for f in record.filter]
# We make the assumption when a "." is present no other filter
# are present
if len(filters) == 0 or "." in filters:
record.filter.clear()
record.filter.add("PASS")
class AnnotatedRecordSample(object): class AnnotatedRecordSample(object):
...@@ -66,25 +110,41 @@ class AnnotatedRecordSample(object): ...@@ -66,25 +110,41 @@ class AnnotatedRecordSample(object):
def __init__(self, variantsample): def __init__(self, variantsample):
self.variantsample = variantsample self.variantsample = variantsample
def genotype(self):
return self.variantsample.get('GT')
def SQ_score(self):
return self.variantsample.get('SQ')
def AltSupport(self):
return self.variantsample.get('AO')[0]
@property
def isVariant(self):
if self.genotype() in [HET_VAR, HOM_VAR]:
return True
else:
return False
class VCFReader(SVReader): class VCFReader(SVReader):
def __init__(self, file_name, sv_to_report=None): def __init__(self, file_name, sv_to_report=None):
super(VCFReader, self).__init__(file_name, super(VCFReader, self).__init__(file_name)
generic_name,
reference_handle)
self.vcf_reader = VariantFile(file_name) self.vcf_reader = VariantFile(file_name)
self.filename = file_name self.filename = file_name
self.sv_to_report =sv_to_report self.sv_to_report = sv_to_report
self.vcf_reader = VariantFile(file_name) self.vcf_reader = VariantFile(file_name)
self.add_metadata()
def __iter__(self): def __iter__(self):
return self return self
def __next__(self): def __next__(self):
while True: while True:
raw_record = next(self.vcf_reader) raw_record = next(self.vcf_reader)
record = (raw_record) record = AnnotatedRecord(raw_record)
return record return record
def getHeader(self): def getHeader(self):
...@@ -102,6 +162,17 @@ class VCFReader(SVReader): ...@@ -102,6 +162,17 @@ class VCFReader(SVReader):
type=None, type=None,
description=description) description=description)
def add_metadata(self):
self.addInfo("SOURCEID", 1, "String",
"The source sv identifier")
self.addInfo("SUPP_READS", 1, "Integer",
"Number of supporting reads")
self.addInfo("NUM_SUPP_SAMPLES", 1, "Integer",
"Number of supporting samples")
self.addFilter("LOWSUPPORT",
"total supp reads < 5 or supp samples < 2")
def getOrderedSamples(self): def getOrderedSamples(self):
samples = self.vcf_reader.header.samples samples = self.vcf_reader.header.samples
sample_names = [sample.rsplit('.')[0] for sample in samples] sample_names = [sample.rsplit('.')[0] for sample in samples]
......
...@@ -228,34 +228,7 @@ class SVReader(object): ...@@ -228,34 +228,7 @@ class SVReader(object):
if excluded_regions is not None: if excluded_regions is not None:
vcf_records = self._filter_for_gaps(vcf_records, excluded_regions, vcf_records = self._filter_for_gaps(vcf_records, excluded_regions,
cutoff_proximity) cutoff_proximity)
# Convert vcf to intervals
# eprint("Converting to bedtools intervals")
# variants = vcf_to_pybed(vcf_records, excluded_regions,
# cutoff_proximity)
#
# # Identify SV overlapping regions to exclude
# eprint("Loading exlusion intervals ")
# eprint(excluded_regions)
# toexclude = BedTool(excluded_regions).sort()
# excluded = dict()
# if len(toexclude):
# eprint("Computing distance with gaps: %d gaps" % len(toexclude))
# # removing cnv in a close proximity to gaps in the assembly
# # -d=True In addition to the closest feature in B,
# # report its distance to A as an extra column
# # TODO this is questionnable ??? (remove this for genomestrip ?)
# for sv in variants.closest(toexclude, d=True):
# if (sv[4] != "."
# and sv[4] != "-1"
# and float(sv[-1]) < cutoff_proximity):
# excluded[sv[3]] = 1
#
# vcf_records_filtered = []
# for record in vcf_records:
# if record.id not in excluded:
# vcf_records_filtered.append(record)
# vcf_records = vcf_records_filtered
return vcf_records return vcf_records
def GeneralFilterPass(self, record, minlen, maxlen): def GeneralFilterPass(self, record, minlen, maxlen):
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment