Skip to content
Snippets Groups Projects
Commit 4645bf7c authored by Thomas Faraut's avatar Thomas Faraut
Browse files

modified annotation

parent bf4fba45
No related branches found
No related tags found
No related merge requests found
......@@ -253,12 +253,30 @@ def setLikeliTag(genotyper):
exit(1)
def GenomeSTRIPLikeRedundancyAnnotator(SVSet, duplicatescore_threshold=-2,
def add_redundancy_infos_header(reader):
# Adding DUPLICATESCORE info (equivalent to GSDUPLICATESCORE)
reader.addInfo("DUPLICATESCORE", 1, "Float",
"LOD score that the events are distinct based on the "
"genotypes of the most discordant sample")
# Adding DUPLICATES info (equivalent to GSDUPLICATEOVERLAP)
reader.addInfo("DUPLICATEOVERLAP", 1, "Float",
"Highest overlap with a duplicate event")
# Adding DUPLICATEOVERLAP info (equivalent to GSDUPLICATEOVERLAP)
reader.addInfo("DUPLICATES", ".", "String",
"List of duplicate events preferred to this one")
# Adding NONDUPLICATEOVERLAP
reader.addInfo("NONDUPLICATEOVERLAP", 1, "Float",
"Amount of overlap with a non-duplicate")
def GenomeSTRIPLikeRedundancyAnnotator(SVSet, reader,
duplicatescore_threshold=-2,
genotyper="svtyper"):
""" Annotating duplicate candidates based on the genotype likelihoods
- genotype likelihoods can be provided by svtyper or genomestrip
"""
add_redundancy_infos_header(reader)
setLikeliTag(genotyper)
variants = defaultdict()
......@@ -344,7 +362,16 @@ def add_overlap_info_sv(sv, overlap, duplicatescore):
sv.record.info['NONDUPLICATEOVERLAP'] = overlap
def GenomeSTRIPLikefiltering(SVSet):
def add_filter_infos_header(reader):
# FILTERS
# Adding specific filters
reader.addFilter("CALLRATE", "Call rate <0.75")
reader.addFilter("MONOMORPH", "All samples have the same genotype")
reader.addFilter("DUPLICATE", "GSDUPLICATESCORE>0")
reader.addFilter("OVERLAP", "NONDUPLICATEOVERLAP>0.7")
def GenomeSTRIPLikefiltering(SVSet, reader):
""" Filtering the candidate CNVs according to the following criteria
- non duplicate sites
- variant sites
......@@ -355,30 +382,15 @@ def GenomeSTRIPLikefiltering(SVSet):
(use NONVARIANTSCORE in both cases)
"""
add_filter_infos_header(reader)
for sv in SVSet:
info = sv.record.info
if callRate(sv) < 0.75:
sv.filter.add("CALLRATE")
if not Polymorph(sv):
sv.filter.add("MONOMORPH")
# if info['GSDUPLICATESCORE'] and info['GSDUPLICATESCORE']>0:
# sv.filter.add("DUPLICATE")
# DUPLICATE
if 'NONDUPLICATEOVERLAP' in info and info['NONDUPLICATEOVERLAP'] > 0.7:
sv.filter.add("OVERLAP")
if "DUPLICATESCORE" in info is not None and info['DUPLICATESCORE'] > -2:
sv.filter.add("DUPLICATE")
# NONVARIANT
if 'GSNONVARSCORE' in info and info['GSNONVARSCORE'] and info['GSNONVARSCORE'] > 13:
sv.filter.add("NONVARIANT")
# GTDEPTH
if 'GSDEPTHRATIO' in info and (info['GSDEPTHRATIO'] is None or info['GSDEPTHRATIO'] > 0.8):
sv.filter.add("DEPTH")
if 'DEPTHRATIO' in info and (isnan(info['DEPTHRATIO']) or info['DEPTHRATIO'] > 0.8):
sv.filter.add("DDEPTH")
if 'DEPTHCALLTHRESHOLD' in info and info['DEPTHCALLTHRESHOLD'] > 1:
sv.filter.add("CCOVERAGE")
# CLUSTERSEP
if 'GSM1' in info and (info['GSM1'] <= -0.5 or info['GSM1'] >= 2.0):
sv.filter.add("CLUSTERSEP")
from svreader import SVRecord, SVReader, SVWriter
import sys
from svrunner_utils import eprint
from svreader import SVReader, SVWriter
from pysam import VariantFile
......@@ -8,6 +9,7 @@ HOM_REF = (0, 0)
HET_VAR = (0, 1)
HOM_VAR = (1, 1)
class AnnotatedRecord(object):
"""
A lightweight object to annotated the final records
......@@ -56,7 +58,49 @@ class AnnotatedRecord(object):
@property
def samples(self):
return self.record.samples
return [AnnotatedRecordSample(s) for s in self.record.samples.values()]
def setNewId(self, identifier):
try:
self.record.info['SOURCEID'] = self.id
except KeyError:
eprint("SOURCEID absent from record info keys,")
sys.exit(1)
self.id = identifier
def num_variant_samples(self):
return sum([s.isVariant for s in self.samples])
def variant_read_support(self):
return sum([s.AltSupport() for s in self.samples])
def qual(self):
return sum([s.SQ_score() for s in self.samples if s.isVariant])
def setQual(self):
self.record.qual = self.qual()
def AddSupportInfos(self):
supp_reads = self.variant_read_support()
num_supp_samples = self.num_variant_samples()
try:
self.record.info['SUPP_READS'] = supp_reads
self.record.info['NUM_SUPP_SAMPLES'] = num_supp_samples
except KeyError:
eprint("SUPP_READS or NUM_SUPP_SAMPLES absent from record info keys")
sys.exit(1)
def UnifiedPass(self):
"""
All records passing the filters (PASS, .) ar now labelled PASS
"""
record = self.record
filters = [f for f in record.filter]
# We make the assumption when a "." is present no other filter
# are present
if len(filters) == 0 or "." in filters:
record.filter.clear()
record.filter.add("PASS")
class AnnotatedRecordSample(object):
......@@ -66,25 +110,41 @@ class AnnotatedRecordSample(object):
def __init__(self, variantsample):
self.variantsample = variantsample
def genotype(self):
return self.variantsample.get('GT')
def SQ_score(self):
return self.variantsample.get('SQ')
def AltSupport(self):
return self.variantsample.get('AO')[0]
@property
def isVariant(self):
if self.genotype() in [HET_VAR, HOM_VAR]:
return True
else:
return False
class VCFReader(SVReader):
def __init__(self, file_name, sv_to_report=None):
super(VCFReader, self).__init__(file_name,
generic_name,
reference_handle)
super(VCFReader, self).__init__(file_name)
self.vcf_reader = VariantFile(file_name)
self.filename = file_name
self.sv_to_report =sv_to_report
self.sv_to_report = sv_to_report
self.vcf_reader = VariantFile(file_name)
self.add_metadata()
def __iter__(self):
return self
def __next__(self):
while True:
raw_record = next(self.vcf_reader)
record = (raw_record)
record = AnnotatedRecord(raw_record)
return record
def getHeader(self):
......@@ -102,6 +162,17 @@ class VCFReader(SVReader):
type=None,
description=description)
def add_metadata(self):
self.addInfo("SOURCEID", 1, "String",
"The source sv identifier")
self.addInfo("SUPP_READS", 1, "Integer",
"Number of supporting reads")
self.addInfo("NUM_SUPP_SAMPLES", 1, "Integer",
"Number of supporting samples")
self.addFilter("LOWSUPPORT",
"total supp reads < 5 or supp samples < 2")
def getOrderedSamples(self):
samples = self.vcf_reader.header.samples
sample_names = [sample.rsplit('.')[0] for sample in samples]
......
......@@ -228,34 +228,7 @@ class SVReader(object):
if excluded_regions is not None:
vcf_records = self._filter_for_gaps(vcf_records, excluded_regions,
cutoff_proximity)
# Convert vcf to intervals
# eprint("Converting to bedtools intervals")
# variants = vcf_to_pybed(vcf_records, excluded_regions,
# cutoff_proximity)
#
# # Identify SV overlapping regions to exclude
# eprint("Loading exlusion intervals ")
# eprint(excluded_regions)
# toexclude = BedTool(excluded_regions).sort()
# excluded = dict()
# if len(toexclude):
# eprint("Computing distance with gaps: %d gaps" % len(toexclude))
# # removing cnv in a close proximity to gaps in the assembly
# # -d=True In addition to the closest feature in B,
# # report its distance to A as an extra column
# # TODO this is questionnable ??? (remove this for genomestrip ?)
# for sv in variants.closest(toexclude, d=True):
# if (sv[4] != "."
# and sv[4] != "-1"
# and float(sv[-1]) < cutoff_proximity):
# excluded[sv[3]] = 1
#
# vcf_records_filtered = []
# for record in vcf_records:
# if record.id not in excluded:
# vcf_records_filtered.append(record)
# vcf_records = vcf_records_filtered
return vcf_records
def GeneralFilterPass(self, record, minlen, maxlen):
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment