Skip to content
Snippets Groups Projects
Commit a47a5d83 authored by Thomas Faraut's avatar Thomas Faraut
Browse files

adding duplicate informations with the new identifiers

parent 4f118e1d
No related branches found
No related tags found
No related merge requests found
......@@ -8,18 +8,31 @@ from math import isnan
from pysam import VariantFile
from svrunner_utils import eprint, warn, vcf_to_pybed
from svreader.vcfwrapper import VCFRecord, SVReader, SVWriter
from svreader.vcfwrapper import VCFRecord, VCFReader, VCFWriter
HOM_REF = (0, 0)
HET_VAR = (0, 1)
HOM_VAR = (1, 1)
VALID_GENOTYPES = [HOM_REF, HET_VAR, HOM_VAR]
SVTYPER_GENO_LIKELI_TAG = "GL"
GENOMESTRIP_GENO_LIKELI_TAG = "GP"
Geno_likeli_tag = SVTYPER_GENO_LIKELI_TAG
def coded_geno(geno):
if geno == HOM_REF:
return "HOMO_REF"
elif geno == HET_VAR:
return "HET_VAR"
elif geno == HOM_VAR:
return "HOMO_VAR"
else:
return "UNDEF"
def Variant(sample):
if sample.get('GT') in [HET_VAR, HOM_VAR]:
return True
......@@ -27,7 +40,7 @@ def Variant(sample):
return False
class VariantRecord(VCFRecord):
class AnnotateRecord(VCFRecord):
"""
A lightweight object to annotated the final records
"""
......@@ -35,7 +48,8 @@ class VariantRecord(VCFRecord):
"""
A pysam VariantRecord wrapper
"""
super(VariantRecord, self).__init__(record)
super(AnnotateRecord, self).__init__(record)
self.new_id = None
@property
def start(self):
......@@ -60,13 +74,20 @@ class VariantRecord(VCFRecord):
else:
return False
@property
def num_samples(self):
return len(self.record.samples.keys())
def setNewId(self, identifier):
self.new_id = identifier
def rename(self):
try:
self.record.info['SOURCEID'] = self.id
except KeyError:
eprint("SOURCEID absent from record info keys,")
sys.exit(1)
self.id = identifier
self.id = self.new_id
def num_variant_samples(self):
return sum([Variant(s) for s in self.samples.values()])
......@@ -78,11 +99,24 @@ class VariantRecord(VCFRecord):
return sum([s.get('SQ') for s in self.samples.values()])
def GQ_sum_score(self):
return sum([s.get('SQ') for s in self.samples.values()])
return sum([s.get('GQ') for s in self.samples.values()])
def maxGQ(self):
return max([s.get('GQ') for s in self.samples.values()])
def setQual(self):
self.record.qual = self.qual()
def numdiffGenotypes(self):
genotypes = defaultdict()
for s in self.samples.values():
if s.get('GT') in VALID_GENOTYPES:
genotypes[coded_geno(s.get('GT'))] = 1
return len(genotypes.keys())
def Polymorph(self):
return self.numdiffGenotypes() > 1
def AddSupportInfos(self):
supp_reads = self.variant_read_support()
num_supp_samples = self.num_variant_samples()
......@@ -93,6 +127,14 @@ class VariantRecord(VCFRecord):
eprint("SUPP_READS or NUM_SUPP_SAMPLES absent from record info keys")
sys.exit(1)
def CallRate(self, cutoff):
num_qual_call = sum([(s.get('GQ') > cutoff) for s in self.samples.values()])
return num_qual_call/self.num_samples
def VariantCallRate(self, cutoff):
num_qual_call = sum([(s.get('GQ') > cutoff) for s in self.samples.values() if Variant(s)])
return num_qual_call/self.num_variant_samples()
def UnifiedPass(self):
"""
All records passing the filters (PASS, .) ar now labelled PASS
......@@ -106,51 +148,14 @@ class VariantRecord(VCFRecord):
record.filter.add("PASS")
class VariantRecordSample(object):
"""
A lightweight object to VariantRecordSample
"""
def __init__(self, variantsample):
self.variantsample = variantsample
def genotype(self):
return self.variantsample.get('GT')
def SQ_score(self):
# Phred-scaled probability that this site is variant
# (non-reference in this sample)
return self.variantsample.get('SQ')
def GQ_score(self):
# Genotype quality
return self.variantsample.get('GQ')
def GL_score(self):
# Genotype Likelihood, log10-scaled likelihoods of the data given the
# called genotype for each possible genotype generated from the
# reference and alternate alleles given the sample ploidy
return self.variantsample.get('GL')
def AltSupport(self):
return self.variantsample.get('AO')[0]
@property
def isVariant(self):
if self.genotype() in [HET_VAR, HOM_VAR]:
return True
else:
return False
class VCFReader(SVReader):
class AnnotateReader(VCFReader):
def __init__(self, file_name, sv_to_report=None):
super(VCFReader, self).__init__(file_name)
super(AnnotateReader, self).__init__(file_name)
self.filename = file_name
self.sv_to_report = sv_to_report
self.vcf_reader = VariantFile(file_name)
self.add_metadata()
self.add_annotation_metadata()
def __iter__(self):
return self
......@@ -158,25 +163,25 @@ class VCFReader(SVReader):
def __next__(self):
while True:
raw_record = next(self.vcf_reader)
record = VariantRecord(raw_record)
record = AnnotateRecord(raw_record)
return record
def getHeader(self):
return self.vcf_reader.header
def addInfo(self, name, number, type, description):
self.vcf_reader.header.info.add(id=name,
number=number,
type=type,
description=description)
def addFilter(self, name, description):
self.vcf_reader.header.filters.add(id=name,
number=None,
type=None,
description=description)
def add_metadata(self):
# def addInfo(self, name, number, type, description):
# self.vcf_reader.header.info.add(id=name,
# number=number,
# type=type,
# description=description)
#
# def addFilter(self, name, description):
# self.vcf_reader.header.filters.add(id=name,
# number=None,
# type=None,
# description=description)
def add_annotation_metadata(self):
self.addInfo("SOURCEID", 1, "String",
"The source sv identifier")
self.addInfo("MAX_SUPP_READS", 1, "Integer",
......@@ -195,7 +200,7 @@ class VCFReader(SVReader):
return len(self.vcf_reader.header.samples)
class VCFWriter(SVWriter):
class AnnotateWriter(VCFWriter):
def __init__(self, file_name, template_reader, index=True):
......@@ -365,9 +370,12 @@ def add_redundancy_infos_header(reader):
# Adding DUPLICATES info (equivalent to GSDUPLICATEOVERLAP)
reader.addInfo("DUPLICATEOVERLAP", 1, "Float",
"Highest overlap with a duplicate event")
# Adding DUPLICATEOVERLAP info (equivalent to GSDUPLICATEOVERLAP)
reader.addInfo("DUPLICATES", ".", "String",
# Adding DUPLICATEOF info (list of variant prefered to this one)
reader.addInfo("DUPLICATEOF", ".", "String",
"List of duplicate events preferred to this one")
# Adding DUPLICATES info (list of variants duplicates of this one)
reader.addInfo("DUPLICATES", ".", "String",
"List of duplicate events represented by the current sv")
# Adding NONDUPLICATEOVERLAP
reader.addInfo("NONDUPLICATEOVERLAP", 1, "Float",
"Amount of overlap with a non-duplicate")
......@@ -396,7 +404,6 @@ def GenomeSTRIPLikeRedundancyAnnotator(SVSet, reader,
overlapping = defaultdict(tuple)
reference = defaultdict()
for o in self_overlap:
print("Here ?", o)
if o[3] == o[7]:
continue
a, b = ordered(o[3], o[7])
......@@ -413,7 +420,7 @@ def GenomeSTRIPLikeRedundancyAnnotator(SVSet, reader,
reference[ref] = 1
overlap_size = int(o[-1])
overlap = getoverlap(dupli, overlap_size)
if maxGQ(ref) > 0 and passed(dupli):
if ref.maxGQ() > 0 and dupli.passed:
# are dismissed
# - reference variant with 0 genotype quality for all markers
# - duplicate that are already tagged as filtered out
......@@ -445,7 +452,7 @@ def add_duplicate_info_sv(sv, duplicateoverlap, duplicatescore, duplicates):
else:
sv.record.info['DUPLICATESCORE'] = duplicatescore
sv.record.info['DUPLICATEOVERLAP'] = duplicateoverlap
sv.record.info['DUPLICATES'] = duplicates
sv.record.info['DUPLICATEOF'] = duplicates
def add_overlap_info_sv(sv, overlap, duplicatescore):
......@@ -459,10 +466,20 @@ def add_overlap_info_sv(sv, overlap, duplicatescore):
sv.record.info['NONDUPLICATEOVERLAP'] = overlap
def add_callrate_infos_header(reader):
# Adding CALLRATE info
reader.addInfo("CALLRATE", 1, "Float",
"Percentage of samples called with a GQ>13")
# Adding VARIANTCALLRATE info
reader.addInfo("VARIANTCALLRATE", 1, "Float",
"Percentage of variant samples called with a GQ>13")
def add_filter_infos_header(reader):
# FILTERS
# Adding specific filters
reader.addFilter("CALLRATE", "Call rate <0.75")
reader.addFilter("VARIANTCALLRATE", "Variant Call rate <0.75")
reader.addFilter("MONOMORPH", "All samples have the same genotype")
reader.addFilter("DUPLICATE", "GSDUPLICATESCORE>0")
reader.addFilter("OVERLAP", "NONDUPLICATEOVERLAP>0.7")
......@@ -479,13 +496,16 @@ def GenomeSTRIPLikefiltering(SVSet, reader):
(use NONVARIANTSCORE in both cases)
"""
add_callrate_infos_header(reader)
add_filter_infos_header(reader)
for sv in SVSet:
info = sv.record.info
if callRate(sv) < 0.75:
sv.record.info['CALLRATE'] = sv.CallRate(13)
sv.record.info['VARIANTCALLRATE'] = sv.VariantCallRate(13)
if sv.CallRate(13) < 0.75:
sv.filter.add("CALLRATE")
if not Polymorph(sv):
if not sv.Polymorph():
sv.filter.add("MONOMORPH")
if 'NONDUPLICATEOVERLAP' in info and info['NONDUPLICATEOVERLAP'] > 0.7:
sv.filter.add("OVERLAP")
......
......@@ -39,6 +39,10 @@ class VCFRecord(SVRecord):
def id(self):
return self.record.id
@id.setter
def id(self, id):
self.record.id = id
@property
def pos(self):
return self.record.pos
......@@ -51,6 +55,10 @@ class VCFRecord(SVRecord):
def stop(self):
return self.record.stop
@property
def svtype(self):
return self._sv_type
@property
def filter(self):
return self.record.filter
......@@ -84,16 +92,24 @@ class VCFReader(SVReader):
self.vcf_reader.header.info.remove_header(key)
def addInfo(self, name, number, type, description):
self.vcf_reader.header.info.add(id=name,
number=number,
type=type,
description=description)
# we add info only if absent from the current header
if name in self.vcf_reader.header.info:
return
else:
self.vcf_reader.header.info.add(id=name,
number=number,
type=type,
description=description)
def addFilter(self, name, description):
self.vcf_reader.header.filters.add(id=name,
number=None,
type=None,
description=description)
# we add filter info only if absent from the current header
if name in self.vcf_reader.header.filters:
return
else:
self.vcf_reader.header.filters.add(id=name,
number=None,
type=None,
description=description)
def getOrderedSamples(self):
samples = self.vcf_reader.header.samples
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment