Skip to content
Snippets Groups Projects
Commit 093720f3 authored by Thomas Faraut's avatar Thomas Faraut
Browse files

Merge branch 'newfilter' of forgemia.inra.fr:svdetection/svlib into newfilter

parents affa74ae aa96035d
No related branches found
No related tags found
No related merge requests found
...@@ -2,24 +2,39 @@ ...@@ -2,24 +2,39 @@
import sys import sys
from collections import defaultdict from collections import defaultdict
from networkx import Graph, connected_components
import numpy as np import numpy as np
from math import isnan from math import isnan
from pysam import VariantFile from pysam import VariantFile
from svrunner_utils import eprint, warn, vcf_to_pybed from svrunner_utils import eprint, warn, vcf_to_pybed
from svreader.vcfwrapper import VCFRecord, SVReader, SVWriter from svreader.vcfwrapper import VCFRecord, VCFReader, VCFWriter
HOM_REF = (0, 0) HOM_REF = (0, 0)
HET_VAR = (0, 1) HET_VAR = (0, 1)
HOM_VAR = (1, 1) HOM_VAR = (1, 1)
VALID_GENOTYPES = [HOM_REF, HET_VAR, HOM_VAR]
SVTYPER_GENO_LIKELI_TAG = "GL" SVTYPER_GENO_LIKELI_TAG = "GL"
GENOMESTRIP_GENO_LIKELI_TAG = "GP" GENOMESTRIP_GENO_LIKELI_TAG = "GP"
Geno_likeli_tag = SVTYPER_GENO_LIKELI_TAG Geno_likeli_tag = SVTYPER_GENO_LIKELI_TAG
def coded_geno(geno):
if geno == HOM_REF:
return "HOMO_REF"
elif geno == HET_VAR:
return "HET_VAR"
elif geno == HOM_VAR:
return "HOMO_VAR"
else:
return "UNDEF"
def Variant(sample): def Variant(sample):
if sample.get('GT') in [HET_VAR, HOM_VAR]: if sample.get('GT') in [HET_VAR, HOM_VAR]:
return True return True
...@@ -27,7 +42,14 @@ def Variant(sample): ...@@ -27,7 +42,14 @@ def Variant(sample):
return False return False
class VariantRecord(VCFRecord): def Heterozygote(sample):
if sample.get('GT') in [HET_VAR]:
return True
else:
return False
class AnnotateRecord(VCFRecord):
""" """
A lightweight object to annotated the final records A lightweight object to annotated the final records
""" """
...@@ -35,7 +57,8 @@ class VariantRecord(VCFRecord): ...@@ -35,7 +57,8 @@ class VariantRecord(VCFRecord):
""" """
A pysam VariantRecord wrapper A pysam VariantRecord wrapper
""" """
super(VariantRecord, self).__init__(record) super(AnnotateRecord, self).__init__(record)
self.new_id = None
@property @property
def start(self): def start(self):
...@@ -60,29 +83,64 @@ class VariantRecord(VCFRecord): ...@@ -60,29 +83,64 @@ class VariantRecord(VCFRecord):
else: else:
return False return False
def setNewId(self, identifier): @property
def num_samples(self):
return len(self.record.samples.keys())
def setNewId(self, new_id):
self.new_id = new_id
def rename(self):
try: try:
self.record.info['SOURCEID'] = self.id self.record.info['SOURCEID'] = self.id
except KeyError: except KeyError:
eprint("SOURCEID absent from record info keys,") eprint("SOURCEID absent from record info keys,")
sys.exit(1) sys.exit(1)
self.id = identifier self.id = self.new_id
def num_variant_samples(self): def num_variant_samples(self):
return sum([Variant(s) for s in self.samples.values()]) return sum([Variant(s) for s in self.samples.values()])
def variant_read_support(self): def variant_read_support(self):
return max([s.get('AO')[0] for s in self.samples.values()]) support = []
for s in self.samples.values():
if s.get('AO') is not None:
support.append(s.get('AO')[0])
return max(support)
def qual(self): def qual(self):
return sum([s.get('SQ') for s in self.samples.values()]) variant_qual = []
for s in self.samples.values():
if s.get('SQ') is not None:
variant_qual.append(s.get('SQ'))
return sum(variant_qual)
def GQ_samples(self):
genotype_qual = []
for s in self.samples.values():
if s.get('GQ') is not None:
genotype_qual.append(s.get('GQ'))
return genotype_qual
def GQ_sum_score(self): def GQ_sum_score(self):
return sum([s.get('SQ') for s in self.samples.values()]) return sum(self.GQ_samples())
def maxGQ(self):
return max(self.GQ_samples())
def setQual(self): def setQual(self):
self.record.qual = self.qual() self.record.qual = self.qual()
def numdiffGenotypes(self):
genotypes = defaultdict()
for s in self.samples.values():
if s.get('GT') in VALID_GENOTYPES:
genotypes[coded_geno(s.get('GT'))] = 1
return len(genotypes.keys())
def Polymorph(self):
return self.numdiffGenotypes() > 1
def AddSupportInfos(self): def AddSupportInfos(self):
supp_reads = self.variant_read_support() supp_reads = self.variant_read_support()
num_supp_samples = self.num_variant_samples() num_supp_samples = self.num_variant_samples()
...@@ -93,6 +151,24 @@ class VariantRecord(VCFRecord): ...@@ -93,6 +151,24 @@ class VariantRecord(VCFRecord):
eprint("SUPP_READS or NUM_SUPP_SAMPLES absent from record info keys") eprint("SUPP_READS or NUM_SUPP_SAMPLES absent from record info keys")
sys.exit(1) sys.exit(1)
def CallRate(self, cutoff):
call_qual = []
for s in self.samples.values():
if s.get('GQ') is not None:
call_qual.append(s.get('GQ'))
num_qual_call = sum([(qual > cutoff) for qual in call_qual])
return num_qual_call/self.num_samples
def VariantCallRate(self, cutoff):
samples = self.samples.values()
num_qual_var = 0
for s in samples:
if s.get("GQ") is not None and s.get("GQ") > cutoff and Variant(s):
num_qual_var += 1
num_var_samples = self.num_variant_samples()
var_call_rate = num_qual_var/num_var_samples if num_var_samples else 0
return var_call_rate
def UnifiedPass(self): def UnifiedPass(self):
""" """
All records passing the filters (PASS, .) ar now labelled PASS All records passing the filters (PASS, .) ar now labelled PASS
...@@ -106,52 +182,14 @@ class VariantRecord(VCFRecord): ...@@ -106,52 +182,14 @@ class VariantRecord(VCFRecord):
record.filter.add("PASS") record.filter.add("PASS")
class VariantRecordSample(object): class AnnotateReader(VCFReader):
"""
A lightweight object to VariantRecordSample
"""
def __init__(self, variantsample):
self.variantsample = variantsample
def genotype(self):
return self.variantsample.get('GT')
def SQ_score(self):
# Phred-scaled probability that this site is variant
# (non-reference in this sample)
return self.variantsample.get('SQ')
def GQ_score(self):
# Genotype quality
return self.variantsample.get('GQ')
def GL_score(self):
# Genotype Likelihood, log10-scaled likelihoods of the data given the
# called genotype for each possible genotype generated from the
# reference and alternate alleles given the sample ploidy
return self.variantsample.get('GL')
def AltSupport(self):
return self.variantsample.get('AO')[0]
@property
def isVariant(self):
if self.genotype() in [HET_VAR, HOM_VAR]:
return True
else:
return False
class VCFReader(SVReader):
def __init__(self, file_name, sv_to_report=None): def __init__(self, file_name, sv_to_report=None):
super(VCFReader, self).__init__(file_name) super(AnnotateReader, self).__init__(file_name)
self.vcf_reader = VariantFile(file_name)
self.filename = file_name self.filename = file_name
self.sv_to_report = sv_to_report self.sv_to_report = sv_to_report
self.vcf_reader = VariantFile(file_name) self.vcf_reader = VariantFile(file_name)
self.add_metadata() self.add_annotation_metadata()
def __iter__(self): def __iter__(self):
return self return self
...@@ -159,25 +197,25 @@ class VCFReader(SVReader): ...@@ -159,25 +197,25 @@ class VCFReader(SVReader):
def __next__(self): def __next__(self):
while True: while True:
raw_record = next(self.vcf_reader) raw_record = next(self.vcf_reader)
record = VariantRecord(raw_record) record = AnnotateRecord(raw_record)
return record return record
def getHeader(self): def getHeader(self):
return self.vcf_reader.header return self.vcf_reader.header
def addInfo(self, name, number, type, description): # def addInfo(self, name, number, type, description):
self.vcf_reader.header.info.add(id=name, # self.vcf_reader.header.info.add(id=name,
number=number, # number=number,
type=type, # type=type,
description=description) # description=description)
#
def addFilter(self, name, description): # def addFilter(self, name, description):
self.vcf_reader.header.filters.add(id=name, # self.vcf_reader.header.filters.add(id=name,
number=None, # number=None,
type=None, # type=None,
description=description) # description=description)
def add_metadata(self): def add_annotation_metadata(self):
self.addInfo("SOURCEID", 1, "String", self.addInfo("SOURCEID", 1, "String",
"The source sv identifier") "The source sv identifier")
self.addInfo("MAX_SUPP_READS", 1, "Integer", self.addInfo("MAX_SUPP_READS", 1, "Integer",
...@@ -187,7 +225,6 @@ class VCFReader(SVReader): ...@@ -187,7 +225,6 @@ class VCFReader(SVReader):
self.addFilter("LOWSUPPORT", self.addFilter("LOWSUPPORT",
"total supp reads < 5 or supp samples < 2") "total supp reads < 5 or supp samples < 2")
def getOrderedSamples(self): def getOrderedSamples(self):
samples = self.vcf_reader.header.samples samples = self.vcf_reader.header.samples
sample_names = [sample.rsplit('.')[0] for sample in samples] sample_names = [sample.rsplit('.')[0] for sample in samples]
...@@ -197,7 +234,7 @@ class VCFReader(SVReader): ...@@ -197,7 +234,7 @@ class VCFReader(SVReader):
return len(self.vcf_reader.header.samples) return len(self.vcf_reader.header.samples)
class VCFWriter(SVWriter): class AnnotateWriter(VCFWriter):
def __init__(self, file_name, template_reader, index=True): def __init__(self, file_name, template_reader, index=True):
...@@ -367,12 +404,18 @@ def add_redundancy_infos_header(reader): ...@@ -367,12 +404,18 @@ def add_redundancy_infos_header(reader):
# Adding DUPLICATES info (equivalent to GSDUPLICATEOVERLAP) # Adding DUPLICATES info (equivalent to GSDUPLICATEOVERLAP)
reader.addInfo("DUPLICATEOVERLAP", 1, "Float", reader.addInfo("DUPLICATEOVERLAP", 1, "Float",
"Highest overlap with a duplicate event") "Highest overlap with a duplicate event")
# Adding DUPLICATEOVERLAP info (equivalent to GSDUPLICATEOVERLAP) # Adding DUPLICATEOF info (list of variant prefered to this one)
reader.addInfo("DUPLICATES", ".", "String", reader.addInfo("DUPLICATEOF", ".", "String",
"List of duplicate events preferred to this one") "List of duplicate events preferred to this one")
# Adding DUPLICATES info (list of variants duplicates of this one)
reader.addInfo("DUPLICATES", ".", "String",
"List of duplicate events represented by the current sv")
# Adding NONDUPLICATEOVERLAP # Adding NONDUPLICATEOVERLAP
reader.addInfo("NONDUPLICATEOVERLAP", 1, "Float", reader.addInfo("NONDUPLICATEOVERLAP", 1, "Float",
"Amount of overlap with a non-duplicate") "Amount of overlap with a non-duplicate")
# Adding TOOLSUPPORT
reader.addInfo("TOOLSUPPORT", ".", "String",
"Tools supporting (detecting) the sv")
def GenomeSTRIPLikeRedundancyAnnotator(SVSet, reader, def GenomeSTRIPLikeRedundancyAnnotator(SVSet, reader,
...@@ -398,7 +441,6 @@ def GenomeSTRIPLikeRedundancyAnnotator(SVSet, reader, ...@@ -398,7 +441,6 @@ def GenomeSTRIPLikeRedundancyAnnotator(SVSet, reader,
overlapping = defaultdict(tuple) overlapping = defaultdict(tuple)
reference = defaultdict() reference = defaultdict()
for o in self_overlap: for o in self_overlap:
print("Here ?", o)
if o[3] == o[7]: if o[3] == o[7]:
continue continue
a, b = ordered(o[3], o[7]) a, b = ordered(o[3], o[7])
...@@ -415,7 +457,7 @@ def GenomeSTRIPLikeRedundancyAnnotator(SVSet, reader, ...@@ -415,7 +457,7 @@ def GenomeSTRIPLikeRedundancyAnnotator(SVSet, reader,
reference[ref] = 1 reference[ref] = 1
overlap_size = int(o[-1]) overlap_size = int(o[-1])
overlap = getoverlap(dupli, overlap_size) overlap = getoverlap(dupli, overlap_size)
if maxGQ(ref) > 0 and passed(dupli): if ref.maxGQ() > 0 and dupli.passed:
# are dismissed # are dismissed
# - reference variant with 0 genotype quality for all markers # - reference variant with 0 genotype quality for all markers
# - duplicate that are already tagged as filtered out # - duplicate that are already tagged as filtered out
...@@ -447,7 +489,7 @@ def add_duplicate_info_sv(sv, duplicateoverlap, duplicatescore, duplicates): ...@@ -447,7 +489,7 @@ def add_duplicate_info_sv(sv, duplicateoverlap, duplicatescore, duplicates):
else: else:
sv.record.info['DUPLICATESCORE'] = duplicatescore sv.record.info['DUPLICATESCORE'] = duplicatescore
sv.record.info['DUPLICATEOVERLAP'] = duplicateoverlap sv.record.info['DUPLICATEOVERLAP'] = duplicateoverlap
sv.record.info['DUPLICATES'] = duplicates sv.record.info['DUPLICATEOF'] = duplicates
def add_overlap_info_sv(sv, overlap, duplicatescore): def add_overlap_info_sv(sv, overlap, duplicatescore):
...@@ -461,13 +503,24 @@ def add_overlap_info_sv(sv, overlap, duplicatescore): ...@@ -461,13 +503,24 @@ def add_overlap_info_sv(sv, overlap, duplicatescore):
sv.record.info['NONDUPLICATEOVERLAP'] = overlap sv.record.info['NONDUPLICATEOVERLAP'] = overlap
def add_callrate_infos_header(reader):
# Adding CALLRATE info
reader.addInfo("CALLRATE", 1, "Float",
"Percentage of samples called with a GQ>13")
# Adding VARIANTCALLRATE info
reader.addInfo("VARIANTCALLRATE", 1, "Float",
"Percentage of variant samples called with a GQ>13")
def add_filter_infos_header(reader): def add_filter_infos_header(reader):
# FILTERS # FILTERS
# Adding specific filters # Adding specific filters
reader.addFilter("CALLRATE", "Call rate <0.75") reader.addFilter("CALLRATE", "Call rate <0.75")
reader.addFilter("VARIANTCALLRATE", "Variant Call rate <0.75")
reader.addFilter("MONOMORPH", "All samples have the same genotype") reader.addFilter("MONOMORPH", "All samples have the same genotype")
reader.addFilter("DUPLICATE", "GSDUPLICATESCORE>0") reader.addFilter("DUPLICATE", "GSDUPLICATESCORE>0")
reader.addFilter("OVERLAP", "NONDUPLICATEOVERLAP>0.7") reader.addFilter("OVERLAP", "NONDUPLICATEOVERLAP>0.7")
reader.addFilter("ABFREQ", "AB frequency <0.3 for >50% heterosamples")
def GenomeSTRIPLikefiltering(SVSet, reader): def GenomeSTRIPLikefiltering(SVSet, reader):
...@@ -481,15 +534,122 @@ def GenomeSTRIPLikefiltering(SVSet, reader): ...@@ -481,15 +534,122 @@ def GenomeSTRIPLikefiltering(SVSet, reader):
(use NONVARIANTSCORE in both cases) (use NONVARIANTSCORE in both cases)
""" """
add_callrate_infos_header(reader)
add_filter_infos_header(reader) add_filter_infos_header(reader)
for sv in SVSet: for sv in SVSet:
info = sv.record.info info = sv.record.info
if callRate(sv) < 0.75: sv.record.info['CALLRATE'] = sv.CallRate(13)
sv.record.info['VARIANTCALLRATE'] = sv.VariantCallRate(13)
if sv.CallRate(13) < 0.75:
sv.filter.add("CALLRATE") sv.filter.add("CALLRATE")
if not Polymorph(sv): if not sv.Polymorph():
sv.filter.add("MONOMORPH") sv.filter.add("MONOMORPH")
if 'NONDUPLICATEOVERLAP' in info and info['NONDUPLICATEOVERLAP'] > 0.7: if 'NONDUPLICATEOVERLAP' in info and info['NONDUPLICATEOVERLAP'] > 0.7:
sv.filter.add("OVERLAP") sv.filter.add("OVERLAP")
if "DUPLICATESCORE" in info is not None and info['DUPLICATESCORE'] > -2: if "DUPLICATESCORE" in info is not None and info['DUPLICATESCORE'] > -2:
sv.filter.add("DUPLICATE") sv.filter.add("DUPLICATE")
def ABFreqFiltering(SVSet):
""" Filtering the candidate CNVs according to the following criteria
- more than 50% of variant samples should have AB freq > 0.3
"""
for sv in SVSet:
ABfreqOK = []
for s in sv.record.samples.values():
if Heterozygote(s):
ABfreqOK.append((s.get('AB')[0] > 0.3))
if len(ABfreqOK) > 0 and sum(ABfreqOK) < len(ABfreqOK)/2:
sv.filter.add("ABFREQ")
def GetConnectedDuplicates(SVSet):
"""
Construct connected components of duplicates and rename the variants
"""
undirected = Graph()
variant_dict = defaultdict()
representatives = defaultdict()
for s in SVSet:
variant_dict[s.id] = s
if "DUPLICATE" in s.filter:
for dupli_repr in s.record.info["DUPLICATEOF"]:
undirected.add_edge(s.id, dupli_repr)
for component in connected_components(undirected):
for c in component:
if "DUPLICATEOF" in variant_dict[c].record.info:
# the current variant is a duplicate
continue
rep = c # the representative of the equivalence class
break
duplicates = component
duplicates.remove(rep)
representatives[rep] = duplicates
add_duplicate_infos(representatives, variant_dict)
def add_duplicate_infos(representatives, sv_dict):
for rep, elements in representatives.items():
for d in elements:
sv_dict[d].record.info['DUPLICATEOF'] = rep
duplicates = list(elements)
if 'DUPLICATES' in sv_dict[rep].record.info:
print(sv_dict[rep].record.info['DUPLICATES'])
duplicates.extend(sv_dict[rep].record.info['DUPLICATES'])
if len(duplicates) > 0:
sv_dict[rep].record.info['DUPLICATES'] = duplicates
def get_tool_name(sv_ident):
return sv_ident.split("_")[0]
def SetSupportingTools(SVSet):
for sv in SVSet:
tools = {get_tool_name(sv.id)}
if "DUPLICATES" in sv.record.info:
duplicates = sv.record.info['DUPLICATES']
# print(duplicates)
for dupli in duplicates:
tools.add(get_tool_name(dupli))
if 'TOOLSUPPORT' in sv.record.info:
supporting = set(sv.record.info['TOOLSUPPORT'])
tools.union(supporting)
sv.record.info['TOOLSUPPORT'] = list(tools)
def static_vars(**kwargs):
def decorate(func):
for k in kwargs:
setattr(func, k, kwargs[k])
return func
return decorate
@static_vars(counter=0)
def new_id_str(sv):
new_id_str.counter += 1
return "_".join(["cnvpipeline", sv.chrom, sv.svtype,
str(new_id_str.counter)])
def rename_info_field(sv, key, sv_dict):
if key in sv.record.info:
info_oldid = sv.record.info[key]
info_newid = [sv_dict[id] for id in info_oldid]
sv.record.info[key] = info_newid
def RenameSV(SVSet):
sv_dict = defaultdict()
for sv in SVSet:
new_id = new_id_str(sv)
sv.setNewId(new_id)
sv_dict[sv.id] = new_id
for sv in SVSet:
rename_info_field(sv, "DUPLICATEOF", sv_dict)
rename_info_field(sv, "DUPLICATES", sv_dict)
sv.record.info['SOURCEID'] = sv.id
sv.rename()
...@@ -388,6 +388,7 @@ class PindelReader(SVReader): ...@@ -388,6 +388,7 @@ class PindelReader(SVReader):
# # Sudmant et al 2015 SuppInfo # # Sudmant et al 2015 SuppInfo
# return (record.length() > 60) # return (record.length() > 60)
def SpecificFilterPass(self, record): def SpecificFilterPass(self, record):
# new pindel specific filter
if record.length() >= 800 or record.length() <= 60: if record.length() >= 800 or record.length() <= 60:
return False return False
elif record.MaxIndividualSupport() <= 3: elif record.MaxIndividualSupport() <= 3:
......
...@@ -39,6 +39,10 @@ class VCFRecord(SVRecord): ...@@ -39,6 +39,10 @@ class VCFRecord(SVRecord):
def id(self): def id(self):
return self.record.id return self.record.id
@id.setter
def id(self, id):
self.record.id = id
@property @property
def pos(self): def pos(self):
return self.record.pos return self.record.pos
...@@ -51,6 +55,10 @@ class VCFRecord(SVRecord): ...@@ -51,6 +55,10 @@ class VCFRecord(SVRecord):
def stop(self): def stop(self):
return self.record.stop return self.record.stop
@property
def svtype(self):
return self._sv_type
@property @property
def filter(self): def filter(self):
return self.record.filter return self.record.filter
...@@ -84,16 +92,24 @@ class VCFReader(SVReader): ...@@ -84,16 +92,24 @@ class VCFReader(SVReader):
self.vcf_reader.header.info.remove_header(key) self.vcf_reader.header.info.remove_header(key)
def addInfo(self, name, number, type, description): def addInfo(self, name, number, type, description):
self.vcf_reader.header.info.add(id=name, # we add info only if absent from the current header
number=number, if name in self.vcf_reader.header.info:
type=type, return
description=description) else:
self.vcf_reader.header.info.add(id=name,
number=number,
type=type,
description=description)
def addFilter(self, name, description): def addFilter(self, name, description):
self.vcf_reader.header.filters.add(id=name, # we add filter info only if absent from the current header
number=None, if name in self.vcf_reader.header.filters:
type=None, return
description=description) else:
self.vcf_reader.header.filters.add(id=name,
number=None,
type=None,
description=description)
def getOrderedSamples(self): def getOrderedSamples(self):
samples = self.vcf_reader.header.samples samples = self.vcf_reader.header.samples
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment