Skip to content
Snippets Groups Projects
Commit 093720f3 authored by Thomas Faraut's avatar Thomas Faraut
Browse files

Merge branch 'newfilter' of forgemia.inra.fr:svdetection/svlib into newfilter

parents affa74ae aa96035d
No related branches found
No related tags found
No related merge requests found
......@@ -2,24 +2,39 @@
import sys
from collections import defaultdict
from networkx import Graph, connected_components
import numpy as np
from math import isnan
from pysam import VariantFile
from svrunner_utils import eprint, warn, vcf_to_pybed
from svreader.vcfwrapper import VCFRecord, SVReader, SVWriter
from svreader.vcfwrapper import VCFRecord, VCFReader, VCFWriter
HOM_REF = (0, 0)
HET_VAR = (0, 1)
HOM_VAR = (1, 1)
VALID_GENOTYPES = [HOM_REF, HET_VAR, HOM_VAR]
SVTYPER_GENO_LIKELI_TAG = "GL"
GENOMESTRIP_GENO_LIKELI_TAG = "GP"
Geno_likeli_tag = SVTYPER_GENO_LIKELI_TAG
def coded_geno(geno):
if geno == HOM_REF:
return "HOMO_REF"
elif geno == HET_VAR:
return "HET_VAR"
elif geno == HOM_VAR:
return "HOMO_VAR"
else:
return "UNDEF"
def Variant(sample):
if sample.get('GT') in [HET_VAR, HOM_VAR]:
return True
......@@ -27,7 +42,14 @@ def Variant(sample):
return False
class VariantRecord(VCFRecord):
def Heterozygote(sample):
if sample.get('GT') in [HET_VAR]:
return True
else:
return False
class AnnotateRecord(VCFRecord):
"""
A lightweight object to annotated the final records
"""
......@@ -35,7 +57,8 @@ class VariantRecord(VCFRecord):
"""
A pysam VariantRecord wrapper
"""
super(VariantRecord, self).__init__(record)
super(AnnotateRecord, self).__init__(record)
self.new_id = None
@property
def start(self):
......@@ -60,29 +83,64 @@ class VariantRecord(VCFRecord):
else:
return False
def setNewId(self, identifier):
@property
def num_samples(self):
return len(self.record.samples.keys())
def setNewId(self, new_id):
self.new_id = new_id
def rename(self):
try:
self.record.info['SOURCEID'] = self.id
except KeyError:
eprint("SOURCEID absent from record info keys,")
sys.exit(1)
self.id = identifier
self.id = self.new_id
def num_variant_samples(self):
return sum([Variant(s) for s in self.samples.values()])
def variant_read_support(self):
return max([s.get('AO')[0] for s in self.samples.values()])
support = []
for s in self.samples.values():
if s.get('AO') is not None:
support.append(s.get('AO')[0])
return max(support)
def qual(self):
return sum([s.get('SQ') for s in self.samples.values()])
variant_qual = []
for s in self.samples.values():
if s.get('SQ') is not None:
variant_qual.append(s.get('SQ'))
return sum(variant_qual)
def GQ_samples(self):
genotype_qual = []
for s in self.samples.values():
if s.get('GQ') is not None:
genotype_qual.append(s.get('GQ'))
return genotype_qual
def GQ_sum_score(self):
return sum([s.get('SQ') for s in self.samples.values()])
return sum(self.GQ_samples())
def maxGQ(self):
return max(self.GQ_samples())
def setQual(self):
self.record.qual = self.qual()
def numdiffGenotypes(self):
genotypes = defaultdict()
for s in self.samples.values():
if s.get('GT') in VALID_GENOTYPES:
genotypes[coded_geno(s.get('GT'))] = 1
return len(genotypes.keys())
def Polymorph(self):
return self.numdiffGenotypes() > 1
def AddSupportInfos(self):
supp_reads = self.variant_read_support()
num_supp_samples = self.num_variant_samples()
......@@ -93,6 +151,24 @@ class VariantRecord(VCFRecord):
eprint("SUPP_READS or NUM_SUPP_SAMPLES absent from record info keys")
sys.exit(1)
def CallRate(self, cutoff):
call_qual = []
for s in self.samples.values():
if s.get('GQ') is not None:
call_qual.append(s.get('GQ'))
num_qual_call = sum([(qual > cutoff) for qual in call_qual])
return num_qual_call/self.num_samples
def VariantCallRate(self, cutoff):
samples = self.samples.values()
num_qual_var = 0
for s in samples:
if s.get("GQ") is not None and s.get("GQ") > cutoff and Variant(s):
num_qual_var += 1
num_var_samples = self.num_variant_samples()
var_call_rate = num_qual_var/num_var_samples if num_var_samples else 0
return var_call_rate
def UnifiedPass(self):
"""
All records passing the filters (PASS, .) ar now labelled PASS
......@@ -106,52 +182,14 @@ class VariantRecord(VCFRecord):
record.filter.add("PASS")
class VariantRecordSample(object):
"""
A lightweight object to VariantRecordSample
"""
def __init__(self, variantsample):
self.variantsample = variantsample
def genotype(self):
return self.variantsample.get('GT')
def SQ_score(self):
# Phred-scaled probability that this site is variant
# (non-reference in this sample)
return self.variantsample.get('SQ')
def GQ_score(self):
# Genotype quality
return self.variantsample.get('GQ')
def GL_score(self):
# Genotype Likelihood, log10-scaled likelihoods of the data given the
# called genotype for each possible genotype generated from the
# reference and alternate alleles given the sample ploidy
return self.variantsample.get('GL')
def AltSupport(self):
return self.variantsample.get('AO')[0]
@property
def isVariant(self):
if self.genotype() in [HET_VAR, HOM_VAR]:
return True
else:
return False
class VCFReader(SVReader):
class AnnotateReader(VCFReader):
def __init__(self, file_name, sv_to_report=None):
super(VCFReader, self).__init__(file_name)
self.vcf_reader = VariantFile(file_name)
super(AnnotateReader, self).__init__(file_name)
self.filename = file_name
self.sv_to_report = sv_to_report
self.vcf_reader = VariantFile(file_name)
self.add_metadata()
self.add_annotation_metadata()
def __iter__(self):
return self
......@@ -159,25 +197,25 @@ class VCFReader(SVReader):
def __next__(self):
while True:
raw_record = next(self.vcf_reader)
record = VariantRecord(raw_record)
record = AnnotateRecord(raw_record)
return record
def getHeader(self):
return self.vcf_reader.header
def addInfo(self, name, number, type, description):
self.vcf_reader.header.info.add(id=name,
number=number,
type=type,
description=description)
def addFilter(self, name, description):
self.vcf_reader.header.filters.add(id=name,
number=None,
type=None,
description=description)
def add_metadata(self):
# def addInfo(self, name, number, type, description):
# self.vcf_reader.header.info.add(id=name,
# number=number,
# type=type,
# description=description)
#
# def addFilter(self, name, description):
# self.vcf_reader.header.filters.add(id=name,
# number=None,
# type=None,
# description=description)
def add_annotation_metadata(self):
self.addInfo("SOURCEID", 1, "String",
"The source sv identifier")
self.addInfo("MAX_SUPP_READS", 1, "Integer",
......@@ -187,7 +225,6 @@ class VCFReader(SVReader):
self.addFilter("LOWSUPPORT",
"total supp reads < 5 or supp samples < 2")
def getOrderedSamples(self):
samples = self.vcf_reader.header.samples
sample_names = [sample.rsplit('.')[0] for sample in samples]
......@@ -197,7 +234,7 @@ class VCFReader(SVReader):
return len(self.vcf_reader.header.samples)
class VCFWriter(SVWriter):
class AnnotateWriter(VCFWriter):
def __init__(self, file_name, template_reader, index=True):
......@@ -367,12 +404,18 @@ def add_redundancy_infos_header(reader):
# Adding DUPLICATES info (equivalent to GSDUPLICATEOVERLAP)
reader.addInfo("DUPLICATEOVERLAP", 1, "Float",
"Highest overlap with a duplicate event")
# Adding DUPLICATEOVERLAP info (equivalent to GSDUPLICATEOVERLAP)
reader.addInfo("DUPLICATES", ".", "String",
# Adding DUPLICATEOF info (list of variant prefered to this one)
reader.addInfo("DUPLICATEOF", ".", "String",
"List of duplicate events preferred to this one")
# Adding DUPLICATES info (list of variants duplicates of this one)
reader.addInfo("DUPLICATES", ".", "String",
"List of duplicate events represented by the current sv")
# Adding NONDUPLICATEOVERLAP
reader.addInfo("NONDUPLICATEOVERLAP", 1, "Float",
"Amount of overlap with a non-duplicate")
# Adding TOOLSUPPORT
reader.addInfo("TOOLSUPPORT", ".", "String",
"Tools supporting (detecting) the sv")
def GenomeSTRIPLikeRedundancyAnnotator(SVSet, reader,
......@@ -398,7 +441,6 @@ def GenomeSTRIPLikeRedundancyAnnotator(SVSet, reader,
overlapping = defaultdict(tuple)
reference = defaultdict()
for o in self_overlap:
print("Here ?", o)
if o[3] == o[7]:
continue
a, b = ordered(o[3], o[7])
......@@ -415,7 +457,7 @@ def GenomeSTRIPLikeRedundancyAnnotator(SVSet, reader,
reference[ref] = 1
overlap_size = int(o[-1])
overlap = getoverlap(dupli, overlap_size)
if maxGQ(ref) > 0 and passed(dupli):
if ref.maxGQ() > 0 and dupli.passed:
# are dismissed
# - reference variant with 0 genotype quality for all markers
# - duplicate that are already tagged as filtered out
......@@ -447,7 +489,7 @@ def add_duplicate_info_sv(sv, duplicateoverlap, duplicatescore, duplicates):
else:
sv.record.info['DUPLICATESCORE'] = duplicatescore
sv.record.info['DUPLICATEOVERLAP'] = duplicateoverlap
sv.record.info['DUPLICATES'] = duplicates
sv.record.info['DUPLICATEOF'] = duplicates
def add_overlap_info_sv(sv, overlap, duplicatescore):
......@@ -461,13 +503,24 @@ def add_overlap_info_sv(sv, overlap, duplicatescore):
sv.record.info['NONDUPLICATEOVERLAP'] = overlap
def add_callrate_infos_header(reader):
# Adding CALLRATE info
reader.addInfo("CALLRATE", 1, "Float",
"Percentage of samples called with a GQ>13")
# Adding VARIANTCALLRATE info
reader.addInfo("VARIANTCALLRATE", 1, "Float",
"Percentage of variant samples called with a GQ>13")
def add_filter_infos_header(reader):
# FILTERS
# Adding specific filters
reader.addFilter("CALLRATE", "Call rate <0.75")
reader.addFilter("VARIANTCALLRATE", "Variant Call rate <0.75")
reader.addFilter("MONOMORPH", "All samples have the same genotype")
reader.addFilter("DUPLICATE", "GSDUPLICATESCORE>0")
reader.addFilter("OVERLAP", "NONDUPLICATEOVERLAP>0.7")
reader.addFilter("ABFREQ", "AB frequency <0.3 for >50% heterosamples")
def GenomeSTRIPLikefiltering(SVSet, reader):
......@@ -481,15 +534,122 @@ def GenomeSTRIPLikefiltering(SVSet, reader):
(use NONVARIANTSCORE in both cases)
"""
add_callrate_infos_header(reader)
add_filter_infos_header(reader)
for sv in SVSet:
info = sv.record.info
if callRate(sv) < 0.75:
sv.record.info['CALLRATE'] = sv.CallRate(13)
sv.record.info['VARIANTCALLRATE'] = sv.VariantCallRate(13)
if sv.CallRate(13) < 0.75:
sv.filter.add("CALLRATE")
if not Polymorph(sv):
if not sv.Polymorph():
sv.filter.add("MONOMORPH")
if 'NONDUPLICATEOVERLAP' in info and info['NONDUPLICATEOVERLAP'] > 0.7:
sv.filter.add("OVERLAP")
if "DUPLICATESCORE" in info is not None and info['DUPLICATESCORE'] > -2:
sv.filter.add("DUPLICATE")
def ABFreqFiltering(SVSet):
""" Filtering the candidate CNVs according to the following criteria
- more than 50% of variant samples should have AB freq > 0.3
"""
for sv in SVSet:
ABfreqOK = []
for s in sv.record.samples.values():
if Heterozygote(s):
ABfreqOK.append((s.get('AB')[0] > 0.3))
if len(ABfreqOK) > 0 and sum(ABfreqOK) < len(ABfreqOK)/2:
sv.filter.add("ABFREQ")
def GetConnectedDuplicates(SVSet):
"""
Construct connected components of duplicates and rename the variants
"""
undirected = Graph()
variant_dict = defaultdict()
representatives = defaultdict()
for s in SVSet:
variant_dict[s.id] = s
if "DUPLICATE" in s.filter:
for dupli_repr in s.record.info["DUPLICATEOF"]:
undirected.add_edge(s.id, dupli_repr)
for component in connected_components(undirected):
for c in component:
if "DUPLICATEOF" in variant_dict[c].record.info:
# the current variant is a duplicate
continue
rep = c # the representative of the equivalence class
break
duplicates = component
duplicates.remove(rep)
representatives[rep] = duplicates
add_duplicate_infos(representatives, variant_dict)
def add_duplicate_infos(representatives, sv_dict):
for rep, elements in representatives.items():
for d in elements:
sv_dict[d].record.info['DUPLICATEOF'] = rep
duplicates = list(elements)
if 'DUPLICATES' in sv_dict[rep].record.info:
print(sv_dict[rep].record.info['DUPLICATES'])
duplicates.extend(sv_dict[rep].record.info['DUPLICATES'])
if len(duplicates) > 0:
sv_dict[rep].record.info['DUPLICATES'] = duplicates
def get_tool_name(sv_ident):
return sv_ident.split("_")[0]
def SetSupportingTools(SVSet):
for sv in SVSet:
tools = {get_tool_name(sv.id)}
if "DUPLICATES" in sv.record.info:
duplicates = sv.record.info['DUPLICATES']
# print(duplicates)
for dupli in duplicates:
tools.add(get_tool_name(dupli))
if 'TOOLSUPPORT' in sv.record.info:
supporting = set(sv.record.info['TOOLSUPPORT'])
tools.union(supporting)
sv.record.info['TOOLSUPPORT'] = list(tools)
def static_vars(**kwargs):
def decorate(func):
for k in kwargs:
setattr(func, k, kwargs[k])
return func
return decorate
@static_vars(counter=0)
def new_id_str(sv):
new_id_str.counter += 1
return "_".join(["cnvpipeline", sv.chrom, sv.svtype,
str(new_id_str.counter)])
def rename_info_field(sv, key, sv_dict):
if key in sv.record.info:
info_oldid = sv.record.info[key]
info_newid = [sv_dict[id] for id in info_oldid]
sv.record.info[key] = info_newid
def RenameSV(SVSet):
sv_dict = defaultdict()
for sv in SVSet:
new_id = new_id_str(sv)
sv.setNewId(new_id)
sv_dict[sv.id] = new_id
for sv in SVSet:
rename_info_field(sv, "DUPLICATEOF", sv_dict)
rename_info_field(sv, "DUPLICATES", sv_dict)
sv.record.info['SOURCEID'] = sv.id
sv.rename()
......@@ -388,6 +388,7 @@ class PindelReader(SVReader):
# # Sudmant et al 2015 SuppInfo
# return (record.length() > 60)
def SpecificFilterPass(self, record):
# new pindel specific filter
if record.length() >= 800 or record.length() <= 60:
return False
elif record.MaxIndividualSupport() <= 3:
......
......@@ -39,6 +39,10 @@ class VCFRecord(SVRecord):
def id(self):
return self.record.id
@id.setter
def id(self, id):
self.record.id = id
@property
def pos(self):
return self.record.pos
......@@ -51,6 +55,10 @@ class VCFRecord(SVRecord):
def stop(self):
return self.record.stop
@property
def svtype(self):
return self._sv_type
@property
def filter(self):
return self.record.filter
......@@ -84,16 +92,24 @@ class VCFReader(SVReader):
self.vcf_reader.header.info.remove_header(key)
def addInfo(self, name, number, type, description):
self.vcf_reader.header.info.add(id=name,
number=number,
type=type,
description=description)
# we add info only if absent from the current header
if name in self.vcf_reader.header.info:
return
else:
self.vcf_reader.header.info.add(id=name,
number=number,
type=type,
description=description)
def addFilter(self, name, description):
self.vcf_reader.header.filters.add(id=name,
number=None,
type=None,
description=description)
# we add filter info only if absent from the current header
if name in self.vcf_reader.header.filters:
return
else:
self.vcf_reader.header.filters.add(id=name,
number=None,
type=None,
description=description)
def getOrderedSamples(self):
samples = self.vcf_reader.header.samples
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment