Skip to content
Snippets Groups Projects
Commit b0069bcf authored by Thomas Faraut's avatar Thomas Faraut
Browse files

a more pep8 compliant code

parent 28cadf8a
No related branches found
No related tags found
No related merge requests found
...@@ -53,6 +53,7 @@ class AnnotateRecord(VCFRecord): ...@@ -53,6 +53,7 @@ class AnnotateRecord(VCFRecord):
""" """
A lightweight object to annotated the final records A lightweight object to annotated the final records
""" """
def __init__(self, record): def __init__(self, record):
""" """
A pysam VariantRecord wrapper A pysam VariantRecord wrapper
...@@ -128,7 +129,7 @@ class AnnotateRecord(VCFRecord): ...@@ -128,7 +129,7 @@ class AnnotateRecord(VCFRecord):
def maxGQ(self): def maxGQ(self):
return max(self.GQ_samples()) return max(self.GQ_samples())
def setQual(self): def set_qual(self):
self.record.qual = self.qual() self.record.qual = self.qual()
def numdiffGenotypes(self): def numdiffGenotypes(self):
...@@ -138,10 +139,10 @@ class AnnotateRecord(VCFRecord): ...@@ -138,10 +139,10 @@ class AnnotateRecord(VCFRecord):
genotypes[coded_geno(s.get('GT'))] = 1 genotypes[coded_geno(s.get('GT'))] = 1
return len(genotypes.keys()) return len(genotypes.keys())
def Polymorph(self): def polymorph(self):
return self.numdiffGenotypes() > 1 return self.numdiffGenotypes() > 1
def AddSupportInfos(self): def add_supporting_infos(self):
supp_reads = self.variant_read_support() supp_reads = self.variant_read_support()
num_supp_samples = self.num_variant_samples() num_supp_samples = self.num_variant_samples()
try: try:
...@@ -151,25 +152,25 @@ class AnnotateRecord(VCFRecord): ...@@ -151,25 +152,25 @@ class AnnotateRecord(VCFRecord):
eprint("SUPP_READS or NUM_SUPP_SAMPLES absent from record info keys") eprint("SUPP_READS or NUM_SUPP_SAMPLES absent from record info keys")
sys.exit(1) sys.exit(1)
def CallRate(self, cutoff): def call_rate(self, cutoff):
call_qual = [] call_qual = []
for s in self.samples.values(): for s in self.samples.values():
if s.get('GQ') is not None: if s.get('GQ') is not None:
call_qual.append(s.get('GQ')) call_qual.append(s.get('GQ'))
num_qual_call = sum([(qual > cutoff) for qual in call_qual]) num_qual_call = sum([(qual > cutoff) for qual in call_qual])
return num_qual_call/self.num_samples return num_qual_call / self.num_samples
def VariantCallRate(self, cutoff): def variant_call_rate(self, cutoff):
samples = self.samples.values() samples = self.samples.values()
num_qual_var = 0 num_qual_var = 0
for s in samples: for s in samples:
if s.get("GQ") is not None and s.get("GQ") > cutoff and Variant(s): if s.get("GQ") is not None and s.get("GQ") > cutoff and Variant(s):
num_qual_var += 1 num_qual_var += 1
num_var_samples = self.num_variant_samples() num_var_samples = self.num_variant_samples()
var_call_rate = num_qual_var/num_var_samples if num_var_samples else 0 var_call_rate = num_qual_var / num_var_samples if num_var_samples else 0
return var_call_rate return var_call_rate
def UnifiedPass(self): def unify_pass_filtertag(self):
""" """
All records passing the filters (PASS, .) ar now labelled PASS All records passing the filters (PASS, .) ar now labelled PASS
""" """
...@@ -203,17 +204,6 @@ class AnnotateReader(VCFReader): ...@@ -203,17 +204,6 @@ class AnnotateReader(VCFReader):
def getHeader(self): def getHeader(self):
return self.vcf_reader.header return self.vcf_reader.header
# def addInfo(self, name, number, type, description):
# self.vcf_reader.header.info.add(id=name,
# number=number,
# type=type,
# description=description)
#
# def addFilter(self, name, description):
# self.vcf_reader.header.filters.add(id=name,
# number=None,
# type=None,
# description=description)
def add_annotation_metadata(self): def add_annotation_metadata(self):
self.addInfo("SOURCEID", 1, "String", self.addInfo("SOURCEID", 1, "String",
...@@ -290,7 +280,7 @@ def probas(likelihoods): ...@@ -290,7 +280,7 @@ def probas(likelihoods):
def getprobas(sample): def getprobas(sample):
# Transforming likelihods into probabilities # Transforming likelihods into probabilities
return probas(getlikelihoods(sample))/np.sum(probas(getlikelihoods(sample))) return probas(getlikelihoods(sample)) / np.sum(probas(getlikelihoods(sample)))
def ondiagonal(u_s, v_s): def ondiagonal(u_s, v_s):
...@@ -301,7 +291,7 @@ def ondiagonal(u_s, v_s): ...@@ -301,7 +291,7 @@ def ondiagonal(u_s, v_s):
q = getprobas(v_s) q = getprobas(v_s)
proba = 0 proba = 0
for a, b in zip(p, q): for a, b in zip(p, q):
proba += a*b proba += a * b
# print("Proba on %3.5f" %(proba)) # print("Proba on %3.5f" %(proba))
return proba return proba
...@@ -315,7 +305,7 @@ def offdiagonal(u_s, v_s): ...@@ -315,7 +305,7 @@ def offdiagonal(u_s, v_s):
for i, a in enumerate(p): for i, a in enumerate(p):
for j, b in enumerate(q): for j, b in enumerate(q):
if i != j: if i != j:
proba += a*b proba += a * b
# print("Proba off %3.2f" %(proba)) # print("Proba off %3.2f" %(proba))
return proba return proba
...@@ -345,35 +335,41 @@ def duplicatescore(u, v): ...@@ -345,35 +335,41 @@ def duplicatescore(u, v):
if offdiago > max_disc: if offdiago > max_disc:
max_disc = offdiago max_disc = offdiago
if max_disc > 0 and max_disc < 1: if max_disc > 0 and max_disc < 1:
ratio = (1-max_disc)/max_disc ratio = (1 - max_disc) / max_disc
computed = np.log(ratio) computed = np.log(ratio)
return computed return computed
def gstrength(u): def gstrength(u):
# Sum of phred-like genotype qualities provides a measure of the """
# combined genotype quality of the site Sum of phred-like genotype qualities provides a measure of the
# np.sum([s['GQ'] if s['GQ'] is not None else 0 for s in u.samples.values()]) combined genotype quality of the site
np.sum([s['GQ'] if s['GQ'] is not None else 0 for s in u.samples.values()])
"""
return u.GQ_sum_score() return u.GQ_sum_score()
def variantstrength(u): def variantstrength(u):
# maximum SQ, where SQ stands for """
# Phred-scaled probability that this site is variant (non-reference) maximum SQ, where SQ stands for
# in this sample) Phred-scaled probability that this site is variant (non-reference)
# QUAL = -10 * log(P(locus is reference in all samples)), which is in this sample)
# equal to the sum of the SQ scores. QUAL = -10 * log(P(locus is reference in all samples)), which is
# see https://github.com/hall-lab/svtyper/issues/10 equal to the sum of the SQ scores.
# sum([s['SQ'] if s['SQ'] is not None else 0 for s in u.samples.values()]) see https://github.com/hall-lab/svtyper/issues/10
sum([s['SQ'] if s['SQ'] is not None else 0 for s in u.samples.values()])
"""
return u.qual() return u.qual()
# max([s['SQ'] if s['SQ'] is not None else 0 for s in u.samples.values()]) # max([s['SQ'] if s['SQ'] is not None else 0 for s in u.samples.values()])
def getduplicates_GQ(u, v): def getduplicates_GQ(u, v):
# select the prefered duplicate on the basis of the """
# Sum of phred-like genotype qualities select the prefered duplicate on the basis of the
# see gstrength Sum of phred-like genotype qualities
# returns prefered, discarded, strength of both see gstrength
returns prefered, discarded, strength of both
"""
if gstrength(u) > gstrength(v): if gstrength(u) > gstrength(v):
return (u, v, gstrength(u), gstrength(v)) return (u, v, gstrength(u), gstrength(v))
else: else:
...@@ -381,10 +377,12 @@ def getduplicates_GQ(u, v): ...@@ -381,10 +377,12 @@ def getduplicates_GQ(u, v):
def getduplicates_QUAL(u, v): def getduplicates_QUAL(u, v):
# select the prefered duplicate on the basis of """
# Phred-scaled probability that this site is a variant select the prefered duplicate on the basis of
# see variantstrength Phred-scaled probability that this site is a variant
# returns prefered, discarded, strength of both see variantstrength
returns prefered, discarded, strength of both
"""
if variantstrength(u) > variantstrength(v): if variantstrength(u) > variantstrength(v):
return (u, v, variantstrength(u), variantstrength(v)) return (u, v, variantstrength(u), variantstrength(v))
else: else:
...@@ -393,7 +391,7 @@ def getduplicates_QUAL(u, v): ...@@ -393,7 +391,7 @@ def getduplicates_QUAL(u, v):
def getoverlap(u, osize): def getoverlap(u, osize):
# percentage overlap given the size of the overlap # percentage overlap given the size of the overlap
return 100*osize/u.svlen return 100 * osize / u.svlen
def add_redundancy_infos_header(reader): def add_redundancy_infos_header(reader):
...@@ -418,9 +416,8 @@ def add_redundancy_infos_header(reader): ...@@ -418,9 +416,8 @@ def add_redundancy_infos_header(reader):
"Tools supporting (detecting) the sv") "Tools supporting (detecting) the sv")
def GenomeSTRIPLikeRedundancyAnnotator(SVSet, reader, def redundancy_annotator(SVSet, reader,
duplicatescore_threshold=-2, duplicatescore_threshold=-2, genotyper="svtyper"):
genotyper="svtyper"):
""" Annotating duplicate candidates based on the genotype likelihoods """ Annotating duplicate candidates based on the genotype likelihoods
- genotype likelihoods can be provided by svtyper or genomestrip - genotype likelihoods can be provided by svtyper or genomestrip
""" """
...@@ -452,7 +449,7 @@ def GenomeSTRIPLikeRedundancyAnnotator(SVSet, reader, ...@@ -452,7 +449,7 @@ def GenomeSTRIPLikeRedundancyAnnotator(SVSet, reader,
score = duplicatescore(u, v) score = duplicatescore(u, v)
# print("Comparing %s and %s : %3.8f" % (u.id, v.id, score)) # print("Comparing %s and %s : %3.8f" % (u.id, v.id, score))
if score > duplicatescore_threshold: if score > duplicatescore_threshold:
ref, dupli, s1, s2 = getduplicates_GQ(u, v) ref, dupli, _, _ = getduplicates_GQ(u, v)
# print("%s prefered to %s %3.8f" % (ref.id, dupli.id, score)) # print("%s prefered to %s %3.8f" % (ref.id, dupli.id, score))
reference[ref] = 1 reference[ref] = 1
overlap_size = int(o[-1]) overlap_size = int(o[-1])
...@@ -523,7 +520,7 @@ def add_filter_infos_header(reader): ...@@ -523,7 +520,7 @@ def add_filter_infos_header(reader):
reader.addFilter("ABFREQ", "AB frequency <0.3 for >50% heterosamples") reader.addFilter("ABFREQ", "AB frequency <0.3 for >50% heterosamples")
def GenomeSTRIPLikefiltering(SVSet, reader): def variant_filtration(SVSet, reader):
""" Filtering the candidate CNVs according to the following criteria """ Filtering the candidate CNVs according to the following criteria
- non duplicate sites - non duplicate sites
- variant sites - variant sites
...@@ -539,11 +536,11 @@ def GenomeSTRIPLikefiltering(SVSet, reader): ...@@ -539,11 +536,11 @@ def GenomeSTRIPLikefiltering(SVSet, reader):
for sv in SVSet: for sv in SVSet:
info = sv.record.info info = sv.record.info
sv.record.info['CALLRATE'] = sv.CallRate(13) sv.record.info['CALLRATE'] = sv.call_rate(13)
sv.record.info['VARIANTCALLRATE'] = sv.VariantCallRate(13) sv.record.info['VARIANTCALLRATE'] = sv.variant_call_rate(13)
if sv.CallRate(13) < 0.75: if sv.call_rate(13) < 0.75:
sv.filter.add("CALLRATE") sv.filter.add("CALLRATE")
if not sv.Polymorph(): if not sv.polymorph():
sv.filter.add("MONOMORPH") sv.filter.add("MONOMORPH")
if 'NONDUPLICATEOVERLAP' in info and info['NONDUPLICATEOVERLAP'] > 0.7: if 'NONDUPLICATEOVERLAP' in info and info['NONDUPLICATEOVERLAP'] > 0.7:
sv.filter.add("OVERLAP") sv.filter.add("OVERLAP")
...@@ -551,21 +548,22 @@ def GenomeSTRIPLikefiltering(SVSet, reader): ...@@ -551,21 +548,22 @@ def GenomeSTRIPLikefiltering(SVSet, reader):
sv.filter.add("DUPLICATE") sv.filter.add("DUPLICATE")
def ABFreqFiltering(SVSet): def AB_filtering(variant_set):
""" Filtering the candidate CNVs according to the following criteria """ Filtering the candidate CNVs according to the following criteria
- more than 50% of variant samples should have AB freq > 0.3 - more than 50% of variant samples should have AB freq > 0.3
""" """
for sv in SVSet: for sv in variant_set:
ABfreqOK = [] valid_AB_freq = []
for s in sv.record.samples.values(): for s in sv.record.samples.values():
if Heterozygote(s): if Heterozygote(s):
ABfreqOK.append((s.get('AB')[0] > 0.3)) valid_AB_freq.append((s.get('AB')[0] > 0.3))
if len(ABfreqOK) > 0 and sum(ABfreqOK) < len(ABfreqOK)/2: if (len(valid_AB_freq) > 0 and
sum(valid_AB_freq) < len(valid_AB_freq) / 2):
sv.filter.add("ABFREQ") sv.filter.add("ABFREQ")
def GetConnectedDuplicates(SVSet): def get_connected_duplicates(SVSet):
""" """
Construct connected components of duplicates and rename the variants Construct connected components of duplicates and rename the variants
""" """
...@@ -606,7 +604,7 @@ def get_tool_name(sv_ident): ...@@ -606,7 +604,7 @@ def get_tool_name(sv_ident):
return sv_ident.split("_")[0] return sv_ident.split("_")[0]
def SetSupportingTools(SVSet): def set_supporting_tools(SVSet):
for sv in SVSet: for sv in SVSet:
tools = {get_tool_name(sv.id)} tools = {get_tool_name(sv.id)}
if "DUPLICATES" in sv.record.info: if "DUPLICATES" in sv.record.info:
...@@ -642,7 +640,7 @@ def rename_info_field(sv, key, sv_dict): ...@@ -642,7 +640,7 @@ def rename_info_field(sv, key, sv_dict):
sv.record.info[key] = info_newid sv.record.info[key] = info_newid
def RenameSV(SVSet): def rename_variants(SVSet):
sv_dict = defaultdict() sv_dict = defaultdict()
for sv in SVSet: for sv in SVSet:
new_id = new_id_str(sv) new_id = new_id_str(sv)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment