Skip to content
Snippets Groups Projects
Commit b0069bcf authored by Thomas Faraut's avatar Thomas Faraut
Browse files

a more pep8 compliant code

parent 28cadf8a
No related branches found
No related tags found
No related merge requests found
......@@ -53,6 +53,7 @@ class AnnotateRecord(VCFRecord):
"""
A lightweight object to annotated the final records
"""
def __init__(self, record):
"""
A pysam VariantRecord wrapper
......@@ -128,7 +129,7 @@ class AnnotateRecord(VCFRecord):
def maxGQ(self):
return max(self.GQ_samples())
def setQual(self):
def set_qual(self):
self.record.qual = self.qual()
def numdiffGenotypes(self):
......@@ -138,10 +139,10 @@ class AnnotateRecord(VCFRecord):
genotypes[coded_geno(s.get('GT'))] = 1
return len(genotypes.keys())
def Polymorph(self):
def polymorph(self):
return self.numdiffGenotypes() > 1
def AddSupportInfos(self):
def add_supporting_infos(self):
supp_reads = self.variant_read_support()
num_supp_samples = self.num_variant_samples()
try:
......@@ -151,25 +152,25 @@ class AnnotateRecord(VCFRecord):
eprint("SUPP_READS or NUM_SUPP_SAMPLES absent from record info keys")
sys.exit(1)
def CallRate(self, cutoff):
def call_rate(self, cutoff):
call_qual = []
for s in self.samples.values():
if s.get('GQ') is not None:
call_qual.append(s.get('GQ'))
num_qual_call = sum([(qual > cutoff) for qual in call_qual])
return num_qual_call/self.num_samples
return num_qual_call / self.num_samples
def VariantCallRate(self, cutoff):
def variant_call_rate(self, cutoff):
samples = self.samples.values()
num_qual_var = 0
for s in samples:
if s.get("GQ") is not None and s.get("GQ") > cutoff and Variant(s):
num_qual_var += 1
num_var_samples = self.num_variant_samples()
var_call_rate = num_qual_var/num_var_samples if num_var_samples else 0
var_call_rate = num_qual_var / num_var_samples if num_var_samples else 0
return var_call_rate
def UnifiedPass(self):
def unify_pass_filtertag(self):
"""
All records passing the filters (PASS, .) ar now labelled PASS
"""
......@@ -203,17 +204,6 @@ class AnnotateReader(VCFReader):
def getHeader(self):
return self.vcf_reader.header
# def addInfo(self, name, number, type, description):
# self.vcf_reader.header.info.add(id=name,
# number=number,
# type=type,
# description=description)
#
# def addFilter(self, name, description):
# self.vcf_reader.header.filters.add(id=name,
# number=None,
# type=None,
# description=description)
def add_annotation_metadata(self):
self.addInfo("SOURCEID", 1, "String",
......@@ -290,7 +280,7 @@ def probas(likelihoods):
def getprobas(sample):
# Transforming likelihods into probabilities
return probas(getlikelihoods(sample))/np.sum(probas(getlikelihoods(sample)))
return probas(getlikelihoods(sample)) / np.sum(probas(getlikelihoods(sample)))
def ondiagonal(u_s, v_s):
......@@ -301,7 +291,7 @@ def ondiagonal(u_s, v_s):
q = getprobas(v_s)
proba = 0
for a, b in zip(p, q):
proba += a*b
proba += a * b
# print("Proba on %3.5f" %(proba))
return proba
......@@ -315,7 +305,7 @@ def offdiagonal(u_s, v_s):
for i, a in enumerate(p):
for j, b in enumerate(q):
if i != j:
proba += a*b
proba += a * b
# print("Proba off %3.2f" %(proba))
return proba
......@@ -345,35 +335,41 @@ def duplicatescore(u, v):
if offdiago > max_disc:
max_disc = offdiago
if max_disc > 0 and max_disc < 1:
ratio = (1-max_disc)/max_disc
ratio = (1 - max_disc) / max_disc
computed = np.log(ratio)
return computed
def gstrength(u):
# Sum of phred-like genotype qualities provides a measure of the
# combined genotype quality of the site
# np.sum([s['GQ'] if s['GQ'] is not None else 0 for s in u.samples.values()])
"""
Sum of phred-like genotype qualities provides a measure of the
combined genotype quality of the site
np.sum([s['GQ'] if s['GQ'] is not None else 0 for s in u.samples.values()])
"""
return u.GQ_sum_score()
def variantstrength(u):
# maximum SQ, where SQ stands for
# Phred-scaled probability that this site is variant (non-reference)
# in this sample)
# QUAL = -10 * log(P(locus is reference in all samples)), which is
# equal to the sum of the SQ scores.
# see https://github.com/hall-lab/svtyper/issues/10
# sum([s['SQ'] if s['SQ'] is not None else 0 for s in u.samples.values()])
"""
maximum SQ, where SQ stands for
Phred-scaled probability that this site is variant (non-reference)
in this sample)
QUAL = -10 * log(P(locus is reference in all samples)), which is
equal to the sum of the SQ scores.
see https://github.com/hall-lab/svtyper/issues/10
sum([s['SQ'] if s['SQ'] is not None else 0 for s in u.samples.values()])
"""
return u.qual()
# max([s['SQ'] if s['SQ'] is not None else 0 for s in u.samples.values()])
def getduplicates_GQ(u, v):
# select the prefered duplicate on the basis of the
# Sum of phred-like genotype qualities
# see gstrength
# returns prefered, discarded, strength of both
"""
select the prefered duplicate on the basis of the
Sum of phred-like genotype qualities
see gstrength
returns prefered, discarded, strength of both
"""
if gstrength(u) > gstrength(v):
return (u, v, gstrength(u), gstrength(v))
else:
......@@ -381,10 +377,12 @@ def getduplicates_GQ(u, v):
def getduplicates_QUAL(u, v):
# select the prefered duplicate on the basis of
# Phred-scaled probability that this site is a variant
# see variantstrength
# returns prefered, discarded, strength of both
"""
select the prefered duplicate on the basis of
Phred-scaled probability that this site is a variant
see variantstrength
returns prefered, discarded, strength of both
"""
if variantstrength(u) > variantstrength(v):
return (u, v, variantstrength(u), variantstrength(v))
else:
......@@ -393,7 +391,7 @@ def getduplicates_QUAL(u, v):
def getoverlap(u, osize):
# percentage overlap given the size of the overlap
return 100*osize/u.svlen
return 100 * osize / u.svlen
def add_redundancy_infos_header(reader):
......@@ -418,9 +416,8 @@ def add_redundancy_infos_header(reader):
"Tools supporting (detecting) the sv")
def GenomeSTRIPLikeRedundancyAnnotator(SVSet, reader,
duplicatescore_threshold=-2,
genotyper="svtyper"):
def redundancy_annotator(SVSet, reader,
duplicatescore_threshold=-2, genotyper="svtyper"):
""" Annotating duplicate candidates based on the genotype likelihoods
- genotype likelihoods can be provided by svtyper or genomestrip
"""
......@@ -452,7 +449,7 @@ def GenomeSTRIPLikeRedundancyAnnotator(SVSet, reader,
score = duplicatescore(u, v)
# print("Comparing %s and %s : %3.8f" % (u.id, v.id, score))
if score > duplicatescore_threshold:
ref, dupli, s1, s2 = getduplicates_GQ(u, v)
ref, dupli, _, _ = getduplicates_GQ(u, v)
# print("%s prefered to %s %3.8f" % (ref.id, dupli.id, score))
reference[ref] = 1
overlap_size = int(o[-1])
......@@ -523,7 +520,7 @@ def add_filter_infos_header(reader):
reader.addFilter("ABFREQ", "AB frequency <0.3 for >50% heterosamples")
def GenomeSTRIPLikefiltering(SVSet, reader):
def variant_filtration(SVSet, reader):
""" Filtering the candidate CNVs according to the following criteria
- non duplicate sites
- variant sites
......@@ -539,11 +536,11 @@ def GenomeSTRIPLikefiltering(SVSet, reader):
for sv in SVSet:
info = sv.record.info
sv.record.info['CALLRATE'] = sv.CallRate(13)
sv.record.info['VARIANTCALLRATE'] = sv.VariantCallRate(13)
if sv.CallRate(13) < 0.75:
sv.record.info['CALLRATE'] = sv.call_rate(13)
sv.record.info['VARIANTCALLRATE'] = sv.variant_call_rate(13)
if sv.call_rate(13) < 0.75:
sv.filter.add("CALLRATE")
if not sv.Polymorph():
if not sv.polymorph():
sv.filter.add("MONOMORPH")
if 'NONDUPLICATEOVERLAP' in info and info['NONDUPLICATEOVERLAP'] > 0.7:
sv.filter.add("OVERLAP")
......@@ -551,21 +548,22 @@ def GenomeSTRIPLikefiltering(SVSet, reader):
sv.filter.add("DUPLICATE")
def ABFreqFiltering(SVSet):
def AB_filtering(variant_set):
""" Filtering the candidate CNVs according to the following criteria
- more than 50% of variant samples should have AB freq > 0.3
"""
for sv in SVSet:
ABfreqOK = []
for sv in variant_set:
valid_AB_freq = []
for s in sv.record.samples.values():
if Heterozygote(s):
ABfreqOK.append((s.get('AB')[0] > 0.3))
if len(ABfreqOK) > 0 and sum(ABfreqOK) < len(ABfreqOK)/2:
valid_AB_freq.append((s.get('AB')[0] > 0.3))
if (len(valid_AB_freq) > 0 and
sum(valid_AB_freq) < len(valid_AB_freq) / 2):
sv.filter.add("ABFREQ")
def GetConnectedDuplicates(SVSet):
def get_connected_duplicates(SVSet):
"""
Construct connected components of duplicates and rename the variants
"""
......@@ -606,7 +604,7 @@ def get_tool_name(sv_ident):
return sv_ident.split("_")[0]
def SetSupportingTools(SVSet):
def set_supporting_tools(SVSet):
for sv in SVSet:
tools = {get_tool_name(sv.id)}
if "DUPLICATES" in sv.record.info:
......@@ -642,7 +640,7 @@ def rename_info_field(sv, key, sv_dict):
sv.record.info[key] = info_newid
def RenameSV(SVSet):
def rename_variants(SVSet):
sv_dict = defaultdict()
for sv in SVSet:
new_id = new_id_str(sv)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment