From b0069bcf24889450188ba0a23156efd8bb021665 Mon Sep 17 00:00:00 2001 From: Thomas Faraut <Thomas.Faraut@inra.fr> Date: Mon, 20 Jan 2020 15:08:34 +0100 Subject: [PATCH] a more pep8 compliant code --- svreader/annotation.py | 116 ++++++++++++++++++++--------------------- 1 file changed, 57 insertions(+), 59 deletions(-) diff --git a/svreader/annotation.py b/svreader/annotation.py index 6d2363e..af72a34 100644 --- a/svreader/annotation.py +++ b/svreader/annotation.py @@ -53,6 +53,7 @@ class AnnotateRecord(VCFRecord): """ A lightweight object to annotated the final records """ + def __init__(self, record): """ A pysam VariantRecord wrapper @@ -128,7 +129,7 @@ class AnnotateRecord(VCFRecord): def maxGQ(self): return max(self.GQ_samples()) - def setQual(self): + def set_qual(self): self.record.qual = self.qual() def numdiffGenotypes(self): @@ -138,10 +139,10 @@ class AnnotateRecord(VCFRecord): genotypes[coded_geno(s.get('GT'))] = 1 return len(genotypes.keys()) - def Polymorph(self): + def polymorph(self): return self.numdiffGenotypes() > 1 - def AddSupportInfos(self): + def add_supporting_infos(self): supp_reads = self.variant_read_support() num_supp_samples = self.num_variant_samples() try: @@ -151,25 +152,25 @@ class AnnotateRecord(VCFRecord): eprint("SUPP_READS or NUM_SUPP_SAMPLES absent from record info keys") sys.exit(1) - def CallRate(self, cutoff): + def call_rate(self, cutoff): call_qual = [] for s in self.samples.values(): if s.get('GQ') is not None: call_qual.append(s.get('GQ')) num_qual_call = sum([(qual > cutoff) for qual in call_qual]) - return num_qual_call/self.num_samples + return num_qual_call / self.num_samples - def VariantCallRate(self, cutoff): + def variant_call_rate(self, cutoff): samples = self.samples.values() num_qual_var = 0 for s in samples: if s.get("GQ") is not None and s.get("GQ") > cutoff and Variant(s): num_qual_var += 1 num_var_samples = self.num_variant_samples() - var_call_rate = num_qual_var/num_var_samples if num_var_samples else 0 + var_call_rate = num_qual_var / num_var_samples if num_var_samples else 0 return var_call_rate - def UnifiedPass(self): + def unify_pass_filtertag(self): """ All records passing the filters (PASS, .) ar now labelled PASS """ @@ -203,17 +204,6 @@ class AnnotateReader(VCFReader): def getHeader(self): return self.vcf_reader.header - # def addInfo(self, name, number, type, description): - # self.vcf_reader.header.info.add(id=name, - # number=number, - # type=type, - # description=description) - # - # def addFilter(self, name, description): - # self.vcf_reader.header.filters.add(id=name, - # number=None, - # type=None, - # description=description) def add_annotation_metadata(self): self.addInfo("SOURCEID", 1, "String", @@ -290,7 +280,7 @@ def probas(likelihoods): def getprobas(sample): # Transforming likelihods into probabilities - return probas(getlikelihoods(sample))/np.sum(probas(getlikelihoods(sample))) + return probas(getlikelihoods(sample)) / np.sum(probas(getlikelihoods(sample))) def ondiagonal(u_s, v_s): @@ -301,7 +291,7 @@ def ondiagonal(u_s, v_s): q = getprobas(v_s) proba = 0 for a, b in zip(p, q): - proba += a*b + proba += a * b # print("Proba on %3.5f" %(proba)) return proba @@ -315,7 +305,7 @@ def offdiagonal(u_s, v_s): for i, a in enumerate(p): for j, b in enumerate(q): if i != j: - proba += a*b + proba += a * b # print("Proba off %3.2f" %(proba)) return proba @@ -345,35 +335,41 @@ def duplicatescore(u, v): if offdiago > max_disc: max_disc = offdiago if max_disc > 0 and max_disc < 1: - ratio = (1-max_disc)/max_disc + ratio = (1 - max_disc) / max_disc computed = np.log(ratio) return computed def gstrength(u): - # Sum of phred-like genotype qualities provides a measure of the - # combined genotype quality of the site - # np.sum([s['GQ'] if s['GQ'] is not None else 0 for s in u.samples.values()]) + """ + Sum of phred-like genotype qualities provides a measure of the + combined genotype quality of the site + np.sum([s['GQ'] if s['GQ'] is not None else 0 for s in u.samples.values()]) + """ return u.GQ_sum_score() def variantstrength(u): - # maximum SQ, where SQ stands for - # Phred-scaled probability that this site is variant (non-reference) - # in this sample) - # QUAL = -10 * log(P(locus is reference in all samples)), which is - # equal to the sum of the SQ scores. - # see https://github.com/hall-lab/svtyper/issues/10 - # sum([s['SQ'] if s['SQ'] is not None else 0 for s in u.samples.values()]) + """ + maximum SQ, where SQ stands for + Phred-scaled probability that this site is variant (non-reference) + in this sample) + QUAL = -10 * log(P(locus is reference in all samples)), which is + equal to the sum of the SQ scores. + see https://github.com/hall-lab/svtyper/issues/10 + sum([s['SQ'] if s['SQ'] is not None else 0 for s in u.samples.values()]) + """ return u.qual() # max([s['SQ'] if s['SQ'] is not None else 0 for s in u.samples.values()]) def getduplicates_GQ(u, v): - # select the prefered duplicate on the basis of the - # Sum of phred-like genotype qualities - # see gstrength - # returns prefered, discarded, strength of both + """ + select the prefered duplicate on the basis of the + Sum of phred-like genotype qualities + see gstrength + returns prefered, discarded, strength of both + """ if gstrength(u) > gstrength(v): return (u, v, gstrength(u), gstrength(v)) else: @@ -381,10 +377,12 @@ def getduplicates_GQ(u, v): def getduplicates_QUAL(u, v): - # select the prefered duplicate on the basis of - # Phred-scaled probability that this site is a variant - # see variantstrength - # returns prefered, discarded, strength of both + """ + select the prefered duplicate on the basis of + Phred-scaled probability that this site is a variant + see variantstrength + returns prefered, discarded, strength of both + """ if variantstrength(u) > variantstrength(v): return (u, v, variantstrength(u), variantstrength(v)) else: @@ -393,7 +391,7 @@ def getduplicates_QUAL(u, v): def getoverlap(u, osize): # percentage overlap given the size of the overlap - return 100*osize/u.svlen + return 100 * osize / u.svlen def add_redundancy_infos_header(reader): @@ -418,9 +416,8 @@ def add_redundancy_infos_header(reader): "Tools supporting (detecting) the sv") -def GenomeSTRIPLikeRedundancyAnnotator(SVSet, reader, - duplicatescore_threshold=-2, - genotyper="svtyper"): +def redundancy_annotator(SVSet, reader, + duplicatescore_threshold=-2, genotyper="svtyper"): """ Annotating duplicate candidates based on the genotype likelihoods - genotype likelihoods can be provided by svtyper or genomestrip """ @@ -452,7 +449,7 @@ def GenomeSTRIPLikeRedundancyAnnotator(SVSet, reader, score = duplicatescore(u, v) # print("Comparing %s and %s : %3.8f" % (u.id, v.id, score)) if score > duplicatescore_threshold: - ref, dupli, s1, s2 = getduplicates_GQ(u, v) + ref, dupli, _, _ = getduplicates_GQ(u, v) # print("%s prefered to %s %3.8f" % (ref.id, dupli.id, score)) reference[ref] = 1 overlap_size = int(o[-1]) @@ -523,7 +520,7 @@ def add_filter_infos_header(reader): reader.addFilter("ABFREQ", "AB frequency <0.3 for >50% heterosamples") -def GenomeSTRIPLikefiltering(SVSet, reader): +def variant_filtration(SVSet, reader): """ Filtering the candidate CNVs according to the following criteria - non duplicate sites - variant sites @@ -539,11 +536,11 @@ def GenomeSTRIPLikefiltering(SVSet, reader): for sv in SVSet: info = sv.record.info - sv.record.info['CALLRATE'] = sv.CallRate(13) - sv.record.info['VARIANTCALLRATE'] = sv.VariantCallRate(13) - if sv.CallRate(13) < 0.75: + sv.record.info['CALLRATE'] = sv.call_rate(13) + sv.record.info['VARIANTCALLRATE'] = sv.variant_call_rate(13) + if sv.call_rate(13) < 0.75: sv.filter.add("CALLRATE") - if not sv.Polymorph(): + if not sv.polymorph(): sv.filter.add("MONOMORPH") if 'NONDUPLICATEOVERLAP' in info and info['NONDUPLICATEOVERLAP'] > 0.7: sv.filter.add("OVERLAP") @@ -551,21 +548,22 @@ def GenomeSTRIPLikefiltering(SVSet, reader): sv.filter.add("DUPLICATE") -def ABFreqFiltering(SVSet): +def AB_filtering(variant_set): """ Filtering the candidate CNVs according to the following criteria - more than 50% of variant samples should have AB freq > 0.3 """ - for sv in SVSet: - ABfreqOK = [] + for sv in variant_set: + valid_AB_freq = [] for s in sv.record.samples.values(): if Heterozygote(s): - ABfreqOK.append((s.get('AB')[0] > 0.3)) - if len(ABfreqOK) > 0 and sum(ABfreqOK) < len(ABfreqOK)/2: + valid_AB_freq.append((s.get('AB')[0] > 0.3)) + if (len(valid_AB_freq) > 0 and + sum(valid_AB_freq) < len(valid_AB_freq) / 2): sv.filter.add("ABFREQ") -def GetConnectedDuplicates(SVSet): +def get_connected_duplicates(SVSet): """ Construct connected components of duplicates and rename the variants """ @@ -606,7 +604,7 @@ def get_tool_name(sv_ident): return sv_ident.split("_")[0] -def SetSupportingTools(SVSet): +def set_supporting_tools(SVSet): for sv in SVSet: tools = {get_tool_name(sv.id)} if "DUPLICATES" in sv.record.info: @@ -642,7 +640,7 @@ def rename_info_field(sv, key, sv_dict): sv.record.info[key] = info_newid -def RenameSV(SVSet): +def rename_variants(SVSet): sv_dict = defaultdict() for sv in SVSet: new_id = new_id_str(sv) -- GitLab