From b0069bcf24889450188ba0a23156efd8bb021665 Mon Sep 17 00:00:00 2001
From: Thomas Faraut <Thomas.Faraut@inra.fr>
Date: Mon, 20 Jan 2020 15:08:34 +0100
Subject: [PATCH] a more pep8 compliant code

---
 svreader/annotation.py | 116 ++++++++++++++++++++---------------------
 1 file changed, 57 insertions(+), 59 deletions(-)

diff --git a/svreader/annotation.py b/svreader/annotation.py
index 6d2363e..af72a34 100644
--- a/svreader/annotation.py
+++ b/svreader/annotation.py
@@ -53,6 +53,7 @@ class AnnotateRecord(VCFRecord):
     """
     A lightweight object to annotated the final records
     """
+
     def __init__(self, record):
         """
         A pysam VariantRecord wrapper
@@ -128,7 +129,7 @@ class AnnotateRecord(VCFRecord):
     def maxGQ(self):
         return max(self.GQ_samples())
 
-    def setQual(self):
+    def set_qual(self):
         self.record.qual = self.qual()
 
     def numdiffGenotypes(self):
@@ -138,10 +139,10 @@ class AnnotateRecord(VCFRecord):
                 genotypes[coded_geno(s.get('GT'))] = 1
         return len(genotypes.keys())
 
-    def Polymorph(self):
+    def polymorph(self):
         return self.numdiffGenotypes() > 1
 
-    def AddSupportInfos(self):
+    def add_supporting_infos(self):
         supp_reads = self.variant_read_support()
         num_supp_samples = self.num_variant_samples()
         try:
@@ -151,25 +152,25 @@ class AnnotateRecord(VCFRecord):
             eprint("SUPP_READS or NUM_SUPP_SAMPLES absent from record info keys")
             sys.exit(1)
 
-    def CallRate(self, cutoff):
+    def call_rate(self, cutoff):
         call_qual = []
         for s in self.samples.values():
             if s.get('GQ') is not None:
                 call_qual.append(s.get('GQ'))
         num_qual_call = sum([(qual > cutoff) for qual in call_qual])
-        return num_qual_call/self.num_samples
+        return num_qual_call / self.num_samples
 
-    def VariantCallRate(self, cutoff):
+    def variant_call_rate(self, cutoff):
         samples = self.samples.values()
         num_qual_var = 0
         for s in samples:
             if s.get("GQ") is not None and s.get("GQ") > cutoff and Variant(s):
                 num_qual_var += 1
         num_var_samples = self.num_variant_samples()
-        var_call_rate = num_qual_var/num_var_samples if num_var_samples else 0
+        var_call_rate = num_qual_var / num_var_samples if num_var_samples else 0
         return var_call_rate
 
-    def UnifiedPass(self):
+    def unify_pass_filtertag(self):
         """
            All records passing the filters (PASS, .) ar now labelled PASS
         """
@@ -203,17 +204,6 @@ class AnnotateReader(VCFReader):
     def getHeader(self):
         return self.vcf_reader.header
 
-    # def addInfo(self, name, number, type, description):
-    #     self.vcf_reader.header.info.add(id=name,
-    #                                     number=number,
-    #                                     type=type,
-    #                                     description=description)
-    #
-    # def addFilter(self, name, description):
-    #     self.vcf_reader.header.filters.add(id=name,
-    #                                        number=None,
-    #                                        type=None,
-    #                                        description=description)
 
     def add_annotation_metadata(self):
         self.addInfo("SOURCEID", 1, "String",
@@ -290,7 +280,7 @@ def probas(likelihoods):
 
 def getprobas(sample):
     # Transforming likelihods into probabilities
-    return probas(getlikelihoods(sample))/np.sum(probas(getlikelihoods(sample)))
+    return probas(getlikelihoods(sample)) / np.sum(probas(getlikelihoods(sample)))
 
 
 def ondiagonal(u_s, v_s):
@@ -301,7 +291,7 @@ def ondiagonal(u_s, v_s):
     q = getprobas(v_s)
     proba = 0
     for a, b in zip(p, q):
-        proba += a*b
+        proba += a * b
     # print("Proba on %3.5f" %(proba))
     return proba
 
@@ -315,7 +305,7 @@ def offdiagonal(u_s, v_s):
     for i, a in enumerate(p):
         for j, b in enumerate(q):
             if i != j:
-                proba += a*b
+                proba += a * b
     # print("Proba off %3.2f" %(proba))
     return proba
 
@@ -345,35 +335,41 @@ def duplicatescore(u, v):
         if offdiago > max_disc:
             max_disc = offdiago
     if max_disc > 0 and max_disc < 1:
-        ratio = (1-max_disc)/max_disc
+        ratio = (1 - max_disc) / max_disc
         computed = np.log(ratio)
     return computed
 
 
 def gstrength(u):
-    # Sum of phred-like genotype qualities provides a measure of the
-    # combined genotype quality of the site
-    # np.sum([s['GQ'] if s['GQ'] is not None else 0 for s in u.samples.values()])
+    """
+    Sum of phred-like genotype qualities provides a measure of the
+    combined genotype quality of the site
+    np.sum([s['GQ'] if s['GQ'] is not None else 0 for s in u.samples.values()])
+    """
     return u.GQ_sum_score()
 
 
 def variantstrength(u):
-    # maximum SQ, where SQ stands for
-    # Phred-scaled probability that this site is variant (non-reference)
-    # in this sample)
-    # QUAL = -10 * log(P(locus is reference in all samples)), which is
-    # equal to the sum of the SQ scores.
-    # see https://github.com/hall-lab/svtyper/issues/10
-    # sum([s['SQ'] if s['SQ'] is not None else 0 for s in u.samples.values()])
+    """
+    maximum SQ, where SQ stands for
+    Phred-scaled probability that this site is variant (non-reference)
+    in this sample)
+    QUAL = -10 * log(P(locus is reference in all samples)), which is
+    equal to the sum of the SQ scores.
+    see https://github.com/hall-lab/svtyper/issues/10
+    sum([s['SQ'] if s['SQ'] is not None else 0 for s in u.samples.values()])
+    """
     return u.qual()
     # max([s['SQ'] if s['SQ'] is not None else 0 for s in u.samples.values()])
 
 
 def getduplicates_GQ(u, v):
-    # select the prefered duplicate on the basis of the
-    # Sum of phred-like genotype qualities
-    # see gstrength
-    # returns prefered, discarded, strength of both
+    """
+    select the prefered duplicate on the basis of the
+    Sum of phred-like genotype qualities
+    see gstrength
+    returns prefered, discarded, strength of both
+    """
     if gstrength(u) > gstrength(v):
         return (u, v, gstrength(u), gstrength(v))
     else:
@@ -381,10 +377,12 @@ def getduplicates_GQ(u, v):
 
 
 def getduplicates_QUAL(u, v):
-    # select the prefered duplicate on the basis of
-    # Phred-scaled probability that this site is a variant
-    # see variantstrength
-    # returns prefered, discarded, strength of both
+    """
+    select the prefered duplicate on the basis of
+    Phred-scaled probability that this site is a variant
+    see variantstrength
+    returns prefered, discarded, strength of both
+    """
     if variantstrength(u) > variantstrength(v):
         return (u, v, variantstrength(u), variantstrength(v))
     else:
@@ -393,7 +391,7 @@ def getduplicates_QUAL(u, v):
 
 def getoverlap(u, osize):
     # percentage overlap given the size of the overlap
-    return 100*osize/u.svlen
+    return 100 * osize / u.svlen
 
 
 def add_redundancy_infos_header(reader):
@@ -418,9 +416,8 @@ def add_redundancy_infos_header(reader):
                    "Tools supporting (detecting) the sv")
 
 
-def GenomeSTRIPLikeRedundancyAnnotator(SVSet, reader,
-                                       duplicatescore_threshold=-2,
-                                       genotyper="svtyper"):
+def redundancy_annotator(SVSet, reader,
+                          duplicatescore_threshold=-2, genotyper="svtyper"):
     """ Annotating duplicate candidates based on the genotype likelihoods
       - genotype likelihoods can be provided by svtyper or genomestrip
     """
@@ -452,7 +449,7 @@ def GenomeSTRIPLikeRedundancyAnnotator(SVSet, reader,
         score = duplicatescore(u, v)
         # print("Comparing %s and %s : %3.8f" % (u.id, v.id, score))
         if score > duplicatescore_threshold:
-            ref, dupli, s1, s2 = getduplicates_GQ(u, v)
+            ref, dupli, _, _ = getduplicates_GQ(u, v)
             # print("%s prefered to %s %3.8f" % (ref.id, dupli.id, score))
             reference[ref] = 1
             overlap_size = int(o[-1])
@@ -523,7 +520,7 @@ def add_filter_infos_header(reader):
     reader.addFilter("ABFREQ", "AB frequency <0.3 for >50% heterosamples")
 
 
-def GenomeSTRIPLikefiltering(SVSet, reader):
+def variant_filtration(SVSet, reader):
     """ Filtering the candidate CNVs according to the following criteria
           - non duplicate sites
           - variant sites
@@ -539,11 +536,11 @@ def GenomeSTRIPLikefiltering(SVSet, reader):
 
     for sv in SVSet:
         info = sv.record.info
-        sv.record.info['CALLRATE'] = sv.CallRate(13)
-        sv.record.info['VARIANTCALLRATE'] = sv.VariantCallRate(13)
-        if sv.CallRate(13) < 0.75:
+        sv.record.info['CALLRATE'] = sv.call_rate(13)
+        sv.record.info['VARIANTCALLRATE'] = sv.variant_call_rate(13)
+        if sv.call_rate(13) < 0.75:
             sv.filter.add("CALLRATE")
-        if not sv.Polymorph():
+        if not sv.polymorph():
             sv.filter.add("MONOMORPH")
         if 'NONDUPLICATEOVERLAP' in info and info['NONDUPLICATEOVERLAP'] > 0.7:
             sv.filter.add("OVERLAP")
@@ -551,21 +548,22 @@ def GenomeSTRIPLikefiltering(SVSet, reader):
             sv.filter.add("DUPLICATE")
 
 
-def ABFreqFiltering(SVSet):
+def AB_filtering(variant_set):
     """ Filtering the candidate CNVs according to the following criteria
           - more than 50% of variant samples should have AB freq > 0.3
     """
 
-    for sv in SVSet:
-        ABfreqOK = []
+    for sv in variant_set:
+        valid_AB_freq = []
         for s in sv.record.samples.values():
             if Heterozygote(s):
-                ABfreqOK.append((s.get('AB')[0] > 0.3))
-        if len(ABfreqOK) > 0 and sum(ABfreqOK) < len(ABfreqOK)/2:
+                valid_AB_freq.append((s.get('AB')[0] > 0.3))
+        if (len(valid_AB_freq) > 0 and
+            sum(valid_AB_freq) < len(valid_AB_freq) / 2):
             sv.filter.add("ABFREQ")
 
 
-def GetConnectedDuplicates(SVSet):
+def get_connected_duplicates(SVSet):
     """
     Construct connected components of duplicates and rename the variants
     """
@@ -606,7 +604,7 @@ def get_tool_name(sv_ident):
     return sv_ident.split("_")[0]
 
 
-def SetSupportingTools(SVSet):
+def set_supporting_tools(SVSet):
     for sv in SVSet:
         tools = {get_tool_name(sv.id)}
         if "DUPLICATES" in sv.record.info:
@@ -642,7 +640,7 @@ def rename_info_field(sv, key, sv_dict):
         sv.record.info[key] = info_newid
 
 
-def RenameSV(SVSet):
+def rename_variants(SVSet):
     sv_dict = defaultdict()
     for sv in SVSet:
         new_id = new_id_str(sv)
-- 
GitLab