a more pep8 compliant code

b0069bcf · Thomas Faraut · 28cadf8a · b0069bcf
Commit b0069bcf authored 5 years ago by Thomas Faraut
--- a/svreader/annotation.py
+++ b/svreader/annotation.py
@@ -53,6 +53,7 @@ class AnnotateRecord(VCFRecord):
    """
    A lightweight object to annotated the final records
    """
+
    def __init__(self, record):
        """
        A pysam VariantRecord wrapper
@@ -128,7 +129,7 @@ class AnnotateRecord(VCFRecord):
    def maxGQ(self):
        return max(self.GQ_samples())

-    def setQual(self):
+    def set_qual(self):
        self.record.qual = self.qual()

    def numdiffGenotypes(self):
@@ -138,10 +139,10 @@ class AnnotateRecord(VCFRecord):
                genotypes[coded_geno(s.get('GT'))] = 1
        return len(genotypes.keys())

-    def Polymorph(self):
+    def polymorph(self):
        return self.numdiffGenotypes() > 1

-    def AddSupportInfos(self):
+    def add_supporting_infos(self):
        supp_reads = self.variant_read_support()
        num_supp_samples = self.num_variant_samples()
        try:
@@ -151,25 +152,25 @@ class AnnotateRecord(VCFRecord):
            eprint("SUPP_READS or NUM_SUPP_SAMPLES absent from record info keys")
            sys.exit(1)

-    def CallRate(self, cutoff):
+    def call_rate(self, cutoff):
        call_qual = []
        for s in self.samples.values():
            if s.get('GQ') is not None:
                call_qual.append(s.get('GQ'))
        num_qual_call = sum([(qual > cutoff) for qual in call_qual])
-        return num_qual_call/self.num_samples
+        return num_qual_call / self.num_samples

-    def VariantCallRate(self, cutoff):
+    def variant_call_rate(self, cutoff):
        samples = self.samples.values()
        num_qual_var = 0
        for s in samples:
            if s.get("GQ") is not None and s.get("GQ") > cutoff and Variant(s):
                num_qual_var += 1
        num_var_samples = self.num_variant_samples()
-        var_call_rate = num_qual_var/num_var_samples if num_var_samples else 0
+        var_call_rate = num_qual_var / num_var_samples if num_var_samples else 0
        return var_call_rate

-    def UnifiedPass(self):
+    def unify_pass_filtertag(self):
        """
           All records passing the filters (PASS, .) ar now labelled PASS
        """
@@ -203,17 +204,6 @@ class AnnotateReader(VCFReader):
    def getHeader(self):
        return self.vcf_reader.header

-    # def addInfo(self, name, number, type, description):
-    #     self.vcf_reader.header.info.add(id=name,
-    #                                     number=number,
-    #                                     type=type,
-    #                                     description=description)
-    #
-    # def addFilter(self, name, description):
-    #     self.vcf_reader.header.filters.add(id=name,
-    #                                        number=None,
-    #                                        type=None,
-    #                                        description=description)

    def add_annotation_metadata(self):
        self.addInfo("SOURCEID", 1, "String",
@@ -290,7 +280,7 @@ def probas(likelihoods):

 def getprobas(sample):
    # Transforming likelihods into probabilities
-    return probas(getlikelihoods(sample))/np.sum(probas(getlikelihoods(sample)))
+    return probas(getlikelihoods(sample)) / np.sum(probas(getlikelihoods(sample)))


 def ondiagonal(u_s, v_s):
@@ -301,7 +291,7 @@ def ondiagonal(u_s, v_s):
    q = getprobas(v_s)
    proba = 0
    for a, b in zip(p, q):
-        proba += a*b
+        proba += a * b
    # print("Proba on %3.5f" %(proba))
    return proba

@@ -315,7 +305,7 @@ def offdiagonal(u_s, v_s):
    for i, a in enumerate(p):
        for j, b in enumerate(q):
            if i != j:
-                proba += a*b
+                proba += a * b
    # print("Proba off %3.2f" %(proba))
    return proba

@@ -345,35 +335,41 @@ def duplicatescore(u, v):
        if offdiago > max_disc:
            max_disc = offdiago
    if max_disc > 0 and max_disc < 1:
-        ratio = (1-max_disc)/max_disc
+        ratio = (1 - max_disc) / max_disc
        computed = np.log(ratio)
    return computed


 def gstrength(u):
-    # Sum of phred-like genotype qualities provides a measure of the
-    # combined genotype quality of the site
-    # np.sum([s['GQ'] if s['GQ'] is not None else 0 for s in u.samples.values()])
+    """
+    Sum of phred-like genotype qualities provides a measure of the
+    combined genotype quality of the site
+    np.sum([s['GQ'] if s['GQ'] is not None else 0 for s in u.samples.values()])
+    """
    return u.GQ_sum_score()


 def variantstrength(u):
-    # maximum SQ, where SQ stands for
-    # Phred-scaled probability that this site is variant (non-reference)
-    # in this sample)
-    # QUAL = -10 * log(P(locus is reference in all samples)), which is
-    # equal to the sum of the SQ scores.
-    # see https://github.com/hall-lab/svtyper/issues/10
-    # sum([s['SQ'] if s['SQ'] is not None else 0 for s in u.samples.values()])
+    """
+    maximum SQ, where SQ stands for
+    Phred-scaled probability that this site is variant (non-reference)
+    in this sample)
+    QUAL = -10 * log(P(locus is reference in all samples)), which is
+    equal to the sum of the SQ scores.
+    see https://github.com/hall-lab/svtyper/issues/10
+    sum([s['SQ'] if s['SQ'] is not None else 0 for s in u.samples.values()])
+    """
    return u.qual()
    # max([s['SQ'] if s['SQ'] is not None else 0 for s in u.samples.values()])


 def getduplicates_GQ(u, v):
-    # select the prefered duplicate on the basis of the
-    # Sum of phred-like genotype qualities
-    # see gstrength
-    # returns prefered, discarded, strength of both
+    """
+    select the prefered duplicate on the basis of the
+    Sum of phred-like genotype qualities
+    see gstrength
+    returns prefered, discarded, strength of both
+    """
    if gstrength(u) > gstrength(v):
        return (u, v, gstrength(u), gstrength(v))
    else:
@@ -381,10 +377,12 @@ def getduplicates_GQ(u, v):


 def getduplicates_QUAL(u, v):
-    # select the prefered duplicate on the basis of
-    # Phred-scaled probability that this site is a variant
-    # see variantstrength
-    # returns prefered, discarded, strength of both
+    """
+    select the prefered duplicate on the basis of
+    Phred-scaled probability that this site is a variant
+    see variantstrength
+    returns prefered, discarded, strength of both
+    """
    if variantstrength(u) > variantstrength(v):
        return (u, v, variantstrength(u), variantstrength(v))
    else:
@@ -393,7 +391,7 @@ def getduplicates_QUAL(u, v):

 def getoverlap(u, osize):
    # percentage overlap given the size of the overlap
-    return 100*osize/u.svlen
+    return 100 * osize / u.svlen


 def add_redundancy_infos_header(reader):
@@ -418,9 +416,8 @@ def add_redundancy_infos_header(reader):
                   "Tools supporting (detecting) the sv")


-def GenomeSTRIPLikeRedundancyAnnotator(SVSet, reader,
-                                       duplicatescore_threshold=-2,
-                                       genotyper="svtyper"):
+def redundancy_annotator(SVSet, reader,
+                          duplicatescore_threshold=-2, genotyper="svtyper"):
    """ Annotating duplicate candidates based on the genotype likelihoods
      - genotype likelihoods can be provided by svtyper or genomestrip
    """
@@ -452,7 +449,7 @@ def GenomeSTRIPLikeRedundancyAnnotator(SVSet, reader,
        score = duplicatescore(u, v)
        # print("Comparing %s and %s : %3.8f" % (u.id, v.id, score))
        if score > duplicatescore_threshold:
-            ref, dupli, s1, s2 = getduplicates_GQ(u, v)
+            ref, dupli, _, _ = getduplicates_GQ(u, v)
            # print("%s prefered to %s %3.8f" % (ref.id, dupli.id, score))
            reference[ref] = 1
            overlap_size = int(o[-1])
@@ -523,7 +520,7 @@ def add_filter_infos_header(reader):
    reader.addFilter("ABFREQ", "AB frequency <0.3 for >50% heterosamples")


-def GenomeSTRIPLikefiltering(SVSet, reader):
+def variant_filtration(SVSet, reader):
    """ Filtering the candidate CNVs according to the following criteria
          - non duplicate sites
          - variant sites
@@ -539,11 +536,11 @@ def GenomeSTRIPLikefiltering(SVSet, reader):

    for sv in SVSet:
        info = sv.record.info
-        sv.record.info['CALLRATE'] = sv.CallRate(13)
-        sv.record.info['VARIANTCALLRATE'] = sv.VariantCallRate(13)
-        if sv.CallRate(13) < 0.75:
+        sv.record.info['CALLRATE'] = sv.call_rate(13)
+        sv.record.info['VARIANTCALLRATE'] = sv.variant_call_rate(13)
+        if sv.call_rate(13) < 0.75:
            sv.filter.add("CALLRATE")
-        if not sv.Polymorph():
+        if not sv.polymorph():
            sv.filter.add("MONOMORPH")
        if 'NONDUPLICATEOVERLAP' in info and info['NONDUPLICATEOVERLAP'] > 0.7:
            sv.filter.add("OVERLAP")
@@ -551,21 +548,22 @@ def GenomeSTRIPLikefiltering(SVSet, reader):
            sv.filter.add("DUPLICATE")


-def ABFreqFiltering(SVSet):
+def AB_filtering(variant_set):
    """ Filtering the candidate CNVs according to the following criteria
          - more than 50% of variant samples should have AB freq > 0.3
    """

-    for sv in SVSet:
-        ABfreqOK = []
+    for sv in variant_set:
+        valid_AB_freq = []
        for s in sv.record.samples.values():
            if Heterozygote(s):
-                ABfreqOK.append((s.get('AB')[0] > 0.3))
-        if len(ABfreqOK) > 0 and sum(ABfreqOK) < len(ABfreqOK)/2:
+                valid_AB_freq.append((s.get('AB')[0] > 0.3))
+        if (len(valid_AB_freq) > 0 and
+            sum(valid_AB_freq) < len(valid_AB_freq) / 2):
            sv.filter.add("ABFREQ")


-def GetConnectedDuplicates(SVSet):
+def get_connected_duplicates(SVSet):
    """
    Construct connected components of duplicates and rename the variants
    """
@@ -606,7 +604,7 @@ def get_tool_name(sv_ident):
    return sv_ident.split("_")[0]


-def SetSupportingTools(SVSet):
+def set_supporting_tools(SVSet):
    for sv in SVSet:
        tools = {get_tool_name(sv.id)}
        if "DUPLICATES" in sv.record.info:
@@ -642,7 +640,7 @@ def rename_info_field(sv, key, sv_dict):
        sv.record.info[key] = info_newid


-def RenameSV(SVSet):
+def rename_variants(SVSet):
    sv_dict = defaultdict()
    for sv in SVSet:
        new_id = new_id_str(sv)