Skip to content
Snippets Groups Projects
annotation.py 20.4 KiB
Newer Older
Thomas Faraut's avatar
Thomas Faraut committed

Thomas Faraut's avatar
Thomas Faraut committed
import sys

from collections import defaultdict
from networkx import Graph, connected_components

import numpy as np
from math import isnan
Thomas Faraut's avatar
Thomas Faraut committed
from pysam import VariantFile

from svrunner_utils import eprint, warn, vcf_to_pybed
from svreader.vcfwrapper import VCFRecord, VCFReader, VCFWriter
Thomas Faraut's avatar
Thomas Faraut committed
HOM_REF = (0, 0)
HET_VAR = (0, 1)
HOM_VAR = (1, 1)

VALID_GENOTYPES = [HOM_REF, HET_VAR,  HOM_VAR]

SVTYPER_GENO_LIKELI_TAG = "GL"
GENOMESTRIP_GENO_LIKELI_TAG = "GP"

Geno_likeli_tag = SVTYPER_GENO_LIKELI_TAG


def coded_geno(geno):
    if geno == HOM_REF:
        return "HOMO_REF"
    elif geno == HET_VAR:
        return "HET_VAR"
    elif geno == HOM_VAR:
        return "HOMO_VAR"
    else:
        return "UNDEF"


def Variant(sample):
    if sample.get('GT') in [HET_VAR, HOM_VAR]:
        return True
    else:
        return False
def Heterozygote(sample):
    if sample.get('GT') in [HET_VAR]:
        return True
    else:
        return False


class AnnotateRecord(VCFRecord):
Thomas Faraut's avatar
Thomas Faraut committed
    """
    A lightweight object to annotated the final records
    """
Thomas Faraut's avatar
Thomas Faraut committed
    def __init__(self, record):
        """
        A pysam VariantRecord wrapper
        """
        super(AnnotateRecord, self).__init__(record)
        self.new_id = None
Thomas Faraut's avatar
Thomas Faraut committed

    @property
    def start(self):
        return self.record.pos

    @property
    def end(self):
        return self.record.stop

Thomas Faraut's avatar
Thomas Faraut committed
    @property
    def svtype(self):
        return self._sv_type

    @property
    def svlen(self):
        return abs(self.stop - self.start)
Thomas Faraut's avatar
Thomas Faraut committed

    @property
    def passed(self):
        if "PASS" in self.record.filter:
            return True
        else:
            return False
    @property
    def num_samples(self):
        return len(self.record.samples.keys())

    def setNewId(self, new_id):
        self.new_id = new_id
Thomas Faraut's avatar
Thomas Faraut committed
        try:
            self.record.info['SOURCEID'] = self.id
        except KeyError:
            eprint("SOURCEID absent from record info keys,")
            sys.exit(1)
Thomas Faraut's avatar
Thomas Faraut committed

    def num_variant_samples(self):
        return sum([Variant(s) for s in self.samples.values()])
Thomas Faraut's avatar
Thomas Faraut committed

    def variant_read_support(self):
        support = []
        for s in self.samples.values():
            if s.get('AO') is not None:
                support.append(s.get('AO')[0])
        return max(support)
Thomas Faraut's avatar
Thomas Faraut committed

    def qual(self):
        variant_qual = []
        for s in self.samples.values():
            if s.get('SQ') is not None:
                variant_qual.append(s.get('SQ'))
        return sum(variant_qual)

    def GQ_samples(self):
        genotype_qual = []
        for s in self.samples.values():
            if s.get('GQ') is not None:
                genotype_qual.append(s.get('GQ'))
        return genotype_qual
        return sum(self.GQ_samples())
        return max(self.GQ_samples())
    def set_qual(self):
Thomas Faraut's avatar
Thomas Faraut committed
        self.record.qual = self.qual()

    def numdiffGenotypes(self):
        genotypes = defaultdict()
        for s in self.samples.values():
            if s.get('GT') in VALID_GENOTYPES:
                genotypes[coded_geno(s.get('GT'))] = 1
        return len(genotypes.keys())

    def polymorph(self):
        return self.numdiffGenotypes() > 1

    def add_supporting_infos(self):
Thomas Faraut's avatar
Thomas Faraut committed
        supp_reads = self.variant_read_support()
        num_supp_samples = self.num_variant_samples()
        #print(supp_reads, num_supp_samples)
Thomas Faraut's avatar
Thomas Faraut committed
        try:
            self.record.info['MAX_SUPP_READS'] = supp_reads
Thomas Faraut's avatar
Thomas Faraut committed
            self.record.info['NUM_SUPP_SAMPLES'] = num_supp_samples
        except KeyError:
            eprint("SUPP_READS or NUM_SUPP_SAMPLES absent from record info keys")
            sys.exit(1)

    def call_rate(self, cutoff):
        call_qual = []
        for s in self.samples.values():
            if s.get('GQ') is not None:
                call_qual.append(s.get('GQ'))
        num_qual_call = sum([(qual > cutoff) for qual in call_qual])
        return num_qual_call / self.num_samples
    def variant_call_rate(self, cutoff):
        samples = self.samples.values()
        num_qual_var = 0
        for s in samples:
            if s.get("GQ") is not None and s.get("GQ") > cutoff and Variant(s):
                num_qual_var += 1
        num_var_samples = self.num_variant_samples()
        var_call_rate = num_qual_var / num_var_samples if num_var_samples else 0
        return var_call_rate
    def unify_pass_filtertag(self):
Thomas Faraut's avatar
Thomas Faraut committed
        """
           All records passing the filters (PASS, .) ar now labelled PASS
        """
        record = self.record
        filters = [f for f in record.filter]
        # We make the assumption when a "." is present no other filter
        # are present
Thomas Faraut's avatar
Thomas Faraut committed
        if not filters or "." in filters:
Thomas Faraut's avatar
Thomas Faraut committed
            record.filter.clear()
            record.filter.add("PASS")
class AnnotateReader(VCFReader):
Thomas Faraut's avatar
Thomas Faraut committed
    def __init__(self, file_name, sv_to_report=None):
        super(AnnotateReader, self).__init__(file_name)
Thomas Faraut's avatar
Thomas Faraut committed
        self.filename = file_name
Thomas Faraut's avatar
Thomas Faraut committed
        self.sv_to_report = sv_to_report
Thomas Faraut's avatar
Thomas Faraut committed
        self.vcf_reader = VariantFile(file_name)
Thomas Faraut's avatar
Thomas Faraut committed

        self.add_annotation_metadata()
Thomas Faraut's avatar
Thomas Faraut committed
    def __iter__(self):
        return self

    def __next__(self):
        while True:
            raw_record = next(self.vcf_reader)
            record = AnnotateRecord(raw_record)
Thomas Faraut's avatar
Thomas Faraut committed
            return record

    def getHeader(self):
        return self.vcf_reader.header

    def add_annotation_metadata(self):
Thomas Faraut's avatar
Thomas Faraut committed
        self.addInfo("SOURCEID", 1, "String",
                     "The source sv identifier")
        self.addInfo("MAX_SUPP_READS", 1, "Integer",
                     "Max number of supporting reads")
Thomas Faraut's avatar
Thomas Faraut committed
        self.addInfo("NUM_SUPP_SAMPLES", 1, "Integer",
                     "Number of supporting samples")
        self.addFilter("LOWSUPPORT",
                       "total supp reads < 5 or supp samples < 2")

Thomas Faraut's avatar
Thomas Faraut committed
    def getOrderedSamples(self):
        samples = self.vcf_reader.header.samples
        sample_names = [sample.rsplit('.')[0] for sample in samples]
        return sample_names

    def numSamples(self):
        return len(self.vcf_reader.header.samples)


class AnnotateWriter(VCFWriter):
Thomas Faraut's avatar
Thomas Faraut committed

    def __init__(self, file_name,  template_reader, index=True):
Thomas Faraut's avatar
Thomas Faraut committed
        super(VCFWriter, self).__init__(file_name,
                                        template_reader.tool_name,
                                        template_reader)

    def _open(self):
        self.vcf_writer = VariantFile(self.filename, 'w',
                                      header=self.template_reader.getHeader())
        self._isopen = True

    def _write(self, record):
        self.vcf_writer.write(record.record)

    def _close(self):
        if self._isopen:
            self.vcf_writer.close()
        else:   # nothing was written
            self._dumpemptyvcf()


def ordered(a, b):
    # simply ordering the two string
    return (a, b) if a < b else (b, a)


def setLikeliTag(genotyper):
    global Geno_likeli_tag

    if genotyper == "svtyper":
        Geno_likeli_tag = SVTYPER_GENO_LIKELI_TAG
        warn("Assuming genotypes provided by svtyper software" +
             " hence tag is %s" % (Geno_likeli_tag))

    elif genotyper == "genomestrip":
        Geno_likeli_tag = GENOMESTRIP_GENO_LIKELI_TAG
        warn("Assuming genotypes provided by genomestrip software" +
             " hence tag is %s" % (Geno_likeli_tag))
    else:
        print("Unknown genotyping software")
        exit(1)


def getlikelihoods(sample):
    return sample.get('GL')


def probas(likelihoods):
    # transform log likelihoods into likelihoods
    return np.exp(np.array(likelihoods))


def getprobas(sample):
    # Transforming likelihods into probabilities
    return probas(getlikelihoods(sample)) / np.sum(probas(getlikelihoods(sample)))


def ondiagonal(u_s, v_s):
    # Probability that, for that individual, the two SVs are identically
    # genotyped P1(0/0)*P2(0/0) + ... P1(1/1)*P2(1/1)
    # in the same way :
    p = getprobas(u_s)
    q = getprobas(v_s)
    proba = 0
    for a, b in zip(p, q):
        proba += a * b
    # print("Proba on %3.5f" %(proba))
    return proba


def offdiagonal(u_s, v_s):
    # Probability that, for that individual, the two SVs are not identically
    # in the same way, complement of the previous one
    p = getprobas(u_s)
    q = getprobas(v_s)
    proba = 0
    for i, a in enumerate(p):
        for j, b in enumerate(q):
            if i != j:
                proba += a * b
    # print("Proba off %3.2f" %(proba))
    return proba


def duplicatescore(u, v):
    # For the two SVs, max discordant genotype log-ratio proba
    # same genotypes against discordant against (worse individual)

    # u_s is the sample of id s of u (idem for v_s)
    # Valid samples
    valid_samples = []
    for s in u.samples:
        u_s = u.samples[s]
        v_s = v.samples[s]
        if (u_s.get('GQ') is not None
                and v_s.get('GQ') is not None):
            valid_samples.append(s)

    max_disc = 0
    computed = float('NaN')
    for s in valid_samples:
        # ondiago is not used, we keep it just for comprehension
        # ondiago = ondiagonal(s, u, v)
        u_s = u.samples[s]
        v_s = v.samples[s]
        offdiago = offdiagonal(u_s, v_s)
        if offdiago > max_disc:
            max_disc = offdiago
    if max_disc > 0 and max_disc < 1:
        ratio = (1 - max_disc) / max_disc
        computed = np.log(ratio)
    return computed


def gstrength(u):
    """
    Sum of phred-like genotype qualities provides a measure of the
    combined genotype quality of the site
    np.sum([s['GQ'] if s['GQ'] is not None else 0 for s in u.samples.values()])
    """
    return u.GQ_sum_score()


def variantstrength(u):
    """
    maximum SQ, where SQ stands for
    Phred-scaled probability that this site is variant (non-reference)
    in this sample)
    QUAL = -10 * log(P(locus is reference in all samples)), which is
    equal to the sum of the SQ scores.
    see https://github.com/hall-lab/svtyper/issues/10
    sum([s['SQ'] if s['SQ'] is not None else 0 for s in u.samples.values()])
    """
    return u.qual()
    # max([s['SQ'] if s['SQ'] is not None else 0 for s in u.samples.values()])


def getduplicates_GQ(u, v):
    """
    select the prefered duplicate on the basis of the
    Sum of phred-like genotype qualities
    see gstrength
    returns prefered, discarded, strength of both
    """
    if gstrength(u) > gstrength(v):
        return (u, v, gstrength(u), gstrength(v))
    else:
        return (v, u, gstrength(v), gstrength(u))


def getduplicates_QUAL(u, v):
    """
    select the prefered duplicate on the basis of
    Phred-scaled probability that this site is a variant
    see variantstrength
    returns prefered, discarded, strength of both
    """
    if variantstrength(u) > variantstrength(v):
        return (u, v, variantstrength(u), variantstrength(v))
    else:
        return (v, u, variantstrength(v), variantstrength(u))


def getoverlap(u, osize):
    # percentage overlap given the size of the overlap
    return 100 * osize / u.svlen


def add_redundancy_infos_header(reader):
    # Adding DUPLICATESCORE info (equivalent to GSDUPLICATESCORE)
    reader.addInfo("DUPLICATESCORE", 1, "Float",
                   "LOD score that the events are distinct based on the "
                   "genotypes of the most discordant sample")
    # Adding DUPLICATES info (equivalent to GSDUPLICATEOVERLAP)
    reader.addInfo("DUPLICATEOVERLAP", 1, "Float",
                   "Highest overlap with a duplicate event")
    # Adding DUPLICATEOF info (list of variant prefered to this one)
    reader.addInfo("DUPLICATEOF", ".", "String",
                   "List of duplicate events preferred to this one")
    # Adding DUPLICATES info (list of variants duplicates of this one)
    reader.addInfo("DUPLICATES", ".", "String",
                   "List of duplicate events represented by the current sv")
    # Adding NONDUPLICATEOVERLAP
    reader.addInfo("NONDUPLICATEOVERLAP", 1, "Float",
                   "Amount of overlap with a non-duplicate")
    # Adding TOOLSUPPORT
    reader.addInfo("TOOLSUPPORT", ".", "String",
                   "Tools supporting (detecting) the sv")
def redundancy_annotator(SVSet, reader,
Thomas Faraut's avatar
Thomas Faraut committed
                         overlap_cutoff,
                         duplicatescore_threshold=-2,
                         genotyper="svtyper"):
    """ Annotating duplicate candidates based on the genotype likelihoods
      - genotype likelihoods can be provided by svtyper or genomestrip
    """

    add_redundancy_infos_header(reader)
    setLikeliTag(genotyper)

    variants = defaultdict()
    for sv in SVSet:
        variants[sv.id] = sv

    pybed_variants = vcf_to_pybed(SVSet)
    self_overlap = pybed_variants.intersect(pybed_variants,
Thomas Faraut's avatar
Thomas Faraut committed
                                            f=overlap_cutoff, r=True, wo=True)

    seen = defaultdict(tuple)
    duplicates = defaultdict(list)
    overlapping = defaultdict(tuple)
    reference = defaultdict()
    for o in self_overlap:
        if o[3] == o[7]:
            continue
        a, b = ordered(o[3], o[7])
        if seen[(a, b)]:
            continue
        seen[(a, b)] = True
        u = variants[a]
        v = variants[b]
        score = duplicatescore(u, v)
        # print("Comparing %s and %s : %3.8f" % (u.id, v.id, score))
        if score > duplicatescore_threshold:
            ref, dupli, _, _ = getduplicates_GQ(u, v)
            # print("%s prefered to %s %3.8f" % (ref.id, dupli.id, score))
            reference[ref] = 1
            overlap_size = int(o[-1])
            overlap = getoverlap(dupli, overlap_size)
            if ref.maxGQ() > 0 and dupli.passed:
                # are dismissed
                # - reference variant with 0 genotype quality for all markers
                # - duplicate that are already tagged as filtered out
                # dupli.id is considered as a duplicate of ref.id
                duplicates[dupli.id].append((ref.id, score, overlap))
        else:
            overlap_size = int(o[-1])
            overlap_u = getoverlap(u, overlap_size)
            overlap_v = getoverlap(v, overlap_size)
            overlapping[(u, v)] = {'dupli_score': score,
                                   'overlap_left': overlap_u,
                                   'overlap_right': overlap_v,
                                   }
    for u in SVSet:
        if u.id in duplicates:
            print("tagged as duplicate %s" % u.id)
            duplis = sorted([a for (a, s, o) in duplicates[u.id]])
            score = max([s for (a, s, o) in duplicates[u.id]])
            overlap = max([o for (a, s, o) in duplicates[u.id]])
            add_duplicate_info_sv(u, overlap, score, duplis)


def add_duplicate_info_sv(sv, duplicateoverlap, duplicatescore, duplicates):
    """
        simply adding two information to the sv infos
    """
    if isnan(duplicatescore):
        sv.record.info['DUPLICATESCORE'] = float('nan')
    else:
        sv.record.info['DUPLICATESCORE'] = duplicatescore
    sv.record.info['DUPLICATEOVERLAP'] = duplicateoverlap
    sv.record.info['DUPLICATEOF'] = duplicates


def add_overlap_info_sv(sv, overlap, duplicatescore):
    """
        simply adding two information to the sv infos
    """
    if isnan(duplicatescore):
        sv.record.info['DUPLICATESCORE'] = float('nan')
    else:
        sv.record.info['DUPLICATESCORE'] = duplicatescore
    sv.record.info['NONDUPLICATEOVERLAP'] = overlap


def add_callrate_infos_header(reader):
    # Adding CALLRATE info
    reader.addInfo("CALLRATE", 1, "Float",
                   "Percentage of samples called with a GQ>13")
    # Adding VARIANTCALLRATE info
    reader.addInfo("VARIANTCALLRATE", 1, "Float",
                   "Percentage of variant samples called with a GQ>13")


def add_filter_infos_header(reader):
    # FILTERS
    # Adding specific filters
    reader.addFilter("CALLRATE", "Call rate <0.75")
    reader.addFilter("VARIANTCALLRATE", "Variant Call rate <0.75")
    reader.addFilter("MONOMORPH", "All samples have the same genotype")
    reader.addFilter("DUPLICATE", "GSDUPLICATESCORE>0")
    reader.addFilter("OVERLAP", "NONDUPLICATEOVERLAP>0.7")
    reader.addFilter("ABFREQ", "AB frequency <0.3 for >50% heterosamples")
Thomas Faraut's avatar
Thomas Faraut committed
def variant_filtration(variant_set, reader, filter_monomorph=False,
                       filter_callrate=False):
    """ Filtering the candidate CNVs according to the following criteria
          - non duplicate sites
          - variant sites
          - call rate > 0.8
          - at least one variant (homozygous or heterozygote) has a genotype
            quality > 20
          - the variant is not everywhere heterozygote or homozygote
            (use NONVARIANTSCORE in both cases)
    """

    add_callrate_infos_header(reader)
    add_filter_infos_header(reader)

Thomas Faraut's avatar
Thomas Faraut committed
    for sv in variant_set:
        info = sv.record.info
        sv.record.info['CALLRATE'] = sv.call_rate(13)
        sv.record.info['VARIANTCALLRATE'] = sv.variant_call_rate(13)
        if sv.call_rate(13) < 0.75 and filter_callrate:
            sv.filter.add("CALLRATE")
        if not sv.polymorph() and filter_monomorph:
            sv.filter.add("MONOMORPH")
        if 'NONDUPLICATEOVERLAP' in info and info['NONDUPLICATEOVERLAP'] > 0.8:
            sv.filter.add("OVERLAP")
        if "DUPLICATESCORE" in info is not None and info['DUPLICATESCORE'] > -2:
            sv.filter.add("DUPLICATE")
def AB_filtering(variant_set):
    """ Filtering the candidate CNVs according to the following criteria
          - more than 50% of variant samples should have AB freq > 0.3
    """

    for sv in variant_set:
        valid_AB_freq = []
        for s in sv.record.samples.values():
            if Heterozygote(s):
                valid_AB_freq.append((s.get('AB')[0] > 0.3))
        if (len(valid_AB_freq) > 0 and
                sum(valid_AB_freq) < len(valid_AB_freq) / 2):
            sv.filter.add("ABFREQ")


Thomas Faraut's avatar
Thomas Faraut committed
def get_connected_duplicates(variant_set):
    """
    Construct connected components of duplicates and rename the variants
    """
    undirected = Graph()
    variant_dict = defaultdict()
    representatives = defaultdict()
Thomas Faraut's avatar
Thomas Faraut committed
    for s in variant_set:
        variant_dict[s.id] = s
        if "DUPLICATE" in s.filter:
            for dupli_repr in s.record.info["DUPLICATEOF"]:
                undirected.add_edge(s.id, dupli_repr)
    for component in connected_components(undirected):
        for c in component:
            if "DUPLICATEOF" in variant_dict[c].record.info:
                # the current variant is a duplicate
                continue
            rep = c  # the representative of the equivalence class
            break
        duplicates = component
        duplicates.remove(rep)
        representatives[rep] = duplicates
    add_duplicate_infos(representatives, variant_dict)


def add_duplicate_infos(representatives, sv_dict):
    for rep, elements in representatives.items():
        for d in elements:
            sv_dict[d].record.info['DUPLICATEOF'] = rep
        duplicates = list(elements)
        if 'DUPLICATES' in sv_dict[rep].record.info:
            print(sv_dict[rep].record.info['DUPLICATES'])
            duplicates.extend(sv_dict[rep].record.info['DUPLICATES'])
        if len(duplicates) > 0:
            sv_dict[rep].record.info['DUPLICATES'] = duplicates


def get_tool_name(sv_ident):
    return sv_ident.split("_")[0]


Thomas Faraut's avatar
Thomas Faraut committed
def set_supporting_tools(variant_set):
    for sv in variant_set:
        tools = {get_tool_name(sv.id)}
        if "DUPLICATES" in sv.record.info:
            duplicates = sv.record.info['DUPLICATES']
            # print(duplicates)
            for dupli in duplicates:
                tools.add(get_tool_name(dupli))
        if 'TOOLSUPPORT' in sv.record.info:
            supporting = set(sv.record.info['TOOLSUPPORT'])
Thomas Faraut's avatar
Thomas Faraut committed
            tools = tools.union(supporting)
        sv.record.info['TOOLSUPPORT'] = list(tools)


def static_vars(**kwargs):
    def decorate(func):
        for k in kwargs:
            setattr(func, k, kwargs[k])
        return func
    return decorate


@static_vars(counter=0)
def new_id_str(sv):
    new_id_str.counter += 1
    return "_".join(["cnvpipeline", sv.chrom, sv.svtype,
                     str(new_id_str.counter)])


def rename_info_field(sv, key, sv_dict):
    if key in sv.record.info:
        info_oldid = sv.record.info[key]
        info_newid = [sv_dict[id] for id in info_oldid]
        sv.record.info[key] = info_newid


def rename_variants(SVSet):
    sv_dict = defaultdict()
    for sv in SVSet:
        new_id = new_id_str(sv)
        sv.setNewId(new_id)
        sv_dict[sv.id] = new_id
    for sv in SVSet:
        rename_info_field(sv, "DUPLICATEOF", sv_dict)
        rename_info_field(sv, "DUPLICATES", sv_dict)
        sv.record.info['SOURCEID'] = sv.id
        sv.rename()