lumpy.py


import logging
import os

import re

from pysam import VariantFile

from svreader import SVRecord, SVReader, SVWriter

logger = logging.getLogger(__name__)
mydir = os.path.dirname(os.path.realpath(__file__))

'''
An ultra lightweight vcf reader for lumpy

INFO:
    SVTYPE         Type of structural variant
    SVLEN          Difference in length between REF and ALT alleles
    END            End position of the variant described in this record
    STRANDS        Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--)
    IMPRECISE      Imprecise structural variation
    CIPOS          Confidence interval around POS for imprecise variants
    CIEND          Confidence interval around END for imprecise variants
    CIPOS95        Confidence interval (95%) around POS for imprecise variants
    CIEND95        Confidence interval (95%) around END for imprecise variants
    MATEID         ID of mate breakends
    EVENT          ID of event associated to breakend
    SECONDARY      Secondary breakend in a multi-line variants
    SU             Number of pieces of evidence supporting the variant across all samples
    PE             Number of paired-end reads supporting the variant across all samples
    SR             Number of split reads supporting the variant across all samples
    EV             Type of LUMPY evidence contributing to the variant call
    PRPOS          LUMPY probability curve of the POS breakend
    PREND          LUMPY probability curve of the END breakend

FORMAT:
    GT             Genotype
    SU             Number of pieces of evidence supporting the variant
    PE             Number of paired-end reads supporting the variant
    SR             Number of split reads supporting the variant
    BD             Amount of BED evidence supporting the variant
    RO             Reference allele observation count, with partial observations recorded fractionally
    AO             Alternate allele observations, with partial observations recorded fractionally
    QR             Sum of quality of reference observations
    QA             Sum of quality of alternate observations
    RS             Reference allele split-read observation count, with partial observations recorded fractionally
    AS             Alternate allele split-read observation count, with partial observations recorded fractionally
    RP             Reference allele paired-end observation count, with partial observations recorded fractionally
    AP             Alternate allele paired-end observation count, with partial observations recorded fractionally
    AB             Allele balance, fraction of observations from alternate allele, QA/(QR+QA)


'''

lumpy_name = "lumpy"
lumpy_source = set([lumpy_name])


class LumpyRecord(SVRecord):

    def __init__(self, record, reference_handle=None):
        record.id = "_".join(
            [lumpy_name,
             record.chrom,
             record.info['SVTYPE'],
             record.id])
        super(LumpyRecord, self).__init__(lumpy_name,
                                          record.chrom,
                                          record.pos,
                                          record.stop,
                                          record.id,
                                          ref=record.ref,
                                          qual=record.qual,
                                          filter=record.filter)
        self.__record = record

        sv = {}
        for sample in record.samples:
            sample_name = sample.rsplit('.')[0]
            sv[sample_name] = record.samples[sample]
        self.sv = sv
        # self.__record.info["MAX_IND_SU"] = self.MaxIndSupportingRP()
        self._sv_type = self.__record.info['SVTYPE']
        self.__record.info['VSAMPLES'] = ",".join(self.variantSamples())

    @property
    def svlen(self):
        svlen = 0
        if self.__record.INFO["SVTYPE"] == "BND":
            chrom, end = self. getAltBND()
            svlen = end - self.start + 1
        else:
            svlen = abs(int(self.__record.INFO["SVLEN"][0]))
        return svlen

    @property
    def record(self):
        return self.__record

    @property
    def sr(self):
        return self.__record.info['SR']

    def addbatch2Id(self, batch=None):
        if batch:
            self.__record.id += "_" + batch

    def getAltBND(self):
        ''' extract second breakend chr,pos from ALT field '''
        alt = self.__record.ALT[0]
        m = re.search(r"[\[\]]([\w\.:0-9]+)[\[\]]", str(alt))
        if m:
            return m.group(1).split(":")
        else:
            return 0

    def variantSampleOld(self, sample):
        if (max(self.sv[sample]["AS"]) > 0
                or max(self.sv[sample]["AP"])
                or max(self.sv[sample]["ASC"]) > 0):
            return True
        else:
            return False

    def variantSample(self, sample):
        if self.sv[sample]["SU"] > 0:
            return True
        else:
            return False

    def variantSamples(self):
        variant_samples = []
        for sample in self.sv:
            if self.variantSample(sample):
                variant_samples.append(sample)
        return sorted(variant_samples)

    def MaxIndSupportingRP(self):
        # Among all the individuals returns the support of the individual with
        # the maximum support
        sv = self.sv
        max_ind_supp = 0
        for sample in sv:
            max_ind_supp = max(max_ind_supp, sv[sample]['SU'])
        return max_ind_supp


class LumpyReader(SVReader):
    svs_supported = set(["DEL", "INS", "DUP", "INV"])

    def __init__(self, file_name, reference_handle=None, svs_to_report=None):
        super(LumpyReader, self).__init__(file_name,
                                          lumpy_name,
                                          reference_handle)
        self.vcf_reader = VariantFile(file_name)
        self.vcf_reader.header.info.add(
            'VSAMPLES',
            number='.',
            type='String',
            description='Samples with variant alleles')
        if svs_to_report is not None:
            self.svs_supported &= set(svs_to_report)

    def __iter__(self):
        return self

    def __next__(self):
        while True:
            lumpy_record = next(self.vcf_reader)
            # BND not supported
            if lumpy_record.info['SVTYPE'] == "BND":
                continue
            record = LumpyRecord(lumpy_record, self.reference_handle)
            if record.sv_type in self.svs_supported:
                return record

    def getHeader(self):
        return self.vcf_reader.header

    def AreSamplesSpecified(self):
        return 1

    # def getOrderedSamples(self):
    #     if not hasattr(self.vcf_reader, "samples"):
    #         return []
    #     return self.vcf_reader.samples

    def getOrderedSamples(self):
        samples = self.vcf_reader.header.samples
        sample_names = [sample.rsplit('.')[0] for sample in samples]
        return sample_names

    def SpecificFilterPass(self, record):
        # First try after sensibility analysis with Mathieu
        # return record.sr[0] > 0
        # Filtering criteria more than 4 PE for at least one invididual
        # or sr > 0
        # see
        # http://bcb.io/2014/08/12/validated-whole-genome-structural-variation-detection-using-multiple-callers
        return record.MaxIndSupportingRP() > 4 or record.sr[0] > 0


class LumpyWriter(SVWriter):

    def __init__(self, file_name, reference_contigs, template_reader):
        super(
            LumpyWriter,
            self).__init__(file_name,
                           template_reader.tool_name,
                           template_reader)

    def _open(self):
        self.vcf_writer = VariantFile(
            self.filename,
            'w',
            header=self.template_reader.getHeader())
        self._isopen = True

    def _write(self, record):
        self.vcf_writer.write(record.record)

    def _close(self):
        if self._isopen:
            self.vcf_writer.close()
        else:  # nothing was written
            # Make sure appropriate samples are added to the vcf
            self._dumpemptyvcf()