delly.py

import logging
import os


from collections import defaultdict

from pysam import VariantFile

from svreader import SVRecord, SVReader, SVWriter
from svrunner_utils import eprint, vcf_to_pybed

logger = logging.getLogger(__name__)

mydir = os.path.dirname(os.path.realpath(__file__))

'''
Delly output is a vcf file, fields are described in the header

#using awk
cat vcf |
awk  '/##FORMAT/{ last=index($0,","); id=index($0,"ID");  \
     desc=substr($0,match($0,"Description")+12) ; gsub(/"/, "", desc); \
     printf "%-30s %s\n", substr($0,id+3,last-id-3), \
     substr(desc,0,length(desc)-1)}'

FILTER
    PASS         All filters passed
    LowQual      PE/SR support below 3 or mapping quality below 20.

INFO
    CIEND        PE confidence interval around END
    CIPOS        PE confidence interval around POS
    CHR2         Chromosome for END coordinate in case of a translocation
    END          End position of the structural variant
    PE           Paired-end support of the structural variant
    MAPQ         Median mapping quality of paired-ends
    SR           Split-read supportSpecificFilterPass
    SRQ          Split-read consensus alignment quality
    CONSENSUS    Split-read consensus sequence
    CE           Consensus sequence entropy
    CT           Paired-end signature induced connection type
    IMPRECISE    Imprecise structural variation
    PRECISE      Precise structural variation
    SVTYPE       Type of structural variant
    SVMETHOD     Type of approach used to detect SV
    INSLEN       Predicted length of the insertion
    HOMLEN       Predicted microhomology length using a max. edit distance of 2
    RDRATIO      Read-depth ratio of het. SV carrier vs. non-carrier.

FORMAT
    GT           Genotype
    GL           Log10-scaled genotype likelihoods for RR,RA,AA genotypes
    GQ           Genotype Quality
    FT           Per-sample genotype filter
    RC           Raw high-quality read counts for the SV
    RCL          Raw high-quality read counts for the left control region
    RCR          Raw high-quality read counts for the right control region
    CN           Read-depth based copy-number estimate for autosomal sites
    DR           high-quality reference pairs
    DV           high-quality variant pairs
    RR           high-quality reference junction reads
    RV           high-quality variant junction reads


'''

delly_name = "delly"
delly_source = set([delly_name])


class DellyRecord(SVRecord):

    def __init__(self, record, sample_order, reference_handle=None):

        # Custom id
        record.id = "_".join([delly_name, record.chrom, record.id])
        super(DellyRecord, self).__init__(delly_name,
                                          record.chrom,
                                          record.pos,
                                          record.stop,
                                          record.id,
                                          ref=record.ref,
                                          qual=record.qual,
                                          filter=record.filter)

        self.__record = record
        self._sv_type = record.info["SVTYPE"]
        self.__pe = record.info["PE"]
        self.__sr = record.info["SR"] if "SR" in list(record.info.keys()) else 0
        self.__ct = record.info["CT"]
        self.__record.info['SR'] = self.__sr

        sv = {}
        for sample in record.samples:
            sample_name = sample.rsplit('.')[0]
            sv[sample_name] = record.samples[sample]
        self.sv = sv
        self.__record.info['VSAMPLES'] = ",".join(self.variantSamples())

    @property
    def record(self):
        return self.__record

    @property
    def start(self):
        return self.record.pos

    @property
    def end(self):
        return self.record.stop

    @property
    def pe(self):
        return self.__pe

    @property
    def sr(self):
        return self.__sr

    @property
    def ct(self):
        return self.__ct

    def update(self, start, end, pe_supp):
        self.record.pos = start
        self.record.stop = end
        self.record.info['PE'] = pe_supp

    def variantSample(self, sample):
        if self.sv[sample]["DV"] > 0 or self.sv[sample]["RV"] > 0:
            return True
        else:
            return False

    def variantSamples(self):
        variant_samples = []
        for sample in self.sv:
            if self.variantSample(sample):
                variant_samples.append(sample)
        return sorted(variant_samples)

    def MaxIndSupportingRP(self):
        ''' Maximum number of supporting PE among individuals.
        '''
        max_ind_supp = 0
        su = 0
        rep_sample = ""
        for sample in self.sv:
            dv = self.sv[sample]["DV"]
            su += dv
            if dv > max_ind_supp:
                rep_sample = sample
                max_ind_supp = dv
        if max_ind_supp == 0:
            ratio = 0
        else:
            total = self.sv[rep_sample]["DV"] + self.sv[rep_sample]["DR"]
            ratio = float(self.sv[rep_sample]["DV"])/total
        return su, max_ind_supp, ratio

    def to_svcf_record(self):
        return self.record


class DellyReader(SVReader):
    svs_supported = set(["DEL", "INS", "DUP", "INV"])

    def __init__(self, file_name, reference_handle=None, svs_to_report=None):
        super(DellyReader, self).__init__(file_name,
                                          delly_name,
                                          reference_handle)
        self.vcf_reader = VariantFile(file_name)
        if 'VSAMPLES' not in self.vcf_reader.header.info.keys():
            self.vcf_reader.header.info.add('VSAMPLES',
                                            number='.',
                                            type='String',
                                            description='Samples with variant alleles')
        self.svs_supported = DellyReader.svs_supported
        if svs_to_report is not None:
            self.svs_supported &= set(svs_to_report)

    def __iter__(self):
        return self

    def __next__(self):
        while True:
            delly_record = next(self.vcf_reader)
            record = DellyRecord(delly_record, self.reference_handle)
            if record.sv_type in self.svs_supported:
                return record

    def getHeader(self):
        return self.vcf_reader.header

    def getOrderedSamples(self):
        samples = self.vcf_reader.header.samples
        sample_names = [sample.rsplit('.')[0] for sample in samples]
        return sample_names

    def SpecificFilterPass(self, record):
        # Filtering criteria more than 5 PE or more than SR and PE > 0
        # see http://bcb.io/2014/08/12/validated-whole-genome-structural-variation-detection-using-multiple-callers
        # TODO : should we modifiy this in order to account for individual
        #       specific filtering (e.g pe > 5 at least for an individual)
        return record.pe > 5 or record.sr > 0

    def bnd_merge(sef, svtype, records):
        """
            Merging delly two lines inversion format according to the
            CT info field (distal connection)
              3to3 tail-to-tail
              5to5 head-to-head
            CT types are separated and for each 3to3, the corresponding
            5to5 is searched for (bedtools intersect with >90% overlap)
            A single line corresponding to the intersection of both
            signature is kept
        """
        if svtype != "INV":
            return records
        # a dictionnary of records with id as key
        records_dict = defaultdict()
        records_CT = defaultdict(list)
        for r in records:
            records_dict[r.id] = r
            records_CT[r.ct].append(r)

        eprint("Converting to bedtools intervals")
        tail2tail = vcf_to_pybed(records_CT["3to3"])
        head2head = vcf_to_pybed(records_CT["5to5"])

        # TODO check if the following code really recover balanced inversions

        balanced = tail2tail.intersect(head2head, r=True, f=0.95, wo=True)

        confirmed = []
        seen = defaultdict()
        for b in balanced:
            start = int(b[5])  # tail
            end = int(b[2])  # head
            t2t = records_dict[b[3]]
            h2h = records_dict[b[7]]
            repre = t2t
            if repre in seen:
                continue
            seen[repre] = 1
            pe_supp = t2t.record.info['PE'] + h2h.record.info['PE']
            print("merging %s and %s" % (t2t, h2h))
            repre.update(start, end, pe_supp)
            confirmed.append(repre)
        return confirmed


class DellyWriter(SVWriter):

    def __init__(self, file_name, reference_contigs, template_reader):
        super(DellyWriter, self).__init__(file_name,
                                          template_reader.tool_name,
                                          template_reader)

    def _open(self):
        self.vcf_writer = VariantFile(self.filename, 'w',
                                      header=self.template_reader.getHeader())
        self._isopen = True

    def _write(self, record):
        self.vcf_writer.write(record.record)

    def _close(self):
        if self._isopen:
            self.vcf_writer.close()
        else:  # nothing was written
            self._dumpemptyvcf()