pindel.py

import logging
import sys
import os
import gzip

import vcf

from svrunner_utils import smart_open
from svreader.vcf_utils import get_template, get_template_header
from svreader import SVRecord, SVReader, SVWriter

from pysam import VariantFile, VariantHeader

logger = logging.getLogger(__name__)

mydir = os.path.dirname(os.path.realpath(__file__))

'''
from http://gmt.genome.wustl.edu/packages/pindel/user-manual.html

There is a head line for each variant reported, followed by the alignment of supporting reads to the reference on the
second line. The example variants are a 1671bp deletion and a 10bp insertion on chr20. The breakpoints are specified
after "BP". Due to microhomology around the breakpoints, the breakpoint coordinates may shift both upstream and
downstream,'BP_range' is used to indicate the range of this shift. The header line contains the following data:

1) The index of the indel/SV (57 means that 57 insertions precede this insertion in the file)

2) The type of indel/SV: I for insertion, D for deletion, INV for inversion, TD for tandem duplication

3) The length of the SV

4) "NT" (to indicate that the next number is the length of non-template sequences inserted; insertions are fully covered
 by the NT-fields, deletions can have NT bases if the deletion is not 'pure', meaning that while bases have been
 deleted, some bases have been inserted between the breakpoints)

5) the length(s) of the NT fragment(s)

6) the sequence(s) of the NT fragment(s)

7-8) the identifier of the chromosome the read was found on

9-10-11) BP: the start and end positions of the SV

12-13-14) BP_range: if the exact position of the SV is unclear since bases at the edge of one read-half could equally
well be appended to the other read-half. In the deletion example, ACA could be on any side of the gap, so the original
deletion could have been between 1337143 and 1338815, between 1337144 and 1338816, or between 1337145 and 133817, or
between 1337146 and 133818. BP-range is used to indicate this range.

15) "Supports": announces that the total count of reads supporting the SV follow.

16) The number of reads supporting the SV

17) The number of unique reads supporting the SV (so not counting duplicate reads)

18) +: supports from reads whose anchors are upstream of the SV

19-20) total number of supporting reads and unique number of supporting reads whose anchors are upstream of the SV.

21) -: supports from reads whose anchors are downstream of the SV

22-23) total number of supporting reads and unique number of supporting reads whose anchors are downstream of the SV

24-25) S1: a simple score, ("# +" + 1)* ("# -" + 1) ;

26-27) SUM_MS: sum of mapping qualities of anchor reads, The reads with variants or unmapped are called split-read,
whose mate is called anchor reads. We use anchor reads to narrow down the search space to speed up and increase
sensitivity;

28) the number of different samples scanned

29-30-31) NumSupSamples?: the number of samples supporting the SV, as well as the number of samples having unique
reads supporting the SV (in practice, these numbers are the same)

32+) Per sample: the sample name, followed by the total number of supporting reads whose anchors are upstream, the
total number of unique supporting reads whose anchors are upstream, the total number of supporting reads whose anchors
 are downstream, and finally the total number of unique supporting reads whose anchors are downstream.

WARNING see below

----

The reported larger insertion (LI) record is rather different than other types of variants. Here is the format:
1) index,
2) type(LI),
3) "ChrID",
4) chrName,
5) left breakpoint,
6) "+"
7) number of supporting reads for the left coordinate,
8) right breakpoint,
9) "-"
10) number of supporting reads for the right coordinate.

Following lines are repeated for each sample
11) Sample name
12) "+"
13) upstream supporting reads
14) "-"
15) downstream supporting reads

'''

'''
New version of pindel changes the output starting from 32+
 Check https://trac.nbic.nl/pipermail/pindel-users/2013-March/000229.html

hi mike,

This is the output format for the new version where the number of reads
supporting the reference allele is also counted on the left and right
breakpoints of the variants.

TS1_3051 164 167 0 0 2 2   TS1_3054 117 118 0 0 0 0

For example, 164 and 167 reads supporting the reference for sample TS1_3051
but only 2 reads and 2 unique reads from - strand support the alternative.
It is less likely that this is a true germline variant. with the read count
for the reference allele, you may filter the calls in the same way as snps.


'''

pindel_name = "pindel"
pindel_source = set([pindel_name])
min_coverage = 10
het_cutoff = 0.2
hom_cutoff = 0.8

GT_REF = "0/0"
GT_HET = "0/1"
GT_HOM = "1/1"

PINDEL_TO_SV_TYPE = {"I": "INS", "D": "DEL", "LI": "INS",
                     "TD": "DUP", "INV": "INV", "RPL":"RPL"}


class PindelHeader:
    def __init__(self, fd):
        self.header_dict = {}
        self.header_fields = []

        start = fd.tell()
        # Parse header : read the header until the last line is
        # encoutered (^#Chr1) break before reading the next line
        while True:
            try:
                line = next(fd)
                if line.find("ChrID") >= 1:
                    self.parse_firstrecord_line(line)
                    break
            except StopIteration:
                break
        # We return to the start of the file
        fd.seek(start)

    def parse_firstrecord_line(self, line):
        fields = line.split()
        num_samples = int(fields[27])
        pindel024u_or_later = len(fields) > 31 + 5 * num_samples
        if pindel024u_or_later:
            self.samples = [fields[i] for i in range(31, len(fields), 7)]
        else:
            self.samples = [fields[i] for i in range(31, len(fields), 5)]
        logger.info("\t".join(self.samples))

    def getOrderedSamples(self):
        # sample order is given by the sample order in the pindel output
        if not hasattr(self, "samples"):
            return []
        return self.samples

    def __str__(self):
        return str(self.__dict__)


def pysam_header(pindel_header):
    # Using the new pysam interface for creating records and header
    # not implemented in the python3.4 version of pysam (0.10.0)
    # have to wait at least for the 0.11.2.2 pysam version
    header = VariantHeader()
    vcf_header = get_template_header("pindel")

    with open(vcf_header) as fin:
        for line in fin:
            if (line.startswith("##INFO")
                    or line.startswith("##FORMAT")
                    or line.startswith("##FILTER")):
                header.add_line(line.rstrip())
    for sample in pindel_header.getOrderedSamples():
        header.add_sample(sample)

    return header


class PindelRecord(SVRecord):
    counter = {"I": 0, "D": 0, "LI": 0, "TD": 0, "INV": 0, "RPL": 0}

    # Only DEL, INV and DUP for the moment
    svs_supported = set(["DEL", "DUP", "INV"])

    def __init__(self, record_string, reference_handle=None):
        self.name = pindel_name

        fields = record_string.split()
        index = fields[0]
        sv_type = fields[1]

        if PINDEL_TO_SV_TYPE[sv_type] not in PindelRecord.svs_supported:
            print(("Unsupported structural variation " + sv_type))
            exit(1)
        sv_len = int(fields[2])
        num_nt_added = fields[4]
        # nt_added = fields[5]
        chrom = fields[7]
        start_pos = int(fields[9])
        end_pos = int(fields[10])
        bp_range = (int(fields[12]), int(fields[13]))
        read_supp = int(fields[15])
        uniq_read_supp = int(fields[16])
        up_read_supp = int(fields[18])
        up_uniq_read_supp = int(fields[19])
        down_read_supp = int(fields[21])
        down_uniq_read_supp = int(fields[22])
        # score = int(fields[24])
        # nums_samples = int(fields[27])
        num_sample_supp = int(fields[29])
        # num_sample_uniq_supp = int(fields[30])

        sv = {}
        samples = []
        for i in range(31, len(fields), 7):
            sv[fields[i]] = {"name": fields[i],
                             "up_ref_read_supp": int(fields[i + 1]),
                             "down_ref_read_supp": int(fields[i + 2]),
                             "up_var_read_supp": int(fields[i + 3]),
                             "up_var_uniq_read_supp": int(fields[i + 4]),
                             "down_var_read_supp": int(fields[i + 5]),
                             "down_var_uniq_read_supp": int(fields[i + 6])
                             }
            samples.append(sv[fields[i]])

        self.su = read_supp
        self.pindel_sv_type = sv_type
        self.up_read_supp = up_read_supp
        self.down_read_supp = down_read_supp
        self.up_uniq_read_supp = up_uniq_read_supp
        self.down_uniq_read_supp = down_uniq_read_supp
        self.uniq_read_supp = uniq_read_supp

        # Computing CIPOS and CIEND
        pos_conf = max(1, abs(int((bp_range[0] - start_pos)/2)))
        end_conf = max(1, abs(int((bp_range[1] - end_pos)/2)))
        self.cipos = (-pos_conf, pos_conf)
        self.ciend = (-end_conf, end_conf)

        PindelRecord.counter[sv_type] += 1
        id = "_".join([pindel_name, chrom, sv_type,
                       str(PindelRecord.counter[sv_type])])
        super(PindelRecord, self).__init__(pindel_name, chrom,
                                           start_pos, end_pos, id)

        self._sv_type = PINDEL_TO_SV_TYPE[sv_type]
        self.__sv_len = sv_len
        self.__samples = samples
        self.__sv = sv

        self.Modinfo = {
            "END":  end_pos,
            "SU": read_supp,
            "INDEX": index,
            "NTLEN": num_nt_added,
            "CIPOS": ",".join([str(x) for x in self.cipos]),
            "CIEND": ",".join([str(x) for x in self.ciend]),
            "NUM_SUPP_SAMPLES": num_sample_supp
        }

    def addbatch2Id(self, batch=None):
        if batch:
            self.id += "_" + batch

    def variantSample(self, sample):
        if (self.__sv[sample]["up_var_read_supp"] > 0
                or self.__sv[sample]["down_var_read_supp"] > 0):
            return True
        else:
            return False

    def variantSamples(self):
        variant_samples = []
        for sample in self.__sv:
            if self.variantSample(sample):
                variant_samples.append(sample)
        return sorted(variant_samples)

    def getOrderedSamples(self):
        # sample order is given by the sample order in the pindel output
        return self.__samples

    def getSampleCalls(self):
        # TODO make sure that the order of the sample conform to
        # the order of the vcf header !!
        calls = []
        for s in self.getOrderedSamples():
            sample = s["name"]
            ad = s["up_var_read_supp"] + s["down_var_read_supp"]
            called_value = vcf.model.make_calldata_tuple("AD")(AD=ad)
            calldata = vcf.model._Call(None, sample, called_value)
            calls.append(calldata)
        return calls

    def MaxIndSupportingRP(self):
        max_ind_supp = 0
        for s in self.__samples:
            ad = s["up_var_read_supp"] + s["down_var_read_supp"]
            max_ind_supp = max(ad, max_ind_supp)
        return max_ind_supp

    def to_vcf_record(self):
        alt = [vcf.model._SV(self.sv_type)]
        info = {"SVLEN": self.sv_len, "SVTYPE": self.sv_type}

        info["MAX_IND_SU"] = self.MaxIndSupportingRP()
        info["VSAMPLES"] = ",".join(self.variantSamples())

        info.update(self.Modinfo)

        calls = self.getSampleCalls()

        vcf_record = vcf.model._Record(self.chrom,
                                       self.start,
                                       self.id,
                                       "N",
                                       alt,
                                       ".",
                                       "PASS",
                                       info,
                                       "AD",
                                       [0],
                                       calls)
        return vcf_record


class PindelReader(SVReader):
    svs_supported = set(["DEL", "INS", "DUP", "INV"])

    def __init__(self, file_name, reference_handle=None, svs_to_report=None):
        super(PindelReader, self).__init__(file_name, pindel_name,
                                           reference_handle)
        self.file_fd = smart_open(file_name)
        self.header = PindelHeader(self.file_fd)
        self.pysamheader = pysam_header(self.header)
        self.svs_supported = PindelReader.svs_supported
        if svs_to_report is not None:
            self.svs_supported &= set(svs_to_report)

    def __iter__(self):
        return self

    def __next__(self):
        while True:
            line = next(self.file_fd)
            if line.find("ChrID") >= 1:
                record = PindelRecord(line.strip(), self.reference_handle)
                if record.sv_type in self.svs_supported:
                    return record

    def AreSamplesSpecified(self):
        return 1

    def getOrderedSamples(self):
        return self.header.getOrderedSamples()

    # def SpecificFilterPass(self, record):
    #     # Filtering criteria more than 4 PE
    #     # see
    #     # http://bcb.io/2014/08/12/validated-whole-genome-structural-variation-detection-using-multiple-callers
    #     return (record.length() > 60)
    def SpecificFilterPass(self, record):
        # fILTER
        if (abs(record.start-record.end+1) >= 2000 or
            record.MaxIndSupportingRP() <= 3):
            return False
        else:
            return True

    def remove_duplicate(self, records):
        """
           returns a vector of records where duplicates were removed
        """
        vcf_records_without_duplicate = self.RemoveDuplicates(records)
        return vcf_records_without_duplicate

    def RemoveDuplicates(self, records, cutoff=1):
        """
            n analysis of all variants to identify the duplicates
            that share excatly the same breakpoints (cutoff = 1)
            or slightly different breakpoints controlled by cutoff
        """
        # Sorting the vcf records
        records.sort(key=lambda x: (x.chrom, x.start, x.end, -x.su))
        vcf_records_without_duplicate = []
        for r in records:
            if self.IsDuplicate(vcf_records_without_duplicate, r, cutoff):
                print("removed entry %s identified as exact duplicate " % r)
                continue
            vcf_records_without_duplicate.append(r)
        return vcf_records_without_duplicate

    def IsDuplicate(self, valid_records, record, cutoff):
        if len(valid_records) == 0:
            return False
        last_included = valid_records[-1]
        start_offset = abs(record.start - last_included.start)
        end_offset = abs(record.end - last_included.end)
        if start_offset + end_offset < cutoff:
            return True
        return False


class PindelWriter(SVWriter):
    def __init__(self, file_name, reference_contigs, template_reader):
        super(PindelWriter, self).__init__(file_name,
                                           template_reader.tool_name,
                                           template_reader)
        vcf_template_reader = get_template(template_reader.tool_name)
        vcf_template_reader.samples = template_reader.getOrderedSamples()
        self.vcf_writer = vcf.Writer(open(self.filename, 'w'),
                                     vcf_template_reader)

    def write(self, record):
        self.vcf_writer.write_record(record.to_vcf_record())

    def _close(self):
        self.vcf_writer.close()