svrunner_utils.py


import sys
import os
import collections
import gzip
from subprocess import call
import re

from collections import defaultdict

from pybedtools import BedTool, create_interval_from_list
from pysam import AlignmentFile
import numpy as np
# import natsort

VERBOSE = True


def get_insert_size(bam):
    """
    Compute mean insert size
    :param bam: bam file
    :type bam: str
    :return: mean insert size
    :rtype: int
    """
    fbam = AlignmentFile(bam, "rb")
    count = 0
    isizes_raw = []
    for read in fbam:
        if read.is_proper_pair:
            isizes_raw.append(np.abs(read.template_length))
            count += 1
            if count == 50000:
                break
    n99 = np.percentile(isizes_raw, 99)
    n1 = np.percentile(isizes_raw, 1)
    isizes = []
    for isize in isizes_raw:
        if n1 < isize < n99:
            isizes.append(isize)
    fbam.close()
    return int(np.round(np.mean(np.round(isizes))))


def getChromLength(reference, chrom):
    """
    Retrieve the length of chromosome chrom from .fai file
    :param reference: the genome fasta file
    :param chrom: the chromosome of interest
    :type reference: file
    :type chrom: str
    :return: chromosome length
    :rtype: int
    """
    reference_fai = str(reference) + ".fai"
    chroms = {}
    if reference is not None and os.path.isfile(reference_fai):
        with open(reference_fai) as fai_file:
            for line in fai_file:
                line_items = line.strip().split("\t")
                name, length = line_items[0:2]
                name = name.split(" ")[0]
                chroms[name] = int(length)
    if chrom not in chroms:
        raise KeyError('the chromosome '+chrom+' is not in the *.fai file....')
    return chroms[chrom]


def fetchId(bamfile):
    """
    Fetch sample id in a bam file
    :param bamfile: the bam file
    :type bamfile: file
    :return: sample name
    :rtype: str
    """
    bamfile_fin = AlignmentFile(bamfile, 'rb')
    name = bamfile_fin.header['RG'][0]['SM']
    bamfile_fin.close()
    return name


_Contig = collections.namedtuple('Contig', ['name', 'length'])


def get_contigs(reference):
    """
    Retrieve the contigs (chromosomes) from the genome fasta file
    :param reference: the genome fasta file
    :type reference: fasta filefile
    :return: list of contigs in the format specified by PyCvf
    :rtype: list
    """
    contigs = []
    reference_fai = str(reference) + ".fai"
    if reference is not None and os.path.isfile(reference_fai):
        with open(reference_fai) as fai_file:
            for line in fai_file:
                line_items = line.strip().split("\t")
                name, length = line_items[0:2]
                name = name.split(" ")[0]
                contigs.append(_Contig(name, int(length)))
    return contigs


def chromNum(chrom):
    m = re.search(r"[cC]hr(?P<num>[\dXY]+)", chrom)
    if m is not None:
        if m.group('num') == 'X':
            num = 100
        elif m.group('num') == 'Y':
            num = 101
        else:
            num = int(m.group('num'))
        return num
    else:
        return chrom


def addContigInfosTovcf(vcffile, outvcffile, reference):
    """
    Add contig infos to vcf
    :param vcffile: input vcffile
    :param outvcffile: output vcffile
    :param reference: the reference genome fasta filefile
    :type vcffile: file
    :type outvcffile: vcffile
    :type reference: fasta filefile
    :return: nothing
    """
    # compute contig information
    contigs = get_contigs(reference)

    filename, extension = os.path.splitext(outvcffile)
    if extension == ".gz":
        outputfile = filename
        gzip = True
    else:
        outputfile = outvcffile
        gzip = False
    with open(outputfile, "w") as fout:
        with smart_open(vcffile) as fd:
            for line in fd:
                if re.search(r"^##contig", line) is not None:
                    continue
                if re.search(r"^#CHROM", line):
                    for ctg in contigs:
                        fout.write('##contig=<ID=%s,length=%d>\n'
                                   % (ctg.name, ctg.length))
                    #  for ctg in sorted(contigs,
                    #                    key=lambda x: chromNum(x.name)):

                fout.write(line)
    # bgzip file
    if gzip:
        cmd = "bgzip -f "+outputfile
        call(cmd, shell=True)
    cmd = "tabix -p vcf " + outvcffile
    call(cmd, shell=True)


def sorting(records, reference_contigs):
    if reference_contigs:
        contigs_order_dict = defaultdict()
        for (index, contig) in enumerate(reference_contigs):
            contigs_order_dict[contig.name] = index
        records.sort(
            key=lambda x: (contigs_order_dict[x.chrom], x.start))
    else:
        records.sort(key=lambda x: (x.chrom, x.start))
    return records


def vcf_to_pybed(vcf_records):
    """
    Convert vcf records to bed records
    :param vcf_records: the vcf records
    :type vcfrecords: a list of vcf records
    :return: the corresponding betools records
    :rtype: a bedtools object
    """
    intervals = []
    for record in vcf_records:
        chrom = record.chrom
        start = record.start - 1
        end = record.end
        name = record.id
        interval_list = list(map(str, [chrom, start, end, name]))
        intervals.append(create_interval_from_list(interval_list))
    return BedTool(intervals).sort()


# A small wrapper to print to stderr
def eprint(*args, **kwargs):
    if VERBOSE:
        print(*args, file=sys.stderr, **kwargs)


def warn(*args, **kwargs):
    print(*args, file=sys.stderr, **kwargs)


def log(objs):
    print("-- log -- ", objs, file=sys.stderr)


def warning(*objs):
    print("WARNING: ", *objs, file=sys.stderr)


def smart_open(file_name):
    """
    Opening compressed or uncompressed file according to the extension (.gz)
    :param file_name: the file to open
    :type file_name: str
    :return: the corresponding file handle
    :rtype: a file handle
    """
    if file_name is not None:
        if file_name.endswith(".gz"):
            file_fd = gzip.open(file_name, "rt")
        else:
            file_fd = open(file_name)
    else:
        file_fd = sys.stdin
    return file_fd


def flatten(l):
    for el in l:
        if isinstance(el, collections.Iterable) and not isinstance(el, str):
            for sub in flatten(el):
                yield sub
        else:
            yield el