Skip to content
Snippets Groups Projects
svrunner_utils.py 6.24 KiB
Newer Older
Floreal Cabanettes's avatar
Floreal Cabanettes committed

import sys
import os
import collections
import gzip
from subprocess import call
import re

from collections import defaultdict

from pybedtools import BedTool, create_interval_from_list
from pysam import AlignmentFile
import numpy as np

VERBOSE = True

def get_insert_size(bam):
    """
    Compute mean insert size
    :param bam: bam file
    :type bam: str
    :return: mean insert size
    :rtype: int
    """
    fbam = AlignmentFile(bam, "rb")
    count = 0
    isizes_raw = []
    for read in fbam:
        if read.is_proper_pair:
            isizes_raw.append(np.abs(read.template_length))
            count += 1
            if count == 50000:
                break
    n99 = np.percentile(isizes_raw, 99)
    n1 = np.percentile(isizes_raw, 1)
    isizes = []
    for isize in isizes_raw:
        if n1 < isize < n99:
            isizes.append(isize)
    fbam.close()
    return int(np.round(np.mean(np.round(isizes))))


def getChromLength(reference, chrom):
    """
    Retrieve the length of chromosome chrom from .fai file
    :param reference: the genome fasta file
    :param chrom: the chromosome of interest
    :type reference: file
    :type chrom: str
    :return: chromosome length
    :rtype: int
    """
    reference_fai = str(reference) + ".fai"
    chroms = {}
    if reference is not None and os.path.isfile(reference_fai):
        with open(reference_fai) as fai_file:
            for line in fai_file:
                line_items = line.strip().split("\t")
                name, length = line_items[0:2]
                name = name.split(" ")[0]
                chroms[name] = int(length)
    if chrom not in chroms:
        raise KeyError('the chromosome '+chrom+' is not in the *.fai file....')
    return chroms[chrom]


def fetchId(bamfile):
    """
    Fetch sample id in a bam file
    :param bamfile: the bam file
    :type bamfile: file
    :return: sample name
    :rtype: str
    """
    bamfile_fin = AlignmentFile(bamfile, 'rb')
    name = bamfile_fin.header['RG'][0]['SM']
    bamfile_fin.close()
    return name


_Contig = collections.namedtuple('Contig', ['name', 'length'])


def get_contigs(reference):
    """
    Retrieve the contigs (chromosomes) from the genome fasta file
    :param reference: the genome fasta file
    :type reference: fasta filefile
    :return: list of contigs in the format specified by PyCvf
    :rtype: list
    """
    contigs = []
    reference_fai = str(reference) + ".fai"
    if reference is not None and os.path.isfile(reference_fai):
        with open(reference_fai) as fai_file:
            for line in fai_file:
                line_items = line.strip().split("\t")
                name, length = line_items[0:2]
                name = name.split(" ")[0]
                contigs.append(_Contig(name, int(length)))
    return contigs


def chromNum(chrom):
    m = re.search(r"[cC]hr(?P<num>[\dXY]+)", chrom)
    if m is not None:
        if m.group('num') == 'X':
            num = 100
        elif m.group('num') == 'Y':
            num = 101
        else:
            num = int(m.group('num'))
        return num
    else:
        return chrom


def addContigInfosTovcf(vcffile, outvcffile, reference):
    """
    Add contig infos to vcf
    :param vcffile: input vcffile
    :param outvcffile: output vcffile
    :param reference: the reference genome fasta filefile
    :type vcffile: file
    :type outvcffile: vcffile
    :type reference: fasta filefile
    :return: nothing
    """
    # compute contig information
    contigs = get_contigs(reference)
    filename, extension = os.path.splitext(outvcffile)
    if extension == ".gz":
        outputfile = filename
        gzip = True
    else:
        outputfile = outvcffile
        gzip = False
    with open(outputfile, "w") as fout:
        with smart_open(vcffile) as fd:
            for line in fd:
                if re.search(r"^##contig", line) is not None:
                    continue
                if re.search(r"^#CHROM", line):
                    for ctg in sorted(contigs, key=lambda x: chromNum(x.name)):
                        fout.write('##contig=<ID=%s,length=%d>\n'
                                   % (ctg.name, ctg.length))
                fout.write(line)
    # bgzip file
    if gzip:
        cmd = "bgzip -f "+outputfile
        call(cmd, shell=True)
    cmd = "tabix -p vcf " + outvcffile
    call(cmd, shell=True)


def sorting(records, reference_contigs):
    if reference_contigs:
        contigs_order_dict = defaultdict()
        for (index, contig) in enumerate(reference_contigs):
            contigs_order_dict[contig.name] = index
        records.sort(
            key=lambda x: (contigs_order_dict[x.chrom], x.start))
    else:
        records.sort(key=lambda x: (x.chrom, x.start))
    return records


def vcf_to_pybed(vcf_records):
    """
    Convert vcf records to bed records
    :param vcf_records: the vcf records
    :type vcfrecords: a list of vcf records
    :return: the corresponding betools records
    :rtype: a bedtools object
    """
    intervals = []
    for record in vcf_records:
        chrom = record.chrom
        start = record.start - 1
        end = record.end
        name = record.id
        interval_list = list(map(str, [chrom, start, end, name]))
        intervals.append(create_interval_from_list(interval_list))
    return BedTool(intervals).sort()


# A small wrapper to print to stderr
def eprint(*args, **kwargs):
    if VERBOSE:
        print(*args, file=sys.stderr, **kwargs)


def warn(*args, **kwargs):
    print(*args, file=sys.stderr, **kwargs)


def log(objs):
    print("-- log -- ", objs, file=sys.stderr)


def warning(*objs):
    print("WARNING: ", *objs, file=sys.stderr)


def smart_open(file_name):
    """
    Opening compressed or uncompressed file according to the extension (.gz)
    :param file_name: the file to open
    :type file_name: str
    :return: the corresponding file handle
    :rtype: a file handle
    """
    if file_name is not None:
        if file_name.endswith(".gz"):
            file_fd = gzip.open(file_name, "rt")
        else:
            file_fd = open(file_name)
    else:
        file_fd = sys.stdin
    return file_fd


def flatten(l):
    for el in l:
        if isinstance(el, collections.Iterable) and not isinstance(el, str):
            for sub in flatten(el):
                yield sub
        else:
            yield el