from __future__ import print_function import sys import os import collections import gzip from subprocess import call import re from collections import defaultdict from pybedtools import BedTool, create_interval_from_list from pysam import AlignmentFile import numpy as np # import natsort VERBOSE = True def get_insert_size(bam): """ Compute mean insert size :param bam: bam file :type bam: str :return: mean insert size :rtype: int """ fbam = AlignmentFile(bam, "rb") count = 0 isizes_raw = [] for read in fbam: if read.is_proper_pair: isizes_raw.append(np.abs(read.template_length)) count += 1 if count == 50000: break n99 = np.percentile(isizes_raw, 99) n1 = np.percentile(isizes_raw, 1) isizes = [] for isize in isizes_raw: if n1 < isize < n99: isizes.append(isize) fbam.close() return int(np.round(np.mean(np.round(isizes)))) def getChromLength(reference, chrom): """ Retrieve the length of chromosome chrom from .fai file :param reference: the genome fasta file :param chrom: the chromosome of interest :type reference: file :type chrom: str :return: chromosome length :rtype: int """ reference_fai = str(reference) + ".fai" chroms = {} if reference is not None and os.path.isfile(reference_fai): with open(reference_fai) as fai_file: for line in fai_file: line_items = line.strip().split("\t") name, length = line_items[0:2] name = name.split(" ")[0] chroms[name] = int(length) if chrom not in chroms: raise KeyError('the chromosome '+chrom+' is not in the *.fai file....') return chroms[chrom] def fetchId(bamfile): """ Fetch sample id in a bam file :param bamfile: the bam file :type bamfile: file :return: sample name :rtype: str """ bamfile_fin = AlignmentFile(bamfile, 'rb') name = bamfile_fin.header['RG'][0]['SM'] bamfile_fin.close() return name _Contig = collections.namedtuple('Contig', ['name', 'length']) def get_contigs(reference): """ Retrieve the contigs (chromosomes) from the genome fasta file :param reference: the genome fasta file :type reference: fasta filefile :return: list of contigs in the format specified by PyCvf :rtype: list """ contigs = [] reference_fai = str(reference) + ".fai" if reference is not None and os.path.isfile(reference_fai): with open(reference_fai) as fai_file: for line in fai_file: line_items = line.strip().split("\t") name, length = line_items[0:2] name = name.split(" ")[0] contigs.append(_Contig(name, int(length))) return contigs # def chromNum(chrom): # m = re.search(r"[cC]hr(?P<num>[\dXY]+)", chrom) # if m is not None: # if m.group('num') == 'X': # num = 100 # elif m.group('num') == 'Y': # num = 101 # else: # num = int(m.group('num')) # return num # else: # return chrom def addContigInfosTovcf(vcffile, outvcffile, reference): """ Add contig infos to vcf :param vcffile: input vcffile :param outvcffile: output vcffile :param reference: the reference genome fasta filefile :type vcffile: file :type outvcffile: vcffile :type reference: fasta filefile :return: nothing """ # compute contig information contigs = get_contigs(reference) filename, extension = os.path.splitext(outvcffile) if extension == ".gz": outputfile = filename gzip = True else: outputfile = outvcffile gzip = False with open(outputfile, "w") as fout: with smart_open(vcffile) as fd: for line in fd: if re.search(r"^##contig", line) is not None: continue if re.search(r"^#CHROM", line): for ctg in contigs: fout.write('##contig=<ID=%s,length=%d>\n' % (ctg.name, ctg.length)) # for ctg in sorted(contigs, # key=lambda x: chromNum(x.name)): fout.write(line) # bgzip file if gzip: cmd = "bgzip -f "+outputfile call(cmd, shell=True) cmd = "tabix -p vcf " + outvcffile call(cmd, shell=True) def sorting(records, reference_contigs): if reference_contigs: contigs_order_dict = defaultdict() for (index, contig) in enumerate(reference_contigs): contigs_order_dict[contig.name] = index records.sort( key=lambda x: (contigs_order_dict[x.chrom], x.start)) else: records.sort(key=lambda x: (x.chrom, x.start)) return records def vcf_to_pybed(vcf_records): """ Convert vcf records to bed records :param vcf_records: the vcf records :type vcfrecords: a list of vcf records in pysam format :return: the corresponding betools records :rtype: a bedtools object """ intervals = [] for record in vcf_records: chrom = record.chrom start = record.start # pysam start is 0-based inclusive just like bed end = record.end # pysam stop is 0-based exclusive just like bed name = record.id interval_list = list(map(str, [chrom, start, end, name])) intervals.append(create_interval_from_list(interval_list)) return BedTool(intervals).sort() # A small wrapper to print to stderr def eprint(*args, **kwargs): if VERBOSE: print(*args, file=sys.stderr, **kwargs) def warn(*args, **kwargs): print(*args, file=sys.stderr, **kwargs) def log(objs): print("-- log -- ", objs, file=sys.stderr) def warning(*objs): print("WARNING: ", *objs, file=sys.stderr) def smart_open(file_name): """ Opening compressed or uncompressed file according to the extension (.gz) :param file_name: the file to open :type file_name: str :return: the corresponding file handle :rtype: a file handle """ if file_name is not None: if file_name.endswith(".gz"): file_fd = gzip.open(file_name, "rt") else: file_fd = open(file_name) else: file_fd = sys.stdin return file_fd def flatten(l): for el in l: if isinstance(el, collections.Iterable) and not isinstance(el, str): for sub in flatten(el): yield sub else: yield el