Skip to content
Snippets Groups Projects
detect_regions_to_exclude.py 7.04 KiB
Newer Older
Thomas Faraut's avatar
Thomas Faraut committed
#!/usr/bin/env python3

import argparse
import os
from tempfile import NamedTemporaryFile, mkdtemp
from shutil import rmtree

import multiprocessing
from functools import partial
Thomas Faraut's avatar
Thomas Faraut committed
import numpy as np

import pysam
import pybedtools

Thomas Faraut's avatar
Thomas Faraut committed
from svrunner_utils import eprint

Thomas Faraut's avatar
Thomas Faraut committed

Thomas Faraut's avatar
Thomas Faraut committed
def compute_mad(a):
    """
    Compute *Median Absolute Deviation* of an array.
    """
    med = np.median(a)
    mad = np.median(np.absolute(a - med))  # MAD
    return mad


def isSplitRead(read):
    if not read.has_tag('SA'):
        return False
    tag = read.get_tag('SA')
    num_alt_align = len(tag.split(";"))
    if num_alt_align > 0:
        return True
    else:
        return False


def get_bamcoverage(bamfile, fafile, chrom, window_size=500,
                    start=None, end=None):
    """
      pure pysam implementation of pysamstats.load_binned,
      same signature as get_pysamstatsbamcoverage (originally get_bamcoverage)
    """
    mybam = pysam.AlignmentFile(bamfile)
    fa = pysam.FastaFile(fafile)
Thomas Faraut's avatar
Thomas Faraut committed
    if start is None:
        start = 0
    if end is None:
        end = len(fa.fetch(chrom))
Thomas Faraut's avatar
Thomas Faraut committed
    for i in range(start, end, window_size):
        pos = i + int(window_size/2)
        counts = mybam.count(chrom, i, i+window_size, read_callback="all")
        coverage.append((chrom.encode('utf8'), pos, counts))
Thomas Faraut's avatar
Thomas Faraut committed
        # print(chrom, pos, counts)
    a = np.array(coverage, dtype=[('chrom', 'S22'),
                                  ('pos', np.int64), ('reads_all', 'i8')])
    a = a.view(np.recarray)
    return a


# def get_pysamstatsbamcoverage(bamfile, fafile, chrom, window_size=500):
#     mybam = pysam.AlignmentFile(bamfile)
#     a = pysamstats.load_binned("coverage", mybam,
#                                fafile=fafile,
#                                chrom=chrom,
#                                window_size=window_size)
Thomas Faraut's avatar
Thomas Faraut committed
    return a


def get_coverage_cutoff(coverage):
    """
        Identify the cutoff value corresponding to high coverage regions
        using mode + 3 stdev as recomended by speedseq
    """
Thomas Faraut's avatar
Thomas Faraut committed
    counts = coverage.reads_all[np.where(coverage.reads_all)]
Thomas Faraut's avatar
Thomas Faraut committed
    median = np.median(counts)
    # stdev = np.std(counts)
    mad = compute_mad(counts)
Thomas Faraut's avatar
Thomas Faraut committed
    # recommanded by SpeedSeq
Thomas Faraut's avatar
Thomas Faraut committed
    # return median + 3 * stdev
    # we use mad to be more robust to outliers
    return median + 7 * mad
Thomas Faraut's avatar
Thomas Faraut committed
def dump_overcovered_to_bed(overcovered_regions, fout, window_size):
    """
        From the numpy record array filtered by the cutoff
        dump the corresponding regions to the bed outputfile
    """
Thomas Faraut's avatar
Thomas Faraut committed
    for chrom, pos in zip(overcovered_regions.chrom,
                          overcovered_regions.pos):
        fout.write("%s\t%d\t%d\n" % (chrom.decode("utf-8"),
                                     pos - int(window_size/2),
                                     pos + int(window_size/2)))


Thomas Faraut's avatar
Thomas Faraut committed
def dump_illcovered_to_bed(overcovered_regions, fout, window_size):
    """
        From the numpy record array filtered by the cutoff
        dump the corresponding regions to the bed outputfile
    """
    extension = 10000
    for chrom, pos in zip(overcovered_regions.chrom,
                          overcovered_regions.pos):
        fout.write("%s\t%d\t%d\n" % (chrom.decode("utf-8"),
                                     max(0, pos - int(window_size/2) - extension),
                                     pos + int(window_size/2) + extension))


Thomas Faraut's avatar
Thomas Faraut committed
def detect_high_coverage(chrom, bamfile, fastafile, tempdir,
Thomas Faraut's avatar
Thomas Faraut committed
                         window_size=500, cutoff=None):
    """
        Detecting high coverage regions using get_bamcoverage
        dumping bed output in a tempfile
    """
    eprint("Detecting high coverage regions in %s for %s" % (bamfile, chrom))
Thomas Faraut's avatar
Thomas Faraut committed
    with NamedTemporaryFile(dir=tempdir, mode="w", delete=False) as output:
Thomas Faraut's avatar
Thomas Faraut committed
        # TODO test for empty bam file (use pysam following
        # https://github.com/pysam-developers/pysam/issues/27)
Thomas Faraut's avatar
Thomas Faraut committed
        coverage = get_bamcoverage(bamfile, fastafile, chrom, window_size)
Thomas Faraut's avatar
Thomas Faraut committed
                                   # start=7040000, end=7140000)
Thomas Faraut's avatar
Thomas Faraut committed
        if cutoff is None:
            cutoff = get_coverage_cutoff(coverage)
        overcovered_regions = coverage[np.where(coverage.reads_all > cutoff)]
Thomas Faraut's avatar
Thomas Faraut committed
        illcovered_regions = coverage[np.where(coverage.reads_all > 10*cutoff)]
        dump_overcovered_to_bed(overcovered_regions, output, window_size)
        dump_illcovered_to_bed(illcovered_regions, output, window_size)

Thomas Faraut's avatar
Thomas Faraut committed
    return output.name, cutoff


def parallel_runs(chromosomes, bamfile, fastafile, tempdir,
                  window_size, cutoff, cores=1):
    """
        Detecting high coverage with a parallelization on chromosomes
    """
Thomas Faraut's avatar
Thomas Faraut committed
    pool = multiprocessing.Pool(processes=cores)
    launch_chrom = partial(detect_high_coverage,
                           bamfile=bamfile,
                           fastafile=fastafile,
                           tempdir=tempdir,
                           window_size=window_size,
                           cutoff=cutoff)  # only remaining param is chrom
    result_list = pool.map(launch_chrom, chromosomes)
    return [r[0] for r in result_list]


def main(args):
    bamfile = args.bam
    fastafile = args.fasta
    chroms = args.chroms
    output = args.output
    window_size = args.window_size
    threads = args.threads
    chromosomes = chroms.split(',')

    tempdir = mkdtemp(dir=".")

    outputfiles = []
    # pop fist chromosome
    chrom = chromosomes.pop(0)
Thomas Faraut's avatar
Thomas Faraut committed
    outputfile, cutoff = detect_high_coverage(chrom, bamfile, fastafile,
                                              tempdir, window_size=window_size)
    outputfiles.append(outputfile)

    if len(chromosomes):
        outfiles = parallel_runs(chromosomes, bamfile, fastafile,
                                 tempdir, window_size, cutoff,
                                 cores=threads)
        outputfiles.extend(outfiles)
Thomas Faraut's avatar
Thomas Faraut committed

    with NamedTemporaryFile(dir=tempdir, mode="w", delete=False) as bedout:
        for fname in outputfiles:
            with open(fname) as infile:
                for line in infile:
                    bedout.write(line)
            os.remove(fname)
    bed = pybedtools.BedTool(bedout.name)
Thomas Faraut's avatar
Thomas Faraut committed
    merged_bed = bed.sort().merge()
Thomas Faraut's avatar
Thomas Faraut committed
    os.remove(bedout.name)
    merged_bed.saveas(output)

    rmtree(tempdir)

Thomas Faraut's avatar
Thomas Faraut committed
if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Extract entries for a "
                                     "specific chromosome from a bam file")
    parser.add_argument('-b', '--bam', required=True,
                        help='the bam file')
    parser.add_argument('-f', '--fasta', required=True,
                        help='the reference genome')
    parser.add_argument('-c', '--chroms', required=True,
                        help='comma separated list of chromosomes')
    parser.add_argument('-o', '--output', required=True,
                        help='output file')
    parser.add_argument('-w', '--window_size', type=int, default=500,
Thomas Faraut's avatar
Thomas Faraut committed
                        help='the size of the window for coverage estimation')
    parser.add_argument('-t', '--threads',  type=int, default=1,
                        help='number of threads')
    args = parser.parse_args()
    main(args)