Newer
Older
#!/usr/bin/env python3
import argparse
import os
from tempfile import NamedTemporaryFile, mkdtemp
from shutil import rmtree
import multiprocessing
from functools import partial
import numpy as np
import pysam
import pysamstats
import pybedtools
def compute_mad(a):
"""
Compute *Median Absolute Deviation* of an array.
"""
med = np.median(a)
mad = np.median(np.absolute(a - med)) # MAD
return mad
def isSplitRead(read):
if not read.has_tag('SA'):
return False
tag = read.get_tag('SA')
num_alt_align = len(tag.split(";"))
if num_alt_align > 0:
return True
else:
return False
def get_bamcoverage(bamfile, fafile, chrom, window_size=500,
start=None, end=None):
Thomas Faraut
committed
"""
pure pysam implementation of pysamstats.load_binned,
same signature as get_pysamstatsbamcoverage (originally get_bamcoverage)
"""
mybam = pysam.AlignmentFile(bamfile)
fa = pysam.FastaFile(fafile)
if start is None:
start = 0
if end is None:
end = len(fa.fetch(chrom))
Thomas Faraut
committed
coverage = []
Thomas Faraut
committed
pos = i + int(window_size/2)
counts = mybam.count(chrom, i, i+window_size, read_callback="all")
coverage.append((chrom.encode('utf8'), pos, counts))
Thomas Faraut
committed
a = np.array(coverage, dtype=[('chrom', 'S22'),
('pos', np.int64), ('reads_all', 'i8')])
a = a.view(np.recarray)
return a
def get_pysamstatsbamcoverage(bamfile, fafile, chrom, window_size=500):
mybam = pysam.AlignmentFile(bamfile)
a = pysamstats.load_binned("coverage", mybam,
fafile=fafile,
chrom=chrom,
window_size=window_size)
return a
def get_coverage_cutoff(coverage):
Thomas Faraut
committed
"""
Identify the cutoff value corresponding to high coverage regions
using mode + 3 stdev as recomended by speedseq
"""
counts = coverage.reads_all[np.where(coverage.reads_all)]
median = np.median(counts)
# stdev = np.std(counts)
mad = compute_mad(counts)
# return median + 3 * stdev
# we use mad to be more robust to outliers
return median + 7 * mad
def dump_overcovered_to_bed(overcovered_regions, fout, window_size):
Thomas Faraut
committed
"""
From the numpy record array filtered by the cutoff
dump the corresponding regions to the bed outputfile
"""
for chrom, pos in zip(overcovered_regions.chrom,
overcovered_regions.pos):
fout.write("%s\t%d\t%d\n" % (chrom.decode("utf-8"),
pos - int(window_size/2),
pos + int(window_size/2)))
def dump_illcovered_to_bed(overcovered_regions, fout, window_size):
"""
From the numpy record array filtered by the cutoff
dump the corresponding regions to the bed outputfile
"""
extension = 10000
for chrom, pos in zip(overcovered_regions.chrom,
overcovered_regions.pos):
fout.write("%s\t%d\t%d\n" % (chrom.decode("utf-8"),
max(0, pos - int(window_size/2) - extension),
pos + int(window_size/2) + extension))
def detect_high_coverage(chrom, bamfile, fastafile, tempdir,
Thomas Faraut
committed
"""
Detecting high coverage regions using get_bamcoverage
dumping bed output in a tempfile
"""
eprint("Detecting high coverage regions in %s for %s" % (bamfile, chrom))
with NamedTemporaryFile(dir=tempdir, mode="w", delete=False) as output:
# TODO test for empty bam file (use pysam following
# https://github.com/pysam-developers/pysam/issues/27)
coverage = get_bamcoverage(bamfile, fastafile, chrom, window_size)
if cutoff is None:
cutoff = get_coverage_cutoff(coverage)
overcovered_regions = coverage[np.where(coverage.reads_all > cutoff)]
illcovered_regions = coverage[np.where(coverage.reads_all > 10*cutoff)]
dump_overcovered_to_bed(overcovered_regions, output, window_size)
dump_illcovered_to_bed(illcovered_regions, output, window_size)
return output.name, cutoff
def parallel_runs(chromosomes, bamfile, fastafile, tempdir,
window_size, cutoff, cores=1):
Thomas Faraut
committed
"""
Detecting high coverage with a parallelization on chromosomes
"""
pool = multiprocessing.Pool(processes=cores)
launch_chrom = partial(detect_high_coverage,
bamfile=bamfile,
fastafile=fastafile,
tempdir=tempdir,
window_size=window_size,
cutoff=cutoff) # only remaining param is chrom
result_list = pool.map(launch_chrom, chromosomes)
return [r[0] for r in result_list]
def main(args):
bamfile = args.bam
fastafile = args.fasta
chroms = args.chroms
output = args.output
window_size = args.window_size
threads = args.threads
chromosomes = chroms.split(',')
tempdir = mkdtemp(dir=".")
outputfiles = []
# pop fist chromosome
chrom = chromosomes.pop(0)
Thomas Faraut
committed
outputfile, cutoff = detect_high_coverage(chrom, bamfile, fastafile,
tempdir, window_size=window_size)
outputfiles.append(outputfile)
Thomas Faraut
committed
if len(chromosomes):
outfiles = parallel_runs(chromosomes, bamfile, fastafile,
tempdir, window_size, cutoff,
cores=threads)
outputfiles.extend(outfiles)
with NamedTemporaryFile(dir=tempdir, mode="w", delete=False) as bedout:
for fname in outputfiles:
with open(fname) as infile:
for line in infile:
bedout.write(line)
os.remove(fname)
bed = pybedtools.BedTool(bedout.name)
os.remove(bedout.name)
merged_bed.saveas(output)
rmtree(tempdir)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Extract entries for a "
"specific chromosome from a bam file")
parser.add_argument('-b', '--bam', required=True,
help='the bam file')
parser.add_argument('-f', '--fasta', required=True,
help='the reference genome')
parser.add_argument('-c', '--chroms', required=True,
help='comma separated list of chromosomes')
parser.add_argument('-o', '--output', required=True,
help='output file')
Thomas Faraut
committed
parser.add_argument('-w', '--window_size', type=int, default=500,
help='the size of the window for coverage estimation')
parser.add_argument('-t', '--threads', type=int, default=1,
help='number of threads')
args = parser.parse_args()
main(args)