Skip to content
Snippets Groups Projects
Commit cd6a9624 authored by Thomas Faraut's avatar Thomas Faraut
Browse files

- implementing a pure pysam bacoverage instead of the pysamstats implementation

- window width for the detection of high coverage is now 500bp
- simplifying gzip of pindel files
parent 13fc368c
No related branches found
No related tags found
No related merge requests found
......@@ -8,6 +8,8 @@ from shutil import rmtree
import multiprocessing
from functools import partial
from svrunner_utils import eprint
import numpy as np
from scipy import stats
......@@ -16,7 +18,26 @@ import pysamstats
import pybedtools
def get_bamcoverage(bamfile, fafile, chrom, window_size=20):
def get_bamcoverage(bamfile, fafile, chrom, window_size=200):
"""
pure pysam implementation of pysamstats.load_binned,
same signature as get_pysamstatsbamcoverage (originally get_bamcoverage)
"""
mybam = pysam.AlignmentFile(bamfile)
fa = pysam.FastaFile(fafile)
len_chr = len(fa.fetch(chrom))
coverage = []
for i in range(0, len_chr, window_size):
pos = i + int(window_size/2)
counts = mybam.count(chrom, i, i+window_size, read_callback="all")
coverage.append((chrom.encode('utf8'), pos, counts))
a = np.array(coverage, dtype=[('chrom', 'S22'),
('pos', np.int64), ('reads_all', 'i8')])
a = a.view(np.recarray)
return a
def get_pysamstatsbamcoverage(bamfile, fafile, chrom, window_size=200):
mybam = pysam.AlignmentFile(bamfile)
a = pysamstats.load_binned("coverage", mybam,
fafile=fafile,
......@@ -26,6 +47,10 @@ def get_bamcoverage(bamfile, fafile, chrom, window_size=20):
def get_coverage_cutoff(coverage):
"""
Identify the cutoff value corresponding to high coverage regions
using mode + 3 stdev as recomended by speedseq
"""
counts = coverage.reads_all[np.where(coverage.reads_all)]
mode = stats.mode(counts)
......@@ -35,6 +60,10 @@ def get_coverage_cutoff(coverage):
def dump_to_bed(overcovered_regions, fout, window_size):
"""
From the numpy record array filtered by the cutoff
dump the corresponding regions to the bed outputfile
"""
for chrom, pos in zip(overcovered_regions.chrom,
overcovered_regions.pos):
fout.write("%s\t%d\t%d\n" % (chrom.decode("utf-8"),
......@@ -44,7 +73,11 @@ def dump_to_bed(overcovered_regions, fout, window_size):
def detect_high_coverage(chrom, bamfile, fastafile, tempdir,
window_size=20, cutoff=None):
print("Detecting high coverage regions in %s for %s" % (bamfile, chrom))
"""
Detecting high coverage regions using get_bamcoverage
dumping bed output in a tempfile
"""
eprint("Detecting high coverage regions in %s for %s" % (bamfile, chrom))
with NamedTemporaryFile(dir=tempdir, mode="w", delete=False) as output:
coverage = get_bamcoverage(bamfile, fastafile, chrom, window_size)
if cutoff is None:
......@@ -56,6 +89,9 @@ def detect_high_coverage(chrom, bamfile, fastafile, tempdir,
def parallel_runs(chromosomes, bamfile, fastafile, tempdir,
window_size, cutoff, cores=1):
"""
Detecting high coverage with a parallelization on chromosomes
"""
pool = multiprocessing.Pool(processes=cores)
launch_chrom = partial(detect_high_coverage,
bamfile=bamfile,
......@@ -81,14 +117,16 @@ def main(args):
outputfiles = []
# pop fist chromosome
chrom = chromosomes.pop(0)
outputfile, cutoff = detect_high_coverage(chrom, bamfile, fastafile,
tempdir, window_size=window_size)
outputfiles.append(outputfile)
outfiles = parallel_runs(chromosomes, bamfile, fastafile,
tempdir, window_size, cutoff,
cores=threads)
outputfiles.extend(outfiles)
if len(chromosomes):
outfiles = parallel_runs(chromosomes, bamfile, fastafile,
tempdir, window_size, cutoff,
cores=threads)
outputfiles.extend(outfiles)
with NamedTemporaryFile(dir=tempdir, mode="w", delete=False) as bedout:
for fname in outputfiles:
......@@ -114,7 +152,7 @@ if __name__ == '__main__':
help='comma separated list of chromosomes')
parser.add_argument('-o', '--output', required=True,
help='output file')
parser.add_argument('-w', '--window_size', type=int, default=20,
parser.add_argument('-w', '--window_size', type=int, default=500,
help='the size of the window for coverage estimation')
parser.add_argument('-t', '--threads', type=int, default=1,
help='number of threads')
......
......@@ -47,20 +47,21 @@ def get_excluded(config):
else:
return ""
def get_chr_batches(ref_file, chr):
def get_chr_batches(ref_file, chr, chr_batchsize=5000000, overlap=50000):
fa = pysam.FastaFile(ref_file)
len_chr = len(fa.fetch(chr))
# Make ranges:
groups = []
start = 1
end = min(start + 9999999, len_chr)
while (len_chr - end + 1 >= 9950000) or (len_chr + 1 == end):
end = min(start + chr_batchsize - 1, len_chr)
while (len_chr - end + 1 >= chr_batchsize - overlap) or (len_chr + 1 == end):
groups.append((start, end))
if len_chr + 1 == end:
break
start = end - 49999
end = min(start + 9999999, len_chr + 1)
start = end - overlap + 1
end = min(start + chr_batchsize - 1, len_chr + 1)
last_group = (start, len_chr + 1)
if last_group not in groups:
groups.append(last_group)
......@@ -93,7 +94,10 @@ chromosomes = config["chromosomes"]
chr_batches = {}
ref_chr = "reference_raw.fasta" if not os.path.exists(REFERENCE) and os.path.exists("reference_raw.fasta") else REFERENCE
for chromosome in chromosomes:
chr_batches[chromosome] = get_chr_batches(ref_chr, chromosome)
chr_batches[chromosome] = get_chr_batches(ref_chr,
chromosome,
chr_batchsize=5000000,
overlap=50000)
# list of variants to be detected
varianttypes = ['DEL', 'INV', 'DUP', 'mCNV']
......
......@@ -48,10 +48,7 @@ rule pindel:
" -o {wildcards.batch}/pindel/{wildcards.chrom}/{wildcards.chrbatch}/pindel_{wildcards.chrom}"
" 1>{log.stdout} 2>{log.stderr}"
"""
for svtype in D TD INV RP SI LI BP CloseEndMapped INT_final;
do
gzip -f {wildcards.batch}/pindel/{wildcards.chrom}/{wildcards.chrbatch}/pindel_{wildcards.chrom}_${{svtype}}
done
gzip -f {wildcards.batch}/pindel/{wildcards.chrom}/{wildcards.chrbatch}/*
"""
rule mergepindelbatches:
......@@ -70,4 +67,4 @@ rule mergepindelbatches:
"mergepindelbatches.py {output.D} {input.D} 1>{log.stdout} 2>{log.stderr};\n"
"mergepindelbatches.py {output.INV} {input.INV} 1>>{log.stdout} 2>>{log.stderr};\n"
"mergepindelbatches.py {output.TD} {input.TD} 1>>{log.stdout} 2>>{log.stderr};\n"
"rm -rf {wildcards.batch}/pindel/{wildcards.chrom}"
\ No newline at end of file
"rm -rf {wildcards.batch}/pindel/{wildcards.chrom}"
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment