diff --git a/snakecnv/bin/detect_regions_to_exclude.py b/snakecnv/bin/detect_regions_to_exclude.py index c19e1ab50746eb902a1ee95cd0bea2c15086bf4f..289a15461b1d852163217121b0607d3767cf2acb 100755 --- a/snakecnv/bin/detect_regions_to_exclude.py +++ b/snakecnv/bin/detect_regions_to_exclude.py @@ -8,6 +8,8 @@ from shutil import rmtree import multiprocessing from functools import partial +from svrunner_utils import eprint + import numpy as np from scipy import stats @@ -16,7 +18,26 @@ import pysamstats import pybedtools -def get_bamcoverage(bamfile, fafile, chrom, window_size=20): +def get_bamcoverage(bamfile, fafile, chrom, window_size=200): + """ + pure pysam implementation of pysamstats.load_binned, + same signature as get_pysamstatsbamcoverage (originally get_bamcoverage) + """ + mybam = pysam.AlignmentFile(bamfile) + fa = pysam.FastaFile(fafile) + len_chr = len(fa.fetch(chrom)) + coverage = [] + for i in range(0, len_chr, window_size): + pos = i + int(window_size/2) + counts = mybam.count(chrom, i, i+window_size, read_callback="all") + coverage.append((chrom.encode('utf8'), pos, counts)) + a = np.array(coverage, dtype=[('chrom', 'S22'), + ('pos', np.int64), ('reads_all', 'i8')]) + a = a.view(np.recarray) + return a + + +def get_pysamstatsbamcoverage(bamfile, fafile, chrom, window_size=200): mybam = pysam.AlignmentFile(bamfile) a = pysamstats.load_binned("coverage", mybam, fafile=fafile, @@ -26,6 +47,10 @@ def get_bamcoverage(bamfile, fafile, chrom, window_size=20): def get_coverage_cutoff(coverage): + """ + Identify the cutoff value corresponding to high coverage regions + using mode + 3 stdev as recomended by speedseq + """ counts = coverage.reads_all[np.where(coverage.reads_all)] mode = stats.mode(counts) @@ -35,6 +60,10 @@ def get_coverage_cutoff(coverage): def dump_to_bed(overcovered_regions, fout, window_size): + """ + From the numpy record array filtered by the cutoff + dump the corresponding regions to the bed outputfile + """ for chrom, pos in zip(overcovered_regions.chrom, overcovered_regions.pos): fout.write("%s\t%d\t%d\n" % (chrom.decode("utf-8"), @@ -44,7 +73,11 @@ def dump_to_bed(overcovered_regions, fout, window_size): def detect_high_coverage(chrom, bamfile, fastafile, tempdir, window_size=20, cutoff=None): - print("Detecting high coverage regions in %s for %s" % (bamfile, chrom)) + """ + Detecting high coverage regions using get_bamcoverage + dumping bed output in a tempfile + """ + eprint("Detecting high coverage regions in %s for %s" % (bamfile, chrom)) with NamedTemporaryFile(dir=tempdir, mode="w", delete=False) as output: coverage = get_bamcoverage(bamfile, fastafile, chrom, window_size) if cutoff is None: @@ -56,6 +89,9 @@ def detect_high_coverage(chrom, bamfile, fastafile, tempdir, def parallel_runs(chromosomes, bamfile, fastafile, tempdir, window_size, cutoff, cores=1): + """ + Detecting high coverage with a parallelization on chromosomes + """ pool = multiprocessing.Pool(processes=cores) launch_chrom = partial(detect_high_coverage, bamfile=bamfile, @@ -81,14 +117,16 @@ def main(args): outputfiles = [] # pop fist chromosome chrom = chromosomes.pop(0) + outputfile, cutoff = detect_high_coverage(chrom, bamfile, fastafile, tempdir, window_size=window_size) outputfiles.append(outputfile) - outfiles = parallel_runs(chromosomes, bamfile, fastafile, - tempdir, window_size, cutoff, - cores=threads) - outputfiles.extend(outfiles) + if len(chromosomes): + outfiles = parallel_runs(chromosomes, bamfile, fastafile, + tempdir, window_size, cutoff, + cores=threads) + outputfiles.extend(outfiles) with NamedTemporaryFile(dir=tempdir, mode="w", delete=False) as bedout: for fname in outputfiles: @@ -114,7 +152,7 @@ if __name__ == '__main__': help='comma separated list of chromosomes') parser.add_argument('-o', '--output', required=True, help='output file') - parser.add_argument('-w', '--window_size', type=int, default=20, + parser.add_argument('-w', '--window_size', type=int, default=500, help='the size of the window for coverage estimation') parser.add_argument('-t', '--threads', type=int, default=1, help='number of threads') diff --git a/snakecnv/detection.snk b/snakecnv/detection.snk index 17028d1acb8325ca036e73c8ef9c3aa3a7df2026..b978084dd7e6fbeb29b93026e0d5bb07d184bd19 100644 --- a/snakecnv/detection.snk +++ b/snakecnv/detection.snk @@ -47,20 +47,21 @@ def get_excluded(config): else: return "" -def get_chr_batches(ref_file, chr): + +def get_chr_batches(ref_file, chr, chr_batchsize=5000000, overlap=50000): fa = pysam.FastaFile(ref_file) len_chr = len(fa.fetch(chr)) # Make ranges: groups = [] start = 1 - end = min(start + 9999999, len_chr) - while (len_chr - end + 1 >= 9950000) or (len_chr + 1 == end): + end = min(start + chr_batchsize - 1, len_chr) + while (len_chr - end + 1 >= chr_batchsize - overlap) or (len_chr + 1 == end): groups.append((start, end)) if len_chr + 1 == end: break - start = end - 49999 - end = min(start + 9999999, len_chr + 1) + start = end - overlap + 1 + end = min(start + chr_batchsize - 1, len_chr + 1) last_group = (start, len_chr + 1) if last_group not in groups: groups.append(last_group) @@ -93,7 +94,10 @@ chromosomes = config["chromosomes"] chr_batches = {} ref_chr = "reference_raw.fasta" if not os.path.exists(REFERENCE) and os.path.exists("reference_raw.fasta") else REFERENCE for chromosome in chromosomes: - chr_batches[chromosome] = get_chr_batches(ref_chr, chromosome) + chr_batches[chromosome] = get_chr_batches(ref_chr, + chromosome, + chr_batchsize=5000000, + overlap=50000) # list of variants to be detected varianttypes = ['DEL', 'INV', 'DUP', 'mCNV'] diff --git a/snakecnv/tools/pindel.snk b/snakecnv/tools/pindel.snk index 269432c0350af89540c0ac1b39a5e24b7384d6a5..4498c9227048fd5808a71717c4b79eaf4f234d85 100644 --- a/snakecnv/tools/pindel.snk +++ b/snakecnv/tools/pindel.snk @@ -48,10 +48,7 @@ rule pindel: " -o {wildcards.batch}/pindel/{wildcards.chrom}/{wildcards.chrbatch}/pindel_{wildcards.chrom}" " 1>{log.stdout} 2>{log.stderr}" """ - for svtype in D TD INV RP SI LI BP CloseEndMapped INT_final; - do - gzip -f {wildcards.batch}/pindel/{wildcards.chrom}/{wildcards.chrbatch}/pindel_{wildcards.chrom}_${{svtype}} - done + gzip -f {wildcards.batch}/pindel/{wildcards.chrom}/{wildcards.chrbatch}/* """ rule mergepindelbatches: @@ -70,4 +67,4 @@ rule mergepindelbatches: "mergepindelbatches.py {output.D} {input.D} 1>{log.stdout} 2>{log.stderr};\n" "mergepindelbatches.py {output.INV} {input.INV} 1>>{log.stdout} 2>>{log.stderr};\n" "mergepindelbatches.py {output.TD} {input.TD} 1>>{log.stdout} 2>>{log.stderr};\n" - "rm -rf {wildcards.batch}/pindel/{wildcards.chrom}" \ No newline at end of file + "rm -rf {wildcards.batch}/pindel/{wildcards.chrom}"