From 5225e199ea7b1b886ad292adf958b17aa608144b Mon Sep 17 00:00:00 2001 From: tfaraut <Thomas.Faraut@inra.fr> Date: Tue, 4 Dec 2018 16:44:00 +0100 Subject: [PATCH] - delly ex is now mine - get_regions to exclude uses not mad --- snakecnv/bin/delly.py | 11 ++-- snakecnv/bin/detect_regions_to_exclude.py | 77 ++++++++++++++++++----- snakecnv/bin/filter.py | 2 +- snakecnv/svsnakemake_utils.py | 1 - 4 files changed, 69 insertions(+), 22 deletions(-) diff --git a/snakecnv/bin/delly.py b/snakecnv/bin/delly.py index f6d6d77..69ee6dc 100755 --- a/snakecnv/bin/delly.py +++ b/snakecnv/bin/delly.py @@ -12,9 +12,6 @@ from subprocess import run import pysam - - - def eprint(*args, **kwargs): ''' A simple to sys.stderr printer wrapper ''' print(*args, file=sys.stderr, **kwargs) @@ -89,12 +86,16 @@ def runDelly(args): delly_raw = "delly_raw.bcf" # construct delly string - delly_call_str = "delly call " + # TODO warning the genologin delly is not multiprocess + delly_exe = "/home/faraut/save/Softwares/delly_v7.9_src/delly/src/delly" + # delly_call_str = "delly call " + delly_call_str = delly_exe + " call " delly_call_str += "-g %s " % genome delly_call_str += "-x %s " % exclusion_file delly_call_str += "-t %s " % svtype delly_call_str += "-o %s " % delly_raw delly_call_str += " ".join(bamfiles) + completed = run(delly_call_str, shell=True) if completed: @@ -110,7 +111,7 @@ def runDelly(args): else: command = "bcftools convert -O b -o {output}.bcf {template};\n".format(output=os.path.splitext(output)[0], - template=template) + template=template) completed = run(command, shell=True) os.chdir(oldir) diff --git a/snakecnv/bin/detect_regions_to_exclude.py b/snakecnv/bin/detect_regions_to_exclude.py index 0ef3acf..d46f5a3 100755 --- a/snakecnv/bin/detect_regions_to_exclude.py +++ b/snakecnv/bin/detect_regions_to_exclude.py @@ -4,40 +4,65 @@ import argparse import os from tempfile import NamedTemporaryFile, mkdtemp from shutil import rmtree +from itertools import izip import multiprocessing from functools import partial -from svrunner_utils import eprint - import numpy as np -from scipy import stats import pysam import pysamstats import pybedtools +from svrunner_utils import eprint + -def get_bamcoverage(bamfile, fafile, chrom, window_size=200): +def compute_mad(a): + """ + Compute *Median Absolute Deviation* of an array. + """ + med = np.median(a) + mad = np.median(np.absolute(a - med)) # MAD + return mad + + +def isSplitRead(read): + if not read.has_tag('SA'): + return False + tag = read.get_tag('SA') + num_alt_align = len(tag.split(";")) + if num_alt_align > 0: + return True + else: + return False + + +def get_bamcoverage(bamfile, fafile, chrom, window_size=500, + start=None, end=None): """ pure pysam implementation of pysamstats.load_binned, same signature as get_pysamstatsbamcoverage (originally get_bamcoverage) """ mybam = pysam.AlignmentFile(bamfile) fa = pysam.FastaFile(fafile) - len_chr = len(fa.fetch(chrom)) + if start is None: + start = 0 + if end is None: + end = len(fa.fetch(chrom)) coverage = [] - for i in range(0, len_chr, window_size): + for i in range(start, end, window_size): pos = i + int(window_size/2) counts = mybam.count(chrom, i, i+window_size, read_callback="all") coverage.append((chrom.encode('utf8'), pos, counts)) + # print(chrom, pos, counts) a = np.array(coverage, dtype=[('chrom', 'S22'), ('pos', np.int64), ('reads_all', 'i8')]) a = a.view(np.recarray) return a -def get_pysamstatsbamcoverage(bamfile, fafile, chrom, window_size=200): +def get_pysamstatsbamcoverage(bamfile, fafile, chrom, window_size=500): mybam = pysam.AlignmentFile(bamfile) a = pysamstats.load_binned("coverage", mybam, fafile=fafile, @@ -54,14 +79,17 @@ def get_coverage_cutoff(coverage): counts = coverage.reads_all[np.where(coverage.reads_all)] - mode = stats.mode(counts) - stdev = np.std(counts) + median = np.median(counts) + # stdev = np.std(counts) + mad = compute_mad(counts) # recommanded by SpeedSeq - return mode[0] + 3 * stdev + # return median + 3 * stdev + # we use mad to be more robust to outliers + return median + 7 * mad -def dump_to_bed(overcovered_regions, fout, window_size): +def dump_overcovered_to_bed(overcovered_regions, fout, window_size): """ From the numpy record array filtered by the cutoff dump the corresponding regions to the bed outputfile @@ -73,20 +101,38 @@ def dump_to_bed(overcovered_regions, fout, window_size): pos + int(window_size/2))) +def dump_illcovered_to_bed(overcovered_regions, fout, window_size): + """ + From the numpy record array filtered by the cutoff + dump the corresponding regions to the bed outputfile + """ + extension = 10000 + for chrom, pos in zip(overcovered_regions.chrom, + overcovered_regions.pos): + fout.write("%s\t%d\t%d\n" % (chrom.decode("utf-8"), + max(0, pos - int(window_size/2) - extension), + pos + int(window_size/2) + extension)) + + def detect_high_coverage(chrom, bamfile, fastafile, tempdir, - window_size=20, cutoff=None): + window_size=500, cutoff=None): """ Detecting high coverage regions using get_bamcoverage dumping bed output in a tempfile """ eprint("Detecting high coverage regions in %s for %s" % (bamfile, chrom)) with NamedTemporaryFile(dir=tempdir, mode="w", delete=False) as output: - # TODO test for empty bam file (use pysam following https://github.com/pysam-developers/pysam/issues/27) + # TODO test for empty bam file (use pysam following + # https://github.com/pysam-developers/pysam/issues/27) coverage = get_bamcoverage(bamfile, fastafile, chrom, window_size) + # start=7040000, end=7140000) if cutoff is None: cutoff = get_coverage_cutoff(coverage) overcovered_regions = coverage[np.where(coverage.reads_all > cutoff)] - dump_to_bed(overcovered_regions, output, window_size) + illcovered_regions = coverage[np.where(coverage.reads_all > 10*cutoff)] + dump_overcovered_to_bed(overcovered_regions, output, window_size) + dump_illcovered_to_bed(illcovered_regions, output, window_size) + return output.name, cutoff @@ -138,12 +184,13 @@ def main(args): bedout.write(line) os.remove(fname) bed = pybedtools.BedTool(bedout.name) - merged_bed = bed.merge() + merged_bed = bed.sort().merge() os.remove(bedout.name) merged_bed.saveas(output) rmtree(tempdir) + if __name__ == '__main__': parser = argparse.ArgumentParser(description="Extract entries for a " "specific chromosome from a bam file") diff --git a/snakecnv/bin/filter.py b/snakecnv/bin/filter.py index 9f3f7a0..efc484f 100755 --- a/snakecnv/bin/filter.py +++ b/snakecnv/bin/filter.py @@ -72,7 +72,7 @@ def filtering(inputfile, outputfile, genotyper, add_infos=True, overlap_cutoff=0 SVSet.append(record) eprint("Working with " + str(len(SVSet)) + " records") - # PASS and "." will be now marker PASS + # PASS and "." will be now marked PASS UnifiedPassAnnotation(SVSet) # Redundancy annotation diff --git a/snakecnv/svsnakemake_utils.py b/snakecnv/svsnakemake_utils.py index cc46fb1..fcebced 100644 --- a/snakecnv/svsnakemake_utils.py +++ b/snakecnv/svsnakemake_utils.py @@ -80,7 +80,6 @@ class SnakemakeUtils: )) return inputs - def get_inputs_bams(self, wildcards): inputs = [] indexes = [] -- GitLab