Skip to content
Snippets Groups Projects
Commit 5225e199 authored by Thomas Faraut's avatar Thomas Faraut
Browse files

- delly ex is now mine

- get_regions to exclude uses not mad
parent fd66bc96
No related branches found
No related tags found
No related merge requests found
......@@ -12,9 +12,6 @@ from subprocess import run
import pysam
def eprint(*args, **kwargs):
''' A simple to sys.stderr printer wrapper '''
print(*args, file=sys.stderr, **kwargs)
......@@ -89,12 +86,16 @@ def runDelly(args):
delly_raw = "delly_raw.bcf"
# construct delly string
delly_call_str = "delly call "
# TODO warning the genologin delly is not multiprocess
delly_exe = "/home/faraut/save/Softwares/delly_v7.9_src/delly/src/delly"
# delly_call_str = "delly call "
delly_call_str = delly_exe + " call "
delly_call_str += "-g %s " % genome
delly_call_str += "-x %s " % exclusion_file
delly_call_str += "-t %s " % svtype
delly_call_str += "-o %s " % delly_raw
delly_call_str += " ".join(bamfiles)
completed = run(delly_call_str, shell=True)
if completed:
......@@ -110,7 +111,7 @@ def runDelly(args):
else:
command = "bcftools convert -O b -o {output}.bcf {template};\n".format(output=os.path.splitext(output)[0],
template=template)
template=template)
completed = run(command, shell=True)
os.chdir(oldir)
......
......@@ -4,40 +4,65 @@ import argparse
import os
from tempfile import NamedTemporaryFile, mkdtemp
from shutil import rmtree
from itertools import izip
import multiprocessing
from functools import partial
from svrunner_utils import eprint
import numpy as np
from scipy import stats
import pysam
import pysamstats
import pybedtools
from svrunner_utils import eprint
def get_bamcoverage(bamfile, fafile, chrom, window_size=200):
def compute_mad(a):
"""
Compute *Median Absolute Deviation* of an array.
"""
med = np.median(a)
mad = np.median(np.absolute(a - med)) # MAD
return mad
def isSplitRead(read):
if not read.has_tag('SA'):
return False
tag = read.get_tag('SA')
num_alt_align = len(tag.split(";"))
if num_alt_align > 0:
return True
else:
return False
def get_bamcoverage(bamfile, fafile, chrom, window_size=500,
start=None, end=None):
"""
pure pysam implementation of pysamstats.load_binned,
same signature as get_pysamstatsbamcoverage (originally get_bamcoverage)
"""
mybam = pysam.AlignmentFile(bamfile)
fa = pysam.FastaFile(fafile)
len_chr = len(fa.fetch(chrom))
if start is None:
start = 0
if end is None:
end = len(fa.fetch(chrom))
coverage = []
for i in range(0, len_chr, window_size):
for i in range(start, end, window_size):
pos = i + int(window_size/2)
counts = mybam.count(chrom, i, i+window_size, read_callback="all")
coverage.append((chrom.encode('utf8'), pos, counts))
# print(chrom, pos, counts)
a = np.array(coverage, dtype=[('chrom', 'S22'),
('pos', np.int64), ('reads_all', 'i8')])
a = a.view(np.recarray)
return a
def get_pysamstatsbamcoverage(bamfile, fafile, chrom, window_size=200):
def get_pysamstatsbamcoverage(bamfile, fafile, chrom, window_size=500):
mybam = pysam.AlignmentFile(bamfile)
a = pysamstats.load_binned("coverage", mybam,
fafile=fafile,
......@@ -54,14 +79,17 @@ def get_coverage_cutoff(coverage):
counts = coverage.reads_all[np.where(coverage.reads_all)]
mode = stats.mode(counts)
stdev = np.std(counts)
median = np.median(counts)
# stdev = np.std(counts)
mad = compute_mad(counts)
# recommanded by SpeedSeq
return mode[0] + 3 * stdev
# return median + 3 * stdev
# we use mad to be more robust to outliers
return median + 7 * mad
def dump_to_bed(overcovered_regions, fout, window_size):
def dump_overcovered_to_bed(overcovered_regions, fout, window_size):
"""
From the numpy record array filtered by the cutoff
dump the corresponding regions to the bed outputfile
......@@ -73,20 +101,38 @@ def dump_to_bed(overcovered_regions, fout, window_size):
pos + int(window_size/2)))
def dump_illcovered_to_bed(overcovered_regions, fout, window_size):
"""
From the numpy record array filtered by the cutoff
dump the corresponding regions to the bed outputfile
"""
extension = 10000
for chrom, pos in zip(overcovered_regions.chrom,
overcovered_regions.pos):
fout.write("%s\t%d\t%d\n" % (chrom.decode("utf-8"),
max(0, pos - int(window_size/2) - extension),
pos + int(window_size/2) + extension))
def detect_high_coverage(chrom, bamfile, fastafile, tempdir,
window_size=20, cutoff=None):
window_size=500, cutoff=None):
"""
Detecting high coverage regions using get_bamcoverage
dumping bed output in a tempfile
"""
eprint("Detecting high coverage regions in %s for %s" % (bamfile, chrom))
with NamedTemporaryFile(dir=tempdir, mode="w", delete=False) as output:
# TODO test for empty bam file (use pysam following https://github.com/pysam-developers/pysam/issues/27)
# TODO test for empty bam file (use pysam following
# https://github.com/pysam-developers/pysam/issues/27)
coverage = get_bamcoverage(bamfile, fastafile, chrom, window_size)
# start=7040000, end=7140000)
if cutoff is None:
cutoff = get_coverage_cutoff(coverage)
overcovered_regions = coverage[np.where(coverage.reads_all > cutoff)]
dump_to_bed(overcovered_regions, output, window_size)
illcovered_regions = coverage[np.where(coverage.reads_all > 10*cutoff)]
dump_overcovered_to_bed(overcovered_regions, output, window_size)
dump_illcovered_to_bed(illcovered_regions, output, window_size)
return output.name, cutoff
......@@ -138,12 +184,13 @@ def main(args):
bedout.write(line)
os.remove(fname)
bed = pybedtools.BedTool(bedout.name)
merged_bed = bed.merge()
merged_bed = bed.sort().merge()
os.remove(bedout.name)
merged_bed.saveas(output)
rmtree(tempdir)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Extract entries for a "
"specific chromosome from a bam file")
......
......@@ -72,7 +72,7 @@ def filtering(inputfile, outputfile, genotyper, add_infos=True, overlap_cutoff=0
SVSet.append(record)
eprint("Working with " + str(len(SVSet)) + " records")
# PASS and "." will be now marker PASS
# PASS and "." will be now marked PASS
UnifiedPassAnnotation(SVSet)
# Redundancy annotation
......
......@@ -80,7 +80,6 @@ class SnakemakeUtils:
))
return inputs
def get_inputs_bams(self, wildcards):
inputs = []
indexes = []
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment