Skip to content
Snippets Groups Projects
Commit 22f3bb62 authored by Thomas Faraut's avatar Thomas Faraut
Browse files

fixing stdout and stderr for delly and lumpy and other details

parent 45bbd834
No related branches found
No related tags found
No related merge requests found
......@@ -102,7 +102,7 @@ else:
chromosomes = config["chromosomes"].split(",")
# list of variants to be detected
varianttypes = ['DEL', 'INV']
varianttypes = ['DEL', 'INV', 'DUP']
snakemake_utils = SnakemakeUtils(tools)
......@@ -156,11 +156,10 @@ rule merging:
rule parsing:
input:
unpack(snakemake_utils.getToolOuputFile),
gaps = "exclusion/gaps.bed",
reference = REFERENCE
output:
parseoutput = "parse/{tool}/{tool}_{chrom}_{svtype}_parsed.vcf.gz"
params:
filterbed = FILTERBED
parseoutput = "parse/{svtype}/{tool}/{tool}_{chrom}_{svtype}_parsed.vcf.gz"
threads:
1
script:
......@@ -199,6 +198,21 @@ rule excluded:
" -t {threads}"
" 1>{log.stdout} 2>{log.stderr}"
rule nstretches:
input:
reference = REFERENCE
output:
gaps = "exclusion/gaps.bed"
threads:
12
log:
stdout = "logs/exclusion/nstretch.o",
stderr = "logs/exclusion/nstretch.e"
shell:
"findNstretches.py -f {input} -o {output} -t {threads}"
" 1>{log.stdout} 2>{log.stderr}"
rule bamlist:
input:
bams = expand("data/bams/{sample}.bam", sample=samples)
......
......@@ -2,39 +2,62 @@
import sys
import argparse
from Bio import SeqIO
import re
def main(args):
infile = args.infile
cutoff = int(args.minsize)
import multiprocessing
from functools import partial
from pysam import Fastafile
handle = open(infile, "rU")
for record in SeqIO.parse(handle, "fasta"):
print(record.id, file=sys.stderr)
parse_fasta(record.id,record.seq,cutoff)
handle.close()
def eprint(*args, **kwargs):
print(*args, file=sys.stderr, **kwargs)
def parse_fasta(seq_id,sequence,cutoff):
i=0
while i<len(sequence):
cur_base = sequence[i]
j = i+1
while j<len(sequence) and sequence[j] == cur_base:
j += 1
l = j-i
if l>cutoff and cur_base=='N':
print("%s\t%d\t%d\t%c\t%d" % (seq_id,i,j,cur_base,l))
i = j
def get_chromosomes(fastafile):
with Fastafile(fastafile) as fasta:
chromosomes = fasta.references
return chromosomes
def find_chrom_Nstretches(chrom, fastafile, cutoff):
p = re.compile("N{%s,}" % cutoff)
with Fastafile(fastafile) as fasta:
sequence = fasta.fetch(chrom)
gaps = [(chrom, m.start(), m.end() + 1) for m in re.finditer(p, sequence)]
return gaps
def findNstretches(infile, minsize, outfile, cores):
chromosomes = get_chromosomes(infile)
pool = multiprocessing.Pool(processes=cores)
launch_chrom = partial(find_chrom_Nstretches,
fastafile=infile,
cutoff=minsize)
results = pool.map(launch_chrom, chromosomes)
genome_gaps = [item for sublist in results for item in sublist]
dumpgaps(genome_gaps, outfile)
def dumpgaps(gaps, outputfile):
with open(outputfile, "w") as fout:
for chrom, start, end in gaps:
fout.write("%s\t%d\t%d\tN\t%d\n" % (chrom, start, end, end - start))
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Construct a bed file describing the stretches of strictly greater than minsize N ")
parser.add_argument('-f', '--file', required=True, dest='infile', help='name fo the fasta file')
parser.add_argument('-m', '--minsize', required=True, help='minimum required stretch size of N')
parser = argparse.ArgumentParser(description="Construct a bed file "
"describing the stretches of strictly "
"greater than minsize N ")
parser.add_argument('-f', '--file', required=True, dest="infile",
help='name fo the fasta file')
parser.add_argument('-o', '--output', required=True,
help='name fo the fasta file')
parser.add_argument('-m', '--minsize', type=int, default=100,
help='minimum required stretch size of N')
parser.add_argument('-t', '--threads', type=int, default=1,
help='number of threads')
args = parser.parse_args()
main(args)
findNstretches(args.infile, args.minsize, args.output, args.threads)
#!/usr/bin/env python
#!/usr/bin/env python3
# (c) 2012 - Ryan M. Layer
# Hall Laboratory
# Quinlan Laboratory
......@@ -21,6 +21,7 @@ import pysam
from svrunner_utils import fetchId
def eprint(*args, **kwargs):
''' A simple to sys.stderr printer wrapper '''
print(*args, file=sys.stderr, **kwargs)
......@@ -66,6 +67,9 @@ def insert_size_distro(bamfile, chrom, outdir,
if read.is_proper_pair:
isizes.append(np.abs(read.template_length))
rlen.append(read.infer_query_length())
readlength = max(rlen)
a = np.array(isizes)
# remove outliers
L = a[(a < np.percentile(a, 99)) & (a > np.percentile(a, 1))]
......@@ -92,8 +96,7 @@ def insert_size_distro(bamfile, chrom, outdir,
for size, count in counts.items():
fout.write("%d\t%f\n" % (size, float(count)/num))
# print('mean:' + str(mean) + '\tstdev:' + str(stdev))
return sample, mean, stdev, histo_file
return sample, mean, stdev, histo_file, readlength
def parse_argumanets():
......
......@@ -42,13 +42,14 @@ def get_sample_desc_files(discsplit_files, insert_sizes):
samples = dict()
for sample, disc, split in discsplit_files:
samples[sample] = {'disc': disc, 'split': split}
for sample, mean, stdev, histo_file in insert_sizes:
for sample, mean, stdev, histo_file, readlength in insert_sizes:
if sample not in samples:
eprint("sample %s absent from insert sizes" % sample)
exit(1)
samples[sample]['lumpy'] = {'id': sample,
'mean': mean,
'stdev': stdev,
'readlength': readlength,
'histo_file': histo_file}
return samples
......@@ -57,7 +58,7 @@ def dict_str(k, v):
return "%s:%s" % (k, v)
def disc_string(disc_file, sample_info, read_length,
def disc_string(disc_file, sample_info,
min_non_overlap=101, discordant_z=5,
back_distance=10, weight=1, min_mapping_threshold=20):
......@@ -68,7 +69,6 @@ def disc_string(disc_file, sample_info, read_length,
'min_mapping_threshold': min_mapping_threshold}
lumpy_dict['bam_file'] = disc_file
lumpy_dict['read_length'] = read_length
lumpy_dict.update(sample_info)
return ",".join([dict_str(k, v) for k, v in lumpy_dict.items()])
......@@ -116,7 +116,7 @@ def runLumpy(args):
temp_output = "lumpy.vcf"
lumpy_str = "lumpy -mw 4 -tt 0 "
for sample, infos in samples.items():
pe_str = disc_string(infos['disc'], infos['lumpy'], readlength)
pe_str = disc_string(infos['disc'], infos['lumpy'])
sr_str = split_string(infos['split'], infos['lumpy'])
lumpy_str += " -pe " + pe_str + " -sr " + sr_str
lumpy_str += " | vcf-sort > " + temp_output
......@@ -140,8 +140,6 @@ def parse_arguments():
help="A file with a list of BAM files")
parser.add_argument("-c", "--chrom", required=True,
help="a chromosome")
parser.add_argument("-r", "--readlength", required=True,
help="the read length")
parser.add_argument("-o", "--output", required=True,
help="the output file")
parser.add_argument("-e", "--excluded",
......
......@@ -16,11 +16,10 @@ tool_to_writer = {"pindel": PindelWriter, "delly": DellyWriter,
"lumpy": LumpyWriter, "genomestrip": GenomeSTRIPWriter}
def convert_svtool_to_vcf(reference, inputfile, outputfile,
def convert_svtool_to_vcf(reference, inputfile, gaps, outputfile,
toolname, svtype,
minlen=50, maxlen=5000000,
doFilter=True,
filterbed=None):
doFilter=True):
# The reference handle
reference_handle = Fastafile(reference) if reference else None
......@@ -46,7 +45,7 @@ def convert_svtool_to_vcf(reference, inputfile, outputfile,
# Now filtering the records
if doFilter:
eprint("Now filtering")
records = SVReader.filtering(records, minlen, maxlen, filterbed)
records = SVReader.filtering(records, minlen, maxlen, gaps)
# Sorting the vcf records
records = sorting(records, reference_contigs)
......@@ -60,16 +59,16 @@ def convert_svtool_to_vcf(reference, inputfile, outputfile,
tabix_index(outputfile, force=True, preset='vcf')
def parsing(reference, tooloutput, parseoutput, tool, svtype, filterbed=None):
print("\n".join([reference, tooloutput, parseoutput, tool, svtype, filterbed]))
def parsing(reference, tooloutput, gaps, parseoutput, tool, svtype):
print("\n".join([reference, tooloutput, gaps, parseoutput, tool, svtype]))
convert_svtool_to_vcf(reference, tooloutput, parseoutput,
tool, svtype, filterbed=filterbed)
convert_svtool_to_vcf(reference, tooloutput, gaps, parseoutput,
tool, svtype)
parsing(snakemake.input['reference'],
snakemake.input['tooloutput'],
snakemake.input['gaps'],
snakemake.output['parseoutput'],
snakemake.wildcards['tool'],
snakemake.wildcards['svtype'],
filterbed=snakemake.params['filterbed'])
snakemake.wildcards['svtype'])
......@@ -52,7 +52,8 @@ class SnakemakeUtils:
def getToolOuputFile(self, wildcards):
tool_prefix = self.get_tool_prefix(wildcards.tool,
wildcards.chrom, wildcards.svtype)
return {'tooloutput': os.path.join(wildcards.tool, tool_prefix)}
return {'tooloutput': os.path.join(wildcards.svtype,
wildcards.tool, tool_prefix)}
def set_tool_svtypes(self, tool, svtypes):
self.tools_sv_types[tool] = svtypes
......
delly_sv=['DEL', 'INV', 'DUP']
delly_sv = ['DEL', 'INV', 'DUP']
rule delly:
input:
......@@ -17,4 +17,5 @@ rule delly:
"export OMP_NUM_THREADS={threads} ; "
"delly.py -b {input.bamlist} -c {wildcards.chrom}"
" -g {params.genome} -x {input.excluded} -t {wildcards.svtype}"
" -o {output} "
" -o {output}"
" 1>{log.stdout} 2>{log.stderr}"
......@@ -15,4 +15,5 @@ rule lumpy:
stderr = "logs/lumpy/{chrom}.e"
shell:
"lumpy.py -b {input.bamlist} -c {wildcards.chrom}"
" -r {params.readlength} -t {threads} -o {output} "
" -t {threads} -o {output}"
" 1>{log.stdout} 2>{log.stderr}"
......@@ -39,7 +39,7 @@ rule pindel:
stderr = "logs/pindel/{chrom}.e"
shell:
"pindel -f {input.genome} -i {input.statfile}"
" --min_perfect_match_around_BP 20 --max_range_index 3"
" --min_perfect_match_around_BP 20 --max_range_index 2"
" --number_of_threads {threads}"
" --report_interchromosomal_events false"
" --minimum_support_for_event 3 -c {wildcards.chrom}"
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment