fixing stdout and stderr for delly and lumpy and other details

22f3bb62 · Thomas Faraut · 45bbd834 · 22f3bb62 · 22f3bb62 · 22f3bb62
Commit 22f3bb62 authored 7 years ago by Thomas Faraut
--- a/snakecnv/Snakefile
+++ b/snakecnv/Snakefile
@@ -102,7 +102,7 @@ else:
    chromosomes = config["chromosomes"].split(",")

 # list of variants to be detected
-varianttypes = ['DEL', 'INV']
+varianttypes = ['DEL', 'INV', 'DUP']

 snakemake_utils = SnakemakeUtils(tools)

@@ -156,11 +156,10 @@ rule merging:
 rule parsing:
    input:
        unpack(snakemake_utils.getToolOuputFile),
+        gaps = "exclusion/gaps.bed",
        reference = REFERENCE
    output:
-        parseoutput = "parse/{tool}/{tool}_{chrom}_{svtype}_parsed.vcf.gz"
-    params:
-        filterbed = FILTERBED
+        parseoutput = "parse/{svtype}/{tool}/{tool}_{chrom}_{svtype}_parsed.vcf.gz"
    threads:
        1
    script:
@@ -199,6 +198,21 @@ rule excluded:
        " -t {threads}"
        " 1>{log.stdout} 2>{log.stderr}"

+
+rule nstretches:
+    input:
+        reference = REFERENCE
+    output:
+        gaps = "exclusion/gaps.bed"
+    threads:
+        12
+    log:
+        stdout = "logs/exclusion/nstretch.o",
+        stderr = "logs/exclusion/nstretch.e"
+    shell:
+        "findNstretches.py -f {input} -o {output} -t {threads}"
+        " 1>{log.stdout} 2>{log.stderr}"
+
 rule bamlist:
    input:
        bams = expand("data/bams/{sample}.bam", sample=samples)

--- a/snakecnv/bin/findNstretches.py
+++ b/snakecnv/bin/findNstretches.py
@@ -2,39 +2,62 @@

 import sys
 import argparse
-from Bio import SeqIO
+import re

-def main(args):
-    infile = args.infile
-    cutoff = int(args.minsize)
+import multiprocessing
+from functools import partial
+
+from pysam import Fastafile

-    handle = open(infile, "rU")
-    for record in SeqIO.parse(handle, "fasta"):
-        print(record.id, file=sys.stderr)
-        parse_fasta(record.id,record.seq,cutoff)
-    handle.close()  

 def eprint(*args, **kwargs):
    print(*args, file=sys.stderr, **kwargs)

-def parse_fasta(seq_id,sequence,cutoff):
-    i=0
-    while i<len(sequence):
-        cur_base = sequence[i]
-        j = i+1
-        while j<len(sequence) and sequence[j] == cur_base:
-            j += 1
-        l = j-i
-        if l>cutoff and cur_base=='N':
-            print("%s\t%d\t%d\t%c\t%d" % (seq_id,i,j,cur_base,l))
-        i = j
+
+def get_chromosomes(fastafile):
+    with Fastafile(fastafile) as fasta:
+        chromosomes = fasta.references
+    return chromosomes
+
+
+def find_chrom_Nstretches(chrom, fastafile, cutoff):
+    p = re.compile("N{%s,}" % cutoff)
+    with Fastafile(fastafile) as fasta:
+        sequence = fasta.fetch(chrom)
+        gaps = [(chrom, m.start(), m.end() + 1) for m in re.finditer(p, sequence)]
+    return gaps
+
+
+def findNstretches(infile, minsize, outfile, cores):
+    chromosomes = get_chromosomes(infile)
+    pool = multiprocessing.Pool(processes=cores)
+    launch_chrom = partial(find_chrom_Nstretches,
+                           fastafile=infile,
+                           cutoff=minsize)
+
+    results = pool.map(launch_chrom, chromosomes)
+    genome_gaps = [item for sublist in results for item in sublist]
+    dumpgaps(genome_gaps, outfile)
+
+def dumpgaps(gaps, outputfile):
+    with open(outputfile, "w") as fout:
+        for chrom, start, end in gaps:
+            fout.write("%s\t%d\t%d\tN\t%d\n" % (chrom, start, end, end - start))


 if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description="Construct a bed file describing the stretches of strictly greater than minsize N ")
-    parser.add_argument('-f', '--file', required=True, dest='infile', help='name fo the fasta file')
-    parser.add_argument('-m', '--minsize', required=True, help='minimum required stretch size of N')
+    parser = argparse.ArgumentParser(description="Construct a bed file "
+                                     "describing the stretches of strictly "
+                                     "greater than minsize N ")
+    parser.add_argument('-f', '--file', required=True, dest="infile",
+                        help='name fo the fasta file')
+    parser.add_argument('-o', '--output', required=True,
+                        help='name fo the fasta file')
+    parser.add_argument('-m', '--minsize', type=int, default=100,
+                        help='minimum required stretch size of N')
+    parser.add_argument('-t', '--threads', type=int, default=1,
+                        help='number of threads')

    args = parser.parse_args()
-    
-    main(args)
+
+    findNstretches(args.infile, args.minsize, args.output, args.threads)
--- a/snakecnv/bin/insert_size_distro.py
+++ b/snakecnv/bin/insert_size_distro.py
-#!/usr/bin/env python
+#!/usr/bin/env python3
 #  (c) 2012 - Ryan M. Layer
 #  Hall Laboratory
 #  Quinlan Laboratory
@@ -21,6 +21,7 @@ import pysam

 from svrunner_utils import fetchId

+
 def eprint(*args, **kwargs):
    ''' A simple to sys.stderr printer wrapper '''
    print(*args, file=sys.stderr, **kwargs)
@@ -66,6 +67,9 @@ def insert_size_distro(bamfile, chrom, outdir,
            if read.is_proper_pair:
                isizes.append(np.abs(read.template_length))
                rlen.append(read.infer_query_length())
+
+    readlength = max(rlen)
+
    a = np.array(isizes)
    # remove outliers
    L = a[(a < np.percentile(a, 99)) & (a > np.percentile(a, 1))]
@@ -92,8 +96,7 @@ def insert_size_distro(bamfile, chrom, outdir,
        for size, count in counts.items():
            fout.write("%d\t%f\n" % (size, float(count)/num))

-    #  print('mean:' + str(mean) + '\tstdev:' + str(stdev))
-    return sample, mean, stdev, histo_file
+    return sample, mean, stdev, histo_file, readlength


 def parse_argumanets():

--- a/snakecnv/bin/lumpy.py
+++ b/snakecnv/bin/lumpy.py
@@ -42,13 +42,14 @@ def get_sample_desc_files(discsplit_files, insert_sizes):
    samples = dict()
    for sample, disc, split in discsplit_files:
        samples[sample] = {'disc': disc, 'split': split}
-    for sample, mean, stdev, histo_file in insert_sizes:
+    for sample, mean, stdev, histo_file, readlength in insert_sizes:
        if sample not in samples:
            eprint("sample %s absent from insert sizes" % sample)
            exit(1)
        samples[sample]['lumpy'] = {'id': sample,
                                    'mean': mean,
                                    'stdev': stdev,
+                                    'readlength': readlength,
                                    'histo_file': histo_file}
    return samples

@@ -57,7 +58,7 @@ def dict_str(k, v):
    return "%s:%s" % (k, v)


-def disc_string(disc_file, sample_info, read_length,
+def disc_string(disc_file, sample_info,
                min_non_overlap=101, discordant_z=5,
                back_distance=10, weight=1, min_mapping_threshold=20):

@@ -68,7 +69,6 @@ def disc_string(disc_file, sample_info, read_length,
                  'min_mapping_threshold': min_mapping_threshold}

    lumpy_dict['bam_file'] = disc_file
-    lumpy_dict['read_length'] = read_length
    lumpy_dict.update(sample_info)

    return ",".join([dict_str(k, v) for k, v in lumpy_dict.items()])
@@ -116,7 +116,7 @@ def runLumpy(args):
    temp_output = "lumpy.vcf"
    lumpy_str = "lumpy -mw 4 -tt 0 "
    for sample, infos in samples.items():
-        pe_str = disc_string(infos['disc'], infos['lumpy'], readlength)
+        pe_str = disc_string(infos['disc'], infos['lumpy'])
        sr_str = split_string(infos['split'], infos['lumpy'])
        lumpy_str += " -pe " + pe_str + " -sr " + sr_str
    lumpy_str += " | vcf-sort > " + temp_output
@@ -140,8 +140,6 @@ def parse_arguments():
                        help="A file with a list of BAM files")
    parser.add_argument("-c", "--chrom", required=True,
                        help="a chromosome")
-    parser.add_argument("-r", "--readlength", required=True,
-                        help="the read length")
    parser.add_argument("-o", "--output", required=True,
                        help="the output file")
    parser.add_argument("-e", "--excluded",

--- a/snakecnv/bin/parse.py
+++ b/snakecnv/bin/parse.py
@@ -16,11 +16,10 @@ tool_to_writer = {"pindel": PindelWriter, "delly": DellyWriter,
                  "lumpy": LumpyWriter, "genomestrip": GenomeSTRIPWriter}


-def convert_svtool_to_vcf(reference, inputfile, outputfile,
+def convert_svtool_to_vcf(reference, inputfile, gaps, outputfile,
                          toolname, svtype,
                          minlen=50, maxlen=5000000,
-                          doFilter=True,
-                          filterbed=None):
+                          doFilter=True):

    # The reference handle
    reference_handle = Fastafile(reference) if reference else None
@@ -46,7 +45,7 @@ def convert_svtool_to_vcf(reference, inputfile, outputfile,
    # Now filtering the records
    if doFilter:
        eprint("Now filtering")
-        records = SVReader.filtering(records, minlen, maxlen, filterbed)
+        records = SVReader.filtering(records, minlen, maxlen, gaps)

    # Sorting the vcf records
    records = sorting(records, reference_contigs)
@@ -60,16 +59,16 @@ def convert_svtool_to_vcf(reference, inputfile, outputfile,
    tabix_index(outputfile, force=True, preset='vcf')


-def parsing(reference, tooloutput, parseoutput, tool, svtype, filterbed=None):
-    print("\n".join([reference, tooloutput, parseoutput, tool, svtype, filterbed]))
+def parsing(reference, tooloutput, gaps, parseoutput, tool, svtype):
+    print("\n".join([reference, tooloutput, gaps, parseoutput, tool, svtype]))

-    convert_svtool_to_vcf(reference, tooloutput, parseoutput,
-                          tool, svtype, filterbed=filterbed)
+    convert_svtool_to_vcf(reference, tooloutput, gaps, parseoutput,
+                          tool, svtype)


 parsing(snakemake.input['reference'],
        snakemake.input['tooloutput'],
+        snakemake.input['gaps'],
        snakemake.output['parseoutput'],
        snakemake.wildcards['tool'],
-        snakemake.wildcards['svtype'],
-        filterbed=snakemake.params['filterbed'])
+        snakemake.wildcards['svtype'])
--- a/snakecnv/lib/svsnakemake_utils.py
+++ b/snakecnv/lib/svsnakemake_utils.py
@@ -52,7 +52,8 @@ class SnakemakeUtils:
    def getToolOuputFile(self, wildcards):
        tool_prefix = self.get_tool_prefix(wildcards.tool,
                                           wildcards.chrom, wildcards.svtype)
-        return {'tooloutput': os.path.join(wildcards.tool, tool_prefix)}
+        return {'tooloutput': os.path.join(wildcards.svtype,
+                                           wildcards.tool, tool_prefix)}

    def set_tool_svtypes(self, tool, svtypes):
        self.tools_sv_types[tool] = svtypes

--- a/snakecnv/tools/delly.snk
+++ b/snakecnv/tools/delly.snk
-delly_sv=['DEL', 'INV', 'DUP']
+delly_sv = ['DEL', 'INV', 'DUP']

 rule delly:
    input:
@@ -17,4 +17,5 @@ rule delly:
        "export  OMP_NUM_THREADS={threads} ; "
        "delly.py -b {input.bamlist} -c {wildcards.chrom}"
        " -g {params.genome} -x {input.excluded} -t {wildcards.svtype}"
-        " -o {output} "
+        " -o {output}"
+        " 1>{log.stdout} 2>{log.stderr}"
--- a/snakecnv/tools/lumpy.snk
+++ b/snakecnv/tools/lumpy.snk
@@ -15,4 +15,5 @@ rule lumpy:
        stderr = "logs/lumpy/{chrom}.e"
    shell:
        "lumpy.py -b {input.bamlist} -c {wildcards.chrom}"
-        " -r {params.readlength} -t {threads} -o {output} "
+        " -t {threads} -o {output}"
+        " 1>{log.stdout} 2>{log.stderr}"
--- a/snakecnv/tools/pindel.snk
+++ b/snakecnv/tools/pindel.snk
@@ -39,7 +39,7 @@ rule pindel:
        stderr = "logs/pindel/{chrom}.e"
    shell:
        "pindel -f {input.genome} -i {input.statfile}"
-        "       --min_perfect_match_around_BP 20 --max_range_index 3"
+        "       --min_perfect_match_around_BP 20 --max_range_index 2"
        "       --number_of_threads {threads}"
        "       --report_interchromosomal_events false"
        "       --minimum_support_for_event 3 -c {wildcards.chrom}"