installing genomestrip using conda

262743e3 · Thomas Faraut · b5c5e280 · 262743e3 · 262743e3 · 262743e3
Commit 262743e3 authored 5 years ago by Thomas Faraut
--- a/README.md
+++ b/README.md
@@ -35,8 +35,7 @@ $ conda create --name cnvpipeline --file requirements.yaml
 ### 4. Load new conda environment

 ```sh
-source activate cnv
-export PERL5LIB="$CONDA_HOME/envs/cnv/lib/perl5"
+source activate cnvpipeline
 ```   


@@ -57,20 +56,16 @@ Special case of genologin cluster (genotoul):

 For future logins, you must reactivate all conda environments. This means launching these commands:

-    export PATH=$CONDA_HOME/bin:$PATH
-    source activate cnv
-    export PERL5LIB="$CONDA_HOME/envs/cnv/lib/perl5"
-
-Where `$CONDA_HOME` is the folder in which you install miniconda in previous step.
-
-
+ ```sh
+    source activate cnvpipeline
+```
+ 
 ## Install for simulations

 To do simulations, you need to compile pirs, which is included as submodule of your cnvpipeline installation. Go into `cnvpipelines/popsim/pirs` and just run:

    make

-
 ## Configuration

 Configuration should be edited in `application.properties` file. Sections and parameters are described below.

--- a/snakecnv/bin/bwa/bwa
+++ b/snakecnv/bin/bwa/bwa
--- a/snakecnv/bin/bwa/libbwa.so
+++ b/snakecnv/bin/bwa/libbwa.so
--- a/snakecnv/tools/envs/repeatmasker.yaml
+++ b/snakecnv/tools/envs/repeatmasker.yaml
+name: try
+channels:
+  - anaconda
+  - conda-forge
+  - bioconda
+  - r
+  - defaults
+dependencies:
+  - repeatmasker
--- a/snakecnv/tools/refbundle.smk
+++ b/snakecnv/tools/refbundle.smk
+
+
+rule get_fasta:
+    input:
+        config['genome']
+    output:
+        "reference.fasta"
+    params:
+        chromosomes = config['chromosomes']
+    shell:
+        "samtools faidx {input} {params.chromosomes} > {output}; "
+        "samtools faidx {output}"
+
+
+rule preparesvmask:
+    input:
+        fasta = "reference.fasta",
+        fai = "reference.fasta.fai"
+    output:
+        fasta = "work/reference.fasta",
+        index = "work/reference.fasta.bwt"
+    log:
+        "logs/preparesvmask.log"
+    run:
+        svmask_index(input.fasta, output.fasta)
+
+
+rule mergesvmask:
+    input:
+        expand("work/svmask_{chrom}.fasta", chrom=config['chromosomes'])
+    output:
+        "reference.svmask.fasta"
+    shell:
+        "cat {input} > {output}"
+
+rule chromsvmask:
+    input:
+        "work/reference.fasta.bwt",
+        fa = "work/reference.fasta"
+    output:
+        "work/svmask_{chrom}.fasta"
+    params:
+        rl = config['read_len'],
+        classpath = config['CLASSPATH'],
+        sv_dir = config['SV_DIR'],
+        gs_bwa_dir = config['GS_BWA_DIR']
+    log:
+        "logs/svmask/{chrom}.log"
+    shell:
+        "export PATH={params.sv_dir}/bwa:\"$PATH\"; "
+        " export LD_LIBRARY_PATH={params.gs_bwa_dir}:\"$LD_LIBRARY_PATH\";"
+        " java -cp {params.classpath}"
+        " -Xmx4g org.broadinstitute.sv.apps.ComputeGenomeMask"
+        " -R {input.fa} -O work/svmask_{wildcards.chrom}.fasta"
+        " -readLength {params.rl}"
+        " -sequence {wildcards.chrom} 2> {log} "
+
+rule index:
+    input:
+        "{type}.fasta"
+    output:
+        "{type}.fasta.fai"
+    log:
+        "logs/index/{type}.log"
+    shell:
+        "samtools faidx {input} &> {log}"
+
+
+rule bed2fasta:
+    input:
+        "reference.fasta.fai",
+        ref = "reference.fasta",
+        bed = "{type}.bed"
+    output:
+        "{type}.fasta"
+    wildcard_constraints:
+        type = ".*(lc|sv|gc)mask"
+    params:
+        classpath = config['CLASSPATH']
+    shell:
+        " java -cp {params.classpath} -Xmx4g "
+        "     org.broadinstitute.sv.apps.BedFileToGenomeMask "
+        "    -I {input.bed} "
+        "    -R {input.ref}  "
+        "    -O {output}"
+
+rule repeatmask:
+    input:
+        fasta = "reference.fasta",
+        fai = "reference.fasta.fai"
+    output:
+        "reference.fasta.out"
+    params:
+        sp = config['species']
+    threads:
+        get_threads("repeatmask", 12)
+    conda:
+        "envs/repeatmasker.yaml"
+    shell:
+        """
+        RepeatMasker -pa {threads} -species {params.sp} -xsmall {input.fasta}
+         > {input.fasta}.repeatmasker.log
+        """
+
+rule length:
+    input:
+        fasta = "reference.fasta",
+        fai = "reference.fasta.fai"
+    output:
+        "reference.fasta.length"
+    shell:
+        "fastalength {input.fasta} > {output}"
+
+
+rule rdmask:
+    input:
+        length = "reference.fasta.length"
+    output:
+        bed = "reference.rdmask.bed"
+    run:
+        with open(input.length) as f:
+            with open(output.bed, "w") as fout:
+                fout.write("\t".join(["CHROM", "START", "END"])+"\n")
+                for line in f:
+                    line = line.rstrip()
+                    fields = line.split()
+                    fout.write("\t".join([fields[1], "0",
+                                          str(int(fields[0])-1)]) + "\n")
+
+rule lcmaskbed:
+    input:
+        repeats = "reference.fasta.out"
+    output:
+        bed = "reference.lcmask.bed"
+    run:
+        repeats = []
+        with open(input.repeats) as f:
+            for line in f:
+                if p_repeats.match(line):
+                    fields = line.rstrip().split()
+                    # append chrom start and end
+                    repeats.append([fields[4], fields[5], fields[6]])
+        sorted_repeats = sorted(repeats,
+                                key=lambda x: (x[0], int(x[1]), int(x[2])))
+        # Write to file
+        import csv
+        with open(output.bed, 'w') as f:
+            writer = csv.writer(f, delimiter='\t')
+            writer.writerows(sorted_repeats)
+
+
+rule gcmaskbed:
+    input:
+        repeats = "reference.fasta.out"
+    output:
+        bed = "reference.gcmask.bed"
+    run:
+        repeats = []
+        with open(input.repeats) as f:
+            for line in f:
+                if p_repeats_all.match(line):
+                    fields = line.rstrip().split()
+                    repeats.append([fields[4], fields[5], fields[6]])
+        sorted_repeats = sorted(repeats,
+                                key=lambda x: (x[0], int(x[1]), int(x[2])))
+
+        # Write to file
+        import csv
+        with open(output.bed, 'w') as f:
+            writer = csv.writer(f, delimiter='\t')
+            writer.writerows(sorted_repeats)
+
+
+rule dict:
+    input:
+        fasta = "reference.fasta",
+        fai = "reference.fasta.fai"
+    output:
+        "reference.dict"
+    shell:
+        "picard CreateSequenceDictionary "
+        " R={input.fasta} "
+        " O={output} "
+
+rule nstretches:
+    input:
+        fasta = "reference.fasta",
+        fai = "reference.fasta.fai"
+    output:
+        "reference.Nstretch.bed"
+    params:
+        nstrech = config['maxNstretches']
+    log:
+        "logs/nstretch.log"
+    threads:
+        1
+    shell:
+        "findNstretches.py -f {input.fasta} -m {params.nstrech} "
+        "-o {output} -t {threads} 2> {log}"
+
+rule ploidy:
+    output:
+        "reference.ploidymap.txt"
+    shell:
+        " printf \"*\t*\t*\t*\t2\n\" > {output}"
+
+rule gendermask:
+    output:
+        "reference.gendermask.bed"
+    shell:
+        " touch {output}"