diff --git a/README.md b/README.md index 6443c71e3c6b6e507d01efd87a373fcd69bacdc2..270a02e5019d41f3d1662ce788c3561cd393fbdd 100644 --- a/README.md +++ b/README.md @@ -35,8 +35,7 @@ $ conda create --name cnvpipeline --file requirements.yaml ### 4. Load new conda environment ```sh -source activate cnv -export PERL5LIB="$CONDA_HOME/envs/cnv/lib/perl5" +source activate cnvpipeline ``` @@ -57,20 +56,16 @@ Special case of genologin cluster (genotoul): For future logins, you must reactivate all conda environments. This means launching these commands: - export PATH=$CONDA_HOME/bin:$PATH - source activate cnv - export PERL5LIB="$CONDA_HOME/envs/cnv/lib/perl5" - -Where `$CONDA_HOME` is the folder in which you install miniconda in previous step. - - + ```sh + source activate cnvpipeline +``` + ## Install for simulations To do simulations, you need to compile pirs, which is included as submodule of your cnvpipeline installation. Go into `cnvpipelines/popsim/pirs` and just run: make - ## Configuration Configuration should be edited in `application.properties` file. Sections and parameters are described below. diff --git a/snakecnv/bin/bwa/bwa b/snakecnv/bin/bwa/bwa new file mode 100644 index 0000000000000000000000000000000000000000..4906741c4aa8c92d9089861504ed8f02ae727f21 Binary files /dev/null and b/snakecnv/bin/bwa/bwa differ diff --git a/snakecnv/bin/bwa/libbwa.so b/snakecnv/bin/bwa/libbwa.so new file mode 100644 index 0000000000000000000000000000000000000000..44a52031b1f68039e5269c7de7210cf66614c684 Binary files /dev/null and b/snakecnv/bin/bwa/libbwa.so differ diff --git a/snakecnv/tools/envs/repeatmasker.yaml b/snakecnv/tools/envs/repeatmasker.yaml new file mode 100644 index 0000000000000000000000000000000000000000..089d51b5efceb5b0b5a17efe0e338fd63f68088a --- /dev/null +++ b/snakecnv/tools/envs/repeatmasker.yaml @@ -0,0 +1,9 @@ +name: try +channels: + - anaconda + - conda-forge + - bioconda + - r + - defaults +dependencies: + - repeatmasker diff --git a/snakecnv/tools/refbundle.smk b/snakecnv/tools/refbundle.smk new file mode 100644 index 0000000000000000000000000000000000000000..8b12247f3f6acf0c49b490b8bdada70155176988 --- /dev/null +++ b/snakecnv/tools/refbundle.smk @@ -0,0 +1,211 @@ + + +rule get_fasta: + input: + config['genome'] + output: + "reference.fasta" + params: + chromosomes = config['chromosomes'] + shell: + "samtools faidx {input} {params.chromosomes} > {output}; " + "samtools faidx {output}" + + +rule preparesvmask: + input: + fasta = "reference.fasta", + fai = "reference.fasta.fai" + output: + fasta = "work/reference.fasta", + index = "work/reference.fasta.bwt" + log: + "logs/preparesvmask.log" + run: + svmask_index(input.fasta, output.fasta) + + +rule mergesvmask: + input: + expand("work/svmask_{chrom}.fasta", chrom=config['chromosomes']) + output: + "reference.svmask.fasta" + shell: + "cat {input} > {output}" + +rule chromsvmask: + input: + "work/reference.fasta.bwt", + fa = "work/reference.fasta" + output: + "work/svmask_{chrom}.fasta" + params: + rl = config['read_len'], + classpath = config['CLASSPATH'], + sv_dir = config['SV_DIR'], + gs_bwa_dir = config['GS_BWA_DIR'] + log: + "logs/svmask/{chrom}.log" + shell: + "export PATH={params.sv_dir}/bwa:\"$PATH\"; " + " export LD_LIBRARY_PATH={params.gs_bwa_dir}:\"$LD_LIBRARY_PATH\";" + " java -cp {params.classpath}" + " -Xmx4g org.broadinstitute.sv.apps.ComputeGenomeMask" + " -R {input.fa} -O work/svmask_{wildcards.chrom}.fasta" + " -readLength {params.rl}" + " -sequence {wildcards.chrom} 2> {log} " + +rule index: + input: + "{type}.fasta" + output: + "{type}.fasta.fai" + log: + "logs/index/{type}.log" + shell: + "samtools faidx {input} &> {log}" + + +rule bed2fasta: + input: + "reference.fasta.fai", + ref = "reference.fasta", + bed = "{type}.bed" + output: + "{type}.fasta" + wildcard_constraints: + type = ".*(lc|sv|gc)mask" + params: + classpath = config['CLASSPATH'] + shell: + " java -cp {params.classpath} -Xmx4g " + " org.broadinstitute.sv.apps.BedFileToGenomeMask " + " -I {input.bed} " + " -R {input.ref} " + " -O {output}" + +rule repeatmask: + input: + fasta = "reference.fasta", + fai = "reference.fasta.fai" + output: + "reference.fasta.out" + params: + sp = config['species'] + threads: + get_threads("repeatmask", 12) + conda: + "envs/repeatmasker.yaml" + shell: + """ + RepeatMasker -pa {threads} -species {params.sp} -xsmall {input.fasta} + > {input.fasta}.repeatmasker.log + """ + +rule length: + input: + fasta = "reference.fasta", + fai = "reference.fasta.fai" + output: + "reference.fasta.length" + shell: + "fastalength {input.fasta} > {output}" + + +rule rdmask: + input: + length = "reference.fasta.length" + output: + bed = "reference.rdmask.bed" + run: + with open(input.length) as f: + with open(output.bed, "w") as fout: + fout.write("\t".join(["CHROM", "START", "END"])+"\n") + for line in f: + line = line.rstrip() + fields = line.split() + fout.write("\t".join([fields[1], "0", + str(int(fields[0])-1)]) + "\n") + +rule lcmaskbed: + input: + repeats = "reference.fasta.out" + output: + bed = "reference.lcmask.bed" + run: + repeats = [] + with open(input.repeats) as f: + for line in f: + if p_repeats.match(line): + fields = line.rstrip().split() + # append chrom start and end + repeats.append([fields[4], fields[5], fields[6]]) + sorted_repeats = sorted(repeats, + key=lambda x: (x[0], int(x[1]), int(x[2]))) + # Write to file + import csv + with open(output.bed, 'w') as f: + writer = csv.writer(f, delimiter='\t') + writer.writerows(sorted_repeats) + + +rule gcmaskbed: + input: + repeats = "reference.fasta.out" + output: + bed = "reference.gcmask.bed" + run: + repeats = [] + with open(input.repeats) as f: + for line in f: + if p_repeats_all.match(line): + fields = line.rstrip().split() + repeats.append([fields[4], fields[5], fields[6]]) + sorted_repeats = sorted(repeats, + key=lambda x: (x[0], int(x[1]), int(x[2]))) + + # Write to file + import csv + with open(output.bed, 'w') as f: + writer = csv.writer(f, delimiter='\t') + writer.writerows(sorted_repeats) + + +rule dict: + input: + fasta = "reference.fasta", + fai = "reference.fasta.fai" + output: + "reference.dict" + shell: + "picard CreateSequenceDictionary " + " R={input.fasta} " + " O={output} " + +rule nstretches: + input: + fasta = "reference.fasta", + fai = "reference.fasta.fai" + output: + "reference.Nstretch.bed" + params: + nstrech = config['maxNstretches'] + log: + "logs/nstretch.log" + threads: + 1 + shell: + "findNstretches.py -f {input.fasta} -m {params.nstrech} " + "-o {output} -t {threads} 2> {log}" + +rule ploidy: + output: + "reference.ploidymap.txt" + shell: + " printf \"*\t*\t*\t*\t2\n\" > {output}" + +rule gendermask: + output: + "reference.gendermask.bed" + shell: + " touch {output}"