Skip to content
Snippets Groups Projects
Commit 262743e3 authored by Thomas Faraut's avatar Thomas Faraut
Browse files

installing genomestrip using conda

parent b5c5e280
No related branches found
No related tags found
1 merge request!14Newinstall
......@@ -35,8 +35,7 @@ $ conda create --name cnvpipeline --file requirements.yaml
### 4. Load new conda environment
```sh
source activate cnv
export PERL5LIB="$CONDA_HOME/envs/cnv/lib/perl5"
source activate cnvpipeline
```
......@@ -57,20 +56,16 @@ Special case of genologin cluster (genotoul):
For future logins, you must reactivate all conda environments. This means launching these commands:
export PATH=$CONDA_HOME/bin:$PATH
source activate cnv
export PERL5LIB="$CONDA_HOME/envs/cnv/lib/perl5"
Where `$CONDA_HOME` is the folder in which you install miniconda in previous step.
```sh
source activate cnvpipeline
```
## Install for simulations
To do simulations, you need to compile pirs, which is included as submodule of your cnvpipeline installation. Go into `cnvpipelines/popsim/pirs` and just run:
make
## Configuration
Configuration should be edited in `application.properties` file. Sections and parameters are described below.
......
File added
File added
name: try
channels:
- anaconda
- conda-forge
- bioconda
- r
- defaults
dependencies:
- repeatmasker
rule get_fasta:
input:
config['genome']
output:
"reference.fasta"
params:
chromosomes = config['chromosomes']
shell:
"samtools faidx {input} {params.chromosomes} > {output}; "
"samtools faidx {output}"
rule preparesvmask:
input:
fasta = "reference.fasta",
fai = "reference.fasta.fai"
output:
fasta = "work/reference.fasta",
index = "work/reference.fasta.bwt"
log:
"logs/preparesvmask.log"
run:
svmask_index(input.fasta, output.fasta)
rule mergesvmask:
input:
expand("work/svmask_{chrom}.fasta", chrom=config['chromosomes'])
output:
"reference.svmask.fasta"
shell:
"cat {input} > {output}"
rule chromsvmask:
input:
"work/reference.fasta.bwt",
fa = "work/reference.fasta"
output:
"work/svmask_{chrom}.fasta"
params:
rl = config['read_len'],
classpath = config['CLASSPATH'],
sv_dir = config['SV_DIR'],
gs_bwa_dir = config['GS_BWA_DIR']
log:
"logs/svmask/{chrom}.log"
shell:
"export PATH={params.sv_dir}/bwa:\"$PATH\"; "
" export LD_LIBRARY_PATH={params.gs_bwa_dir}:\"$LD_LIBRARY_PATH\";"
" java -cp {params.classpath}"
" -Xmx4g org.broadinstitute.sv.apps.ComputeGenomeMask"
" -R {input.fa} -O work/svmask_{wildcards.chrom}.fasta"
" -readLength {params.rl}"
" -sequence {wildcards.chrom} 2> {log} "
rule index:
input:
"{type}.fasta"
output:
"{type}.fasta.fai"
log:
"logs/index/{type}.log"
shell:
"samtools faidx {input} &> {log}"
rule bed2fasta:
input:
"reference.fasta.fai",
ref = "reference.fasta",
bed = "{type}.bed"
output:
"{type}.fasta"
wildcard_constraints:
type = ".*(lc|sv|gc)mask"
params:
classpath = config['CLASSPATH']
shell:
" java -cp {params.classpath} -Xmx4g "
" org.broadinstitute.sv.apps.BedFileToGenomeMask "
" -I {input.bed} "
" -R {input.ref} "
" -O {output}"
rule repeatmask:
input:
fasta = "reference.fasta",
fai = "reference.fasta.fai"
output:
"reference.fasta.out"
params:
sp = config['species']
threads:
get_threads("repeatmask", 12)
conda:
"envs/repeatmasker.yaml"
shell:
"""
RepeatMasker -pa {threads} -species {params.sp} -xsmall {input.fasta}
> {input.fasta}.repeatmasker.log
"""
rule length:
input:
fasta = "reference.fasta",
fai = "reference.fasta.fai"
output:
"reference.fasta.length"
shell:
"fastalength {input.fasta} > {output}"
rule rdmask:
input:
length = "reference.fasta.length"
output:
bed = "reference.rdmask.bed"
run:
with open(input.length) as f:
with open(output.bed, "w") as fout:
fout.write("\t".join(["CHROM", "START", "END"])+"\n")
for line in f:
line = line.rstrip()
fields = line.split()
fout.write("\t".join([fields[1], "0",
str(int(fields[0])-1)]) + "\n")
rule lcmaskbed:
input:
repeats = "reference.fasta.out"
output:
bed = "reference.lcmask.bed"
run:
repeats = []
with open(input.repeats) as f:
for line in f:
if p_repeats.match(line):
fields = line.rstrip().split()
# append chrom start and end
repeats.append([fields[4], fields[5], fields[6]])
sorted_repeats = sorted(repeats,
key=lambda x: (x[0], int(x[1]), int(x[2])))
# Write to file
import csv
with open(output.bed, 'w') as f:
writer = csv.writer(f, delimiter='\t')
writer.writerows(sorted_repeats)
rule gcmaskbed:
input:
repeats = "reference.fasta.out"
output:
bed = "reference.gcmask.bed"
run:
repeats = []
with open(input.repeats) as f:
for line in f:
if p_repeats_all.match(line):
fields = line.rstrip().split()
repeats.append([fields[4], fields[5], fields[6]])
sorted_repeats = sorted(repeats,
key=lambda x: (x[0], int(x[1]), int(x[2])))
# Write to file
import csv
with open(output.bed, 'w') as f:
writer = csv.writer(f, delimiter='\t')
writer.writerows(sorted_repeats)
rule dict:
input:
fasta = "reference.fasta",
fai = "reference.fasta.fai"
output:
"reference.dict"
shell:
"picard CreateSequenceDictionary "
" R={input.fasta} "
" O={output} "
rule nstretches:
input:
fasta = "reference.fasta",
fai = "reference.fasta.fai"
output:
"reference.Nstretch.bed"
params:
nstrech = config['maxNstretches']
log:
"logs/nstretch.log"
threads:
1
shell:
"findNstretches.py -f {input.fasta} -m {params.nstrech} "
"-o {output} -t {threads} 2> {log}"
rule ploidy:
output:
"reference.ploidymap.txt"
shell:
" printf \"*\t*\t*\t*\t2\n\" > {output}"
rule gendermask:
output:
"reference.gendermask.bed"
shell:
" touch {output}"
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment