UMR 1202 BioGeCo
Snakemake_Singularity_HPC

Repository

#!/bin/bash
#Run snakemake with a singularity container
##Author Domitille COQ--ETCHEGARAY
##25/02/2020


#SBATCH -J snakemake_test
#SBATCH --time=00:05:00
#SBATCH -c 1
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --mem 128000MB
#SBATCH -o snakemake.%N.%j.out
#SBATCH -e snakemake.%N.%j.err

#############################Module loading#############################
module load snakemake
########################################################################


## Cluster config file with parameters for each rule of the Snakefile
CLUSTER_CONFIG=config_files/cluster.json

## Slurm command to launch each new jobs (aka rule) create by snakemake.
## Arguments values are parameters of the cluster config file.
CLUSTER="sbatch --mem={cluster.mem} --ntasks-per-node {cluster.npernode} -t {cluster.time} -n {cluster.ntasks} -c {cluster.c} -J {cluster.jobname} -o snake_subjob_log/{cluster.jobname}.%N.%j.out -e snake_subjob_log/{cluster.jobname}.%N.%j.err"

## Use at most N cores in parallel (default: 1). If N is omitted or 'all', the limit is set to the number of available cores.
MAX_CORES=100

## Create a log directory for all the slurm output files
mkdir -p snake_subjob_log

## Create the directive acyclic graph of the workflow
snakemake -s Snakefile --dag | dot -Tpng > dag.png

## Launch the workflow : -s Snakefile --use-singularity launch the container
## --cluster-config cluster configuration file for each rule in the cluster --cluster sbacth shell command
snakemake -s Snakefile --use-singularity -j $MAX_CORES --cluster-config $CLUSTER_CONFIG --cluster "$CLUSTER"

## Create a final report
snakemake -s Snakefile --report smk_report.html


## Useful information to print
echo '########################################'
echo 'Date:' $(date --iso-8601=seconds)
echo 'User:' $USER
echo 'Host:' $HOSTNAME
echo 'Job Name:' $SLURM_JOB_NAME
echo 'Job ID:' $SLURM_JOB_ID
echo 'Array task ID:' ${SLURM_ARRAY_TASK_ID}
echo 'Number of nodes assigned to job:' $SLURM_JOB_NUM_NODES
echo 'Total number of cores for job (?):' $SLURM_NTASKS
echo 'Number of requested cores per node:' $SLURM_NTASKS_PER_NODE
echo 'Nodes assigned to job:' $SLURM_JOB_NODELIST
echo 'Directory:' $(pwd)
## Detail Information:
echo 'scontrol show job:'
scontrol show job $SLURM_JOB_ID
echo '########################################'

channels:
  - conda-forge
  - bioconda
dependencies:
  - bioconda::snakemake-minimal =5.4.5
  - python =3.6
  - jinja2 =2.10
  - networkx =2.1
  - matplotlib =2.2.3
  - graphviz =2.38.0
  - bcftools =1.9
  - samtools =1.9
  - bwa =0.7.17
  - pysam =0.15.0
user@001:~$ conda env create -n myEnv -f environment.yml
user@001:~$ conda activate myEnv
user@001:~$ conda env list
base             /home/user/anaconda3
myEnv          * /home/user/anaconda3/envs/snakemake_tuto
##Author Domitille COQ--ETCHEGARAY
##25/02/2020

##Path to the config file, defined parameters like path of first input files
configfile: "config_files/config.yml"
##Global variable that will tell to all the rules to use the shell of the singularity container
singularity: "img/snakemake_tuto.sif"

##Rule job 0, will always be the first rule executed and define the target file of the Snakemake workflow
rule all :
    input:
        "plots/quals.svg"

rule bwa_map:
    #Path of input files
    input:
        "data/genome.fa",
        lambda wildcards: config["samples"][wildcards.sample]
    #Path of output files (create by Snakemake)
    output:
        "mapped_reads/{sample}.bam"
    #Add parameters depending of the wildcard value
    #annotate aligned reads with so-called read groups, that contain metadata like the sample name.
    params:
        rg=r"@RG\tID:{sample}\tSM:{sample}"
    #Output log of the rule create in a file
    log:
        "tool_logs/bwa_mem/{sample}.log"
    #More informations like wall time clock of the rule
    benchmark :
        "benchmarks/bwa_mem/{sample}.bwa.benchmark.txt"
    #Number of cores allow for the rule
    threads: 8
    #Memmory allow for the rule
    resources:
        mem_mb=4000
    #Shell command that will be execute by the rule
    shell:
        "(bwa mem -R '{params.rg}' -t {threads} {input} "
        "| samtools view -Sb -> {output}) 2> {log}"

rule samtools_sort:
    input:
        "mapped_reads/{sample}.bam"
    output:
        "sorted_reads/{sample}.bam"
    log:
        "tool_logs/samtools_sort/{sample}.log"
    benchmark :
        "benchmarks/samtools_sort/{sample}.sams.benchmark.txt"
    shell:
        "(samtools sort -T sorted_reads/{wildcards.sample} "
        "-O bam {input} > {output}) 2> {log}"

rule samtools_index:
    input:
        "sorted_reads/{sample}.bam"
    output:
        "sorted_reads/{sample}.bam.bai"
    log:
        "tool_logs/samtools_index/{sample}.log"
    benchmark :
        "benchmarks/samtools_idx/{sample}.sami.benchmark.txt"
    shell:
        "(samtools index {input}) 2> {log}"

rule bcftools_call:
    input:
        fa="data/genome.fa",
        bam=expand("sorted_reads/{sample}.bam",sample=config["samples"]),
        bai=expand("sorted_reads/{sample}.bam.bai",sample=config["samples"])
    output:
        "calls/all.vcf"
    log:
        "tool_logs/bcftools_call/bcf.log"
    benchmark:
        "benchmarks/bcftools_call/bcf.log"
    shell:
        "(bcftools mpileup -f {input.fa} {input.bam} "
        "| bcftools call -mv -> {output}) 2> {log}"

rule plot_quals :
    input :
        "calls/all.vcf"
    output :
        "plots/quals.svg"
    #Script that will be execute by the rule
    script :
        "scripts/plot_quals.py"
samples:
  A: data/samples/A.fastq
  B: data/samples/B.fastq
  C: data/samples/C.fastq
Bootstrap : docker
From :  continuumio/miniconda3
IncludeCmd : yes

%files
environment.yml

%post
apt-get update && apt-get install -y procps && apt-get clean -y
/opt/conda/bin/conda env create -n myEnv -f /environment.yml
/opt/conda/bin/conda clean -a


%environment
export PATH=/opt/conda/bin:$PATH
. /opt/conda/etc/profile.d/conda.sh
conda activate myEnv

%runscript
echo "Hello World"

%help
Tools for Snakemake tutorial

%labels
Author Domitille COQ--ETCHEGARAY
user@001:~$ sudo singularity build snakemake_tuto.sif sing.def
user@001:~$ singularity shell snakemake_tuto.sif
user@001:~$ wget https://github.com/snakemake/snakemake-tutorial-data/archive/v5.4.5.
user@001:~$ tar.gz tar -xf v5.4.5.tar.gz --strip 1
user@001:~$ sudo singularity build snakemake_tuto.sif sing.def
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
from pysam import VariantFile

quals = [record.qual for record in VariantFile(snakemake.input[0])]
plt.hist(quals)

plt.savefig(snakemake.output[0])
user@001:~$ tree
.
├── config_files
│   └── config.yml
├── data
│   ├── genome.fa
│   ├── genome.fa.amb
│   ├── genome.fa.ann
│   ├── genome.fa.bwt
│   ├── genome.fa.fai
│   ├── genome.fa.pac
│   ├── genome.fa.sa
│   └── samples
│       ├── A.fastq
│       ├── B.fastq
│       └── C.fastq
├── img
│   ├── environment.yml
│   ├── sing.def
│   └── snakemake_tuto.sif
├── scripts
│   └── plot_quals.py
└── Snakefile


user@001:~$ snakemake --use-singularity
{
    "__default__" :
    {
        "jobname": "default",
        "c" : 1,
        "ntasks" : 1,
        "npernode" : 1,
        "mem": 4000,
        "time": "00:02:00"
    },
    "bwa_map" :
    {
        "jobname": "bwa",
        "c": 8,
        "ntasks": 1,
        "npernode" : 1,
        "mem": 4000,
        "time": "00:02:00"
    },
    "samtools_sort" :
    {
        "jobname": "samsort",
        "c": 1,
        "ntasks": 1,
        "npernode" : 1,
        "mem": 4000,
        "time": "00:02:00"
    },
    "samtools_index" :
    {
        "jobname": "samidx",
        "c": 1,
        "ntasks": 1,
        "npernode" : 1,
        "mem": 4000,
        "time": "00:02:00"
    },
    "bcftools_call" :
    {
        "jobname": "bcfcall",
        "c": 1,
        "ntasks": 1,
        "npernode" : 1,
        "mem": 4000,
        "time": "00:02:00"
    },
    "plot_quals" :
    {
        "jobname": "plot",
        "c": 1,
        "ntasks": 1,
        "npernode" : 1,
        "mem": 4000,
        "time": "00:02:00"
    }
}
user@cluster001:~$ tree
.
├── config_files
│   ├── cluster.json
│   └── config.yml
├── data
│   ├── genome.fa
│   ├── genome.fa.amb
│   ├── genome.fa.ann
│   ├── genome.fa.bwt
│   ├── genome.fa.fai
│   ├── genome.fa.pac
│   ├── genome.fa.sa
│   └── samples
│       ├── A.fastq
│       ├── B.fastq
│       └── C.fastq
├── img
│   ├── environment.yml
│   ├── sing.def
│   └── snakemake_tuto.sif
├── job.sh
├── scripts
│   └── plot_quals.py
└── Snakefile

user@cluster001:~$ module load sinteractive
user@cluster001:~$ sinteractive
user@cluster001:~$ sbatch job.sh
singularity push --docker-username user --docker-password passwd container.sif oras://gitlab-registry/user/project:latest