Commit 38e2f9aa authored by Slaheddine Kastalli's avatar Slaheddine Kastalli
Browse files

Merge branch 'dev' of forgemia.inra.fr:cedric.midoux/dada2 into dev

parents 254025a7 4ee1731b
{
"__default__" :
{
"out" : "LOGS/",
"err" : "LOGS/",
"queue" : "short.q,long.q,maiage.q",
"cluster" : ""
}
}
{
"THREADS": 8,
"SAMPLES": ["4A", "5A", "6A", "7A", "8A", "9A", "10A", "11A", "12A"],
"FIVE_PRIMER": {"4A": "GTGYCAGCMGCCGCGGTA", "5A": "GTGYCAGCMGCCGCGGTA","6A": "GTGYCAGCMGCCGCGGTA","7A": "GTGYCAGCMGCCGCGGTA","8A": "GTGYCAGCMGCCGCGGTA","9A": "GTGYCAGCMGCCGCGGTA","10A": "GTGYCAGCMGCCGCGGTA","11A": "GTGYCAGCMGCCGCGGTA","12A": "GTGYCAGCMGCCGCGGTA"},
"THREE_PRIMER": {"4A": "ACTYAAAKGAATTGRCGGGG", "5A": "ACTYAAAKGAATTGRCGGGG","6A": "ACTYAAAKGAATTGRCGGGG","7A": "ACTYAAAKGAATTGRCGGGG","8A": "ACTYAAAKGAATTGRCGGGG","9A": "ACTYAAAKGAATTGRCGGGG","10A": "ACTYAAAKGAATTGRCGGGG","11A": "ACTYAAAKGAATTGRCGGGG","12A": "ACTYAAAKGAATTGRCGGGG"},
"DATABASE": "/db/frogs_databanks/assignation/16S/silva_132_16S_pintail100/silva_132_16S_pintail100.fasta"
}
{
"THREADS": 8,
"SAMPLES": ["4A", "5A", "6A", "7A", "8A", "9A", "10A", "11A", "12A"],
"FIVE_PRIMER": {"4A": "GTGYCAGCMGCCGCGGTA", "5A": "GTGYCAGCMGCCGCGGTA","6A": "GTGYCAGCMGCCGCGGTA","7A": "GTGYCAGCMGCCGCGGTA","8A": "GTGYCAGCMGCCGCGGTA","9A": "GTGYCAGCMGCCGCGGTA","10A": "GTGYCAGCMGCCGCGGTA","11A": "GTGYCAGCMGCCGCGGTA","12A": "GTGYCAGCMGCCGCGGTA"},
"THREE_PRIMER": {"4A": "ACTYAAAKGAATTGRCGGGG", "5A": "ACTYAAAKGAATTGRCGGGG","6A": "ACTYAAAKGAATTGRCGGGG","7A": "ACTYAAAKGAATTGRCGGGG","8A": "ACTYAAAKGAATTGRCGGGG","9A": "ACTYAAAKGAATTGRCGGGG","10A": "ACTYAAAKGAATTGRCGGGG","11A": "ACTYAAAKGAATTGRCGGGG","12A": "ACTYAAAKGAATTGRCGGGG"},
"DATABASE": "/db/frogs_databanks/assignation/16S/silva_132_16S_pintail100/silva_132_16S_pintail100.fasta"
}
#!/bin/bash
export PYTHONPATH=''
source /usr/local/genome/Anaconda2-5.1.0/etc/profile.d/conda.sh
conda activate snakemake-5.7.4
mkdir -p LOGS/
snakemake \
--snakefile $1 \
--jobscript jobscript.sh \
--cluster-config cluster.json \
--cluster "qsub -V -cwd -N {rule} -o {cluster.out} -e {cluster.err} -q {cluster.queue} -pe thread {threads} {cluster.cluster}" \
--keep-going \
--restart-times 5 \
--jobs 80 \
--wait-for-files \
--latency-wait 150 \
--verbose \
--printshellcmds
rule addAffi:
input:
biom = "work/dada/dada.biom",
fasta = "work/dada/seqtab.fasta"
output:
biom = "work/FROGS/affiliation.biom",
html = "work/FROGS/affiliation.html",
log = "work/FROGS/affiliation.log"
threads:
config["THREADS"]
params:
database = config["DATABASE"]
shell:
"conda activate frogs-3.2.0 "
"&& "
"affiliation_OTU.py "
"--input-fasta {input.fasta} "
"--input-biom {input.biom} "
"--nb-cpu {threads} "
"--log-file {output.log} "
"--output-biom {output.biom} "
"--summary {output.html} "
"--reference {params.database} "
"&& "
"conda deactivate"
rule stats:
input:
biom = "work/FROGS/affiliation.biom"
output:
html_cluster = "work/FROGS/clusters_metrics.html",
html_affiliations = "work/FROGS/affiliations_metrics.html"
shell:
"conda activate frogs-3.2.0 "
"&& "
"clusters_stat.py "
"--input-biom {input.biom} "
"--output-file {output.html_cluster}"
"&& "
"affiliations_stat.py "
"--input-biom {input.biom} "
"--output-file {output.html_affiliations} "
"--taxonomic-ranks Domain Phylum Class Order Family Genus Species "
"--rarefaction-ranks Genus "
"--multiple-tag blast_affiliations "
"--tax-consensus-tag blast_taxonomy "
"--identity-tag perc_identity "
"--coverage-tag perc_query_coverage"
"&& "
"conda deactivate"
rule tree:
input:
fasta = "work/dada/seqtab.fasta",
biom = "work/FROGS/affiliation.biom"
output:
html = "work/FROGS/tree.html",
nwk = "work/FROGS/tree.nwk",
log = "work/FROGS/tree.log"
threads:
config["THREADS"]
shell:
"conda activate frogs-3.2.0 "
"&& "
"tree.py "
"--nb-cpus {threads} "
"--input-sequences {input.fasta} "
"--biom-file {input.biom} "
"--out-tree {output.nwk} "
"--html {output.html} "
"--log-file {output.log} "
"&& "
"conda deactivate"
rule stdBIOM:
input:
biom = "work/FROGS/affiliation.biom"
output:
biom = "work/FROGS/abundance.biom",
tsv = "work/FROGS/blast_informations.std.tsv"
shell:
"conda activate frogs-3.2.0 "
"&& "
"biom_to_stdBiom.py "
"--input-biom {input.biom} "
"--output-biom {output.biom} "
"--output-metadata {output.tsv}"
"&& "
"conda deactivate"
rule biom_to_tsv:
input:
biom = "work/FROGS/affiliation.biom",
fasta = "work/dada/seqtab.fasta"
output:
tsv = "work/FROGS/abundance.tsv",
multi = "work/FROGS/multihits.tsv"
shell:
"conda activate frogs-3.2.0 "
"&& "
"biom_to_tsv.py "
"--input-biom {input.biom} "
"--input-fasta {input.fasta} "
"--output-tsv {output.tsv} "
"--output-multi-affi {output.multi}"
"&& "
"conda deactivate"
localrules: copy
rule copy:
input:
"work/FROGS/{file}"
output:
"report/{file}"
shell:
"cp -uv {input} {output}"
rule dada2:
input:
"work/filter/{sample}.fastq.gz"
output:
"work/dada/{sample}.rds"
threads:
config["THREADS"]
script:
"dada2.R"
rule makeSequenceTable:
input:
expand("work/dada/{sample}.rds", sample=SAMPLES)
output:
rds = "work/dada/seqtab.rds",
fasta = "work/dada/seqtab.fasta",
tsv = "work/dada/seqtab.tsv",
biom = "work/dada/dada.biom"
threads:
config["THREADS"]
script:
"makeSequenceTable.R"
rule metrics:
input:
rds = "work/dada/seqtab.rds",
fastqc = expand("work/fastqc/{sample}_fastqc.zip", sample=SAMPLES)
output:
tsv = "work/dada/metrics.tsv"
script:
"metrics.R"
shell.executable("/bin/bash")
shell.prefix("source /usr/local/genome/Anaconda2-5.1.0/etc/profile.d/conda.sh;")
configfile: "./config.json"
SAMPLES=config["SAMPLES"]
rule all:
input:
"report/multiqc_report.html",
expand("work/dada/{sample}.rds", sample=SAMPLES),
"work/dada/seqtab.fasta",
"work/dada/seqtab.tsv",
"work/dada/metrics.tsv",
"work/FROGS/affiliation.biom",
"report/clusters_metrics.html",
"report/affiliations_metrics.html",
"report/abundance.biom",
"report/abundance.tsv",
# "report/tree.nwk"
include: "quality.smk"
include: "preprocess.smk"
include: "dada2.smk"
include: "affiliation.smk"
rule cutadapt:
input:
"DATA/{sample}.fastq.gz"
output:
"work/cutadapt/{sample}.fastq.gz"
params:
five = lambda wildcards: config["FIVE_PRIMER"][wildcards.sample],
three = lambda wildcards: config["THREE_PRIMER"][wildcards.sample]
shell:
"conda activate cutadapt-2.5 "
"&& "
"cutadapt "
"-g {params.five} "
"-a {params.three} "
"--error-rate 0.1 "
"--discard-untrimmed "
"--match-read-wildcards "
"-o {output} "
"{input} "
"&& "
"conda deactivate"
rule filter:
input:
"work/cutadapt/{sample}.fastq.gz"
output:
"work/filter/{sample}.fastq.gz"
script:
"filterAndTrim.R"
rule fastqc:
input:
"DATA/{sample}.fastq.gz"
output:
zip = "work/fastqc/{sample}_fastqc.zip",
html = "work/fastqc/{sample}_fastqc.html"
params:
output = "work/fastqc/"
shell:
"conda activate fastqc-0.11.8 "
"&& "
"fastqc "
"{input} "
"--noextract "
"--outdir {params.output} "
"&& "
"conda deactivate "
rule multiqc:
input:
expand("work/fastqc/{sample}_fastqc.zip", sample=SAMPLES)
output:
html = "report/multiqc_report.html",
params:
output = "report/"
shell:
"conda activate multiqc-1.8 "
"&& "
"multiqc "
"--no-data-dir "
"--outdir {params.output} "
"{input} "
"&& "
"conda deactivate "
Version: 1.0
RestoreWorkspace: Default
SaveWorkspace: Default
AlwaysSaveHistory: Default
EnableCodeIndexing: Yes
UseSpacesForTab: Yes
NumSpacesForTab: 2
Encoding: UTF-8
RnwWeave: Sweave
LaTeX: pdfLaTeX
#!/bin/bash
export PYTHONPATH=''
source /usr/local/genome/Anaconda2-5.1.0/etc/profile.d/conda.sh
conda activate snakemake-5.7.4
mkdir -p LOGS/
snakemake \
--snakefile $1 \
--jobscript jobscript.sh \
--cluster-config cluster.json \
--cluster "qsub -V -cwd -N {rule} -o {cluster.out} -e {cluster.err} -q {cluster.queue} -pe thread {threads} {cluster.cluster}" \
--keep-going \
--restart-times 5 \
--jobs 80 \
--wait-for-files \
--latency-wait 150 \
--verbose \
--printshellcmds
#!/bin/bash
export PYTHONPATH=''
source /usr/local/genome/Anaconda2-5.1.0/etc/profile.d/conda.sh
conda activate snakemake-5.7.4
mkdir -p report/
snakemake \
--snakefile $1 \
--dag \
| dot -Tpdf > ./report/`basename $1 .smk`_graph.pdf
#!/bin/bash
export PYTHONPATH=''
source /usr/local/genome/Anaconda2-5.1.0/etc/profile.d/conda.sh
conda activate snakemake-5.7.4
snakemake \
--snakefile $1 \
--jobscript jobscript.sh \
--cluster-config cluster.json \
--dryrun \
--forceall \
--printshellcmds \
--verbose
#!/bin/bash
export PYTHONPATH=''
source /usr/local/genome/Anaconda2-5.1.0/etc/profile.d/conda.sh
conda activate snakemake-5.7.4
snakemake \
--snakefile $1 \
--jobscript jobscript.sh \
--cluster-config cluster.json \
--dryrun \
--printshellcmds \
--verbose
#!/bin/bash
export PYTHONPATH=''
source /usr/local/genome/Anaconda2-5.1.0/etc/profile.d/conda.sh
conda activate snakemake-5.7.4
mkdir -p report/
snakemake \
--snakefile $1 \
--jobscript jobscript.sh \
--cluster-config cluster.json \
--report report/`basename $1 .smk`_report.html \
--verbose
library(dada2)
derep <- derepFastq(snakemake@input[[1]], verbose = TRUE)
err <- learnErrors(snakemake@input[[1]], multithread = snakemake@threads, verbose = TRUE)
dada <- dada(derep, err = err, multithread = snakemake@threads, verbose = TRUE)
saveRDS(dada, snakemake@output[[1]])
library(dada2)
filterAndTrim(snakemake@input[[1]], snakemake@output[[1]], maxN=0, rm.phix=TRUE, compress=TRUE, verbose = TRUE)
#!/bin/sh
# properties = {properties}
export PYTHONPATH=''
source /usr/local/genome/Anaconda2-5.1.0/etc/profile.d/conda.sh
conda activate snakemake-5.7.4
{exec_job}
library(dada2)
library(biomformat)
dada_list <- lapply(snakemake@input, readRDS)
names(dada_list) <- lapply(snakemake@input, function(x){basename(tools::file_path_sans_ext(x))})
seqtab <- makeSequenceTable(dada_list, orderBy = "abundance")
seqtab.nochim <- removeBimeraDenovo(seqtab, method="consensus", multithread = snakemake@threads, verbose = TRUE)
saveRDS(seqtab.nochim, snakemake@output$rds)
uniquesToFasta(seqtab.nochim, snakemake@output$fasta, ids = getSequences(seqtab.nochim))
write.table(t(seqtab.nochim), snakemake@output$tsv, sep = "\t", quote = FALSE)
biom <- make_biom(t(seqtab.nochim))
write_biom(biom, snakemake@output$biom)
library(dplyr)
library(tibble)
reads_input <- sapply(snakemake@config$SAMPLES, function(x) {
as.numeric(system(
sprintf(
"unzip -p work/fastqc/%s_fastqc.zip %s_fastqc/fastqc_data.txt | grep 'Total Sequences' | cut -f2",
x, x),
intern = TRUE
))
}) %>% as.data.frame() %>% rownames_to_column("Sample")
seqtab.nochim <- readRDS(snakemake@input$rds)
reads_output <- rowSums(seqtab.nochim) %>% as.data.frame() %>% rownames_to_column("Sample")
nb_asv <- rowSums(seqtab.nochim > 0) %>% as.data.frame() %>% rownames_to_column("Sample")
metrics <- dplyr::full_join(x = reads_input, y = reads_output, by = "Sample") %>%
dplyr::full_join(y = nb_asv, by = "Sample")
colnames(metrics) <- c('Sample', 'Input reads', 'Post-process reads', 'Nb ASV')
write.table(metrics, snakemake@output$tsv, sep = "\t", quote = FALSE, row.names = FALSE)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment