Commit 05d051e0 authored by Celine Noirot's avatar Celine Noirot
Browse files

Merge branch 'dev_docs' into 'dev'

Dev docs

See merge request !14
parents 57e6e8b9 a689edb8
Pipeline #48008 skipped with stage
......@@ -2,6 +2,7 @@
from __future__ import print_function
from collections import OrderedDict
import re
import os
regexes = {
'metagWGS': ['v_pipeline.txt', r"(\S+)"],
......@@ -49,11 +50,12 @@ results['Eggnog-Mapper'] = '<span style="color:#999999;\">N/A</span>'
# Search each file using its regex
for k, v in regexes.items():
with open(v[0]) as x:
versions = x.read()
match = re.search(v[1], versions)
if match:
results[k] = "v{}".format(match.group(1))
if os.path.exists(v[0]):
with open(v[0]) as x:
versions = x.read()
match = re.search(v[1], versions)
if match:
results[k] = "v{}".format(match.group(1))
# Remove software set to false in results
for k in results:
......
......@@ -10,9 +10,9 @@
> ```
> sample,fastq_1,fastq_2
> a1,$DASTASET/a1_R1.fastq.gz,$DASTASET/a1_R2.fastq.gz
> a2,$DASTASET/a2_R1.fastq.gz,$DASTASET/a2_R2.fastq.gz
> c,$DASTASET/c_R1.fastq.gz,$DASTASET/c_R2.fastq.gz
> a1,$DATASET/a1_R1.fastq.gz,$DATASET/a1_R2.fastq.gz
> a2,$DATASET/a2_R1.fastq.gz,$DATASET/a2_R2.fastq.gz
> c,$DATASET/c_R1.fastq.gz,$DATASET/c_R2.fastq.gz
> ```
4. Run a basic script:
......@@ -33,7 +33,7 @@
> nextflow run -profile test_genotoul_workq metagwgs/main.nf \
> --type 'SR' \
> --input 'metagwgs-test-datasets/small/input/samplesheet.csv' \
> --skip_host_filter --skip_kaiju
> --skip_host_filter --skip_kaiju --stop_at_clean
> ```
> **NOTE:** you can change Nextflow and Singularity versions with other versions available on the cluster (see all versions with `search_module ToolName`). Nextflow version must be >= v20 and Singularity version must be >= v3.
......
......@@ -5,18 +5,17 @@
1. Install metagwgs as described here: [installation doc](../docs/installation.md)
2. Get datasets: two datasets are currently available for these functional tests at `https://forgemia.inra.fr/genotoul-bioinfo/metagwgs-test-datasets.git`
Replace "\<dataset\>" with either "small" or "mag":
```
git clone --branch <dataset> git@forgemia.inra.fr:genotoul-bioinfo/metagwgs-test-datasets.git
git clone git@forgemia.inra.fr:genotoul-bioinfo/metagwgs-test-datasets.git
or
wget https://forgemia.inra.fr/genotoul-bioinfo/metagwgs-test-datasets/-/archive/<dataset>/metagwgs-test-datasets-<dataset>.tar.gz
wget https://forgemia.inra.fr/genotoul-bioinfo/metagwgs-test-datasets.git
```
3. Get data banks: download [this archive](http://genoweb.toulouse.inra.fr/~choede/FT_banks_2021-12-16.tar.gz ) and decompress its contents in any folder. This archive contains data banks for:
3. Get data banks: download [this archive](http://genoweb.toulouse.inra.fr/~choede/FT_banks_2021-12-16.tar.gz) and decompress its contents in any folder. This archive contains data banks for:
- **Kaiju** (_kaijudb_refseq_2020-05-25_)
- **Diamond** (_refseq_bacteria_2021-05-20_)
- **NCBI Taxonomy** (_taxonomy_2021-08-23_)
- **NCBI Taxonomy** (_taxonomy_2021-12-7_ )
- **Eggnog Mapper** (_eggnog-mapper-2.0.4-rf1_)
......@@ -30,15 +29,34 @@ To launch functional tests, you need to be located at the root of the folder whe
- by providing the results folder of a pipeline already exectuted
```
cd test_folder
python <metagwgs-src>/functional_tests/main.py -step 07_taxo_affi -exp_dir metagwgs-test-datasets/small/output -obs_dir ./results
export METAG_PATH="/path/to/sources"
export DATASET="/path/to/metagwgs-test-datasets"
python $METAG_PATH/functional_tests/main.py -step 07_taxo_affi -exp_dir $DATASET/small/output -obs_dir ./results
```
- by providing a script which will launch the nextflow pipeline [see example](./launch_example.sh) (this example is designed for the "small" dataset with --min_contigs_cpm>1000, using slurm)
```
mkdir test_folder
cd test_folder
cp <metagwgs-src>/functional_tests/launch_example.sh ./
python <metagwgs-src>/functional_tests/main.py -step 07_taxo_affi -exp_dir metagwgs-test-datasets/small/output -obs_dir ./results --script launch_example.sh
```
1. create working directory
```
mkdir test_folder
cd test_folder
```
2.set enviroment variables and load module
```
export METAG_PATH="/path/to/sources"
export DATASET="/path/to/metagwgs-test-datasets"
export DATABANK="/path/to/FT_banks_2021-10-19"
export EGGNOG_DB="$DATABANK/eggnog-mapper-2.0.4-rf1/data"
module load system/Python-3.7.4
```
3.launch functional test
```
cp $METAG_PATH/functional_tests/launch_example.sh ./
python $METAG_PATH/functional_tests/main.py -step 07_taxo_affi -exp_dir $DATASET/small/output -obs_dir ./results --script launch_example.sh
```
>**NOTE: more information on the command used to produce each dataset in [small](https://forgemia.inra.fr/genotoul-bioinfo/metagwgs-test-datasets/-/tree/small) and [mag](https://forgemia.inra.fr/genotoul-bioinfo/metagwgs-test-datasets/-/tree/mag) READMEs**
......@@ -132,7 +150,7 @@ To use it :
```
cut -f 1 $METAG_PATH/functional_tests/expected_processes_sr.tsv | tail -n +2 > $OUTDIR/cmd_sr.sh
```
> the commands use profile `test_genotoul_workq`
> the commands use profile `test,genotoul`
- replace path in the samplesheet :
```
sed -i -e "s,\$DATASET,$DATASET,g" $DATASET/small/input/samplesheet.csv
......
cmd outputdir DATABASES:INDEX_KAIJU DATABASES:DOWNLOAD_TAXONOMY_DB DATABASES:EGGNOG_MAPPER_DB SH:S04_FILTERED_QUAST SH:S04_STRUCTURAL_ANNOT:PROKKA SH:S04_STRUCTURAL_ANNOT:RENAME_CONTIGS_AND_GENES SH:S05_ALIGNMENT:DIAMOND SH:S05_ALIGNMENT:MINIMAP2 SH:S06_FUNC_ANNOT:BEST_HITS SH:S06_FUNC_ANNOT:CD_HIT:GLOBAL_CD_HIT SH:S06_FUNC_ANNOT:CD_HIT:INDIVIDUAL_CD_HIT SH:S06_FUNC_ANNOT:EGGNOG_MAPPER SH:S06_FUNC_ANNOT:FUNCTIONAL_ANNOT_TABLE SH:S06_FUNC_ANNOT:MERGE_QUANT_ANNOT_BEST SH:S06_FUNC_ANNOT:QUANTIFICATION:FEATURE_COUNTS SH:S06_FUNC_ANNOT:QUANTIFICATION:QUANTIFICATION_TABLE SH:S07_TAXO_AFFI:ASSIGN_TAXONOMY SH:S07_TAXO_AFFI:QUANTIF_AND_TAXONOMIC_TABLE_CONTIGS
mkdir $OUTDIR/hifi_all ; cd $OUTDIR/hifi_all ;nextflow run -profile test_genotoul_workq $METAG_PATH/main.nf --type HIFI --input `echo $DATASET`/hifi/input/samplesheet.csv --diamond_bank `echo $DATABANK`/refseq_bacteria_2021-05-20/refseq_bacteria.dmnd --eggnog_mapper_db_dir `echo $EGGNOG_DB` --taxonomy_dir `echo $DATABANK`/taxonomy_2021-08-23 $OUTDIR/hifi_all 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
mkdir $OUTDIR/hifi_stop_at_structural_annot;cd $OUTDIR/hifi_stop_at_structural_annot; nextflow run -profile test_genotoul_workq $METAG_PATH/main.nf --type HIFI --input `echo $DATASET`/hifi/input/samplesheet.csv --diamond_bank `echo $DATABANK`/refseq_bacteria_2021-05-20/refseq_bacteria.dmnd --stop_at_structural_annot $OUTDIR/hifi_stop_at_structural_annot 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0
mkdir $OUTDIR/skip_func_annot-skip_taxo_affi; cd $OUTDIR/skip_func_annot-skip_taxo_affi;cp ../nextflow.config .; nextflow run -profile test_genotoul_workq $METAG_PATH/main.nf --type HIFI --input `echo $DATASET`/hifi/input/samplesheet.csv --diamond_bank `echo $DATABANK`/refseq_bacteria_2021-05-20/refseq_bacteria.dmnd --skip_func_annot --skip_taxo_affi $OUTDIR/skip_func_annot-skip_taxo_affi 0 0 0 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0
mkdir $OUTDIR/skip_func_annot ; cd $OUTDIR/skip_func_annot;cp ../nextflow.config .; nextflow run -profile test_genotoul_workq $METAG_PATH/main.nf --type HIFI --input `echo $DATASET`/hifi/input/samplesheet.csv --eggnog_mapper_db_dir `echo $EGGNOG_DB` --diamond_bank `echo $DATABANK`/refseq_bacteria_2021-05-20/refseq_bacteria.dmnd --taxonomy_dir `echo $DATABANK`/taxonomy_2021-08-23 –skip_func_annot $OUTDIR/skip_func_annot 0 0 0 1 1 1 1 1 0 0 0 0 0 0 0 0 1 1
mkdir $OUTDIR/skip_taxo_affi; cd $OUTDIR/skip_taxo_affi;nextflow run -profile test_genotoul_workq $METAG_PATH/main.nf --type HIFI --input `echo $DATASET`/hifi/input/samplesheet.csv --diamond_bank `echo $DATABANK`/refseq_bacteria_2021-05-20/refseq_bacteria.dmnd --eggnog_mapper_db_dir `echo $EGGNOG_DB` --skip_taxo_affi $OUTDIR/skip_taxo_affi 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
mkdir $OUTDIR/hifi_all ; cd $OUTDIR/hifi_all ;nextflow run -profile test,genotoul $METAG_PATH/main.nf --type HIFI --input `echo $DATASET`/hifi/input/samplesheet.csv --diamond_bank `echo $DATABANK`/refseq_bacteria_2021-05-20/refseq_bacteria.dmnd --eggnog_mapper_db_dir `echo $EGGNOG_DB` --accession2taxid `echo $DATABANK`/taxonomy_2021-12-7/prot.accession2taxid.FULL --taxdump `echo $DATABANK`/taxonomy_2021-12-7/new_taxdump $OUTDIR/hifi_all 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
mkdir $OUTDIR/hifi_stop_at_structural_annot;cd $OUTDIR/hifi_stop_at_structural_annot; nextflow run -profile test,genotoul $METAG_PATH/main.nf --type HIFI --input `echo $DATASET`/hifi/input/samplesheet.csv --diamond_bank `echo $DATABANK`/refseq_bacteria_2021-05-20/refseq_bacteria.dmnd --stop_at_structural_annot $OUTDIR/hifi_stop_at_structural_annot 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0
mkdir $OUTDIR/skip_func_annot-skip_taxo_affi; cd $OUTDIR/skip_func_annot-skip_taxo_affi;cp ../nextflow.config .; nextflow run -profile test,genotoul $METAG_PATH/main.nf --type HIFI --input `echo $DATASET`/hifi/input/samplesheet.csv --diamond_bank `echo $DATABANK`/refseq_bacteria_2021-05-20/refseq_bacteria.dmnd --skip_func_annot --skip_taxo_affi $OUTDIR/skip_func_annot-skip_taxo_affi 0 0 0 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0
mkdir $OUTDIR/skip_func_annot ; cd $OUTDIR/skip_func_annot;cp ../nextflow.config .; nextflow run -profile test,genotoul $METAG_PATH/main.nf --type HIFI --input `echo $DATASET`/hifi/input/samplesheet.csv --eggnog_mapper_db_dir `echo $EGGNOG_DB` --diamond_bank `echo $DATABANK`/refseq_bacteria_2021-05-20/refseq_bacteria.dmnd --accession2taxid `echo $DATABANK`/taxonomy_2021-12-7/prot.accession2taxid.FULL --taxdump `echo $DATABANK`/taxonomy_2021-12-7/new_taxdump –skip_func_annot $OUTDIR/skip_func_annot 0 0 0 1 1 1 1 1 0 0 0 0 0 0 0 0 1 1
mkdir $OUTDIR/skip_taxo_affi; cd $OUTDIR/skip_taxo_affi;nextflow run -profile test,genotoul $METAG_PATH/main.nf --type HIFI --input `echo $DATASET`/hifi/input/samplesheet.csv --diamond_bank `echo $DATABANK`/refseq_bacteria_2021-05-20/refseq_bacteria.dmnd --eggnog_mapper_db_dir `echo $EGGNOG_DB` --skip_taxo_affi $OUTDIR/skip_taxo_affi 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
This diff is collapsed.
#!/bin/bash
sbatch -W -p workq -J functional_test --mem=6G \
--wrap="module load bioinfo/Nextflow-v21.04.1 ; module load system/singularity-3.7.3 ; nextflow run -profile test_genotoul_workq main.nf --type 'SR' --input 'metagwgs-test-datasets/small/input/samplesheet.csv' --host_fasta 'metagwgs-test-datasets/small/input/host/Homo_sapiens.GRCh38_chr21.fa' --host_index 'metagwgs-test-datasets/small/input/host/Homo_sapiens.GRCh38_chr21.fa.{amb,ann,bwt,pac,sa}' --kaiju_db_dir 'FT_banks_2021-10-19/kaijudb_refseq_2020-05-25' --min_contigs_cpm 1000 --diamond_bank 'FT_banks_2021-10-19/refseq_bacteria_2021-05-20/refseq_bacteria_100000.dmnd' --eggnog_mapper_db_dir 'FT_banks_2021-10-19/eggnog-mapper-2.0.4-rf1/data' --taxonomy_dir 'FT_banks_2021-10-19/taxonomy_2021-08-23' --stop_at_clean -with-report -with-timeline -with-trace -with-dag"
\ No newline at end of file
--wrap="module load bioinfo/Nextflow-v21.04.1 ; module load system/singularity-3.7.3 ; nextflow run -profile test,genotoul $METAG_PATH/main.nf --type 'SR' --input '$DATASET/small/input/samplesheet.csv' --host_fasta '$DATASET/small/input/host/Homo_sapiens.GRCh38_chr21.fa' --host_index '$DATASET/small/input/host/Homo_sapiens.GRCh38_chr21.fa.{amb,ann,bwt,pac,sa}' --kaiju_db_dir '$DATABANK/kaijudb_refseq_2020-05-25' --min_contigs_cpm 1000 --diamond_bank '$DATABANK/refseq_bacteria_2021-05-20/refseq_bacteria_100000.dmnd' --eggnog_mapper_db_dir '$EGGNOG_DB' --accession2taxid '$DATABANK/taxonomy_2021-12-7/prot.accession2taxid.FULL' --taxdump '$DATABANK/taxonomy_2021-12-7/new_taxdump' -with-report -with-timeline -with-trace -with-dag"
......@@ -165,7 +165,7 @@ workflow {
skip_clean = true
}
if ( !(params.stop_at_structural_annot) && !(params.diamond_bank) ) {
if ( !(params.stop_at_clean) && !(params.stop_at_assembly) && !(params.stop_at_filtering) && !(params.stop_at_structural_annot) && !(params.diamond_bank) ) {
exit 1, "You must specify --stop_at_structural_annot or specify a diamond bank with --diamond_bank"
}
header = getAndCheckHeader()
......
......@@ -12,6 +12,7 @@ process PROKKA {
"""
prokka --metagenome --noanno --rawproduct --outdir PROKKA_${sampleId} --prefix ${sampleId} ${assembly_file} --centre X --compliant --cpus ${task.cpus}
rm PROKKA_${sampleId}/*.gbk
gt gff3validator PROKKA_${sampleId}/${sampleId}.gff
"""
}
......
......@@ -24,8 +24,7 @@ process READS_DEDUPLICATION {
samtools idxstats ${sampleId}.filtered.bam > ${sampleId}.count_reads_on_contigs.idxstats
samtools flagstat ${sampleId}.filtered.bam > ${sampleId}.count_reads_on_contigs.flagstat
samtools sort -n -o ${sampleId}.filtered.sort.bam ${sampleId}.filtered.bam
bedtools bamtofastq -i ${sampleId}.filtered.sort.bam -fq ${sampleId}_R1_dedup.fastq -fq2 ${sampleId}_R2_dedup.fastq
gzip ${sampleId}_R1_dedup.fastq ; gzip ${sampleId}_R2_dedup.fastq
samtools fastq -N -1 ${sampleId}_R1_dedup.fastq.gz -2 ${sampleId}_R2_dedup.fastq.gz ${sampleId}.filtered.sort.bam
rm ${sampleId}.sort.bam
rm ${sampleId}.fixmate.bam
rm ${sampleId}.fixmate.positionsort.bam
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment