Commit 97ab5025 authored by Jean Mainguy's avatar Jean Mainguy
Browse files

Merge branch 'dev' of forgemia.inra.fr:genotoul-bioinfo/metagwgs into dev

parents 6b2f2dda 814e6a51
......@@ -73,27 +73,13 @@ At the end of the build, two files (`metagwgs.sif` and `eggnog_mapper.sif`) must
**WARNING:** to ensure Nextflow can find the _.sif_ files, we encourage you to change the _nextflow.config_ file in metagWGS to contain these lines:
```
process {
container = '<PATH>/metagwgs.sif'
container = '$SING_IMG_FOLDER/metagwgs.sif'
withLabel: EGGNOG {
container = '<PATH>/eggnog_mapper.sif'
container = '$SING_IMG_FOLDER/eggnog_mapper.sif'
}
}
```
Where \<PATH\> leads to the directory where the singularity images are built/downloaded.
**WARNING:** to ensure Nextflow can find the _.sif_ files, we encourage you to change the _nextflow.config_ file in metagWGS at these lines:
```
process {
container = '<PATH>/metagwgs.sif'
withLabel: eggnog {
container = '<PATH>/eggnog_mapper.sif'
}
withLabel: mosdepth {
container = '<PATH>/mosdepth.sif'
}
}
```
Where \<PATH\> leads to the directory where the singularity images are built/downloaded.
Where $SING_IMG_FOLDER leads to the directory where the singularity images are built/downloaded.
## V. Use metagWGS
......
......@@ -3,15 +3,24 @@
## I. Pre-requisites
1. Install metagwgs as described here: [installation doc](../docs/installation.md)
2. Get datasets: three datasets are currently available for these functional tests at `https://forgemia.inra.fr/genotoul-bioinfo/metagwgs-test-datasets.git`: small, mag and hifi ([descriptions here](https://forgemia.inra.fr/genotoul-bioinfo/metagwgs-test-datasets/-/blob/master/README.md))
2. Get datasets: two datasets are currently available for these functional tests at `https://forgemia.inra.fr/genotoul-bioinfo/metagwgs-test-datasets.git`
```
git clone git@forgemia.inra.fr:genotoul-bioinfo/metagwgs-test-datasets.git
Replace "\<dataset\>" with either "small" or "mag":
```
git clone --branch <dataset> git@forgemia.inra.fr:genotoul-bioinfo/metagwgs-test-datasets.git
or
wget https://forgemia.inra.fr/genotoul-bioinfo/metagwgs-test-datasets/-/archive/metagwgs-test-datasets.tar.gz
```
wget https://forgemia.inra.fr/genotoul-bioinfo/metagwgs-test-datasets/-/archive/<dataset>/metagwgs-test-datasets-<dataset>.tar.gz
```
3. Get data banks: download [this archive](http://genoweb.toulouse.inra.fr/~choede/FT_banks_2021-10-19.tar.gz) and decompress its contents in any folder. This archive contains data banks for:
- **Kaiju** (_kaijudb_refseq_2020-05-25_)
- **Diamond** (_refseq_bacteria_2021-05-20_)
- **NCBI Taxonomy** (_taxonomy_2021-08-23_)
- **Eggnog Mapper** (_eggnog-mapper-2.0.4-rf1_)
> Use those banks to reproduce the outputs of functional tests.
## II. Run functional tests
......@@ -23,7 +32,7 @@ To launch functional tests, you need to be located at the root of the folder whe
cd test_folder
python <metagwgs-src>/functional_tests/main.py -step 07_taxo_affi -exp_dir metagwgs-test-datasets/small/output -obs_dir ./results
```
- by providing a script which will launch the nextflow pipeline [see example](./launch_example.sh)
- by providing a script which will launch the nextflow pipeline [see example](./launch_example.sh) (this example is designed for the "small" dataset with --min_contigs_cpm>1000, using slurm)
```
mkdir test_folder
cd test_folder
......@@ -31,6 +40,8 @@ cp <metagwgs-src>/functional_tests/launch_example.sh ./
python <metagwgs-src>/functional_tests/main.py -step 07_taxo_affi -exp_dir metagwgs-test-datasets/small/output -obs_dir ./results --script launch_example.sh
```
>**NOTE: more information on the command used to produce each dataset in [small](https://forgemia.inra.fr/genotoul-bioinfo/metagwgs-test-datasets/-/tree/small) and [mag](https://forgemia.inra.fr/genotoul-bioinfo/metagwgs-test-datasets/-/tree/mag) READMEs**
## III. Output
A ft_\[step\].log file is created for each step of metagwgs. It contains information about each test performed on given files.
......@@ -38,9 +49,9 @@ A ft_\[step\].log file is created for each step of metagwgs. It contains informa
Exemple with ft_01_clean_qc.log:
```
Expected directory: /work/pmartin2/metaG/test_expected_logs/01_clean_qc
Expected directory: metagwgs-test-datasets/output/01_clean_qc
vs
Observed directory: /work/pmartin2/metaG/refac_09_13.2/results/01_clean_qc
Observed directory: results/01_clean_qc
------------------------------------------------------------------------------
......
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Pierre MARTIN
# IE Genotoul
# MIAT - INRAe (Toulouse)
# 2021
"""----------------------------------------------------------------------------
Script Name: functions.py
Description: Functions for functional tests
Input files: Expected and observed folders
Created By: Pierre Martin
Date: 2021-12-16
-------------------------------------------------------------------------------
"""
# Metadata
__author__ = 'Pierre Martin \
- MIAT - PF Genotoul'
__copyright__ = 'Copyright (C) 2021 INRAe'
__license__ = 'GNU General Public License'
__version__ = '1'
__email__ = 'support.bioinfo.genotoul@inra.fr'
__status__ = 'dev'
# Functions of metagwgs functional_tests (main.py)
try:
......@@ -112,8 +125,9 @@ def check_files(exp_dir, obs_dir, step, methods, verbose):
expected_path = path.join(expected_prefix, file_path)
observed_path = path.join(observed_prefix, file_path)
print("exp:\t",expected_path)
print("obs:\t",observed_path)
if verbose:
print("exp:\t",expected_path)
print("obs:\t",observed_path)
file_name = path.basename(file_path)
file_extension = path.splitext(file_name)[1]
......
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Pierre MARTIN
# IE Genotoul
# MIAT - INRAe (Toulouse)
# 2021
"""----------------------------------------------------------------------------
Script Name: main.py
Description: Do functional tests on observed results using expected results
Input files: Expected and observed folders
Created By: Pierre Martin
Date: 2021-12-16
-------------------------------------------------------------------------------
"""
# Metadata
__author__ = 'Pierre Martin \
- MIAT - PF Genotoul'
__copyright__ = 'Copyright (C) 2021 INRAe'
__license__ = 'GNU General Public License'
__version__ = '1'
__email__ = 'support.bioinfo.genotoul@inra.fr'
__status__ = 'dev'
# Usage
## cd [work_directory]
......
#!/usr/bin/env nextflow
/*
========================================================================================
metagWGS
========================================================================================
metagWGS Analysis Pipeline.
#### Homepage / Documentation
https://forgemia.inra.fr/genotoul-bioinfo/metagwgs/
----------------------------------------------------------------------------------------
*/
nextflow.enable.dsl = 2
include { SHARED as SH } from './subworkflows/shared'
......@@ -51,7 +61,7 @@ include { MULTIQC } from './modules/multiqc'
S03_FILTERING options:
--stop_at_filtering Stop the pipeline at this step
--skip_filtering Skip this step
--min_contigs_cpm [cutoff] CPM cutoff (Count Per Million) to filter contigs with low number of reads. Default: 10.
--min_contigs_cpm [cutoff] CPM cutoff (Count Per Million) to filter contigs with low number of reads. Default: 1.
S04_STRUCTURAL_ANNOT options:
--stop_at_structural_annot Stop the pipeline at this step
......@@ -69,8 +79,8 @@ include { MULTIQC } from './modules/multiqc'
S07_TAXO_AFFI options:
--skip_taxo_affi Skip this step
--accession2taxid FTP adress of file prot.accession2taxid.gz. Default: "ftp://ftp.ncbi.nih.gov/pub/taxonomy/accession2taxid/prot.accession2taxid.gz".
--taxdump FTP adress of file taxdump.tar.gz. Default: "ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz".
--accession2taxid FTP adress of file prot.accession2taxid.gz. Default: "ftp://ftp.ncbi.nih.gov/pub/taxonomy/accession2taxid/prot.accession2taxid.FULL.gz".
--taxdump FTP adress of file taxdump.tar.gz. Default: "ftp://ftp.ncbi.nih.gov/pub/taxonomy/new_taxdump.tar.gz".
--taxonomy_dir Directory if taxdump and accession2taxid already downloaded ("PATH/directory").
Other options:
......@@ -224,7 +234,6 @@ workflow {
if ( params.type.toUpperCase() == "SR" ) {
ch_multiqc_config = file(params.sr_multiqc_config, checkIfExists: true)
println("Entering SR")
ch_inputs
.map { item -> [ item.sample, item.fastq_1, item.fastq_2 ] }
.set { ch_reads }
......@@ -259,7 +268,6 @@ workflow {
else if ( params.type.toUpperCase() == "HIFI" ) {
ch_multiqc_config = file(params.hifi_multiqc_config, checkIfExists: true)
println("Entering HiFi")
ch_inputs.map { item -> [ item.sample, item.assembly ] } // [sample, assembly]
.set { ch_assembly }
......
process EGGNOG_MAPPER {
publishDir "${params.outdir}/06_func_annot/06_3_functional_annotation", mode: 'copy'
tag "${sampleId}"
label 'EGGNOG'
input:
......
......@@ -20,7 +20,6 @@ process FEATURE_COUNTS {
// Create table with sum of reads for each global cluster of genes in each sample.
process QUANTIFICATION_TABLE {
publishDir "${params.outdir}/06_func_annot/06_2_quantification", mode: 'copy'
label 'PYTHON'
input:
......
......@@ -3,7 +3,7 @@ process BWA_MEM {
publishDir "${params.outdir}/05_alignment/05_1_reads_alignment_on_contigs/${sampleId}", mode: 'copy'
input:
tuple val(sampleId), path(fna), path(read1), path(read2), path(gff)
tuple val(sampleId), path(fna), path(read1), path(read2)
output:
tuple val(sampleId), path("${sampleId}.sort.bam"), path("${sampleId}.sort.bam.bai"), emit: bam
......@@ -21,8 +21,6 @@ process BWA_MEM {
samtools coverage ${sampleId}.sort.bam > ${sampleId}_coverage.tsv
samtools idxstats ${sampleId}.sort.bam > ${sampleId}.sort.bam.idxstats
# awk 'BEGIN {FS="\t"}; {print \$1 FS "0" FS \$2}' ${sampleId}.sort.bam.idxstats > ${sampleId}_contig.bed
"""
}
......
......@@ -57,8 +57,8 @@ params {
// Others parameters.
outdir = "results"
databases = "databases"
accession2taxid = "ftp://ftp.ncbi.nih.gov/pub/taxonomy/accession2taxid/prot.accession2taxid.gz"
taxdump = "ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz"
accession2taxid = "ftp://ftp.ncbi.nih.gov/pub/taxonomy/accession2taxid/prot.accession2taxid.FULL.gz"
taxdump = "ftp://ftp.ncbi.nih.gov/pub/taxonomy/new_taxdump.tar.gz"
taxonomy_dir = false
hifi_multiqc_config = "$baseDir/assets/hifi_multiqc_config.yaml"
sr_multiqc_config = "$baseDir/assets/sr_multiqc_config.yaml"
......
......@@ -7,7 +7,6 @@ workflow DATABASES {
ch_host_fasta = Channel.empty()
ch_host_index = Channel.empty()
if ( !skip_clean && !params.skip_host_filter ) {
println("Creating host db")
ch_host_fasta = Channel.value(file(params.host_fasta))
if ( !params.host_index ) {
INDEX_HOST(ch_host_fasta)
......@@ -20,7 +19,6 @@ workflow DATABASES {
ch_kaiju_db = Channel.empty()
if ( !skip_clean && !params.skip_kaiju ) { //kaiju_db
println("Creating kaiju db")
if ( !params.kaiju_db_dir && params.kaiju_db_url ) {
INDEX_KAIJU(params.kaiju_db_url)
ch_kaiju_db = INDEX_KAIJU.out.kaiju_db
......@@ -44,7 +42,6 @@ workflow DATABASES {
ch_eggnog = Channel.empty()
if ( !params.stop_at_clean && !params.stop_at_filtering && !params.stop_at_assembly && !params.stop_at_structural_annot && !params.skip_func_annot ) { //eggnog_mapper_db
println("Creating eggnog db")
if( params.eggnog_mapper_db_dir != "" ) {
ch_eggnog = Channel.fromPath(params.eggnog_mapper_db_dir, checkIfExists: true).first()
}
......@@ -59,7 +56,6 @@ workflow DATABASES {
ch_taxonomy = Channel.empty()
if ( !params.stop_at_clean && !params.stop_at_filtering && !params.stop_at_assembly && !params.stop_at_structural_annot && !params.skip_taxo_affi ) {
println("Creating taxonomy db")
if( !params.taxonomy_dir ) {
ch_accession2taxid = Channel.value(params.accession2taxid)
ch_taxdump = Channel.value(params.taxdump)
......
......@@ -21,7 +21,6 @@ workflow SHARED {
ch_prot_length = Channel.empty()
if ( !params.stop_at_clean && !params.stop_at_assembly && !params.stop_at_filtering ) {
println("S04_STRUCTURAL_ANNOT")
S04_STRUCTURAL_ANNOT ( assembly )
ch_prokka_ffn = S04_STRUCTURAL_ANNOT.out.ffn
ch_prokka_faa = S04_STRUCTURAL_ANNOT.out.faa
......@@ -31,7 +30,6 @@ workflow SHARED {
ch_contigs_and_reads = ch_prokka_fna
.join(reads, remainder: true)
.join(ch_prokka_gff, remainder: true)
ch_prot_length = S04_STRUCTURAL_ANNOT.out.prot_length
}
......@@ -39,7 +37,6 @@ workflow SHARED {
ch_m8 = Channel.empty()
ch_sam_coverage = Channel.empty()
if ( !params.stop_at_clean && !params.stop_at_assembly && !params.stop_at_filtering && !params.stop_at_structural_annot ) {
println("S05_ALIGNMENT")
S05_ALIGNMENT ( ch_contigs_and_reads, ch_prokka_faa )
ch_bam = S05_ALIGNMENT.out.bam
ch_m8 = S05_ALIGNMENT.out.m8
......@@ -49,14 +46,12 @@ workflow SHARED {
ch_quant_report = Channel.empty()
ch_v_eggnogmapper = Channel.empty()
if ( !params.stop_at_clean && !params.stop_at_assembly && !params.stop_at_filtering && !params.stop_at_structural_annot && !params.skip_func_annot ) {
println("S06_FUNC_ANNOT")
S06_FUNC_ANNOT ( ch_prokka_ffn, ch_prokka_faa, ch_prokka_gff, ch_bam, ch_m8, eggnog_db )
ch_quant_report = S06_FUNC_ANNOT.out.quant_report
ch_v_eggnogmapper = S06_FUNC_ANNOT.out.v_eggnogmapper
}
if ( !params.stop_at_clean && !params.stop_at_assembly && !params.stop_at_filtering && !params.stop_at_structural_annot && !params.skip_taxo_affi ) {
println("S07_TAXO_AFFI")
S07_TAXO_AFFI ( taxonomy, ch_m8, ch_sam_coverage, ch_prot_length)
}
......
......@@ -28,7 +28,6 @@ workflow SHORT_READS {
ch_filtered_report = Channel.empty()
if ( !params.skip_clean ) {
println("S01_CLEAN_QC")
S01_CLEAN_QC (
reads,
paired,
......@@ -50,7 +49,6 @@ workflow SHORT_READS {
ch_dedup = Channel.empty()
if ( !params.stop_at_clean ) {
println("S02_ASSEMBLY")
S02_ASSEMBLY ( ch_preprocessed_reads )
ch_assembly = S02_ASSEMBLY.out.assembly
ch_dedup = S02_ASSEMBLY.out.dedup
......@@ -60,7 +58,6 @@ workflow SHORT_READS {
}
if ( !params.stop_at_clean && !params.stop_at_assembly && !params.skip_filtering ) {
println("S03_FILTERING")
ch_min_contigs_cpm = Channel.value(params.min_contigs_cpm)
ch_assembly
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment