Commit 924e29a2 authored by MARTIN Pierre's avatar MARTIN Pierre
Browse files

Merge branch 'dev-test' into dev

parents 2f283a62 c910cba0
.nextflow*
work/
metagwgs.code-workspace
# recipe for building singularity image and deploy it on the registery for bwa version 0.7.17
image:
name: quay.io/singularity/singularity:v3.4.0
entrypoint: [""]
stages:
- build
- deploy
# Build Singularity container bwa_v0.7.17.sif
singularity-image:
stage: build
script:
- singularity build metagWGS.sif env/Singularity_recipe_metagWGS
- singularity build eggnog_mapper.sif env/Singularity_recipe_eggnog_mapper
- singularity build mosdepth.sif env/Singularity_recipe_mosdepth
artifacts:
paths:
- metagWGS.sif
- eggnog_mapper.sif
- mosdepth.sif
only:
changes:
- .gitlab-ci.yml
- env/*
# Push the image template.sif on the registry
deploy:
stage: deploy
script:
- singularity push --docker-username "${CI_REGISTRY_USER}" --docker-password "${CI_REGISTRY_PASSWORD}" metagWGS.sif oras://"$CI_REGISTRY_IMAGE"/"$CI_PROJECT_NAME":"$CI_COMMIT_TAG"
- singularity push --docker-username "${CI_REGISTRY_USER}" --docker-password "${CI_REGISTRY_PASSWORD}" eggnog_mapper.sif oras://"$CI_REGISTRY_IMAGE"/eggnog_mapper:"$CI_COMMIT_TAG"
- singularity push --docker-username "${CI_REGISTRY_USER}" --docker-password "${CI_REGISTRY_PASSWORD}" mosdepth.sif oras://"$CI_REGISTRY_IMAGE"/mosdepth:"$CI_COMMIT_TAG"
only:
changes:
- .gitlab-ci.yml
- env/*
#!/usr/bin/env python3
"""----------------------------------------------------------------------------
Script Name: filter_diamond_hits.py
Description: Keep best diamond hits for each query gene/protein
based on best bitscore and filter out query with low identity and low coverage
Adapted from best_bitscore_diamond.py script of Joanna Fourquet
Input files: Diamond output file (.m8)
Created By: Jean Mainguy
Date: 2021-08-02
-------------------------------------------------------------------------------
"""
# Metadata
__author__ = 'Mainguy Jean - Plateforme bioinformatique Toulouse'
__copyright__ = 'Copyright (C) 2021 INRAE'
__license__ = 'GNU General Public License'
__version__ = '0.1'
__email__ = 'support.bioinfo.genotoul@inra.fr'
__status__ = 'dev'
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter, FileType
import logging
import sys
import csv
def get_hits_with_highest_bitscore(hits):
highest_bitscore = max([float(hit['bitScore']) for hit in hits])
return [hit for hit in hits if float(hit['bitScore']) == highest_bitscore]
def get_all_hits_per_query(blast_result_file, header_list):
# Assertion: Hit are already sorted by query in diamond output.
# Both commands should output the same number of line:
# cut -f1 blast_result_file | uniq | wc -l
# cut -f1 blast_result_file | sort | uniq | wc -l
with open(blast_result_file) as in_fl:
result_reader = csv.DictReader(in_fl, delimiter='\t', fieldnames=header_list)
query_ids_processed = []
current_query_id = None
hits = []
for hit in result_reader:
if not current_query_id:
current_query_id = hit['queryId']
if current_query_id and current_query_id != hit['queryId']:
yield hits
hits = []
current_query_id = hit['queryId']
assert current_query_id not in query_ids_processed, f"Queries are not sorted in blast result. Query {current_query_id} is found in different part of the file."
query_ids_processed.append(current_query_id)
hits.append(hit)
if current_query_id:
yield hits
def is_identity_and_coverage_ok(hit, min_identity, min_coverage):
qcovhsp = (int(hit["queryEnd"]) - int(hit["queryStart"]) + 1) / int(hit['queryLength'])
if float(hit['percIdentity']) >= min_identity or qcovhsp >= min_coverage:
return True
return False
def parse_arguments():
"""Parse script arguments."""
parser = ArgumentParser(description="...",
formatter_class=ArgumentDefaultsHelpFormatter)
parser.add_argument('aln_input_file',
help="File with blast/diamond matches expected format m8 \
\nqueryId, subjectId, percIdentity, alnLength, mismatchCount, gapOpenCount,\
queryStart, queryEnd, subjectStart, subjectEnd, eVal, bitScore")
parser.add_argument('-o', '--output_file', type=str,
default="best_hit.tsv", help=("string specifying output file path"))
parser.add_argument('-i', '--min_identity', default=60, type=float,
help="percentage of identity")
parser.add_argument('-c', '--min_coverage', default=70, type=float,
help="percentage of coverage")
parser.add_argument("-v", "--verbose", help="increase output verbosity",
action="store_true")
args = parser.parse_args()
return args
def main():
args = parse_arguments()
if args.verbose:
logging.basicConfig(format="%(levelname)s: %(message)s", level=logging.DEBUG)
logging.info('Mode verbose ON')
else:
logging.basicConfig(format="%(levelname)s: %(message)s")
headers = "queryId subjectId percIdentity alnLength mismatchCount gapOpenCount queryStart queryEnd subjectStart subjectEnd eVal bitScore queryLength subjectLength subjectTitle"
header_list = headers.split(' ')
blast_result = args.aln_input_file
outfile = args.output_file
min_coverage = args.min_coverage
min_identity = args.min_identity
best_hit_count = 0
query_count_with_low_hit = 0
with open(outfile, 'w') as out_fl:
writer = csv.DictWriter(out_fl, fieldnames=header_list, delimiter='\t')
for query_i, query_hits in enumerate(get_all_hits_per_query(blast_result, header_list)):
if query_i % 10000 == 0:
logging.info(f'{query_i} queries processed... ')
correct_hits = [hit for hit in query_hits if is_identity_and_coverage_ok(
hit, min_identity, min_coverage)]
if not correct_hits:
query_count_with_low_hit += 1
continue
best_hits = get_hits_with_highest_bitscore(correct_hits)
for best_hit in best_hits:
best_hit_count += 1
writer.writerow(best_hit)
logging.info(f'{query_count_with_low_hit} queries ({100*query_count_with_low_hit/(query_i+1):.2f}%) have low hits that do not pass identity ({min_identity}%) or coverage ({min_coverage}%) thresholds')
logging.info(f'{best_hit_count} best hits of {query_i+1 - query_count_with_low_hit } queries have been written in {outfile}.')
if __name__ == '__main__':
main()
......@@ -89,21 +89,21 @@ concat_diamond_files = pd.DataFrame()
# Concatenate diamond files.
for (diamond_idx,diamond_path) in enumerate(diamond_files):
diamond_file = pd.read_csv(diamond_path, delimiter='\t', decimal='.', header=None)
diamond_file.loc[:,1] = 'https://www.ncbi.nlm.nih.gov/protein/' + diamond_file.loc[:,1]
group_diamond_file = diamond_file.groupby(diamond_file.columns[0])\
.agg({diamond_file.columns[14] : ';'.join, diamond_file.columns[1] : ','.join})\
.reset_index()\
.reindex(columns=diamond_file.columns)
res_diamond_file = group_diamond_file.iloc[:,[0,1,14]]
diamond_columns = ["qseqid","sseqid","pident","length","mismatch","gapopen","qstart","qend","sstart","send","evalue","bitscore","qlen","slen","stitle"]
diamond_file = pd.read_csv(diamond_path, delimiter='\t', decimal='.', header=None, names=diamond_columns)
diamond_file.loc[:,"sseqid"] = 'https://www.ncbi.nlm.nih.gov/protein/' + diamond_file.loc[:,"sseqid"]
group_diamond_file = diamond_file.groupby("qseqid")\
.agg({"stitle" : ';'.join, "sseqid" : ','.join})\
.reset_index()\
.reindex(columns=diamond_file.columns)
res_diamond_file = group_diamond_file.loc[:,["qseqid","sseqid","stitle"]]
concat_diamond_files = pd.concat([concat_diamond_files, res_diamond_file])
# Merge counts, annotation and diamond results.
merge_annot = pd.merge(counts_file,concat_eggnog_mapper_files,left_on="seed_cluster",right_on='#query_name', how='left')
merge = pd.merge(merge_annot,concat_diamond_files,left_on="seed_cluster",right_on=concat_diamond_files.columns[0], how='left')
merge = pd.merge(merge_annot,concat_diamond_files,left_on="seed_cluster",right_on="qseqid", how='left')
merge.drop('#query_name', inplace=True, axis=1)
merge.drop(merge.columns[28], inplace=True, axis=1)
res_merge = merge.rename(columns = {1: 'diamond_db_id', 14: 'diamond_db_description'})
merge.drop("qseqid", inplace=True, axis=1)
# Write merge data frame in output file.
res_merge.to_csv(args.output_file, sep="\t", index=False)
merge.to_csv(args.output_file, sep="\t", index=False)
......@@ -71,14 +71,19 @@ percontig = pd.read_csv(args.percontig_file, delimiter='\t', dtype=str)
# Merge idxstats and .percontig.tsv files.
merge = pd.merge(idxstats,percontig,left_on='contig',right_on='#contig', how='outer')
#add depth
# Add depth
merge = pd.merge(merge,mosdepth,left_on='contig',right_on='contig', how='outer')
# Fill NaN values to keep unmapped contigs.
merge['consensus_lineage'] = merge['consensus_lineage'].fillna('Unknown')
merge['tax_id_by_level'] = merge['tax_id_by_level'].fillna(1)
merge['consensus_tax_id'] = merge['consensus_tax_id'].fillna(1)
# Group by lineage and sum number of reads and contigs.
res = merge.groupby(['consensus_lineage','consensus_tax_id', 'tax_id_by_level']).agg({'contig' : [';'.join, 'count'], 'mapped': 'sum', 'depth': 'mean'}).reset_index()
res.columns=['lineage_by_level', 'consensus_tax_id', 'tax_id_by_level', 'name_contigs', 'nb_contigs', 'nb_reads', 'depth']
# Fill the NaN by 0.
# Fill NaN values with 0.
res.fillna(0, inplace=True)
# Split by taxonomic level
......
......@@ -57,7 +57,7 @@ with open(args.list_of_kaiju_files) as fkaiju_list:
kaiju_files = fkaiju_list.read().split()
# Merge kaiju results for all samples.
for (kaiju_idx,kaiju_path) in enumerate(kaiju_files):
for (kaiju_idx,kaiju_path) in enumerate(sorted(kaiju_files)):
print(kaiju_idx)
if(kaiju_idx==0):
merge = pd.read_csv(kaiju_path, delimiter='\t', dtype=str)
......
......@@ -58,8 +58,8 @@ with open(args.list_of_input_files) as finput_list:
sample_files = finput_list.read().split()
# Merge results for all samples by lineage.
for (sample_idx,sample_path) in enumerate(sample_files):
print("Read " + sample_path)
for (sample_idx,sample_path) in enumerate(sorted(sample_files)):
print(sample_idx)
if(sample_idx==0):
merge = pd.read_csv(sample_path, delimiter='\t', dtype=str)
sample_name = os.path.splitext(sample_path)[0]
......@@ -78,7 +78,7 @@ merge.rename(columns = {'name_contigs': 'name_contigs_' + sample_name, \
'nb_reads': 'nb_reads_' + sample_name,\
'depth': 'depth_' + sample_name},inplace=True)
# Fill the NaN by 0.
# Fill NaN values with 0.
merge.fillna(0, inplace=True)
print("Write " + args.output_file)
# Write merge data frame in output file.
......
......@@ -44,14 +44,10 @@ process {
memory = { 50.GB * task.attempt }
cpus = 25
}
withName: metaspades {
withName: assembly {
memory = { 110.GB * task.attempt }
cpus = 20
}
withName: megahit {
cpus = 20
memory = { 100.GB * task.attempt }
}
withName: quast {
cpus = 4
memory = { 8.GB * task.attempt }
......
......@@ -45,14 +45,10 @@ process {
memory = { 2.GB * task.attempt }
cpus = 20
}
withName: metaspades {
withName: assembly {
memory = { 60.GB * task.attempt }
cpus = 14
}
withName: megahit {
cpus = 20
memory = { 60.GB * task.attempt }
}
withName: quast {
cpus = 3
memory = { 8.GB * task.attempt }
......
......@@ -41,14 +41,10 @@ process {
memory = { 36.GB * task.attempt }
cpus = 4
}
withName: metaspades {
withName: assembly {
memory = { 10.GB * task.attempt }
cpus = 8
}
withName: megahit {
cpus = 8
memory = { 10.GB * task.attempt }
}
withName: quast {
cpus = 2
memory = { 2.GB * task.attempt }
......
process.executor = 'slurm'
includeConfig 'singularity.config'
singularity.runOptions = "-B /work/bank/ -B /bank -B /work2 -B /work -B /save -B /home -B /work/project -B /usr/local/bioinfo"
singularity.runOptions = "-B /work/bank/ -B /bank -B /work -B /work2 -B /save -B /home -B /work/project -B /usr/local/bioinfo"
process.queue = 'workq'
process {
......@@ -15,18 +15,18 @@ process {
maxErrors = '-1'
withName: cutadapt {
cpus = 3
memory = { 1.GB * task.attempt }
cpus = 3
memory = { 1.GB * task.attempt }
}
withName: sickle {
memory = { 1.GB * task.attempt }
memory = { 1.GB * task.attempt }
}
withLabel: fastqc {
cpus = 6
memory = { 1.GB * task.attempt }
cpus = 6
memory = { 1.GB * task.attempt }
}
withName: multiqc {
memory = { 2.GB * task.attempt }
memory = { 2.GB * task.attempt }
}
withName: host_filter {
memory = { 20.GB * task.attempt }
......@@ -38,17 +38,13 @@ process {
cpus = 6
}
withName: kaiju {
memory = { 100.GB * task.attempt }
memory = { 50.GB * task.attempt }
cpus = 4
}
withName: metaspades {
withName: assembly {
memory = { 10.GB * task.attempt }
cpus = 8
}
withName: megahit {
cpus = 8
memory = { 10.GB * task.attempt }
}
withName: quast {
cpus = 2
memory = { 2.GB * task.attempt }
......@@ -75,8 +71,8 @@ process {
memory = { 1.GB * task.attempt }
}
withName: diamond {
cpus = 2
memory = { 8.GB * task.attempt }
cpus = 8
memory = { 10.GB * task.attempt }
}
withName: get_software_versions {
memory = { 1.GB * task.attempt }
......
......@@ -37,14 +37,10 @@ process {
memory = { 10.GB * task.attempt }
cpus = 2
}
withName: metaspades {
withName: assembly {
memory = { 2.GB * task.attempt }
cpus = 3
}
withName: megahit {
cpus = 3
memory = { 2.GB * task.attempt }
}
withName: quast {
cpus = 2
memory = { 2.GB * task.attempt }
......
......@@ -45,7 +45,7 @@ A report html file is generated at the end of the workflow with [MultiQC](https:
The pipeline is built using [Nextflow,](https://www.nextflow.io/docs/latest/index.html#) a bioinformatics workflow tool to run tasks across multiple compute infrastructures in a very portable manner.
Two [Singularity](https://sylabs.io/docs/) containers are available making installation trivial and results highly reproducible.
Three [Singularity](https://sylabs.io/docs/) containers are available making installation trivial and results highly reproducible.
## Documentation
......
......@@ -32,7 +32,7 @@ A directory called `metagwgs` containing all source files of the pipeline have b
## III. Install Singularity
metagWGS needs two [Singularity](https://sylabs.io/docs/) containers to run: Singularity version 3 or above must be installed.
metagWGS needs three [Singularity](https://sylabs.io/docs/) containers to run: Singularity version 3 or above must be installed.
See [here](https://sylabs.io/guides/3.7/user-guide/quick_start.html#quick-installation-steps) how to install Singularity >=v3.
......@@ -43,9 +43,9 @@ See [here](https://sylabs.io/guides/3.7/user-guide/quick_start.html#quick-instal
## IV. Download or build Singularity containers
You can directly download the two Singularity containers (`Solution 1`, recommended) or build them (`Solution 2`).
You can directly download the three Singularity containers (`Solution 1`, recommended) or build them (`Solution 2`).
### Solution 1 (recommended): download the two containers
### Solution 1 (recommended): download the three containers
**In the directory you want tu run the workflow**, where you have the directory `metagwgs` with metagWGS source files, run these command lines:
......@@ -53,23 +53,26 @@ You can directly download the two Singularity containers (`Solution 1`, recommen
cd metagwgs/env/
singularity pull eggnog_mapper.sif oras://registry.forgemia.inra.fr/genotoul-bioinfo/metagwgs/eggnog_mapper:latest
singularity pull metagwgs.sif oras://registry.forgemia.inra.fr/genotoul-bioinfo/metagwgs/metagwgs:latest
singularity pull mosdepth.sif oras://registry.forgemia.inra.fr/genotoul-bioinfo/metagwgs/mosdepth:latest
singularity pull mosdepth.sif oras://registry.forgemia.inra.fr/genotoul-bioinfo/metagwgs/metagwgs:latest
```
Three files (`metagwgs.sif` , `mosdepth.sif` and `eggnog_mapper.sif`) must have been downloaded.
Three files (`metagwgs.sif`, `eggnog_mapper.sif` and `mosdepth.sif`) must have been downloaded.
### Solution 2: build the two containers.
### Solution 2: build the three containers.
**In the directory you want tu run the workflow**, where you have downloaded metagWGS source files, go to `metagwgs/env/` directory, and follow [these explanations](https://forgemia.inra.fr/genotoul-bioinfo/metagwgs/-/wikis/Singularity%20container) to build the two containers. You need two files by container to build them. These files are into the `metagwgs/env/` folder and you can read them here:
**In the directory you want tu run the workflow**, where you have downloaded metagWGS source files, go to `metagwgs/env/` directory, and follow [these explanations](https://forgemia.inra.fr/genotoul-bioinfo/metagwgs/-/wikis/Singularity%20container) to build the three containers. You need three files by container to build them. These files are into the `metagwgs/env/` folder and you can read them here:
* metagwgs.sif container
* [metagWGS recipe file](https://forgemia.inra.fr/genotoul-bioinfo/metagwgs/-/blob/dev/env/Singularity_recipe_metagWGS)
* [metagWGS.yml](https://forgemia.inra.fr/genotoul-bioinfo/metagwgs/-/blob/dev/env/metagWGS.yml)
* eggnog_mapper container
* eggnog_mapper.sif container
* [eggnog_mapper recipe file](https://forgemia.inra.fr/genotoul-bioinfo/metagwgs/-/blob/dev/env/Singularity_recipe_eggnog_mapper)
* [eggnog_mapper.yml](https://forgemia.inra.fr/genotoul-bioinfo/metagwgs/-/blob/dev/env/eggnog_mapper.yml)
* mosdepth.sif container
* [mosdepth recipe file](https://forgemia.inra.fr/genotoul-bioinfo/metagwgs/-/blob/dev/env/Singularity_recipe_mosdepth)
* [mosdepth.yml](https://forgemia.inra.fr/genotoul-bioinfo/metagwgs/-/blob/dev/env/mosdepth.yml)
At the end of the build, two files (`metagwgs.sif` and `eggnog_mapper.sif`) must have been generated.
At the end of the build, three files (`metagwgs.sif`, `eggnog_mapper.sif` and `mosdepth.sif`) must have been generated.
## V. Use metagWGS
......
......@@ -2,7 +2,7 @@
## I. Basic usage
1. See [Installation page](https://forgemia.inra.fr/genotoul-bioinfo/metagwgs/-/blob/dev/docs/installation.md) to install metagWGS. Make sure you are in the directory where you downloaded `metagwgs` source files and added into `metagwgs/dev` the two Singularity images `metagwgs.sif` and `eggnog_mapper.sif`.
1. See [Installation page](https://forgemia.inra.fr/genotoul-bioinfo/metagwgs/-/blob/dev/docs/installation.md) to install metagWGS. Make sure you are in the directory where you downloaded `metagwgs` source files and added into `metagwgs/env` the two Singularity images `metagwgs.sif` and `eggnog_mapper.sif`.
2. metagWGS is still under development: you need to use the `dev` branch of the metagwgs repository.
......@@ -108,7 +108,7 @@ It allows you to choose the configuration profile among:
These profiles are associated to different configuration files developped [in this directory](https://forgemia.inra.fr/genotoul-bioinfo/metagwgs/-/tree/dev/conf). The `base.config` file available in this directory is the base configuration load in first which is crushed by indications of the profile you use. See [here](https://genotoul-bioinfo.pages.mia.inra.fr/use-nextflow-nfcore-course/nfcore/profiles.html) for more explanations.
### 2. Usefull options
### 2. Useful options
#### `-resume`
......@@ -223,7 +223,7 @@ No parameter available for this substep.
* `--kaiju_db_dir "PATH/directory"`: if you have already downloaded the kaiju database, indicate its directory. **WARNING:** you will not be able to use kaiju database built with `kaiju-makedb` command line. Default: `--kaiju_db_dir false`. See **WARNING 2**.
* `--kaiju_db "http://kaiju.binf.ku.dk/database/CHOOSEN_DATABASE.tgz"`: allows metagWGS to download kaiju database of your choice. The list of kaiju databases is available in [kaiju website](http://kaiju.binf.ku.dk/server), in the blue insert on the left side. Default: `--kaiju_db false`. See **WARNING 2**.
* `--kaiju_db "http://kaiju.binf.ku.dk/database/CHOOSEN_DATABASE.tgz"`: allows metagWGS to download kaiju database of your choice. The list of kaiju databases is available in [kaiju website](http://kaiju.binf.ku.dk/server), in the blue insert on the left side. Default: `--kaiju_db https://kaiju.binf.ku.dk/database/kaiju_db_refseq_2021-02-26.tgz`. See **WARNING 2**.
* `--skip_kaiju`: allows to skip taxonomic affiliation of reads with kaiju. Krona files will not be generated. Use: `--skip_kaiju`. See **WARNING 2**.
......@@ -237,7 +237,7 @@ No parameter available for this substep.
**WARNING 4:** the user has choice between `metaspades` or `megahit` for `--assembly` parameter. The choice can be based on CPUs and memory availability: `metaspades` needs more CPUs and memory than `megahit` but our tests showed that assembly metrics are better for `metaspades` than `megahit`.
* `--metaspades_mem [memory_value]`: memory (in G) used by `metaspades` process. Default: `440`.
* `--metaspades_mem [memory_value]`: memory (in Gb) used by `metaspades` process. Default: `440`.
#### **`03_filtering` step:**
......@@ -288,6 +288,8 @@ No parameters.
* `--taxdump "FTP_PATH_TO_taxdump.tar.gz"`: indicates the FTP adress of the NCBI file `taxdump.tar.gz`. Default `"ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz"`.
* `--taxonomy_dir "PATH/directory": if you have already downloaded the accession2taxid and taxdump databases, indicate their parent directory. Default: `--taxonomy_dir false`.`
#### **`08_binning` step:**
**WARNING 13:** `08_binning` step depends on `01_clean_qc`, `02_assembly`, `03_filtering` (if you use it), `04_structural_annot` and `05_alignment` steps. You need to use mandatory files of these six steps to run `08_binning`. See [II. Input files](https://forgemia.inra.fr/genotoul-bioinfo/metagwgs/-/blob/dev/docs/usage.md#ii-input-files) and WARNINGS from 1 to 9.
......
# Functional tests: Usage
## I. Pre-requisites
1. metagWGS is still under development: you need to use the `dev-test` branch of the metagwgs repository.
Run:
```bash
cd metagwgs
git checkout dev-test
git pull
cd functional_tests
```
2. Make sure you are in the directory where you downloaded `metagwgs` source files and added the three mandatory Singularity images in `metagwgs/env`
3. Make sure you downloaded all the required data files for metagwgs. If not, they will be downloaded by the pipeline each time you run it in a new project.
4. Download the test datasets (expected results + test fastq) from [link-to-test-datasets].
## II. Functional tests
Each step of metagwgs produces a series of files. We want to be able to determine if the modifications we perform on metagwgs have an impact on any of these files (presence, contents, format, ...).
Two datasets are currently available for these functional tests: test (from [metagwgs/test](https://forgemia.inra.fr/genotoul-bioinfo/metagwgs/-/tree/master/test)) and MAG (from [nf-core/test-datasets](https://github.com/nf-core/test-datasets/tree/mag/test_data))
When launching the functional test script, the files contained in *exp_dir* (in ./test_expected_logs) are scanned and, for each possible file extension, a test if performed on an expected file against it's observed version (in ./results).
### Test methods
5 simple test methods are used:
diff: simple bash difference between two files
`diff exp_path obs_path`
zdiff: simple bash difference between two gzipped files
`zdiff exp_path obs_path`
no_header_diff: remove the headers of .annotations and .seed_orthologs files
`diff <(grep -w "^?#" exp_path) <(grep -w "^?#" obs_path)`
cut_diff: exception for cutadapt.log file
`diff <(tail -n+6 exp_path) <(tail -n+6 obs_path)`
not_empty: in python, check if file is empty
`test = path.getsize(obs_path) > 0`
## III. Launch test
Nextflow metagwgs can be launched on any cluster manager (sge, slurm, ...). The script for functional tests can use a provided script containing the command to launch Nextflow on a cluster.
Exemples below use the slurm job manager and launche all 7 steps of metagwgs to ensure all parts of main.nf work as intended.
### Launch with script
Create a new directory (project-directory) containing a shell script to be used by functional tests:
```
#!/bin/bash
sbatch -W -p workq -J metagwgs --mem=6G \
--wrap="module load bioinfo/Nextflow-v21.04.1 ; module load system/singularity-3.7.3 ; nextflow run -profile test_genotoul_workq [work_dir]/metaG/metagwgs/main.nf --step '01_clean_qc,02_assembly,03_filtering,04_structural_annot,05_alignment,06_func_annot,07_taxo_affi' --reads '../metagwgs/test/*_{R1,R2}.fastq.gz' --host_fasta '[work_dir]/human_ref/Homo_sapiens.GRCh38_chr21.fa' --host_bwa_index '[work_dir]/human_ref/Homo_sapiens.GRCh38_chr21.fa.{amb,ann,bwt,pac,sa}' --kaiju_db_dir '/bank/kaijudb/kaijudb_refseq_2020-05-25' --taxonomy_dir '[work_dir]/taxonomy' --eggnog_mapper_db_dir '/bank/eggnog-mapper/eggnog-mapper-2.0.4-rf1/data' --assembly metaspades --diamond_bank "[work_dir]/refseq_bacteria_2021-05-20/refseq_bacteria.dmnd" -with-report -with-timeline -with-trace -with-dag -resume"
```
Then launch this command:
```
cd project-directory
python [work_dir]/metaG/metagwgs/functional_tests/main.py -step 07_taxo_affi -exp_dir [work_dir]/test_expected_logs -obs_dir ./results --script launch_07_taxo_affi.sh
```
### Launch without script
If you already have launched metagwgs [see metagwgs README and usage] on test data:
```
cd project-directory
python [work_dir]/metaG/metagwgs/functional_tests/main.py -step 07_taxo_affi -exp_dir [work_dir]/test_expected_logs -obs_dir ./results
```
## Output
A ft_\[step\].log file is created for each step of metagwgs. It contains information about each test performed on given files.
Exemple with ft_01_clean_qc.log:
```
Expected directory: /work/pmartin2/metaG/test_expected_logs/01_clean_qc
vs
Observed directory: /work/pmartin2/metaG/refac_09_13.2/results/01_clean_qc
------------------------------------------------------------------------------
File: 01_1_cleaned_reads/cleaned_a_R1.fastq.gz
Test method: zdiff
Test result: Passed