RAPPEL : Opération de maintenance > ForgeMIA indisponible le 20 Janvier entre 7h et 12h

Commit 1b2c8ebc authored by MARTIN Pierre's avatar MARTIN Pierre
Browse files

Merge branch 'long-reads' of forgemia.inra.fr:genotoul-bioinfo/metagwgs into long-reads

parents 4ffd1c3b 1e9e4470
*.config linguist-language=nextflow
*.nf linguist-language=nextflow
\ No newline at end of file
*.nf gitlab-language=groovy
# recipe for building singularity image and deploy it on the registery for bwa version 0.7.17
image:
name: quay.io/singularity/singularity:v3.4.0
entrypoint: [""]
stages:
- build
- deploy
# Build Singularity container bwa_v0.7.17.sif
singularity-image:
stage: build
script:
- singularity build metagWGS.sif env/Singularity_recipe_metagWGS
- singularity build eggnog_mapper.sif env/Singularity_recipe_eggnog_mapper
artifacts:
paths:
- metagWGS.sif
- eggnog_mapper.sif
only:
changes:
- .gitlab-ci.yml
- env/*
when: manual
# Push the image template.sif on the registry
deploy:
stage: deploy
script:
- singularity push --docker-username "${CI_REGISTRY_USER}" --docker-password "${CI_REGISTRY_PASSWORD}" metagWGS.sif oras://"$CI_REGISTRY_IMAGE"/"$CI_PROJECT_NAME":"$CI_COMMIT_TAG"
- singularity push --docker-username "${CI_REGISTRY_USER}" --docker-password "${CI_REGISTRY_PASSWORD}" eggnog_mapper.sif oras://"$CI_REGISTRY_IMAGE"/eggnog_mapper:"$CI_COMMIT_TAG"
only:
changes:
- .gitlab-ci.yml
- env/*
when: manual
\ No newline at end of file
......@@ -57,6 +57,11 @@ metagWGS is distributed under the GNU General Public License v3.
## Copyright
2021 INRAE
## Funded by
Anti-Selfish (Labex ECOFECT – N° 00002455-CT15000562)
France Génomique National Infrastructure (funded as part of Investissement d’avenir program managed by Agence Nationale de la Recherche, contract ANR-10-INBS-09)
With participation of SeqOccIn members financed by FEDER-FSE MIDI-PYRENEES ET GARONNE 2014-2020.
## Citation
metagWGS has been presented at JOBIM 2020:
......
......@@ -42,27 +42,27 @@ except ImportError as error:
print(str(datetime.now()))
# Manage parameters.
parser = argparse.ArgumentParser(description = 'Script which join \
parser = argparse.ArgumentParser(description='Script which join \
quantification table by gene and tables by samples \
with functional annotations')
parser.add_argument('-t', '--table_of_abundances', required = True, \
help = 'Table containing counts \
parser.add_argument('-t', '--table_of_abundances', required=True,
help='Table containing counts \
for each global gene id in each sample.')
parser.add_argument('-f', '--list_of_file_annotations', required = True, \
help = 'List of files storing functional annotation for each gene per sample.')
parser.add_argument('-f', '--list_of_file_annotations', required=True,
help='List of files storing functional annotation for each gene per sample.')
parser.add_argument('-d', '--list_of_file_diamond', required = True, \
help = 'List of files storing diamond results with best bitscore \
parser.add_argument('-d', '--list_of_file_diamond', required=True,
help='List of files storing diamond results with best bitscore \
for each gene per sample.')
parser.add_argument('-o', '--output_file', required = True, \
help = 'Name of output file containing counts \
parser.add_argument('-o', '--output_file', required=True,
help='Name of output file containing counts \
for each global gene id and its functional annotation.')
parser.add_argument('-v', '--version', action = 'version', \
version = __version__)
parser.add_argument('-v', '--version', action='version',
version=__version__)
args = parser.parse_args()
......@@ -80,28 +80,33 @@ with open(args.list_of_file_diamond) as fdiamond_list:
concat_eggnog_mapper_files = pd.DataFrame()
# Concatenate annotation files.
for (annotations_idx,annotations_path) in enumerate(files_of_annotations):
eggnog_mapper_file = pd.read_csv(annotations_path, delimiter='\t', decimal='.',skiprows=4)
for (annotations_idx, annotations_path) in enumerate(files_of_annotations):
eggnog_mapper_file = pd.read_csv(annotations_path, delimiter='\t', decimal='.', skiprows=4)
concat_eggnog_mapper_files = pd.concat([concat_eggnog_mapper_files, eggnog_mapper_file])
# Creates a new empty dataframe for diamond results.
concat_diamond_files = pd.DataFrame()
# Concatenate diamond files.
for (diamond_idx,diamond_path) in enumerate(diamond_files):
diamond_columns = ["qseqid","sseqid","pident","length","mismatch","gapopen","qstart","qend","sstart","send","evalue","bitscore","qlen","slen","stitle"]
diamond_file = pd.read_csv(diamond_path, delimiter='\t', decimal='.', header=None, names=diamond_columns)
diamond_file.loc[:,"sseqid"] = 'https://www.ncbi.nlm.nih.gov/protein/' + diamond_file.loc[:,"sseqid"]
for (diamond_idx, diamond_path) in enumerate(diamond_files):
diamond_columns = ["qseqid", "sseqid", "pident", "length", "mismatch", "gapopen",
"qstart", "qend", "sstart", "send", "evalue", "bitscore", "qlen", "slen", "stitle"]
diamond_file = pd.read_csv(diamond_path, delimiter='\t', decimal='.',
header=None, names=diamond_columns)
diamond_file.loc[:, "sseqid"] = 'https://www.ncbi.nlm.nih.gov/protein/' + \
diamond_file.loc[:, "sseqid"]
group_diamond_file = diamond_file.groupby("qseqid")\
.agg({"stitle" : ';'.join, "sseqid" : ','.join})\
.agg({"stitle": ';'.join, "sseqid": ','.join})\
.reset_index()\
.reindex(columns=diamond_file.columns)
res_diamond_file = group_diamond_file.loc[:,["qseqid","sseqid","stitle"]]
res_diamond_file = group_diamond_file.loc[:, ["qseqid", "sseqid", "stitle"]]
concat_diamond_files = pd.concat([concat_diamond_files, res_diamond_file])
# Merge counts, annotation and diamond results.
merge_annot = pd.merge(counts_file,concat_eggnog_mapper_files,left_on="seed_cluster",right_on='#query_name', how='left')
merge = pd.merge(merge_annot,concat_diamond_files,left_on="seed_cluster",right_on="qseqid", how='left')
merge_annot = pd.merge(counts_file, concat_eggnog_mapper_files,
left_on="seed_cluster", right_on='#query_name', how='left')
merge = pd.merge(merge_annot, concat_diamond_files,
left_on="seed_cluster", right_on="qseqid", how='left')
merge.drop('#query_name', inplace=True, axis=1)
merge.drop("qseqid", inplace=True, axis=1)
......
......@@ -12,7 +12,7 @@ process {
cpus = { 1 * task.attempt }
memory = { 2.GB * task.attempt }
errorStrategy = 'finish'
errorStrategy = 'finish'
//{ task.exitStatus in [1,143,137,104,134,139] ? 'retry' : 'finish' }
maxRetries = 1
maxErrors = '-1'
......@@ -109,7 +109,7 @@ process {
withLabel: eggnog {
container = 'file://metagwgs/env/eggnog_mapper.sif'
}
withLabel: mosdepth {
withName: depth_on_contigs {
container = 'file://metagwgs/env/mosdepth.sif'
}
}
process.executor = 'slurm'
includeConfig 'singularity.config'
singularity.runOptions = "-B /work/bank/ -B /bank -B /work2 -B /work -B /save -B /home -B /work/project"
process.queue = 'workq'
process {
// Process-specific resource requirements
cpus = { 1 * task.attempt }
memory = { 2.GB * task.attempt }
errorStrategy = { task.exitStatus in [1,143,137,104,134,139] ? 'retry' : 'finish' }
maxRetries = 4
maxErrors = '-1'
withName: multiqc {
memory = { 8.GB * task.attempt }
}
withName: quast {
cpus = 3
memory = { 8.GB * task.attempt }
}
withName: prokka {
memory = { 45.GB * task.attempt }
cpus = 8
}
withName: rename_contigs_genes{
memory = { 2.GB * task.attempt }
}
withLabel: cd_hit {
memory = { 50.GB * task.attempt }
cpus = 16
}
withName: quantification {
memory = { 50.GB * task.attempt }
}
withName: quantification_table {
memory = { 100.GB * task.attempt }
}
withName: diamond {
cpus = 8
memory = { 22.GB * task.attempt }
}
withName: get_software_versions {
memory = { 1.GB * task.attempt }
}
withLabel: binning {
memory = { 5.GB * task.attempt }
}
withName: cat {
cpus = 8
memory = { 16.GB * task.attempt }
}
withName: eggnog_mapper_db {
cpus = 2
memory = { 2.GB * task.attempt }
}
withName: eggnog_mapper {
cpus = 4
memory = { 20.GB * task.attempt }
}
withName: merge_quantif_and_functional_annot {
cpus = 1
memory = { 50.GB * task.attempt }
}
withName: make_functional_annotation_tables {
cpus = 1
memory = { 50.GB * task.attempt }
}
withName: reads_alignment_on_contigs {
cpus = 8
memory = { 15.GB * task.attempt }
}
}
includeConfig 'singularity.config'
includeConfig 'singularity.config'
singularity.runOptions = "-B /work/bank/ -B /bank -B /work2 -B /work -B /save -B /home -B /work/project -B /usr/local/bioinfo"
process.queue = 'workq'
process {
......
......@@ -268,9 +268,11 @@ No parameters.
* `--diamond_bank "PATH/bank.dmnd"`: path to diamond bank used to align protein sequence of genes. This bank must be previously built with [diamond makedb](https://github.com/bbuchfink/diamond/wiki). Default `""`.
**WARNING 10:** You need to use a NCBI reference to have functional links in the output file _Quantifications_and_functional_annotations.tsv_ of `06_func_annot` step
#### **`06_func_annot` step:**
**WARNING 10:** `06_func_annot` step depends on `01_clean_qc`, `02_assembly`, `03_filtering` (if you use it), `04_structural_annot` and `05_alignment` steps. You need to use mandatory files of these six steps to run `06_func_annot`. See [II. Input files](https://forgemia.inra.fr/genotoul-bioinfo/metagwgs/-/blob/dev/docs/usage.md#ii-input-files) and WARNINGS from 1 to 9.
**WARNING 11:** `06_func_annot` step depends on `01_clean_qc`, `02_assembly`, `03_filtering` (if you use it), `04_structural_annot` and `05_alignment` steps. You need to use mandatory files of these six steps to run `06_func_annot`. See [II. Input files](https://forgemia.inra.fr/genotoul-bioinfo/metagwgs/-/blob/dev/docs/usage.md#ii-input-files) and WARNINGS from 1 to 9.
* `--percentage_identity [number]`: corresponds to cd-hit-est -c option to indicate sequence percentage identity for clustering genes. Default: `0.95` corresponding to 95% of sequence identity. Use: `number` must be between 0 and 1, and use `.` when you want to use a float.
......@@ -278,11 +280,11 @@ No parameters.
* `--eggnog_mapper_db_dir "PATH/database_directory/"`: indicates path to eggNOG-mapper database if you have already dowloaded it. If you run the `06_func_annot` step in different metagenomics projects, downloading the eggNOG-mapper database only once before running metagWGS avoids you to multiply the storage of this database and thus keep free disk space. See **WARNING 6**.
**WARNING 11**: you need to use `--eggnogmapper_db` or `--eggnog_mapper_db_dir`. If it is not the case, an error message will occur.
**WARNING 12:** you need to use `--eggnogmapper_db` or `--eggnog_mapper_db_dir`. If it is not the case, an error message will occur.
#### **`07_taxo_affi` step:**
**WARNING 12:** `07_taxo_affi` step depends on `01_clean_qc`, `02_assembly`, `03_filtering` (if you use it), `04_structural_annot` and `05_alignment` steps. You need to use mandatory files of these six steps to run `07_taxo_affi`. See [II. Input files](https://forgemia.inra.fr/genotoul-bioinfo/metagwgs/-/blob/dev/docs/usage.md#ii-input-files) and WARNINGS from 1 to 9.
**WARNING 13:** `07_taxo_affi` step depends on `01_clean_qc`, `02_assembly`, `03_filtering` (if you use it), `04_structural_annot` and `05_alignment` steps. You need to use mandatory files of these six steps to run `07_taxo_affi`. See [II. Input files](https://forgemia.inra.fr/genotoul-bioinfo/metagwgs/-/blob/dev/docs/usage.md#ii-input-files) and WARNINGS from 1 to 9.
* `--accession2taxid "FTP_PATH_TO_prot.accession2taxid.gz"`: indicates the FTP adress of the NCBI file `prot.accession2taxid.gz`. Default: `"ftp://ftp.ncbi.nih.gov/pub/taxonomy/accession2taxid/prot.accession2taxid.gz"`.
......@@ -292,13 +294,13 @@ No parameters.
#### **`08_binning` step:**
**WARNING 13:** `08_binning` step depends on `01_clean_qc`, `02_assembly`, `03_filtering` (if you use it), `04_structural_annot` and `05_alignment` steps. You need to use mandatory files of these six steps to run `08_binning`. See [II. Input files](https://forgemia.inra.fr/genotoul-bioinfo/metagwgs/-/blob/dev/docs/usage.md#ii-input-files) and WARNINGS from 1 to 9.
**WARNING 14:** `08_binning` step depends on `01_clean_qc`, `02_assembly`, `03_filtering` (if you use it), `04_structural_annot` and `05_alignment` steps. You need to use mandatory files of these six steps to run `08_binning`. See [II. Input files](https://forgemia.inra.fr/genotoul-bioinfo/metagwgs/-/blob/dev/docs/usage.md#ii-input-files) and WARNINGS from 1 to 9.
* `--min_contig_size [cutoff_length]`: contig length cutoff to filter contigs before binning. Must be greater than `1500`. Default: `1500`.
* `--busco_reference "PATH/file_db"`: path to BUSCO database. Default: `"https://busco-archive.ezlab.org/v3/datasets/bacteria_odb9.tar.gz"`. **WARNING 14:** We use BUSCO v3 from the `metagWGS.sif` Singularity container. Be careful not to use the BUSCO reference of other BUSCO versions.
* `--busco_reference "PATH/file_db"`: path to BUSCO database. Default: `"https://busco-archive.ezlab.org/v3/datasets/bacteria_odb9.tar.gz"`. **WARNING 15:** We use BUSCO v3 from the `metagWGS.sif` Singularity container. Be careful not to use the BUSCO reference of other BUSCO versions.
* `--cat_db "PATH/CAT_prepare_20190108.tar.gz"`: path to CAT/BAT database. Default: `false`. **WARNING 15:** you need to download this database before running metagWGS `08_binning` step. Download it with: `wget tbb.bio.uu.nl/bastiaan/CAT_prepare/CAT_prepare_20210107.tar.gz`.
* `--cat_db "PATH/CAT_prepare_20190108.tar.gz"`: path to CAT/BAT database. Default: `false`. **WARNING 16:** you need to download this database before running metagWGS `08_binning` step. Download it with: `wget tbb.bio.uu.nl/bastiaan/CAT_prepare/CAT_prepare_20210107.tar.gz`.
#### Others parameters
......@@ -316,4 +318,4 @@ See the description of output files in [this part](https://forgemia.inra.fr/geno
> If you have an account into [genologin cluster](http://bioinfo.genotoul.fr/) and you would like to familiarise yourself with metagWGS, see the tutorial available into the [use case documentation page](https://forgemia.inra.fr/genotoul-bioinfo/metagwgs/-/blob/dev/docs/use_case.md). It allows to analyze big test dataset with metagWGS.
**WARNING:** the test dataset into `metagwgs/test` directory used in [I. Basic Usage](https://forgemia.inra.fr/genotoul-bioinfo/metagwgs/-/blob/dev/docs/usage.md#i-basic-usage) is a small test dataset which does not allow to test all steps (`08_binning` doesn't work with this dataset).
**WARNING 17:** the test dataset into `metagwgs/test` directory used in [I. Basic Usage](https://forgemia.inra.fr/genotoul-bioinfo/metagwgs/-/blob/dev/docs/usage.md#i-basic-usage) is a small test dataset which does not allow to test all steps (`08_binning` doesn't work with this dataset).
This diff is collapsed.
#!/usr/bin/env nextflow
nextflow.enable.dsl=2
include { PROKKA_AND_RENAME } from './modules/prokka'
include { quast } from './modules/metaquast'
include { diamond } from './modules/diamond'
include { CD_HIT } from './modules/cd_hit'
include {ALIGN_HIFI_READS} from './modules/read_alignment'
include {FEATURE_COUNTS} from './modules/feature_counts'
Channel.fromPath(params.assemblies)
.map { it -> [ file(it).getSimpleName(), it ] }
.set { assembly_ch }
Channel.fromPath(params.reads)
.map { it -> [ file(it).getSimpleName(), it ] }
.set { reads_ch }
diamond_bank_ch = Channel.value(params.diamond_bank)
percentage_identity_ch = Channel.value(params.percentage_identity)
workflow {
quast(assembly_ch)
PROKKA_AND_RENAME( assembly_ch )
fna_ch = PROKKA_AND_RENAME.out.fna
gff_ch = PROKKA_AND_RENAME.out.gff
faa_ch = PROKKA_AND_RENAME.out.faa
ffn_ch = PROKKA_AND_RENAME.out.ffn
reads_and_contigs_ch = fna_ch.join(reads_ch, remainder: false)
ALIGN_HIFI_READS(reads_and_contigs_ch)
reads_to_contigs_bam_ch = ALIGN_HIFI_READS.out.bam
idxstats_ch = ALIGN_HIFI_READS.out.idxstats
contig_bed_ch = ALIGN_HIFI_READS.out.contig_bed
diamond(faa_ch, diamond_bank_ch)
diamond_result_ch = diamond.out.diamond_result
CD_HIT(ffn_ch, percentage_identity_ch)
individual_clstr_table_ch = CD_HIT.out.individual_clstr_table
global_clstr_table_ch = CD_HIT.out.global_clstr_table
FEATURE_COUNTS(gff_ch, reads_to_contigs_bam_ch, individual_clstr_table_ch, global_clstr_table_ch)
quantification_table_ch = FEATURE_COUNTS.out.quantification_table
}
process INDIVIDUAL_CD_HIT {
tag "${replicateId}"
publishDir "${params.outdir}/06_func_annot/06_1_clustering", mode: 'copy'
label 'cd_hit'
input:
tuple val(replicateId), file(assembly_ffn_file)
val percentage_identity_cdhit
output:
path("${replicateId}.cd-hit-est.${percentage_identity_cdhit}.fasta"), emit: clstr_fasta
path("${replicateId}.cd-hit-est.${percentage_identity_cdhit}.table_cluster_contigs.txt"), emit: individual_clstr_table
path("${replicateId}.cd-hit-est.${percentage_identity_cdhit}.fasta.clstr")
// when: ('06_func_annot' in step)
script:
"""
cd-hit-est -c ${percentage_identity_cdhit} -i ${assembly_ffn_file} -o ${replicateId}.cd-hit-est.${percentage_identity_cdhit}.fasta -T ${task.cpus} -M ${task.mem} -d 150
cat ${replicateId}.cd-hit-est.${percentage_identity_cdhit}.fasta.clstr | cd_hit_produce_table_clstr.py > ${replicateId}.cd-hit-est.${percentage_identity_cdhit}.table_cluster_contigs.txt
"""
}
// Global clustering with CD-HIT.
process GLOBAL_CD_HIT {
publishDir "${params.outdir}/06_func_annot/06_1_clustering", mode: 'copy'
label 'cd_hit'
input:
file "*.fasta"
val percentage_identity_cdhit
output:
path "All-cd-hit-est.${percentage_identity_cdhit}.fasta"
path "All-cd-hit-est.${percentage_identity_cdhit}.fasta.clstr"
path "table_clstr.txt", emit: global_clstr_table
// when: ('06_func_annot' in step)
script:
"""
cat * > All-cd-hit-est.${percentage_identity_cdhit}
cd-hit-est -c ${percentage_identity_cdhit} -i All-cd-hit-est.${percentage_identity_cdhit} -o All-cd-hit-est.${percentage_identity_cdhit}.fasta -T ${task.cpus} -M {task.mem} -d 150
cat All-cd-hit-est.${percentage_identity_cdhit}.fasta.clstr | cd_hit_produce_table_clstr.py > table_clstr.txt
"""
}
workflow CD_HIT {
take:
assembly_ch // channel: [ val(sampleid), path(assemblyfasta) ]
percentage_identity_ch // channel: val
main:
INDIVIDUAL_CD_HIT( assembly_ch, percentage_identity_ch )
GLOBAL_CD_HIT(INDIVIDUAL_CD_HIT.out.clstr_fasta.collect(), percentage_identity_ch )
emit:
individual_clstr_table = INDIVIDUAL_CD_HIT.out.individual_clstr_table
global_clstr_table = GLOBAL_CD_HIT.out.global_clstr_table
}
process diamond {
publishDir "${params.outdir}/05_alignment/05_2_database_alignment/$replicateId", mode: 'copy'
tag "${replicateId}"
// when: ('05_alignment' in step || '06_func_annot' in step || '07_taxo_affi' in step || '08_binning' in step)
input:
tuple val(replicateId), file(renamed_prokka_faa)
val diamond_bank
output:
tuple val(replicateId), path("${replicateId}_aln_diamond.m8"), emit: diamond_result
script:
"""
diamond blastp -p ${task.cpus} -d ${diamond_bank} -q ${renamed_prokka_faa} -o ${replicateId}_aln_diamond.m8 -f 6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen stitle
"""
}
// Quantification of reads on each gene in each sample.
process quantification {
tag "${replicateId}"
publishDir "${params.outdir}/06_func_annot/06_2_quantification", mode: 'copy'
input:
tuple val(replicateId), file(gff_prokka), file(bam), file(bam_index)
output:
path "${replicateId}.featureCounts.tsv", emit: count_table
path "${replicateId}.featureCounts.tsv.summary", emit: summary
path "${replicateId}.featureCounts.stdout"
// when: ('06_func_annot' in step)
script:
"""
featureCounts -T ${task.cpus} -p -O -t gene -g ID -a ${gff_prokka} -o ${replicateId}.featureCounts.tsv ${bam} &> ${replicateId}.featureCounts.stdout
"""
}
// Create table with sum of reads for each global cluster of genes in each sample.
process quantification_table {
publishDir "${params.outdir}/06_func_annot/06_2_quantification", mode: 'copy'
label 'python'
input:
path clusters_contigs
path global_clusters_clusters
path counts_files
output:
path "Clusters_Count_table_all_samples.txt", emit: quantification_table
path "Correspondence_global_clstr_genes.txt"
// when: ('06_func_annot' in step)
script:
"""
ls ${clusters_contigs} | cat > List_of_contigs_files.txt
ls ${counts_files} | cat > List_of_count_files.txt
Quantification_clusters.py -t ${global_clusters_clusters} -l List_of_contigs_files.txt -c List_of_count_files.txt -oc Clusters_Count_table_all_samples.txt -oid Correspondence_global_clstr_genes.txt
"""
}
workflow FEATURE_COUNTS {
take:
gff_ch // channel: [ val(sampleid), path(gff) ]
bam_ch // channel: [ val(sampleid), path(bam), path(bam_index) ]
individual_clstr_table_ch
global_clstr_table_ch
main:
gff_and_bam_ch = gff_ch.join(bam_ch, remainder: false)
quantification(gff_and_bam_ch)
count_table_ch = quantification.out.count_table.collect()
quantification_table(individual_clstr_table_ch.collect(), global_clstr_table_ch.collect(), count_table_ch)
emit:
quantification_table = quantification_table.out.quantification_table
}
// Assembly metrics.
process quast {
publishDir "${params.outdir}/02_assembly", mode: 'copy'
input:
tuple val(replicateId), file(assembly_file)
output:
path "${replicateId}_all_contigs_QC/*" // into quast_assembly_ch
path "${replicateId}_all_contigs_QC/report.tsv", emit: quast_assembly_for_multiqc_ch
// when: ('02_assembly' in step || '03_filtering' in step || '04_structural_annot' in step || '05_alignment' in step || '06_func_annot' in step || '07_taxo_affi' in step || '08_binning' in step)
script:
"""
mkdir ${replicateId}_all_contigs_QC/
touch ${replicateId}_all_contigs_QC/report.tsv
metaquast.py --threads "${task.cpus}" --rna-finding --max-ref-number 0 --min-contig 0 "${assembly_file}" -o "${replicateId}_all_contigs_QC"
"""
}
process prokka {
tag "${replicateId}"
input:
// set replicateId, file(assembly_file) from assembly_ch
tuple val(replicateId), file(assembly_file)
output:
tuple val(replicateId), path("*"), emit: prokka_results
//tuple val(replicateId), path("PROKKA_${replicateId}/${replicateId}.txt") , emit: prokka_for_multiqc_ch
// when: ('04_structural_annot' in step || '05_alignment' in step || '06_func_annot' in step || '07_taxo_affi' in step || '08_binning' in step)
script:
//replicateId = "$replicateId"
"""
prokka --metagenome --noanno --rawproduct --outdir PROKKA_${replicateId} --prefix ${replicateId} ${assembly_file} --centre X --compliant --cpus ${task.cpus}
"""
}
process RENAME_CONTIGS_AND_GENES {
tag "${replicateId}"
publishDir "${params.outdir}/04_structural_annot", mode: 'copy'
label 'python'
input:
// set replicateId, file(assembly_file) from prokka_ch
tuple val(replicateId), file(prokka_results)
output:
tuple val(replicateId), path("${replicateId}.annotated.fna"), emit: fna
tuple val(replicateId), path("${replicateId}.annotated.ffn"), emit: ffn
tuple val(replicateId), path("${replicateId}.annotated.faa"), emit: faa
tuple val(replicateId), path("${replicateId}.annotated.gff"), emit: gff
tuple val(replicateId), path("${replicateId}_prot.len"), emit: contigs_length
// when: ('04_structural_annot' in step || '05_alignment' in step || '06_func_annot' in step || '07_taxo_affi' in step || '08_binning' in step)
script:
"""
grep "^gnl" ${prokka_results}/${replicateId}.gff > ${replicateId}_only_gnl.gff
Rename_contigs_and_genes.py -f ${replicateId}_only_gnl.gff -faa ${prokka_results}/${replicateId}.faa \
-ffn ${prokka_results}/${replicateId}.ffn -fna ${prokka_results}/${replicateId}.fna \
-p ${replicateId} -oGFF ${replicateId}.annotated.gff -oFAA ${replicateId}.annotated.faa \
-oFFN ${replicateId}.annotated.ffn -oFNA ${replicateId}.annotated.fna
samtools faidx ${replicateId}.annotated.faa; cut -f 1,2 ${replicateId}.annotated.faa.fai > ${replicateId}_prot.len
"""
}
workflow PROKKA_AND_RENAME {
take:
assembly_ch // channel: [ val(sampleid), path(assemblyfasta) ]
main:
prokka( assembly_ch )
RENAME_CONTIGS_AND_GENES(prokka.out.prokka_results)
emit:
fna = RENAME_CONTIGS_AND_GENES.out.fna
ffn = RENAME_CONTIGS_AND_GENES.out.ffn
gff = RENAME_CONTIGS_AND_GENES.out.gff
faa = RENAME_CONTIGS_AND_GENES.out.faa
contigs_length = RENAME_CONTIGS_AND_GENES.out.contigs_length
}
\ No newline at end of file
process ALIGN_HIFI_READS {