main.nf 57.9 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
#!/usr/bin/env nextflow
/*
========================================================================================
                         metagWGS
========================================================================================
 metagWGS Analysis Pipeline.
 #### Homepage / Documentation
 https://forgemia.inra.fr/genotoul-bioinfo/metagwgs/
----------------------------------------------------------------------------------------
*/


13
/*
14
 * Define helpMessage
15
16
 */

17
 def helpMessage() {
18

19
20
   log.info"""
   Usage:
21

22
     The typical command for running the pipeline is as follows:
Joanna Fourquet's avatar
Joanna Fourquet committed
23
       nextflow run -profile standard main.nf --reads '*_{R1,R2}.fastq.gz' --skip_removal_host --skip_kaiju 
24

25
     Mandatory arguments:
cnoirot's avatar
cnoirot committed
26
       --reads [path]                Path to input data (must be surrounded with quotes).
27

28
     Options:
Joanna Fourquet's avatar
Joanna Fourquet committed
29
     
Joanna Fourquet's avatar
Joanna Fourquet committed
30
31
32
33
34
35
36
37
38
39
40
41
42
43
       --step                        Choose step(s) into: "01_clean_qc", "02_assembly", "03_filtering",
                                     "04_structural_annot", "05_alignment", "06_func_annot", "07_taxo_affi", "08_binning".
                                     i. You can directly indicate the final step that is important to you. For example,
                                     if you are interested in binning (and the taxonomic affiliation of bins), just use --step "08_binning".
                                     It runs the previous steps automatically (except "03_filtering", see ii).
                                     ii. "03_filtering" is automatically skipped for the next steps
                                     "04_structural_annot", "05_alignment", "06_func_annot", "07_taxo_affi" and "08_binning".
                                     If you want to filter your assembly before doing one of these steps, you must use --step "03_filtering,the_step",
                                     for example --step "03_filtering,04_structural_annot".
                                     iii. When you run one of the three steps "06_func_annot", "07_taxo_affi" or "08_binning" during a first analysis
                                     and then another of these steps interests you and you run metagWGS again to get the result of this other step,
                                     you have to indicate --step "the_first_step,the_second_step". This will allow you to have a final MultiQC html report
                                     that will take into account the metrics of both analyses performed. If the third of these steps interests you and you run again
                                     metagWGS for this step, you also have to indicate --step "the_first_step,the_second_step,the_third,step" for the same reasons.
44
45
     01_clean_qc options:
       --skip_01_clean_qc            Skip 01_clean_qc step.
46
47
       --adapter1                    Sequence of adapter 1. Default: Illumina TruSeq adapter.
       --adapter2                    Sequence of adapter 2. Default: Illumina TruSeq adapter.
Joanna Fourquet's avatar
Joanna Fourquet committed
48
       --skip_sickle                 Skip sickle process.
49
50
51
52
53
       --quality_type                Type of quality values for sickle (solexa (CASAVA < 1.3), illumina (CASAVA 1.3 to 1.7), sanger (which is CASAVA >= 1.8)). Default: 'sanger'.
       --skip_removal_host           Skip filter host reads process.
       --host_fasta                  Full path to fasta of host genome ("PATH/name_genome.fasta").
       --host_bwa_index              Full path to directory containing BWA index including base name i.e ("PATH/name_genome.{amb,ann,bwt,pac,sa}").
       You need to use --skip_removal_host or --host_fasta or --skip_01_clean_qc. If it is not the case, an error message will occur.
Joanna Fourquet's avatar
Joanna Fourquet committed
54
       --skip_kaiju                  Skip taxonomic affiliation of reads with kaiju.
55
56
57
       --kaiju_db_dir                Directory with kaiju database already built ("PATH/directory").
       --kaiju_db                    Indicate kaiju database you want to build. Use: --kaiju_db ""http://kaiju.binf.ku.dk/database/CHOOSEN_DATABASE.tgz".
       You need to use --kaiju_db_dir or --kaiju_db or --skip_kaiju. If it is not the case, an error message will occur.
Joanna Fourquet's avatar
Joanna Fourquet committed
58
     
Joanna Fourquet's avatar
Joanna Fourquet committed
59
     02_assembly options:
Joanna Fourquet's avatar
Joanna Fourquet committed
60
       --assembly                    Indicate the assembly tool ["metaspades" or "megahit"]. Default: "metaspades".
61
       --metaspades_mem [mem_value]  Memory (in G) used by metaspades process. Default: 440.
Joanna Fourquet's avatar
Joanna Fourquet committed
62
       
Joanna Fourquet's avatar
Joanna Fourquet committed
63
     03_filtering options:
64
       --min_contigs_cpm [cutoff]    CPM cutoff (Count Per Million) to filter contigs with low number of reads. Default: 10.
Joanna Fourquet's avatar
Joanna Fourquet committed
65
       
Joanna Fourquet's avatar
Joanna Fourquet committed
66
     05_alignment options:
67
68
       --diamond_bank                Path to diamond bank used to align protein sequence of genes: "PATH/bank.dmnd".
                                     This bank must be previously built with diamond makedb.
Joanna Fourquet's avatar
Joanna Fourquet committed
69
       
Joanna Fourquet's avatar
Joanna Fourquet committed
70
     06_func_annot options:
71
72
73
       --percentage_identity [nb]    Sequence identity threshold. Default: 0.95 corresponding to 95%. Use a number between 0 and 1.
       --eggnogmapper_db             Use: --eggnogmapper_db if you want that metagwgs build the database. Default false: metagWGS didn't build this database.
       --eggnog_mapper_db_dir        Path to eggnog-mapper database "PATH/database_directory/" if it is already built. Default: false.
Joanna Fourquet's avatar
Joanna Fourquet committed
74
75
       You need to use --eggnogmapper_db or --eggnog_mapper_db_dir. If it is not the case, an error message will occur.
       
76
     07_taxo_affi options:
Joanna Fourquet's avatar
Joanna Fourquet committed
77
78
79
       --accession2taxid             FTP adress of file prot.accession2taxid.gz. Default: "ftp://ftp.ncbi.nih.gov/pub/taxonomy/accession2taxid/prot.accession2taxid.gz".
       --taxdump                     FTP adress of file taxdump.tar.gz. Default "ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz".

80
     08_binning options:
81
82
83
       --min_contig_size [cutoff]    Contig length cutoff to filter contigs before binning. Must be greater than 1500. Default: 1500.
       --busco_reference             BUSCO v3 database. Default: "https://busco-archive.ezlab.org/v3/datasets/bacteria_odb9.tar.gz".
       --cat_db                      CAT/BAT database "PATH/CAT_prepare_20190108.tar.gz". Default: false. Must be previously downloaded.
84

85
     Other options:
86
       --outdir                      The output directory where the results will be saved: "dir_name". Default "results".
87
       --help                        Show this message and exit.
88
       --multiqc_config              If you want to change the configuration of multiqc report: "PATH/multiqc.yaml". Default "$baseDir/assets/multiqc_config.yaml".
89

cnoirot's avatar
cnoirot committed
90
       -profile                      Configuration profile to use.
91
92
                                     Available: singularity (use of Singularity containers), conda (use of conda environments), genotoul (use of metagWGS on genologin cluster),
                                     different test profiles (test_genotoul_workq, test_genotoul_testq, big_test_genotoul, test_local) and debug profile.
93
94
   """.stripIndent()
 }
95

96
97
98
99
 /*
  * SET UP CONFIGURATION VARIABLES.
  */

100
// Show help message.
101

102
103
104
105
if (params.help){
  helpMessage()
  exit 0
}
106

107
108
// Define list of available steps.

109
110
def defineStepList() {
    return [
111
112
113
114
115
116
117
118
        '01_clean_qc',
        '02_assembly',
        '03_filtering',
        '04_structural_annot',
        '05_alignment',
        '06_func_annot',
        '07_taxo_affi',
        '08_binning'
119
120
121
    ]
}

122
// Check step existence.
Joanna Fourquet's avatar
Joanna Fourquet committed
123
124
125
126
127
128
129
130
131
132
133
134
135

def checkParameterExistence(list_it, list) {
    nb_false_step = 0
    for(it in list_it) {
        if (!list.contains(it)) {
            log.warn "Unknown parameter: ${it}"
            nb_false_step = nb_false_step + 1
        }
    }
    if(nb_false_step > 0) {return false}
    else {return true}
}

136
// Check number of steps.
Joanna Fourquet's avatar
Joanna Fourquet committed
137

138
// Set up parameters.
cnoirot's avatar
cnoirot committed
139

Joanna Fourquet's avatar
Joanna Fourquet committed
140
step = params.step.split(",")
Joanna Fourquet's avatar
Joanna Fourquet committed
141
stepList = defineStepList()
142
if (!checkParameterExistence(step, stepList)) exit 1, "Unknown step(s) upon ${step}, see --help for more information"
143

cnoirot's avatar
cnoirot committed
144
if (!['metaspades','megahit'].contains(params.assembly)){
145
146
    exit 1, "Invalid aligner option: ${params.assembly}. Valid options: 'metaspades', 'megahit'"
}
cnoirot's avatar
cnoirot committed
147

cnoirot's avatar
cnoirot committed
148
149
150
if (!["solexa","illumina","sanger"].contains(params.quality_type)){
    exit 1, "Invalid quality_type option: ${params.quality_type}. Valid options:'solexa','illumina','sanger'"
}
151
152

/*
153
 * Create channels for adapters, qualityType, alignment genome, mode value.
154
 */
155

156
157
adapter1_ch = Channel.value(params.adapter1)
adapter2_ch = Channel.value(params.adapter2)
cnoirot's avatar
cnoirot committed
158

Joanna Fourquet's avatar
Joanna Fourquet committed
159
160
161
162
163
164
165
166
if(!params.skip_busco){
    Channel
        .fromPath( "${params.busco_reference}", checkIfExists: true )
        .set { file_busco_db }
} else {
    file_busco_db = Channel.from()
}

167
168
169
170
171
172
173
174
if(params.cat_db){
    Channel
        .fromPath( "${params.cat_db}", checkIfExists: true )
        .set { file_cat_db }
} else {
    file_cat_db = Channel.from()
}

Joanna Fourquet's avatar
Joanna Fourquet committed
175
176
177
178
179
180
if(params.host_fasta){
    bwa_fasta_ch = Channel.value(file(params.host_fasta))
} else {
    bwa_fasta_ch = Channel.from()
}

181
diamond_bank_ch = Channel.value(params.diamond_bank)
182
183
accession2taxid_ch = Channel.value(params.accession2taxid)
taxdump_ch = Channel.value(params.taxdump)
184
percentage_identity_ch = Channel.value(params.percentage_identity)
cnoirot's avatar
cnoirot committed
185
min_contigs_cpm_ch = Channel.value(params.min_contigs_cpm)
Joanna Fourquet's avatar
Joanna Fourquet committed
186
metaspades_mem_ch = Channel.value(params.metaspades_mem)
187

cnoirot's avatar
cnoirot committed
188
multiqc_config_ch = file(params.multiqc_config, checkIfExists: true)
189
190
191
/*
 * Create channels for input read files.
 */
192
 Channel
Joanna Fourquet's avatar
Joanna Fourquet committed
193
     .fromFilePairs( params.reads, size: params.single_end ? 1 : 2, flat: true )
194
     .ifEmpty { exit 1, "Cannot find any reads matching: ${params.reads}\nNB: Path needs to be enclosed in quotes!\nNB: Path requires at least one * wildcard!\nIf this is single-end data, please specify --singleEnd on the command line." }
Joanna Fourquet's avatar
Joanna Fourquet committed
195
     .into { raw_reads_fastqc; raw_reads_cutadapt; raw_reads_assembly_ch; raw_reads_dedup_ch}
196

197
taxons = "phylum class order family genus species"
Joanna Fourquet's avatar
Joanna Fourquet committed
198
taxons_affi_taxo_contigs = "all superkingdom phylum class order family genus species"
199

Joanna Fourquet's avatar
Joanna Fourquet committed
200
// index host if needed
201
if (params.host_fasta && !(params.host_bwa_index) && "01_clean_qc" in step && !(params.skip_removal_host)) {
Joanna Fourquet's avatar
Joanna Fourquet committed
202
203
204
    lastPath = params.host_fasta.lastIndexOf(File.separator)
    bwa_base = params.host_fasta.substring(lastPath+1)
    fasta_ch = file(params.host_fasta, checkIfExists: true)
cnoirot's avatar
cnoirot committed
205
206

    process BWAIndex {
207
        publishDir "${params.outdir}/01_clean_qc/01_1_cleaned_reads/bwa_index" , mode: 'copy'
cnoirot's avatar
cnoirot committed
208
209
210
211
212

        input:
        file fasta from fasta_ch

        output:
Joanna Fourquet's avatar
Joanna Fourquet committed
213
        file("${fasta}.*") into bwa_built
cnoirot's avatar
cnoirot committed
214
215
216
217
218
219
220

        script:
        """
        bwa index -a bwtsw $fasta
        """
    }
}
221

222
223
if (!(params.skip_removal_host) && !(params.host_fasta) && !(params.skip_01_clean_qc)){
    exit 1, "You must specify --host_fasta or skip cleaning host step with option --skip_removal_host or skip all clean and qc modules with --skip_01_clean_qc"
Joanna Fourquet's avatar
Joanna Fourquet committed
224
225
}

226
if (!(params.skip_removal_host) && !(params.skip_01_clean_qc)){
Joanna Fourquet's avatar
Joanna Fourquet committed
227
228
    bwa_index_ch = params.host_bwa_index ? Channel.value(file(params.host_bwa_index)) : bwa_built
}
Joanna Fourquet's avatar
Joanna Fourquet committed
229

230
/*
231
 * CLEANING.
232
 */
233

234
// Cutadapt.
235
process cutadapt {
236
237
  tag "$replicateId"

238
239
  publishDir "${params.outdir}/01_clean_qc/01_1_cleaned_reads/", mode: 'copy', pattern: 'cleaned_*.fastq.gz'
  publishDir "${params.outdir}/01_clean_qc/01_1_cleaned_reads/logs", mode: 'copy', pattern: '*_cutadapt.log'
240
  input:
Joanna Fourquet's avatar
Joanna Fourquet committed
241
  set replicateId, file(read1), file(read2) from raw_reads_cutadapt
242
243
  val adapter1_ch
  val adapter2_ch
244
245

  output:
246
  set replicateId, file("*${replicateId}*_R1.fastq.gz"), file("*${replicateId}*_R2.fastq.gz") into (cutadapt_reads_ch, cutadapt2_reads_ch, cutadapt3_reads_ch)
cnoirot's avatar
cnoirot committed
247
  file("${replicateId}_cutadapt.log") into cutadapt_log_ch_for_multiqc
248

249
  when: ('01_clean_qc' in step || '02_assembly' in step || '03_filtering' in step || '04_structural_annot' in step || '05_alignment' in step || '06_func_annot' in step || '07_taxo_affi' in step || '08_binning' in step) && (!params.skip_01_clean_qc)
250

251
  script:
cnoirot's avatar
cnoirot committed
252
253
  if(params.skip_sickle & params.skip_removal_host) {
    // output are final cleaned files
254
    output_files = "-o cleaned_${replicateId}_R1.fastq.gz -p cleaned_${replicateId}_R2.fastq.gz"
cnoirot's avatar
cnoirot committed
255
256
257
  } else {
    //tempory files not saved in publish dir
    output_files = "-o ${replicateId}_cutadapt_R1.fastq.gz -p ${replicateId}_cutadapt_R2.fastq.gz"
258
  }
259
  """
cnoirot's avatar
cnoirot committed
260
  cutadapt -a $adapter1_ch -A $adapter2_ch $output_files -m 36 --trim-n -q 20,20 --max-n 0 \
Joanna Fourquet's avatar
Joanna Fourquet committed
261
  --cores=${task.cpus} ${read1} ${read2} > ${replicateId}_cutadapt.log
262
263
  """
}
264

265
// Sickle.
266
process sickle {
267
  tag "$replicateId"
268
269
  publishDir "${params.outdir}/01_clean_qc/01_1_cleaned_reads/", mode: 'copy', pattern: 'cleaned_*.fastq.gz'
  publishDir "${params.outdir}/01_clean_qc/01_1_cleaned_reads/logs", mode: 'copy', pattern: '*_sickle.log'
270

271
  when:
272
  (!params.skip_sickle) && ('01_clean_qc' in step || '02_assembly' in step || '03_filtering' in step || '04_structural_annot' in step || '05_alignment' in step || '06_func_annot' in step || '07_taxo_affi' in step || '08_binning' in step) && (!params.skip_01_clean_qc)
273

274
  input:
275
  set replicateId, file(cutadapt_reads_R1), file(cutadapt_reads_R2) from cutadapt_reads_ch
276
277

  output:
278
  set replicateId, file("*${replicateId}*_R1.fastq.gz"), file("*${replicateId}*_R2.fastq.gz") into sickle_reads_ch, sickle2_reads_ch
cnoirot's avatar
cnoirot committed
279
280
  file("${replicateId}_single_sickle.fastq.gz") into sickle_single_ch
  file("${replicateId}_sickle.log") into sickle_log_ch_for_multiqc
281
282

  script:
cnoirot's avatar
cnoirot committed
283
284
285
286
  mode = params.single_end ? 'se' : 'pe'

  if(params.skip_removal_host) {
    // output are final cleaned files
287
    options = "-o cleaned_${replicateId}_R1.fastq.gz -p cleaned_${replicateId}_R2.fastq.gz"
cnoirot's avatar
cnoirot committed
288
289
290
291
292
  } else {
    //tempory files not saved in publish dir
    options = "-o ${replicateId}_sickle_R1.fastq.gz -p ${replicateId}_sickle_R2.fastq.gz"
  }
  options += " -t " + params.quality_type
293
  """
cnoirot's avatar
cnoirot committed
294
295
  sickle ${mode} -f ${cutadapt_reads_R1} -r ${cutadapt_reads_R2} $options \
  -s ${replicateId}_single_sickle.fastq.gz -g > ${replicateId}_sickle.log
296
297
298
  """
}

299
if (!params.skip_sickle) {
cnoirot's avatar
cnoirot committed
300
  sickle_reads_ch.set{intermediate_cleaned_ch}
301
302
}
else {
cnoirot's avatar
cnoirot committed
303
  cutadapt2_reads_ch.set{intermediate_cleaned_ch}
304
305
}

306
// WARNING: use bioinfo_bwa_samtools module.
307
if (!params.skip_removal_host && ('01_clean_qc' in step || '02_assembly' in step || '03_filtering' in step || '04_structural_annot' in step || '05_alignment' in step || '06_func_annot' in step || '07_taxo_affi' in step || '08_binning' in step) && (!params.skip_01_clean_qc)) {
308
309
  process host_filter {
    tag "${replicateId}"
310
311
312
    publishDir "${params.outdir}/01_clean_qc/01_1_cleaned_reads/", mode: 'copy', pattern: 'cleaned_*.fastq.gz'
    publishDir "${params.outdir}/01_clean_qc/01_1_cleaned_reads/", mode: 'copy', pattern: '*.bam'
    publishDir "${params.outdir}/01_clean_qc/01_1_cleaned_reads/logs", mode: 'copy',
Joanna Fourquet's avatar
Joanna Fourquet committed
313
314
315
        saveAs: {filename -> 
        if (filename.indexOf(".flagstat") > 0 ) "$filename"
        else null}
316
    publishDir "${params.outdir}/01_clean_qc/01_1_cleaned_reads/logs", mode: 'copy',
317
318
319
        saveAs: {filename -> 
        if (filename.indexOf(".nb_bases") > 0 ) "$filename"
        else null}
320

321
322
    input:
    set replicateId, file(trimmed_reads_R1), file(trimmed_reads_R2) from intermediate_cleaned_ch
Joanna Fourquet's avatar
Joanna Fourquet committed
323
324
    file index from bwa_index_ch
    file fasta from bwa_fasta_ch
325

326
    output:
Joanna Fourquet's avatar
Joanna Fourquet committed
327
    set replicateId, file("cleaned_${replicateId}_R1.fastq.gz"), file("cleaned_${replicateId}_R2.fastq.gz") into filter_reads_ch
Joanna Fourquet's avatar
Joanna Fourquet committed
328
329
    file("host_filter_flagstat/${replicateId}.host_filter.flagstat") into flagstat_after_host_filter_for_multiqc_ch
    file("${replicateId}.no_filter.flagstat") into flagstat_before_filter_for_multiqc_ch
330
331
    file("${replicateId}_cleaned_R1.nb_bases")
    file("${replicateId}_cleaned_R2.nb_bases")
332

333
    """
Joanna Fourquet's avatar
Joanna Fourquet committed
334
    bwa mem -t ${task.cpus} ${fasta} ${trimmed_reads_R1} ${trimmed_reads_R2} > ${replicateId}.bam
Joanna Fourquet's avatar
Joanna Fourquet committed
335
336
    samtools view -bhS -f 12 ${replicateId}.bam > ${replicateId}.without_host.bam
    mkdir host_filter_flagstat
Joanna's avatar
Joanna committed
337
338
    samtools flagstat ${replicateId}.bam > ${replicateId}.no_filter.flagstat
    samtools flagstat ${replicateId}.without_host.bam >> host_filter_flagstat/${replicateId}.host_filter.flagstat
339
340
341
    bamToFastq -i ${replicateId}.without_host.bam -fq cleaned_${replicateId}_R1.fastq -fq2 cleaned_${replicateId}_R2.fastq
    gzip cleaned_${replicateId}_R1.fastq
    gzip cleaned_${replicateId}_R2.fastq
Joanna Fourquet's avatar
Joanna Fourquet committed
342
343
    zcat "cleaned_${replicateId}_R1.fastq.gz" | paste - - - - | cut -f 2 | tr -d '\n' | wc -c > ${replicateId}_cleaned_R1.nb_bases
    zcat "cleaned_${replicateId}_R2.fastq.gz" | paste - - - - | cut -f 2 | tr -d '\n' | wc -c > ${replicateId}_cleaned_R2.nb_bases
Joanna Fourquet's avatar
Joanna Fourquet committed
344
345
    rm ${replicateId}.bam
    rm ${replicateId}.without_host.bam
346
347
    """
  }
348
  filter_reads_ch.set{preprocessed_reads_ch}
Joanna Fourquet's avatar
Joanna Fourquet committed
349
350
}
else {
351
  intermediate_cleaned_ch.set{preprocessed_reads_ch}
Joanna Fourquet's avatar
Joanna Fourquet committed
352
353
  Channel.empty().set{flagstat_after_host_filter_for_multiqc_ch}
  Channel.empty().set{flagstat_before_filter_for_multiqc_ch}
Joanna Fourquet's avatar
Joanna Fourquet committed
354
355
}

356
357
358
359
preprocessed_reads_ch.into{
  clean_reads_for_fastqc_ch
  clean_reads_for_kaiju_ch
  clean_reads_for_assembly_ch
360
  clean_reads_for_dedup_ch
361
}
362

363

364
365
// FastQC on raw data
process fastqc_raw {
366
  tag "${replicateId}"
367
  publishDir "${params.outdir}/01_clean_qc/01_2_qc/fastqc_raw/", mode: 'copy'
368
369

  input:
Joanna Fourquet's avatar
Joanna Fourquet committed
370
  set replicateId, file(read1), file(read2) from raw_reads_fastqc
371
372

  output:
Joanna Fourquet's avatar
Joanna Fourquet committed
373
374
  file("${replicateId}/*.zip") into fastqc_raw_for_multiqc_ch
  file("${replicateId}/*.html") into fastqc_raw_ch
375

376
  when: ('01_clean_qc' in step || '02_assembly' in step || '03_filtering' in step || '04_structural_annot' in step || '05_alignment' in step || '06_func_annot' in step || '07_taxo_affi' in step || '08_binning' in step) && (!params.skip_01_clean_qc)
377

378
379
  script:
  """
Joanna Fourquet's avatar
Joanna Fourquet committed
380
    mkdir ${replicateId} ; fastqc --nogroup --quiet -o ${replicateId} --threads ${task.cpus} ${read1} ${read2}
381
  """
382
}
383

384
385
386
// FastQC on cleaned data
process fastqc_cleaned {
  tag "${replicateId}"
387
  publishDir "${params.outdir}/01_clean_qc/01_2_qc/fastqc_cleaned", mode: 'copy'
388
389

  input:
390
  set replicateId, file(read1), file(read2) from clean_reads_for_fastqc_ch
391
392

  output:
Joanna Fourquet's avatar
Joanna Fourquet committed
393
394
  file("cleaned_${replicateId}/*.zip") into fastqc_cleaned_for_multiqc_ch
  file("cleaned_${replicateId}/*.html") into fastqc_cleaned_ch
395

396
  when: ('01_clean_qc' in step || '02_assembly' in step || '03_filtering' in step || 'structural' in step || '05_alignment' in step || '06_func_annot' in step || '07_taxo_affi' in step || '08_binning' in step) && (!params.skip_01_clean_qc)
397

398
399
400
401
402
  script:
  """
  mkdir cleaned_${replicateId}; fastqc --nogroup --quiet -o cleaned_${replicateId} --threads ${task.cpus} ${read1} ${read2}
  """
}
403
404
405
406
407

/*
 * TAXONOMIC CLASSIFICATION.
 */

408
if (!params.skip_kaiju && ('01_clean_qc' in step || '02_assembly' in step || '03_filtering' in step || '04_structural_annot' in step || '05_alignment' in step || '06_func_annot' in step || '07_taxo_affi' in step || '08_binning' in step) && (!params.skip_01_clean_qc)) {
409
410
411
  if(params.kaiju_db) {
    // Built kaiju database index.
    process index_db_kaiju {
412
      publishDir "${params.outdir}/01_clean_qc/01_3_taxonomic_affiliation_reads/index", mode: 'copy'
413

414
415
416
417
418
      input:
      val database from params.kaiju_db

      output:
      file("nodes.dmp") into index_kaiju_nodes_final_ch
Joanna Fourquet's avatar
Joanna Fourquet committed
419
      file("kaiju_db_refseq.fmi") into index_kaiju_db_final_ch
420
421
422
423
      file("names.dmp") into index_kaiju_names_final_ch

      script:
      """
424
      wget ${database}
Joanna Fourquet's avatar
Joanna Fourquet committed
425
426
427
428
      file='${database}'
      fileNameDatabase=\${file##*/}
      echo \$fileNameDatabase
      tar -zxvf \$fileNameDatabase
429
430
431
432
433
434
435
      """
    } } else {
      if(params.kaiju_db_dir) {
          kaiju_totalPath = params.kaiju_db_dir.lastIndexOf(File.separator)
          kaiju_Path = params.kaiju_db_dir.substring(0,kaiju_totalPath)
          kaiju_Path_db = params.kaiju_db_dir.substring(kaiju_totalPath + 1)
          index_kaiju_nodes_final_ch  = Channel
Joanna Fourquet's avatar
Joanna Fourquet committed
436
              .value(kaiju_Path + "/" + kaiju_Path_db + "/nodes.dmp")
437
          index_kaiju_db_final_ch = Channel
Joanna Fourquet's avatar
Joanna Fourquet committed
438
              .value(kaiju_Path + "/" + kaiju_Path_db + "/kaiju_db_refseq.fmi")
439
          index_kaiju_names_final_ch = Channel
Joanna Fourquet's avatar
Joanna Fourquet committed
440
              .value(kaiju_Path + "/" + kaiju_Path_db + "/names.dmp")
441
442
443
444
445
446
447
448
449
450
451
      }
      else {
          if (!(params.kaiju_db) & !(params.kaiju_db_dir) & (!params.skip_kaiju)){
              exit 1, "You must specify --kaiju_db or --kaiju_db_dir or --skip_kaiju"
          }
      }
    }

  // Kaiju.
  process kaiju {
    tag "${replicateId}"
452
    publishDir "${params.outdir}/01_clean_qc/01_3_taxonomic_affiliation_reads", mode: 'copy', pattern: '*.krona.html'
453

454
    input:
455
456
457
458
    set replicateId, file(preprocessed_reads_R1), file(preprocessed_reads_R2) from clean_reads_for_kaiju_ch
    val (index_kaiju_nodes_final) from index_kaiju_nodes_final_ch
    val (index_kaiju_db_final) from index_kaiju_db_final_ch
    val (index_kaiju_names_final) from index_kaiju_names_final_ch
459

460
    output:
461
    file("${replicateId}_kaiju_MEM_verbose.out") into kaiju_MEM_verbose_ch
462
    file("${replicateId}.krona.html") into kaiju_krona_ch
463
    file("*.summary_*") into kaiju_summary_for_multiqc_ch
464
    file("*.summary_species") into kaiju_summary_species_ch
465
466
467
468
469
    file("*.summary_genus") into kaiju_summary_genus_ch
    file("*.summary_family") into kaiju_summary_family_ch
    file("*.summary_class") into kaiju_summary_class_ch
    file("*.summary_order") into kaiju_summary_order_ch
    file("*.summary_phylum") into kaiju_summary_phylum_ch
470
    file("*.pdf") into plot_ch
471
472
473

    script:
    """
474
475
    kaiju -z ${task.cpus} -t ${index_kaiju_nodes_final} -f ${index_kaiju_db_final} -i ${preprocessed_reads_R1} -j ${preprocessed_reads_R2} -o ${replicateId}_kaiju_MEM_verbose.out -a mem -v
    kaiju2krona -t ${index_kaiju_nodes_final} -n ${index_kaiju_names_final} -i ${replicateId}_kaiju_MEM_verbose.out -o ${replicateId}_kaiju_MEM_without_unassigned.out.krona -u
476
    ktImportText -o ${replicateId}.krona.html ${replicateId}_kaiju_MEM_without_unassigned.out.krona
477
478
479
480
    for i in ${taxons} ;
    do
    kaiju2table -t ${index_kaiju_nodes_final} -n ${index_kaiju_names_final} -r \$i -o ${replicateId}_kaiju_MEM.out.summary_\$i ${replicateId}_kaiju_MEM_verbose.out
    done
481
    Generate_barplot_kaiju.py -i ${replicateId}_kaiju_MEM_verbose.out
482
483
    """
  }
484
  // Merge kaiju results by species
485
  process kaiju_merge{
486
    tag "${replicateId}"
487
    publishDir "${params.outdir}/01_clean_qc/01_3_taxonomic_affiliation_reads", mode: 'copy'
488
  
489
490
491
492
493
494
495
    input:
    file(kaiju_species) from kaiju_summary_species_ch.collect()
    file(kaiju_genus) from kaiju_summary_genus_ch.collect()
    file(kaiju_family) from kaiju_summary_family_ch.collect()
    file(kaiju_class) from kaiju_summary_class_ch.collect()
    file(kaiju_order) from kaiju_summary_order_ch.collect()
    file(kaiju_phylum) from kaiju_summary_phylum_ch.collect()
496
  
497
498
    output:
    file("taxo_affi_reads_*.tsv") into merge_files_kaiju_ch
499
  
500
501
    script:
    """
Joanna Fourquet's avatar
Joanna Fourquet committed
502
503
504
505
506
507
    echo "${kaiju_phylum}" > phylum.txt
    echo "${kaiju_order}" > order.txt
    echo "${kaiju_class}" > class.txt
    echo "${kaiju_family}" > family.txt
    echo "${kaiju_genus}" > genus.txt
    echo "${kaiju_species}" > species.txt
508
509
    for i in ${taxons} ;
    do
Joanna Fourquet's avatar
Joanna Fourquet committed
510
    merge_kaiju_results.py -f \$i".txt" -o taxo_affi_reads_\$i".tsv"
511
512
    done
    """
513
  }
514
}
Joanna's avatar
Joanna committed
515
516
517
else {
Channel.empty().set{kaiju_summary_for_multiqc_ch}
}
518

519
if(params.skip_01_clean_qc) {
Joanna Fourquet's avatar
Joanna Fourquet committed
520
521
522
523
524
525
526
527
528
529
530
531
  raw_reads_assembly_ch.set{
   input_reads_for_assembly_ch}
  raw_reads_dedup_ch.set{
   input_reads_for_dedup_ch}
}
else {
  clean_reads_for_assembly_ch.set{
   input_reads_for_assembly_ch}
  clean_reads_for_dedup_ch.set{
   input_reads_for_dedup_ch}
}

532
533
534
535
536
537
538
539
540
/*
 * ASSEMBLY.
 */

// Metaspades.
if(params.assembly=='metaspades') {
  megahit_ch = Channel.from(false)
  process metaspades {
    tag "${replicateId}"
541
    publishDir "${params.outdir}/02_assembly", mode: 'copy'
542
543

    input:
Joanna Fourquet's avatar
Joanna Fourquet committed
544
    set replicateId, file(preprocessed_reads_R1), file(preprocessed_reads_R2) from input_reads_for_assembly_ch
Joanna Fourquet's avatar
Joanna Fourquet committed
545
    val spades_mem from metaspades_mem_ch
546
547

    output:
548
    set replicateId, file("${replicateId}_metaspades/${replicateId}_scaffolds.fasta") into assembly_for_quast_ch, assembly_for_dedup_ch, assembly_for_filter_ch, assembly_no_filter_ch
549
550
    set replicateId, file("${replicateId}_metaspades/${replicateId}_spades_log.txt"), file("${replicateId}_metaspades/params.txt") into logs_metaspades_ch

551
    when: ('02_assembly' in step || '03_filtering' in step || '04_structural_annot' in step || '05_alignment' in step || '06_func_annot' in step || '07_taxo_affi' in step || '08_binning' in step)
552

553
554
    script:
    """
Joanna Fourquet's avatar
Joanna Fourquet committed
555
    metaspades.py -t ${task.cpus} -m ${spades_mem} -1 ${preprocessed_reads_R1} -2 ${preprocessed_reads_R2} -o ${replicateId}_metaspades 
cnoirot's avatar
cnoirot committed
556
557
    mv ${replicateId}_metaspades/scaffolds.fasta ${replicateId}_metaspades/${replicateId}_scaffolds.fasta
    mv ${replicateId}_metaspades/spades.log ${replicateId}_metaspades/${replicateId}_spades_log.txt
558
559
560
    """
  }
} else { // Megahit.
561
562
563
564
  if(params.assembly=='megahit') {
    metaspades_ch = Channel.from(false)
    process megahit {
      tag "${replicateId}"
565
      publishDir "${params.outdir}/02_assembly", mode: 'copy'
566

567
      input:
Joanna Fourquet's avatar
Joanna Fourquet committed
568
      set replicateId, file(preprocessed_reads_R1), file(preprocessed_reads_R2) from input_reads_for_assembly_ch
569

570
      output:
Joanna Fourquet's avatar
Joanna Fourquet committed
571
572
      set replicateId, file("${replicateId}_megahit/${replicateId}.contigs.fa") into assembly_for_quast_ch, assembly_for_dedup_ch, assembly_for_filter_ch, assembly_no_filter_ch
      set replicateId, file("${replicateId}_megahit/${replicateId}.log"), file("${replicateId}_megahit/options.json") into logs_megahit_ch
573

574
      when: ('02_assembly' in step || '03_filtering' in step || '04_structural_annot' in step || '05_alignment' in step || '06_func_annot' in step || '07_taxo_affi' in step || '08_binning' in step)
575
576
577

      script:
      """
Joanna Fourquet's avatar
Joanna Fourquet committed
578
      megahit -t ${task.cpus} -1 ${preprocessed_reads_R1} -2 ${preprocessed_reads_R2} -o ${replicateId}_megahit --out-prefix "${replicateId}"
579
580
      """
    }
581
582
583
584
585
  }
}

// Assembly metrics.
process quast {
586
    publishDir "${params.outdir}/02_assembly", mode: 'copy'
587
588
589
590
591
592

    input:
    set replicateId, file(assembly_file) from assembly_for_quast_ch

    output:
    file("${replicateId}_all_contigs_QC/*") into quast_assembly_ch
Joanna Fourquet's avatar
Joanna Fourquet committed
593
    file("${replicateId}_all_contigs_QC/report.tsv") into quast_assembly_for_multiqc_ch
594

595
    when: ('02_assembly' in step || '03_filtering' in step || '04_structural_annot' in step || '05_alignment' in step || '06_func_annot' in step || '07_taxo_affi' in step || '08_binning' in step)
596

597
598
    script:
    """
599
    metaquast.py --threads "${task.cpus}" --rna-finding --max-ref-number 0 --min-contig 0 "${assembly_file}" -o "${replicateId}_all_contigs_QC"
600
601
602
    """
}

603
// Reads deduplication
604

Joanna Fourquet's avatar
Joanna Fourquet committed
605
assembly_and_reads_ch = assembly_for_dedup_ch.join(input_reads_for_dedup_ch, remainder: true)
606
607

process reads_deduplication {
608
609
610
611
    publishDir "${params.outdir}/02_assembly", mode: 'copy', pattern: '*.fastq.gz'
    publishDir "${params.outdir}/02_assembly/logs", mode: 'copy', pattern: '*.idxstats'
    publishDir "${params.outdir}/02_assembly/logs", mode: 'copy', pattern: '*.flagstat'
    publishDir "${params.outdir}/02_assembly/logs", mode: 'copy', pattern: '*.nb_bases'
612
    
613
614
615
616
    input:
    set replicateId, file(assembly_file), file(preprocessed_reads_R1), file(preprocessed_reads_R2) from assembly_and_reads_ch

    output:
617
    set replicateId, file("${replicateId}_R1_dedup.fastq.gz"), file("${replicateId}_R2_dedup.fastq.gz") into deduplicated_reads_ch, deduplicated_reads_copy_ch
618
619
    set replicateId, file("${replicateId}.count_reads_on_contigs.idxstats") into (idxstats_filter_logs_ch, idxstats_filter_logs_for_multiqc_ch)
    set replicateId, file("${replicateId}.count_reads_on_contigs.flagstat") into flagstat_filter_logs_ch
Joanna Fourquet's avatar
Joanna Fourquet committed
620
    file("${replicateId}.count_reads_on_contigs.flagstat") into flagstat_after_dedup_reads_for_multiqc_ch
621
622
    file("${replicateId}_dedup_R1.nb_bases")
    file("${replicateId}_dedup_R2.nb_bases")
623

624
    when: ('02_assembly' in step || '03_filtering' in step || '04_structural_annot' in step || '05_alignment' in step || '06_func_annot' in step || '07_taxo_affi' in step || '08_binning' in step)
625

626
627
    script:
    """
628
    mkdir logs
629
630
631
632
    bwa index ${assembly_file} -p ${assembly_file}
    bwa mem ${assembly_file} ${preprocessed_reads_R1} ${preprocessed_reads_R2} | samtools view -bS - | samtools sort -n -o ${replicateId}.sort.bam -
    samtools fixmate -m ${replicateId}.sort.bam ${replicateId}.fixmate.bam
    samtools sort -o ${replicateId}.fixmate.positionsort.bam ${replicateId}.fixmate.bam
633
    samtools markdup -r -S -s -f ${replicateId}.stats ${replicateId}.fixmate.positionsort.bam ${replicateId}.filtered.bam
634
    samtools index ${replicateId}.filtered.bam
Joanna's avatar
Joanna committed
635
636
    samtools idxstats ${replicateId}.filtered.bam > ${replicateId}.count_reads_on_contigs.idxstats
    samtools flagstat ${replicateId}.filtered.bam > ${replicateId}.count_reads_on_contigs.flagstat
637
638
    samtools sort -n -o ${replicateId}.filtered.sort.bam ${replicateId}.filtered.bam
    bedtools bamtofastq -i ${replicateId}.filtered.sort.bam -fq ${replicateId}_R1_dedup.fastq -fq2 ${replicateId}_R2_dedup.fastq
cnoirot's avatar
cnoirot committed
639
    gzip ${replicateId}_R1_dedup.fastq ; gzip ${replicateId}_R2_dedup.fastq
640
641
    zcat "${replicateId}_R1_dedup.fastq.gz" | paste - - - - | cut -f 2 | tr -d '\n' | wc -c > ${replicateId}_dedup_R1.nb_bases
    zcat "${replicateId}_R2_dedup.fastq.gz" | paste - - - - | cut -f 2 | tr -d '\n' | wc -c > ${replicateId}_dedup_R2.nb_bases
642
643
644
    """
}

645
// Assembly filter
646
647
assembly_and_logs_ch = assembly_for_filter_ch.join(idxstats_filter_logs_ch, remainder: true)
process assembly_filter {
648
    publishDir "${params.outdir}/03_filtering", mode: 'copy'
649
650
651
652
      
    input:
    set replicateId, file(assembly_file), file(idxstats) from assembly_and_logs_ch
    val min_cpm from min_contigs_cpm_ch
653

654
    output:
Joanna Fourquet's avatar
Joanna Fourquet committed
655
656
    set replicateId, file("${replicateId}_select_contigs_cpm${min_cpm}.fasta") into select_assembly_ch
    set replicateId, file("${replicateId}_discard_contigs_cpm${min_cpm}.fasta") into discard_assembly_ch
657
    file("${replicateId}_select_contigs_QC/*") into quast_select_contigs_ch
Joanna Fourquet's avatar
Joanna Fourquet committed
658
    file("${replicateId}_select_contigs_QC/report.tsv") into quast_select_contigs_for_multiqc_ch
659

660
    when: ('03_filtering' in step)
661

662
663
    script:
    """
Joanna Fourquet's avatar
Joanna Fourquet committed
664
665
    Filter_contig_per_cpm.py -i ${idxstats} -f ${assembly_file} -c ${min_cpm} -s ${replicateId}_select_contigs_cpm${min_cpm}.fasta -d ${replicateId}_discard_contigs_cpm${min_cpm}.fasta
    metaquast.py --threads "${task.cpus}" --rna-finding --max-ref-number 0 --min-contig 0 "${replicateId}_select_contigs_cpm${min_cpm}.fasta" -o "${replicateId}_select_contigs_QC"
666
667
668
    """
}

669
if(!('03_filtering' in step)) {
Joanna Fourquet's avatar
Joanna Fourquet committed
670
  assembly_no_filter_ch.into{
671
   select_assembly_ch }
672
673
}

674
675
676
677
678
679
680
681
/*
 * ANNOTATION with Prokka.
 */

process prokka {
  tag "${replicateId}"

  input:
cnoirot's avatar
cnoirot committed
682
  set replicateId, file(assembly_file) from select_assembly_ch
683
684

  output:
Joanna Fourquet's avatar
Joanna Fourquet committed
685
686
  set replicateId, file("*") into prokka_ch
  set replicateId, file("PROKKA_${replicateId}/${replicateId}.txt") into prokka_for_multiqc_ch
687

688
  when: ('04_structural_annot' in step || '05_alignment' in step || '06_func_annot' in step || '07_taxo_affi' in step || '08_binning' in step)
689

690
691
  script:
  """
Joanna Fourquet's avatar
Joanna Fourquet committed
692
  prokka --metagenome --noanno --rawproduct --outdir PROKKA_${replicateId} --prefix ${replicateId} ${assembly_file} --centre X --compliant --cpus ${task.cpus}
693
694
695
696
697
698
699
700
701
  """
}

/*
 * RENAME contigs and genes.
 */

process rename_contigs_genes {
  tag "${replicateId}"
702
  publishDir "${params.outdir}/04_structural_annot", mode: 'copy'
703
704
705
706
707
708
  label 'python'

  input:
  set replicateId, file(assembly_file) from prokka_ch

  output:
709
  set replicateId, file("${replicateId}.annotated.fna") into prokka_renamed_fna_ch, prokka_renamed_fna_ch2, prokka_renamed_fna_for_metabat2_ch
710
711
  set replicateId, file("${replicateId}.annotated.ffn") into prokka_renamed_ffn_ch
  set replicateId, file("${replicateId}.annotated.faa") into prokka_renamed_faa_ch
712
  set replicateId, file("${replicateId}.annotated.gff") into prokka_renamed_gff_ch, prokka_renamed_gff_ch2
713

714
  when: ('04_structural_annot' in step || '05_alignment' in step || '06_func_annot' in step || '07_taxo_affi' in step || '08_binning' in step)
715

716
717
  script:
  """
718
  grep "^gnl" ${assembly_file}/${replicateId}.gff > ${replicateId}_only_gnl.gff
Joanna's avatar
Joanna committed
719
  Rename_contigs_and_genes.py -f ${replicateId}_only_gnl.gff -faa ${assembly_file}/${replicateId}.faa -ffn ${assembly_file}/${replicateId}.ffn -fna ${assembly_file}/${replicateId}.fna -p ${replicateId} -oGFF ${replicateId}.annotated.gff -oFAA ${replicateId}.annotated.faa -oFFN ${replicateId}.annotated.ffn -oFNA ${replicateId}.annotated.fna
720
721
722
  """
}

723
724
725
726
727
728

// ALIGNMENT OF READS AGAINST CONTIGS.
prokka_reads_ch = prokka_renamed_gff_ch.join(prokka_renamed_fna_ch, remainder: true).join(deduplicated_reads_ch, remainder: true)

process reads_alignment_on_contigs {
   tag "${replicateId}"
729
   publishDir "${params.outdir}/05_alignment/05_1_reads_alignment_on_contigs/$replicateId", mode: 'copy'
730
731
732
733
734
735

   input:
   set replicateId, file(gff_prokka), file(fna_prokka), file(deduplicated_reads_R1), file(deduplicated_reads_R2) from prokka_reads_ch

   output:
   set val(replicateId), file("${replicateId}.sort.bam"), file("${replicateId}.sort.bam.bai") into reads_assembly_ch, reads_assembly_ch_for_metabat2
Joanna Fourquet's avatar
Joanna Fourquet committed
736
   set val(replicateId), file("${replicateId}.sort.bam.idxstats") into idxstats_ch
737

738
   when: ('05_alignment' in step || '06_func_annot' in step || '07_taxo_affi' in step || '08_binning' in step)
739
740
741
742
743
744
745
746
747
748
749
750
751
752

   script:
   """
   bwa index ${fna_prokka} -p ${fna_prokka}
   bwa mem ${fna_prokka} ${deduplicated_reads_R1} ${deduplicated_reads_R2} | samtools view -bS - | samtools sort - -o ${replicateId}.sort.bam
   samtools index ${replicateId}.sort.bam
   samtools idxstats ${replicateId}.sort.bam > ${replicateId}.sort.bam.idxstats
   """
}

// ALIGNMENT AGAINST PROTEIN DATABASE: DIAMOND.
prokka_renamed_faa_ch.into{prokka_renamed_faa_ch2; prokka_renamed_faa_ch3; prokka_renamed_faa_ch4}

process diamond {
753
    publishDir "${params.outdir}/05_alignment/05_2_database_alignment/$replicateId", mode: 'copy'
754
755
    tag "${replicateId}"

756
    when: ('05_alignment' in step || '06_func_annot' in step || '07_taxo_affi' in step || '08_binning' in step)
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771

    input:
    set replicateId, file(renamed_prokka_faa) from prokka_renamed_faa_ch2
    val diamond_bank from diamond_bank_ch

    output:
    set replicateId, file("${replicateId}_aln_diamond.m8") into diamond_result_ch, diamond_result_for_annot_ch

    script:
    """
    diamond blastp -p ${task.cpus} -d ${diamond_bank} -q ${renamed_prokka_faa} -o ${replicateId}_aln_diamond.m8 -f 6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen stitle
    """
}

process diamond_header {
772
    publishDir "${params.outdir}/05_alignment/05_2_database_alignment/", mode: 'copy'
773

774
    when: ('05_alignment' in step || '06_func_annot' in step || '07_taxo_affi' in step || '08_binning' in step)
775
776
777
778
779
780
781
782
783
784

    output:
    file("head.m8") into diamond_header_ch

    script:
    """
    echo "qseqid	sseqid	pident	length	mismatch	gapopen	qstart	qend	sstart	send	evalue	bitscore	qlen	slen	stitle" > head.m8
    """
}

785
786
787
788
789
790
791
/*
 * CLUSTERING.
 */

// Sample clustering with CD-HIT.
process individual_cd_hit {
  tag "${replicateId}"
792
  publishDir "${params.outdir}/06_func_annot/06_1_clustering", mode: 'copy'
793
794
795
796
797
798
799
  label 'cd_hit'

  input:
  set replicateId, file(assembly_ffn_file) from prokka_renamed_ffn_ch
  val percentage_identity_cdhit from percentage_identity_ch

  output:
Joanna Fourquet's avatar
Joanna Fourquet committed
800
801
802
  file("${replicateId}.cd-hit-est.${percentage_identity_cdhit}.fasta") into individual_cd_hit_ch
  file("${replicateId}.cd-hit-est.${percentage_identity_cdhit}.fasta.clstr") into individual_cd_hit_cluster_ch
  file("${replicateId}.cd-hit-est.${percentage_identity_cdhit}.table_cluster_contigs.txt") into table_clstr_contigs_ch
803

804
  when: ('06_func_annot' in step)
805

806
807
  script:
  """
Joanna Fourquet's avatar
Joanna Fourquet committed
808
809
  cd-hit-est -c ${percentage_identity_cdhit} -i ${assembly_ffn_file} -o ${replicateId}.cd-hit-est.${percentage_identity_cdhit}.fasta -T ${task.cpus} -M ${task.mem} -d 150
  cat ${replicateId}.cd-hit-est.${percentage_identity_cdhit}.fasta.clstr | cd_hit_produce_table_clstr.py > ${replicateId}.cd-hit-est.${percentage_identity_cdhit}.table_cluster_contigs.txt
810
811
812
813
814
  """
}

// Global clustering with CD-HIT.
process global_cd_hit {
815
  publishDir "${params.outdir}/06_func_annot/06_1_clustering", mode: 'copy'
816
817
818
819
820
821
822
  label 'cd_hit'

  input:
  file "*" from individual_cd_hit_ch.collect()
  val percentage_identity_cdhit from percentage_identity_ch

  output:
Joanna Fourquet's avatar
Joanna Fourquet committed
823
824
  file("All-cd-hit-est.${percentage_identity_cdhit}.fasta") into concatenation_individual_cd_hit_ch
  file("All-cd-hit-est.${percentage_identity_cdhit}.fasta.clstr") into global_cd_hit_ch
825
826
  file("table_clstr.txt") into table_global_clstr_clstr_ch

827
  when: ('06_func_annot' in step)
828

829
830
831
  script:
  """
  cat * > All-cd-hit-est.${percentage_identity_cdhit}
Joanna Fourquet's avatar
Joanna Fourquet committed
832
833
  cd-hit-est -c ${percentage_identity_cdhit} -i All-cd-hit-est.${percentage_identity_cdhit} -o All-cd-hit-est.${percentage_identity_cdhit}.fasta -T ${task.cpus} -M {task.mem} -d 150
  cat All-cd-hit-est.${percentage_identity_cdhit}.fasta.clstr | cd_hit_produce_table_clstr.py > table_clstr.txt
834
835
836
837
  """
}

/*
838
 * Create channel with sample id, renamed annotation files and bam and index bam files.
839
840
*/

841
prokka_bam_ch = prokka_renamed_gff_ch2.join(prokka_renamed_fna_ch2, remainder: true).join(reads_assembly_ch, remainder: true)
842
843
844
845
846
847

/*
 * QUANTIFICATION
 */

// Quantification of reads on each gene in each sample.
848
process quantification {
849
   tag "${replicateId}"
850
   publishDir "${params.outdir}/06_func_annot/06_2_quantification", mode: 'copy'
851
852

   input:
853
   set replicateId, file(gff_prokka), file(fna_prokka), file(bam), file(bam_index) from prokka_bam_ch
854
855
856

   output:
   file("${replicateId}.featureCounts.tsv") into counts_ch
cnoirot's avatar
cnoirot committed
857
   file("${replicateId}.featureCounts.tsv.summary") into featureCounts_out_ch_for_multiqc
858
859
   file("${replicateId}.featureCounts.stdout") into featureCounts_error_ch

860
   when: ('06_func_annot' in step)
861

862
863
   script:
   """
864
   featureCounts -T ${task.cpus} -p -O -t gene -g ID -a ${gff_prokka} -o ${replicateId}.featureCounts.tsv ${bam} &> ${replicateId}.featureCounts.stdout
865
   """
866
}
867
868
869

// Create table with sum of reads for each global cluster of genes in each sample.
 process quantification_table {
870
   publishDir "${params.outdir}/06_func_annot/06_2_quantification", mode: 'copy'
871
872
873
874
875
876
877
878
   label 'python'

   input:
   file(clusters_contigs) from table_clstr_contigs_ch.collect()
   file(global_clusters_clusters) from table_global_clstr_clstr_ch
   file(counts_files) from counts_ch.collect()

   output:
879
   file("Correspondence_global_clstr_genes.txt") into table_global_clstr_genes_ch
880
881
   file("Clusters_Count_table_all_samples.txt") into quantification_table_ch

882
   when: ('06_func_annot' in step)
883
884
885
886
   script:
   """
   ls ${clusters_contigs} | cat > List_of_contigs_files.txt
   ls ${counts_files} | cat > List_of_count_files.txt
Joanna Fourquet's avatar
Joanna Fourquet committed
887
   Quantification_clusters.py -t ${global_clusters_clusters} -l List_of_contigs_files.txt -c List_of_count_files.txt -oc Clusters_Count_table_all_samples.txt -oid Correspondence_global_clstr_genes.txt
888
889
890
   """
 }

891
892
893
894
/*
 * FUNCTIONAL ANNOTATION OF GENES.
 */

895
if(params.eggnogmapper_db) {
896
897
898
    // Built eggNOG-mapper database.
    process eggnog_mapper_db {

Joanna Fourquet's avatar
Joanna Fourquet committed