Assembly filter chunks
Change assembly filter method to parallelize fasta processing, making this step significantly faster.
CODE
>>> assembly_for_filter_ch .splitFasta(by: 100000, file: true) .set{chunk_assembly_for_filter_ch}chunk_assembly_for_filter_ch .combine(idxstats_filter_logs_ch, by:0) .set{assembly_and_logs_ch}
process assembly_filter { publishDir "${params.outdir}/03_filtering/", mode: 'copy'
input: set sampleId, file(assembly_file), file(idxstats) from assembly_and_logs_ch val min_cpm from min_contigs_cpm_ch
output:
set sampleId, file("{sampleId}_select_contigs_cpm
{min_cpm}.fasta") into select_assembly_ch, select_assembly_for_quast_ch
set sampleId, file("{sampleId}_discard_contigs_cpm
{min_cpm}.fasta") into discard_assembly_ch
when: ('03_filtering' in step)
shell: ''' chunk_name=`basename !{assembly_file} .fa` grep "^>" !{assembly_file} | cut -f 1 -d " " | sed 's/^>//g' | awk '{print($1"\t")}' > $chunk_name'.names' grep -f $chunk_name'.names' !{idxstats} > $chunk_name'_chunk.idxstats'
Filter_contig_per_cpm.py -i $chunk_name'chunk.idxstats' -f !{assembly_file} -c !{min_cpm} -s "!{sampleId}"$chunk_name"select_contigs_cpm!{min_cpm}.fasta" -d "!{sampleId}"$chunk_name"_discard_contigs_cpm!{min_cpm}.fasta"
cat !{sampleId}__select_contigs_cpm!{min_cpm}.fasta > !{sampleId}select_contigs_cpm!{min_cpm}.fasta cat !{sampleId}_discard_contigs_cpm!{min_cpm}.fasta > !{sampleId}_discard_contigs_cpm!{min_cpm}.fasta '''
}
process quast_filtered { publishDir "${params.outdir}/03_filtering/", mode: 'copy'
input: set sampleId, file(fasta) from select_assembly_for_quast_ch
output: set sampleId, file("${sampleId}_select_contigs_QC/report.tsv") into quast_select_contigs_for_multiqc_ch file("${sampleId}_select_contigs_QC/*") into quast_select_contigs_ch
when: ('03_filtering' in step)
script:
"""
metaquast.py --threads ${task.cpus} --rna-finding --max-ref-number 0 --min-contig 0 {fasta} -o "
{sampleId}_select_contigs_QC"
"""
}
-
idxstats use only once to calculate CPM