Commit 2295377c authored by MARTIN Pierre's avatar MARTIN Pierre
Browse files

json file listing; tests on all 7 steps of pipeline; added testing methods;...

json file listing; tests on all 7 steps of pipeline; added testing methods; refactoring main.nf from beggining to process kaiju_index; eggnog fix; diamond fix
parent fe79fddd
......@@ -13,8 +13,8 @@ try:
import sys
import re
import os
import os.path
from collections import defaultdict
import os.path as path
import json
import subprocess
except ImportError as error:
......@@ -39,8 +39,10 @@ def parse_arguments():
help = '(required) expected logs dir containing logs from a healthy metagwgs workflow')
parser.add_argument('-obs_dir', type = str, \
help = '(required) observed logs dir containing logs from the metagwgs workflow you wish to test')
parser.add_argument('-sampleIds', type = str, \
help = '(required) list of sample ids used as input for metagWGS (format: a,b,c,...')
parser.add_argument('--script', type = str, \
help = '(optional) script file containing metagwgs Nextflow launch command ')
help = '(optional) script file containing metagwgs Nextflow launch command')
if len(sys.argv) == 1:
parser.print_usage(sys.stderr)
......@@ -52,8 +54,8 @@ def parse_arguments():
### Doesn't work right now, must launch .sh separately and then run functional tests on produced results
def launch_nextflow(script):
script_file = os.path.abspath(script)
script_dir = os.path.dirname(script_file)
script_file = path.abspath(script)
script_dir = path.dirname(script_file)
print('Launching test run with the provided file:\n\n{}\n'.format(script_file))
......@@ -61,156 +63,49 @@ def launch_nextflow(script):
print('Test run completed')
# Lists of output files to compare
## 01_clean_qc:
global _01_clean_qc = [
('01_1_cleaned_reads/logs/a.no_filter.flagstat', 'diff'),
('01_1_cleaned_reads/logs/c.no_filter.flagstat', 'diff'),
('01_1_cleaned_reads/logs/a_cutadapt.log', 'cut_diff'),
('01_1_cleaned_reads/logs/c_cutadapt.log', 'cut_diff'),
('01_1_cleaned_reads/logs/a_sickle.log', 'diff'),
('01_1_cleaned_reads/logs/c_sickle.log', 'diff'),
('01_1_cleaned_reads/logs/host_filter_flagstat/a.host_filter.flagstat', 'diff'),
('01_1_cleaned_reads/logs/host_filter_flagstat/c.host_filter.flagstat', 'diff'),
('01_2_qc/fastqc_cleaned/cleaned_a/cleaned_a_R1_fastqc.zip', 'not_empty'),
('01_2_qc/fastqc_cleaned/cleaned_c/cleaned_c_R1_fastqc.zip', 'not_empty'),
('01_2_qc/fastqc_cleaned/cleaned_a/cleaned_a_R2_fastqc.zip', 'not_empty'),
('01_2_qc/fastqc_cleaned/cleaned_c/cleaned_c_R2_fastqc.zip', 'not_empty'),
('01_2_qc/fastqc_cleaned/cleaned_a/cleaned_a_R1_fastqc.html', 'not_empty'),
('01_2_qc/fastqc_cleaned/cleaned_c/cleaned_c_R1_fastqc.html', 'not_empty'),
('01_2_qc/fastqc_cleaned/cleaned_a/cleaned_a_R2_fastqc.html', 'not_empty'),
('01_2_qc/fastqc_cleaned/cleaned_c/cleaned_c_R2_fastqc.html', 'not_empty'),
('01_2_qc/fastqc_raw/a/a_R1_fastqc.zip', 'not_empty'),
('01_2_qc/fastqc_raw/c/c_R1_fastqc.zip', 'not_empty'),
('01_2_qc/fastqc_raw/a/a_R2_fastqc.zip', 'not_empty'),
('01_2_qc/fastqc_raw/c/c_R2_fastqc.zip', 'not_empty'),
('01_2_qc/fastqc_raw/a/a_R1_fastqc.html', 'not_empty'),
('01_2_qc/fastqc_raw/c/c_R1_fastqc.html', 'not_empty'),
('01_2_qc/fastqc_raw/a/a_R2_fastqc.html', 'not_empty'),
('01_2_qc/fastqc_raw/c/c_R2_fastqc.html', 'not_empty'),
('01_3_taxonomic_affiliation_reads/a.krona.html', 'not_empty'),
('01_3_taxonomic_affiliation_reads/c.krona.html', 'not_empty'),
('01_3_taxonomic_affiliation_reads/taxo_affi_reads_class.tsv', 'diff'),
('01_3_taxonomic_affiliation_reads/taxo_affi_reads_family.tsv', 'diff'),
('01_3_taxonomic_affiliation_reads/taxo_affi_reads_genus.tsv', 'diff'),
('01_3_taxonomic_affiliation_reads/taxo_affi_reads_order.tsv', 'diff'),
('01_3_taxonomic_affiliation_reads/taxo_affi_reads_phylum.tsv', 'diff'),
('01_3_taxonomic_affiliation_reads/taxo_affi_reads_species.tsv', 'diff')
]
## 02_assembly:
global _02_assembly
_02_assembly = [
('logs/a.count_reads_on_contigs.flagstat', 'diff'),
('logs/c.count_reads_on_contigs.flagstat', 'diff'),
('logs/a.count_reads_on_contigs.idxstats', 'diff'),
('logs/c.count_reads_on_contigs.idxstats', 'diff'),
('a_metaspades/a_scaffolds.fasta', 'diff'),
('c_metaspades/c_scaffolds.fasta', 'diff'),
('a_megahit/a_scaffolds.fasta', 'diff'),
('c_megahit/c_scaffolds.fasta', 'diff'),
('a_all_contigs_QC/report.tsv', 'diff'),
('c_all_contigs_QC/report.tsv', 'diff'),
('a_all_contigs_QC/report.html', 'not_empty'),
('c_all_contigs_QC/report.html', 'not_empty')
]
## 03_filtering
global _03_filtering
_03_filtering = [
('a_select_contigs_cpm10.fasta', 'diff'),
('c_select_contigs_cpm10.fasta', 'diff'),
('a_discard_contigs_cpm10.fasta', 'diff'),
('c_discard_contigs_cpm10.fasta', 'diff'),
('a_select_contigs_QC/report.tsv', 'diff'),
('c_select_contigs_QC/report.tsv', 'diff'),
('a_select_contigs_QC/report.html', 'not_empty'),
('c_select_contigs_QC/report.html', 'not_empty')
]
## 04_structural_annot
global _04_structural_annot
_04_structural_annot = [
('a.annotated.faa', 'diff'),
('c.annotated.faa', 'diff'),
('a.annotated.ffn', 'diff'),
('c.annotated.ffn', 'diff'),
('a.annotated.fna', 'diff'),
('c.annotated.fna', 'diff'),
('a.annotated.gff', 'diff'),
('c.annotated.gff', 'diff'),
('a_prot.len', 'diff'),
('c_prot.len', 'diff')
]
## 05_alignment
global _05_alignment
_05_alignment = [
('05_1_reads_alignment_on_contigs/a/a_contig.bed', 'diff'),
('05_1_reads_alignment_on_contigs/c/c_contig.bed', 'diff'),
('05_1_reads_alignment_on_contigs/a/a.sort.bam.idxstats', 'diff'),
('05_1_reads_alignment_on_contigs/c/c.sort.bam.idxstats', 'diff'),
('05_1_reads_alignment_on_contigs/a/a.regions.bed.gz', 'zdiff'),
('05_1_reads_alignment_on_contigs/c/c.regions.bed.gz', 'zdiff'),
('05_2_database_alignment/a/a_aln_diamond.m8', 'diff'),
('05_2_database_alignment/c/c_aln_diamond.m8', 'diff')
]
# ## 06_func_annot
# global _06_func_annot = [
# ]
# ## 07_taxo_affi
# global _07_taxo_affi = [
# ]
# Load JSON file containing list of files to check
## (this .json file must be located in the same directory as 'functions.py')
def json_load(step):
# Do file comparisons for given step and write output
def check_files(exp_dir, obs_dir, step):
if not os.path.exists(exp_dir) or not os.path.exists(obs_dir):
sys.exit('{} or {} folder(s) do not exist, please check --exp_dir and --obs_dir parameters'.format(exp_dir, obs_dir))
if step == '01_clean_qc':
files_list = _01_clean_qc
elif step == '02_assembly':
files_list = _02_assembly
elif step == '03_filtering':
files_list = _03_filtering
with open(path.join(path.dirname(path.realpath(sys.argv[0])), 'files_to_test.json'), 'r') as json_file:
json_obj = json.load(json_file)
json_file.close()
elif step == '04_structural_annot':
files_list = json_obj[step]
files_list = _04_structural_annot
return files_list
elif step == '05_alignment':
files_list = _05_alignment
# elif step == '06_func_annot':
# files_list = _06_func_annot
# Do file comparisons for given step and write output
def check_files(exp_dir, obs_dir, step, sampleIds):
# elif step == '07_taxo_affi':
# Check existence of expected and observed directories
if not path.exists(exp_dir) or not path.exists(obs_dir):
sys.exit('{a} or {b} folder(s) do not exist, please check --exp_dir and --obs_dir parameters'.format(a = exp_dir, b = obs_dir))
# files_list = _07_taxo_affi
# Load files and methods list from JSON file
files_list = json_load(step)
# print files_list, len(files_list)
# sys.exit()
# elif step == '08_binning':
# Load list of sample ids given by user
sampleIds_list = sampleIds.split(',')
# files_list = _08_binning
# Initiate log for untested files (removes already existing logs)
not_tested_log = 'ft_{}.not_tested'.format(step)
os.remove(not_tested_log) if path.exists(not_tested_log) else None
log = open('ft_{}.log'.format(step), 'w+')
expected_prefix = '{}/{}/'.format(os.path.abspath(exp_dir), step)
observed_prefix = '{}/{}/'.format(os.path.abspath(obs_dir), step)
expected_prefix = path.join(path.abspath(exp_dir), step)
observed_prefix = path.join(path.abspath(obs_dir), step)
log.write('Expected directory: {}\nvs\nObserved directory: {}\n'.format(expected_prefix, observed_prefix))
log.write('Expected directory: {a}\nvs\nObserved directory: {b}\n'.format(a = expected_prefix, b = observed_prefix))
# r = re.compile('sampleId')
# samples_cnt = filter(r.match, sampleIds_list)
# sys.exit(samples_cnt)
# other_cnt
# max_cnt = len(files_list) * len(sampleIds_list)
max_cnt = len(files_list)
......@@ -221,59 +116,43 @@ def check_files(exp_dir, obs_dir, step):
print('\nLaunching {}...\n'.format(step))
for file in files_list:
for file_method in files_list:
file_path = file[0]
file_usage = file[1]
file_path = file_method['file']
file_method = file_method['method']
expected_path = '{}{}'.format(expected_prefix, file_path)
observed_path = '{}{}'.format(observed_prefix, file_path)
not_tested_log = 'ft_{}.not_tested'.format(step)
expected_path = path.join(expected_prefix, file_path)
observed_path = path.join(observed_prefix, file_path)
# Make log of non existing files
if not os.path.exists(expected_path):
nt = open(not_tested_log, 'a')
nt.write('Exp:\t' + expected_path + '\n')
nt.close()
nt_cnt += 1
max_cnt -= 1
file_out = '''
------------------------------------------------------------------------------
File: {}
Not tested (list in {})
'''.format(file_path, not_tested_log)
log.write(file_out)
print(file_out)
elif not os.path.exists(observed_path):
if not path.exists(observed_path):
nt = open(not_tested_log, 'a')
nt.write('Obs:\t' + observed_path + '\n')
nt.write(observed_path + '\n')
nt.close()
nt_cnt += 1
max_cnt -= 1
if max_cnt > 0:
max_cnt -= 1
file_out = '''
------------------------------------------------------------------------------
File: {}
Not tested (list in {})
'''.format(file_path, not_tested_log)
File: {a}
Not tested (list in {b})
'''.format(a = file_path, b = not_tested_log)
log.write(file_out)
print(file_out)
else:
if path.exists(expected_path) and path.exists(observed_path):
test, out = test_method(expected_path, observed_path, file_usage)
test, out = test_method(expected_path, observed_path, file_method)
if test == False:
file_out = '''
------------------------------------------------------------------------------
File: {}
Test method: {}
File: {a}
Test method: {b}
'''.format(file_path, file_usage)
'''.format(a = file_path, b = file_method)
log.write(file_out)
print(file_out)
log.write(out)
......@@ -283,9 +162,9 @@ Test method: {}
file_out = '''
------------------------------------------------------------------------------
File: {}
Test method: {}
'''.format(file_path, file_usage)
File: {a}
Test method: {b}
'''.format(a = file_path, b = file_method)
log.write(file_out)
print(file_out)
......@@ -294,10 +173,17 @@ Test method: {}
continue
true_perc = round((float(true_cnt) / float(max_cnt) * 100), 2)
if false_cnt != 0:
false_perc = round(100 - (float(true_cnt) / float(max_cnt) * 100), 2)
if max_cnt != 0:
if true_cnt != 0:
true_perc = round((float(true_cnt) / float(max_cnt) * 100), 2)
else:
true_perc = float(0)
if false_cnt != 0:
false_perc = round(100 - (float(true_cnt) / float(max_cnt) * 100), 2)
else:
false_perc = float(0)
else:
true_perc = float(0)
false_perc = float(0)
out = '''
......@@ -334,10 +220,18 @@ def test_method(exp_path, obs_path, method):
YES = 1
command = 'diff <(tail -n+6 {}) <(tail -n+6 {})'.format(exp_path, obs_path)
elif method == 'zdiff':
elif method == 'no_header_diff':
YES = 2
command = 'diff <(grep -w "^#" {}) <(grep -w "^#" {})'.format(exp_path, obs_path)
elif method == 'zdiff':
YES = 3
command = 'zdiff {} {}'.format(exp_path, obs_path)
elif method == 'taxo_diff':
YES = 4
command = 'diff <(cut -f1 {} | sort) <(cut -f1 {} | sort)'.format(exp_path, obs_path)
process = subprocess.Popen(command, stdout = subprocess.PIPE, shell = True, executable = '/bin/bash')
diff_out, error = process.communicate()
......@@ -359,8 +253,8 @@ def test_method(exp_path, obs_path, method):
return test, out
elif method == 'not_empty':
YES = 3
test = os.path.getsize(obs_path) > 0
YES = 5
test = path.getsize(obs_path) > 0
if test:
test = True
......
......@@ -63,7 +63,7 @@ def main():
if steps_list[step] <= steps_list[args.step]:
# Launch test on expected and observed files from this step
out = check_files(args.exp_dir, args.obs_dir, step)
out = check_files(args.exp_dir, args.obs_dir, step, args.sampleIds)
outs.append(out)
print(''.join(outs))
......
......@@ -75,7 +75,8 @@
07_taxo_affi options:
--accession2taxid FTP adress of file prot.accession2taxid.gz. Default: "ftp://ftp.ncbi.nih.gov/pub/taxonomy/accession2taxid/prot.accession2taxid.gz".
--taxdump FTP adress of file taxdump.tar.gz. Default "ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz".
--taxdump FTP adress of file taxdump.tar.gz. Default: "ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz".
--taxonomy_dir Directory if taxdump and accession2taxid already downloaded ("PATH/directory").
08_binning options:
--min_contig_size [cutoff] Contig length cutoff to filter contigs before binning. Must be greater than 1500. Default: 1500.
......@@ -408,13 +409,9 @@ if (!params.skip_kaiju && ('01_clean_qc' in step || '02_assembly' in step || '03
output:
file("nodes.dmp") into index_kaiju_nodes_ch
file("kaiju_db*.fmi") into index_kaiju_db_ch, index_kaiju_db_size_ch
file("*.fmi") into index_kaiju_db_ch
file("names.dmp") into index_kaiju_names_ch
// file("kaiju_db(?:_refseq|).fmi") into index_kaiju_db_ch
// .value(kaiju_Path + "/" + kaiju_Path_db + "/kaiju_db(?:_refseq|).fmi")
script:
"""
wget ${database}
......@@ -425,12 +422,17 @@ if (!params.skip_kaiju && ('01_clean_qc' in step || '02_assembly' in step || '03
"""
} } else {
if(params.kaiju_db_dir) {
index_kaiju_nodes_ch = Channel
.value(params.kaiju_db_dir + "/nodes.dmp")
index_kaiju_db_ch = Channel
.value(params.kaiju_db_dir + "/kaiju_db*.fmi")
index_kaiju_names_ch = Channel
.value(params.kaiju_db_dir + "/names.dmp")
if(file(params.kaiju_db_dir + "/kaiju_db*.fmi").size == 1) {
index_kaiju_nodes_ch = Channel
.value(params.kaiju_db_dir + "/nodes.dmp")
index_kaiju_db_ch = Channel
.value(params.kaiju_db_dir + "/kaiju_db*.fmi")
index_kaiju_names_ch = Channel
.value(params.kaiju_db_dir + "/names.dmp")
}
else {
exit 1, "There are more than one file ending with .fmi in ${params.kaiju_db_dir}"
}
}
else {
if (!(params.kaiju_db) & !(params.kaiju_db_dir) & (!params.skip_kaiju)){
......@@ -439,13 +441,13 @@ if (!params.skip_kaiju && ('01_clean_qc' in step || '02_assembly' in step || '03
}
}
// if {
// if {
// items = Channel.from(analysis_config.items.keySet())
// for (String item : items) {
// list_of_items_to_process << item
// }
// }
// }
// Kaiju.
process kaiju {
......@@ -482,7 +484,7 @@ if (!params.skip_kaiju && ('01_clean_qc' in step || '02_assembly' in step || '03
Generate_barplot_kaiju.py -i ${sampleId}_kaiju_MEM_verbose.out
"""
}
// Merge kaiju results by species
// Merge kaiju results by taxonomic ranks
process kaiju_merge{
tag "${sampleId}"
publishDir "${params.outdir}/01_clean_qc/01_3_taxonomic_affiliation_reads", mode: 'copy'
......@@ -536,7 +538,7 @@ else {
// Metaspades.
if(params.assembly=='metaspades') {
megahit_ch = Channel.from(false)
// megahit_ch = Channel.from(false)
process metaspades {
tag "${sampleId}"
publishDir "${params.outdir}/02_assembly", mode: 'copy'
......@@ -560,7 +562,7 @@ if(params.assembly=='metaspades') {
}
} else { // Megahit.
if(params.assembly=='megahit') {
metaspades_ch = Channel.from(false)
// metaspades_ch = Channel.from(false)
process megahit {
tag "${sampleId}"
publishDir "${params.outdir}/02_assembly", mode: 'copy'
......@@ -925,11 +927,11 @@ if(params.eggnogmapper_db) {
script:
"""
mkdir db_eggnog_mapper
/eggnog-mapper-2.0.4-rf1/download_eggnog_data.py -P -M -f -y --data_dir db_eggnog_mapper
/eggnog-mapper-2.0.4-rf1/download_eggnog_data.py -P -f -y --data_dir db_eggnog_mapper
"""
} } else {
if(params.eggnog_mapper_db_dir) {
functional_annot_db_ch = Channel.value(params.eggnog_mapper_db_dir)
functional_annot_db_ch = Channel.fromPath(params.eggnog_mapper_db_dir).first()
}
else {
if (!(params.eggnogmapper_db) & !(params.eggnog_mapper_db_dir) & "06_func_annot" in step){
......@@ -974,7 +976,7 @@ process best_hits_diamond {
script:
"""
best_bitscore_diamond.py -o ${sampleId}.best_hit ${diamond_file}
filter_diamond_hits.py -o ${sampleId}.best_hit ${diamond_file}
"""
}
// Merge eggNOG-mapper output files and quantification table.
......@@ -1035,35 +1037,44 @@ process make_functional_annotation_tables {
// TAXONOMIC AFFILIATION OF CONTIGS.
// Load taxonomy.
// Download taxonomy if taxonomy_dir is not given.
if(!params.taxonomy_dir) {
process download_taxonomy_db {
when:
('07_taxo_affi' in step)
input:
val accession2taxid_ch
val taxdump_ch
output:
set file("*taxid*"), file ("*taxdump*") into taxonomy_ch
when:
('07_taxo_affi' in step)
script:
"""
wget ${accession2taxid_ch}
file='${accession2taxid_ch}'
fileName=\${file##*/}
echo \$fileName
gunzip \$fileName
wget ${taxdump_ch}
file_taxdump='${taxdump_ch}'
fileName_taxdump=\${file_taxdump##*/}
echo \$fileName_taxdump
mkdir taxdump; mv \$fileName_taxdump taxdump; cd taxdump ; tar xzvf \$fileName_taxdump
"""
}
input:
val accession2taxid_ch
val taxdump_ch
output:
set file("*taxid*"), file ("*taxdump*") into taxonomy_ch
script:
"""
wget ${accession2taxid_ch}
file='${accession2taxid_ch}'
fileName=\${file##*/}
echo \$fileName
gunzip \$fileName
wget ${taxdump_ch}
file_taxdump='${taxdump_ch}'
fileName_taxdump=\${file_taxdump##*/}
echo \$fileName_taxdump
mkdir taxdump; mv \$fileName_taxdump taxdump; cd taxdump ; tar xzvf \$fileName_taxdump
"""
}
}
else if(params.taxonomy_dir) {
assert file(params.taxonomy_dir + '/prot.accession2taxid').exists()
assert file(params.taxonomy_dir + '/taxdump').exists()
taxonomy_ch = Channel
.tuple(file(params.taxonomy_dir + '/prot.accession2taxid'), file(params.taxonomy_dir + '/taxdump'))
}
else {
exit 1, "You must specify --accession2taxid and --taxdump or --taxonomy_dir"
}
/*
* Create channel with sample id, diamond files and desman length files, idxstats and depth files.
......@@ -1080,7 +1091,7 @@ process diamond_parser {
when: ('07_taxo_affi' in step)
input:
set file(prot_accession2taxid), file(taxonomy) from taxonomy_ch
set file(accession2taxid), file(taxdump) from taxonomy_ch
set sampleId, file(diamond_file), file(prot_len), file(idxstats), file(depth) from diamond_parser_input_ch
output:
......@@ -1099,7 +1110,7 @@ process diamond_parser {
script:
"""
aln2taxaffi.py -a ${prot_accession2taxid} --taxonomy ${taxonomy} -o ${sampleId} ${diamond_file} ${prot_len}
aln2taxaffi.py -a ${accession2taxid} --taxonomy ${taxdump} -o ${sampleId} ${diamond_file} ${prot_len}
merge_contig_quantif_perlineage.py -i ${idxstats} -c ${sampleId}.percontig.tsv -m ${depth} -o ${sampleId}_quantif_percontig
"""
}
......
......@@ -36,6 +36,7 @@ params {
diamond_bank = ""
accession2taxid = "ftp://ftp.ncbi.nih.gov/pub/taxonomy/accession2taxid/prot.accession2taxid.gz"
taxdump = "ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz"
taxonomy_dir = false
eggnogmapper_db = false
eggnog_mapper_db_dir = false
multiqc_config = "$baseDir/assets/multiqc_config.yaml"
......
......@@ -4,4 +4,4 @@ TTTCCGCATCTAAAANATATCAGTATTGCTTGTGTCNGACTGATATAATAAAAATAGCTGACAATCAAAAGATTATCAGC
CATTCTGTGTACCCGAGATCGTGCTCGAACCAACTACGGTGTTACACCACTAGAACCTGATACTAGCGCT
GCACACACGGTGGAATGAATGTGCCGGCAGACAAATGTGTTGAGTATAACACGATGGATGCAATCGCAAA
TGGAATGAGTA
AGACAAAAATAGACAATCAAGATAATTCTGTAGATCCTAAAACTAAAAACGGACACGGACAGAGGTA
AGACAAAAATAGACAATCAAGATAATTCTGTAGATCCTAAAACTAAAAACGGACACGGACAGAGGTA
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment