Commit 93c467b2 authored by MARTIN Pierre's avatar MARTIN Pierre
Browse files

Functional tests v1 finished; automatically fetch files in exp_dir and test...

Functional tests v1 finished; automatically fetch files in exp_dir and test using methods in main.nf
parent 6924e6be
#!/usr/bin/env python3
"""----------------------------------------------------------------------------
Script Name: filter_diamond_hits.py
Description: Keep best diamond hits for each query gene/protein
based on best bitscore and filter out query with low identity and low coverage
Adapted from best_bitscore_diamond.py script of Joanna Fourquet
Input files: Diamond output file (.m8)
Created By: Jean Mainguy
Date: 2021-08-02
-------------------------------------------------------------------------------
"""
# Metadata
__author__ = 'Mainguy Jean - Plateforme bioinformatique Toulouse'
__copyright__ = 'Copyright (C) 2021 INRAE'
__license__ = 'GNU General Public License'
__version__ = '0.1'
__email__ = 'support.bioinfo.genotoul@inra.fr'
__status__ = 'dev'
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter, FileType
import logging
import sys
import csv
def get_hits_with_highest_bitscore(hits):
highest_bitscore = max([float(hit['bitScore']) for hit in hits])
return [hit for hit in hits if float(hit['bitScore']) == highest_bitscore]
def get_all_hits_per_query(blast_result_file, header_list):
# Assertion: Hit are already sorted by query in diamond output.
# Both commands should output the same number of line:
# cut -f1 blast_result_file | uniq | wc -l
# cut -f1 blast_result_file | sort | uniq | wc -l
with open(blast_result_file) as in_fl:
result_reader = csv.DictReader(in_fl, delimiter='\t', fieldnames=header_list)
query_ids_processed = []
current_query_id = None
hits = []
for hit in result_reader:
if not current_query_id:
current_query_id = hit['queryId']
if current_query_id and current_query_id != hit['queryId']:
yield hits
hits = []
current_query_id = hit['queryId']
assert current_query_id not in query_ids_processed, f"Queries are not sorted in blast result. Query {current_query_id} is found in different part of the file."
query_ids_processed.append(current_query_id)
hits.append(hit)
if current_query_id:
yield hits
def is_identity_and_coverage_ok(hit, min_identity, min_coverage):
qcovhsp = (int(hit["queryEnd"]) - int(hit["queryStart"]) + 1) / int(hit['queryLength'])
if float(hit['percIdentity']) >= min_identity or qcovhsp >= min_coverage:
return True
return False
def parse_arguments():
"""Parse script arguments."""
parser = ArgumentParser(description="...",
formatter_class=ArgumentDefaultsHelpFormatter)
parser.add_argument('aln_input_file',
help="File with blast/diamond matches expected format m8 \
\nqueryId, subjectId, percIdentity, alnLength, mismatchCount, gapOpenCount,\
queryStart, queryEnd, subjectStart, subjectEnd, eVal, bitScore")
parser.add_argument('-o', '--output_file', type=str,
default="best_hit.tsv", help=("string specifying output file path"))
parser.add_argument('-i', '--min_identity', default=60, type=float,
help="percentage of identity")
parser.add_argument('-c', '--min_coverage', default=70, type=float,
help="percentage of coverage")
parser.add_argument("-v", "--verbose", help="increase output verbosity",
action="store_true")
args = parser.parse_args()
return args
def main():
args = parse_arguments()
if args.verbose:
logging.basicConfig(format="%(levelname)s: %(message)s", level=logging.DEBUG)
logging.info('Mode verbose ON')
else:
logging.basicConfig(format="%(levelname)s: %(message)s")
headers = "queryId subjectId percIdentity alnLength mismatchCount gapOpenCount queryStart queryEnd subjectStart subjectEnd eVal bitScore queryLength subjectLength subjectTitle"
header_list = headers.split(' ')
blast_result = args.aln_input_file
outfile = args.output_file
min_coverage = args.min_coverage
min_identity = args.min_identity
best_hit_count = 0
query_count_with_low_hit = 0
with open(outfile, 'w') as out_fl:
writer = csv.DictWriter(out_fl, fieldnames=header_list, delimiter='\t')
for query_i, query_hits in enumerate(get_all_hits_per_query(blast_result, header_list)):
if query_i % 10000 == 0:
logging.info(f'{query_i} queries processed... ')
correct_hits = [hit for hit in query_hits if is_identity_and_coverage_ok(
hit, min_identity, min_coverage)]
if not correct_hits:
query_count_with_low_hit += 1
continue
best_hits = get_hits_with_highest_bitscore(correct_hits)
for best_hit in best_hits:
best_hit_count += 1
writer.writerow(best_hit)
logging.info(f'{query_count_with_low_hit} queries ({100*query_count_with_low_hit/(query_i+1):.2f}%) have low hits that do not pass identity ({min_identity}%) or coverage ({min_coverage}%) thresholds')
logging.info(f'{best_hit_count} best hits of {query_i+1 - query_count_with_low_hit } queries have been written in {outfile}.')
if __name__ == '__main__':
main()
#!/usr/bin/env python
"""--------------------------------------------------------------------
Script Name: merge_contig_quantif_perlineage.py
Description: merge quantifications and lineage into one matrice for one sample.
Input files: idxstats file, depth from mosdepth (bed.gz) and lineage percontig.tsv file.
Created By: Joanna Fourquet
Date: 2021-01-19
-----------------------------------------------------------------------
"""
# Metadata.
__author__ = 'Joanna Fourquet \
- GenPhySE - NED'
__copyright__ = 'Copyright (C) 2021 INRAE'
__license__ = 'GNU General Public License'
__version__ = '0.1'
__email__ = 'support.bioinfo.genotoul@inra.fr'
__status__ = 'dev'
# Status: dev.
# Modules importation.
try:
import argparse
import re
import sys
import pandas as pd
import numpy as np
from datetime import datetime
except ImportError as error:
print(error)
exit(1)
# Print time.
print(str(datetime.now()))
# Manage parameters.
parser = argparse.ArgumentParser(description = 'Script which \
merge quantifications and lineage into one matrice for one sample.')
parser.add_argument('-i', '--idxstats_file', required = True, \
help = 'idxstats file.')
parser.add_argument('-m', '--mosdepth_file', required = True, \
help = 'depth per contigs from mosdepth (regions.bed.gz).')
parser.add_argument('-c', '--percontig_file', required = True, \
help = '.percontig.tsv file.')
parser.add_argument('-o', '--output_name', required = True, \
help = 'Name of output file containing counts of contigs and reads \
for each lineage.')
parser.add_argument('-v', '--version', action = 'version', \
version = __version__)
args = parser.parse_args()
# Recovery of idxstats file.
idxstats = pd.read_csv(args.idxstats_file, delimiter='\t', header=None)
idxstats.columns = ["contig","len","mapped","unmapped"]
# Recovery of mosdepth file; remove start/end columns
mosdepth = pd.read_csv(args.mosdepth_file, delimiter='\t', header=None,compression='gzip')
mosdepth.columns = ["contig","start","end","depth"]
mosdepth.drop(["start","end"], inplace=True,axis=1)
# Recovery of .percontig.tsv file.
percontig = pd.read_csv(args.percontig_file, delimiter='\t', dtype=str)
# Merge idxstats and .percontig.tsv files.
merge = pd.merge(idxstats,percontig,left_on='contig',right_on='#contig', how='outer')
#add depth
merge = pd.merge(merge,mosdepth,left_on='contig',right_on='contig', how='outer')
# Group by lineage and sum number of reads and contigs.
res = merge.groupby(['consensus_lineage','consensus_tax_id', 'tax_id_by_level']).agg({'contig' : [';'.join, 'count'], 'mapped': 'sum', 'depth': 'mean'}).reset_index()
res.columns=['lineage_by_level', 'consensus_tax_id', 'tax_id_by_level', 'name_contigs', 'nb_contigs', 'nb_reads', 'depth']
# Fill the NaN by 0.
res.fillna(0, inplace=True)
# Split by taxonomic level
res_split_tax_id = res.join(res['tax_id_by_level'].str.split(pat=";",expand=True))
res_split_tax_id.columns=['consensus_lineage', 'consensus_taxid', 'tax_id_by_level', 'name_contigs', 'nb_contigs', 'depth', 'nb_reads', "superkingdom_tax_id", "phylum_tax_id", "order_tax_id", "class_tax_id", "family_tax_id", "genus_tax_id", "species_tax_id"]
res_split_tax_id.fillna(value='no_affi', inplace = True)
print(res_split_tax_id.head())
res_split = res_split_tax_id.join(res_split_tax_id['consensus_lineage'].str.split(pat=";",expand=True))
res_split.columns=['consensus_lineage', 'consensus_taxid', 'tax_id_by_level', 'name_contigs', 'nb_contigs', 'nb_reads', 'depth', "superkingdom_tax_id", "phylum_tax_id", "order_tax_id", "class_tax_id", "family_tax_id", "genus_tax_id", "species_tax_id", "superkingdom_lineage", "phylum_lineage", "order_lineage", "class_lineage", "family_lineage", "genus_lineage", "species_lineage"]
res_split.fillna(value='no_affi', inplace = True)
levels_columns=['tax_id_by_level','lineage_by_level','name_contigs','nb_contigs', 'nb_reads', 'depth']
level_superkingdom = res_split.groupby(['superkingdom_tax_id','superkingdom_lineage']).agg({'name_contigs' : [';'.join], 'nb_contigs' : 'sum', 'nb_reads' : 'sum', 'depth': 'mean'}).reset_index()
level_superkingdom.columns=levels_columns
level_phylum = res_split.groupby(['phylum_tax_id','phylum_lineage']).agg({'name_contigs' : [';'.join], 'nb_contigs' : 'sum', 'nb_reads' : 'sum', 'depth': 'mean'}).reset_index()
level_phylum.columns=levels_columns
level_order = res_split.groupby(['order_tax_id','order_lineage']).agg({'name_contigs' : [';'.join], 'nb_contigs' : 'sum', 'nb_reads' : 'sum', 'depth': 'mean'}).reset_index()
level_order.columns=levels_columns
level_class = res_split.groupby(['class_tax_id','class_lineage']).agg({'name_contigs' : [';'.join], 'nb_contigs' : 'sum', 'nb_reads' : 'sum', 'depth': 'mean'}).reset_index()
level_class.columns=levels_columns
level_family = res_split.groupby(['family_tax_id','family_lineage']).agg({'name_contigs' : [';'.join], 'nb_contigs' : 'sum', 'nb_reads' : 'sum', 'depth': 'mean'}).reset_index()
level_family.columns=levels_columns
level_genus = res_split.groupby(['genus_tax_id','genus_lineage']).agg({'name_contigs' : [';'.join], 'nb_contigs' : 'sum', 'nb_reads' : 'sum', 'depth': 'mean'}).reset_index()
level_genus.columns=levels_columns
level_species = res_split.groupby(['species_tax_id','species_lineage']).agg({'name_contigs' : [';'.join], 'nb_contigs' : 'sum', 'nb_reads' : 'sum', 'depth': 'mean'}).reset_index()
level_species.columns=levels_columns
# Write merge data frame in output files.
res.to_csv(args.output_name + ".tsv", sep="\t", index=False)
level_superkingdom.to_csv(args.output_name + "_by_superkingdom.tsv", sep="\t", index=False)
level_phylum.to_csv(args.output_name + "_by_phylum.tsv", sep="\t", index=False)
level_order.to_csv(args.output_name + "_by_order.tsv", sep="\t", index=False)
level_class.to_csv(args.output_name + "_by_class.tsv", sep="\t", index=False)
level_family.to_csv(args.output_name + "_by_family.tsv", sep="\t", index=False)
level_genus.to_csv(args.output_name + "_by_genus.tsv", sep="\t", index=False)
level_species.to_csv(args.output_name + "_by_species.tsv", sep="\t", index=False)
{
"01_clean_qc":
[
{"file":"01_1_cleaned_reads/logs/a.no_filter.flagstat", "method":"diff"},
{"file":"01_1_cleaned_reads/logs/c.no_filter.flagstat", "method":"diff"},
{"file":"01_1_cleaned_reads/logs/a_cutadapt.log", "method":"cut_diff"},
{"file":"01_1_cleaned_reads/logs/c_cutadapt.log", "method":"cut_diff"},
{"file":"01_1_cleaned_reads/logs/a_sickle.log", "method":"diff"},
{"file":"01_1_cleaned_reads/logs/c_sickle.log", "method":"diff"},
{"file":"01_1_cleaned_reads/logs/host_filter_flagstat/a.host_filter.flagstat", "method":"diff"},
{"file":"01_1_cleaned_reads/logs/host_filter_flagstat/c.host_filter.flagstat", "method":"diff"},
{"file":"01_2_qc/fastqc_cleaned/cleaned_a/cleaned_a_R1_fastqc.zip", "method":"not_empty"},
{"file":"01_2_qc/fastqc_cleaned/cleaned_c/cleaned_c_R1_fastqc.zip", "method":"not_empty"},
{"file":"01_2_qc/fastqc_cleaned/cleaned_a/cleaned_a_R2_fastqc.zip", "method":"not_empty"},
{"file":"01_2_qc/fastqc_cleaned/cleaned_c/cleaned_c_R2_fastqc.zip", "method":"not_empty"},
{"file":"01_2_qc/fastqc_cleaned/cleaned_a/cleaned_a_R1_fastqc.html", "method":"not_empty"},
{"file":"01_2_qc/fastqc_cleaned/cleaned_c/cleaned_c_R1_fastqc.html", "method":"not_empty"},
{"file":"01_2_qc/fastqc_cleaned/cleaned_a/cleaned_a_R2_fastqc.html", "method":"not_empty"},
{"file":"01_2_qc/fastqc_cleaned/cleaned_c/cleaned_c_R2_fastqc.html", "method":"not_empty"},
{"file":"01_2_qc/fastqc_raw/a/a_R1_fastqc.zip", "method":"not_empty"},
{"file":"01_2_qc/fastqc_raw/c/c_R1_fastqc.zip", "method":"not_empty"},
{"file":"01_2_qc/fastqc_raw/a/a_R2_fastqc.zip", "method":"not_empty"},
{"file":"01_2_qc/fastqc_raw/c/c_R2_fastqc.zip", "method":"not_empty"},
{"file":"01_2_qc/fastqc_raw/a/a_R1_fastqc.html", "method":"not_empty"},
{"file":"01_2_qc/fastqc_raw/c/c_R1_fastqc.html", "method":"not_empty"},
{"file":"01_2_qc/fastqc_raw/a/a_R2_fastqc.html", "method":"not_empty"},
{"file":"01_2_qc/fastqc_raw/c/c_R2_fastqc.html", "method":"not_empty"},
{"file":"01_3_taxonomic_affiliation_reads/a.krona.html", "method":"not_empty"},
{"file":"01_3_taxonomic_affiliation_reads/c.krona.html", "method":"not_empty"},
{"file":"01_3_taxonomic_affiliation_reads/taxo_affi_reads_class.tsv", "method":"taxo_diff"},
{"file":"01_3_taxonomic_affiliation_reads/taxo_affi_reads_family.tsv", "method":"taxo_diff"},
{"file":"01_3_taxonomic_affiliation_reads/taxo_affi_reads_genus.tsv", "method":"taxo_diff"},
{"file":"01_3_taxonomic_affiliation_reads/taxo_affi_reads_order.tsv", "method":"taxo_diff"},
{"file":"01_3_taxonomic_affiliation_reads/taxo_affi_reads_phylum.tsv", "method":"taxo_diff"},
{"file":"01_3_taxonomic_affiliation_reads/taxo_affi_reads_species.tsv", "method":"taxo_diff"}
],
"02_assembly":
[
{"file":"logs/a.count_reads_on_contigs.flagstat", "method":"diff"},
{"file":"logs/c.count_reads_on_contigs.flagstat", "method":"diff"},
{"file":"logs/a.count_reads_on_contigs.idxstats", "method":"diff"},
{"file":"logs/c.count_reads_on_contigs.idxstats", "method":"diff"},
{"file":"a_metaspades/a_scaffolds.fasta", "method":"diff"},
{"file":"c_metaspades/c_scaffolds.fasta", "method":"diff"},
{"file":"a_megahit/a_contigs.fa", "method":"diff"},
{"file":"c_megahit/c_contigs.fa", "method":"diff"},
{"file":"a_all_contigs_QC/report.tsv", "method":"diff"},
{"file":"c_all_contigs_QC/report.tsv", "method":"diff"},
{"file":"a_all_contigs_QC/report.html", "method":"not_empty"},
{"file":"c_all_contigs_QC/report.html", "method":"not_empty"}
],
"03_filtering":
[
{"file":"a_select_contigs_cpm10.fasta", "method":"diff"},
{"file":"c_select_contigs_cpm10.fasta", "method":"diff"},
{"file":"a_discard_contigs_cpm10.fasta", "method":"diff"},
{"file":"c_discard_contigs_cpm10.fasta", "method":"diff"},
{"file":"a_select_contigs_QC/report.tsv", "method":"diff"},
{"file":"c_select_contigs_QC/report.tsv", "method":"diff"},
{"file":"a_select_contigs_QC/report.html", "method":"not_empty"},
{"file":"c_select_contigs_QC/report.html", "method":"not_empty"}
],
"04_structural_annot":
[
{"file":"a.annotated.faa", "method":"diff"},
{"file":"c.annotated.faa", "method":"diff"},
{"file":"a.annotated.ffn", "method":"diff"},
{"file":"c.annotated.ffn", "method":"diff"},
{"file":"a.annotated.fna", "method":"diff"},
{"file":"c.annotated.fna", "method":"diff"},
{"file":"a.annotated.gff", "method":"diff"},
{"file":"c.annotated.gff", "method":"diff"},
{"file":"a_prot.len", "method":"diff"},
{"file":"c_prot.len", "method":"diff"}
],
"05_alignment":
[
{"file":"05_1_reads_alignment_on_contigs/a/a_contig.bed", "method":"diff"},
{"file":"05_1_reads_alignment_on_contigs/c/c_contig.bed", "method":"diff"},
{"file":"05_1_reads_alignment_on_contigs/a/a.sort.bam.idxstats", "method":"diff"},
{"file":"05_1_reads_alignment_on_contigs/c/c.sort.bam.idxstats", "method":"diff"},
{"file":"05_1_reads_alignment_on_contigs/a/a.regions.bed.gz", "method":"zdiff"},
{"file":"05_1_reads_alignment_on_contigs/c/c.regions.bed.gz", "method":"zdiff"},
{"file":"05_2_database_alignment/a/a_aln_diamond.m8", "method":"diff"},
{"file":"05_2_database_alignment/c/c_aln_diamond.m8", "method":"diff"}
],
"06_func_annot":
[
{"file":"06_1_clustering/a.cd-hit-est.0.95.fasta", "method":"diff"},
{"file":"06_1_clustering/c.cd-hit-est.0.95.fasta", "method":"diff"},
{"file":"06_1_clustering/a.cd-hit-est.0.95.fasta.clstr", "method":"diff"},
{"file":"06_1_clustering/c.cd-hit-est.0.95.fasta.clstr", "method":"diff"},
{"file":"06_1_clustering/a.cd-hit-est.0.95.table_cluster_contigs.txt", "method":"diff"},
{"file":"06_1_clustering/c.cd-hit-est.0.95.table_cluster_contigs.txt", "method":"diff"},
{"file":"06_1_clustering/All-cd-hit-est.0.95.fasta", "method":"diff"},
{"file":"06_1_clustering/All-cd-hit-est.0.95.fasta.clstr", "method":"diff"},
{"file":"06_1_clustering/table_clstr.txt", "method":"diff"},
{"file":"06_2_quantification/c.featureCounts.tsv", "method":"diff"},
{"file":"06_2_quantification/a.featureCounts.tsv", "method":"diff"},
{"file":"06_2_quantification/c.featureCounts.tsv.summary", "method":"diff"},
{"file":"06_2_quantification/a.featureCounts.tsv.summary", "method":"diff"},
{"file":"06_2_quantification/Clusters_Count_table_all_samples.txt", "method":"diff"},
{"file":"06_2_quantification/Correspondence_global_clstr_genes.txt", "method":"diff"},
{"file":"06_3_functional_annotation/a.best_hit", "method":"diff"},
{"file":"06_3_functional_annotation/c.best_hit", "method":"diff"},
{"file":"06_3_functional_annotation/a_diamond_one2one.emapper.annotations", "method":"no_header_diff"},
{"file":"06_3_functional_annotation/c_diamond_one2one.emapper.annotations", "method":"no_header_diff"},
{"file":"06_3_functional_annotation/a_diamond_one2one.emapper.seed_orthologs", "method":"no_header_diff"},
{"file":"06_3_functional_annotation/c_diamond_one2one.emapper.seed_orthologs", "method":"no_header_diff"},
{"file":"06_3_functional_annotation/GOs_abundance.tsv", "method":"diff"},
{"file":"06_3_functional_annotation/KEGG_ko_abundance.tsv", "method":"diff"},
{"file":"06_3_functional_annotation/KEGG_Module_abundance.tsv", "method":"diff"},
{"file":"06_3_functional_annotation/KEGG_Pathway_abundance.tsv", "method":"diff"},
{"file":"06_3_functional_annotation/PFAM_abundance.tsv", "method":"diff"},
{"file":"06_3_functional_annotation/Quantifications_and_functional_annotations.tsv", "method":"diff"}
],
"07_taxo_affi":
[
{"file":"quantification_by_contig_lineage_all.tsv", "method":"diff"},
{"file":"quantification_by_contig_lineage_class.tsv", "method":"diff"},
{"file":"quantification_by_contig_lineage_family.tsv", "method":"diff"},
{"file":"quantification_by_contig_lineage_genus.tsv", "method":"diff"},
{"file":"quantification_by_contig_lineage_order.tsv", "method":"diff"},
{"file":"quantification_by_contig_lineage_phylum.tsv", "method":"diff"},
{"file":"quantification_by_contig_lineage_species.tsv", "method":"diff"},
{"file":"quantification_by_contig_lineage_superkingdom.tsv", "method":"diff"},
{"file":"a/a.percontig.tsv", "method":"diff"},
{"file":"a/a.pergene.tsv", "method":"diff"},
{"file":"a/a_quantif_percontig_by_class.tsv", "method":"diff"},
{"file":"a/a_quantif_percontig_by_family.tsv", "method":"diff"},
{"file":"a/a_quantif_percontig_by_genus.tsv", "method":"diff"},
{"file":"a/a_quantif_percontig_by_order.tsv", "method":"diff"},
{"file":"a/a_quantif_percontig_by_phylum.tsv", "method":"diff"},
{"file":"a/a_quantif_percontig_by_species.tsv", "method":"diff"},
{"file":"a/a_quantif_percontig_by_superkingdom.tsv", "method":"diff"},
{"file":"a/a_quantif_percontig.tsv", "method":"diff"},
{"file":"a/a.warn.tsv", "method":"diff"},
{"file":"a/graphs/a_aln_diamond.m8_contig_taxonomy_level.pdf", "method":"not_empty"},
{"file":"a/graphs/a_aln_diamond.m8_nb_prot_annotated_and_assigned.pdf", "method":"not_empty"},
{"file":"a/graphs/a_aln_diamond.m8_prot_taxonomy_level.pdf", "method":"not_empty"},
{"file":"c/c.percontig.tsv", "method":"diff"},
{"file":"c/c.pergene.tsv", "method":"diff"},
{"file":"c/c_quantif_percontig_by_class.tsv", "method":"diff"},
{"file":"c/c_quantif_percontig_by_family.tsv", "method":"diff"},
{"file":"c/c_quantif_percontig_by_genus.tsv", "method":"diff"},
{"file":"c/c_quantif_percontig_by_order.tsv", "method":"diff"},
{"file":"c/c_quantif_percontig_by_phylum.tsv", "method":"diff"},
{"file":"c/c_quantif_percontig_by_species.tsv", "method":"diff"},
{"file":"c/c_quantif_percontig_by_superkingdom.tsv", "method":"diff"},
{"file":"c/c_quantif_percontig.tsv", "method":"diff"},
{"file":"c/c.warn.tsv", "method":"diff"},
{"file":"c/graphs/c_aln_diamond.m8_contig_taxonomy_level.pdf", "method":"not_empty"},
{"file":"c/graphs/c_aln_diamond.m8_nb_prot_annotated_and_assigned.pdf", "method":"not_empty"},
{"file":"c/graphs/c_aln_diamond.m8_prot_taxonomy_level.pdf", "method":"not_empty"}
]
}
\ No newline at end of file
......@@ -7,7 +7,6 @@
# 2021
# Functions of metagwgs functional_tests (main.py)
try:
import argparse
import sys
......@@ -22,14 +21,15 @@ except ImportError as error:
print(error)
exit(1)
# Arguments parsing (3 required / 1 optional)
## Usage example:
# Arguments parsing (3 required / 2 optional)
## Usage example:
#
# python functional_tests.py \
# -step 01_clean_qc \
# -exp_dir ../../functional_tests/expected \
# -obs_dir ../../functional_tests/observed/results \
# --script ../../functional_tests/test.sh
# --verbose
def parse_arguments():
parser = argparse.ArgumentParser()
......@@ -40,10 +40,10 @@ def parse_arguments():
help = '(required) expected logs dir containing logs from a healthy metagwgs workflow')
parser.add_argument('-obs_dir', type = str, \
help = '(required) observed logs dir containing logs from the metagwgs workflow you wish to test')
parser.add_argument('-sampleIds', type = str, \
help = '(required) list of sample ids used as input for metagWGS (format: a,b,c,...')
parser.add_argument('--script', type = str, \
help = '(optional) script file containing metagwgs Nextflow launch command')
parser.add_argument('--verbose', action = "store_true", \
help = '(optional) print test results in stdout (default: false)')
if len(sys.argv) == 1:
parser.print_usage(sys.stderr)
......@@ -52,7 +52,6 @@ def parse_arguments():
return parser.parse_args()
# Launch nextflow from script if given by user
### Doesn't work right now, must launch .sh separately and then run functional tests on produced results
def launch_nextflow(script):
script_file = path.abspath(script)
......@@ -64,18 +63,7 @@ def launch_nextflow(script):
print('Test run completed')
# Load JSON file containing list of files to check
## (this .json file must be located in the same directory as 'functions.py')
def json_load(step):
with open(path.join(path.dirname(path.realpath(sys.argv[0])), 'files_to_test.json'), 'r') as json_file:
json_obj = json.load(json_file)
json_file.close()
files_list = json_obj[step]
return files_list
# Find files in expected directory to test
def files_load(exp_dir, step):
files_list = []
......@@ -85,23 +73,15 @@ def files_load(exp_dir, step):
return sorted(files_list)
# Do file comparisons for given step and write output
def check_files(exp_dir, obs_dir, step):
# Do file comparisons for given step (and write output in stdout if --verbose)
def check_files(exp_dir, obs_dir, step, methods, verbose):
# Check existence of expected and observed directories
if not path.exists(exp_dir) or not path.exists(obs_dir):
sys.exit('{a} or {b} folder(s) do not exist, please check --exp_dir and --obs_dir parameters'.format(a = exp_dir, b = obs_dir))
# Load files and methods list from JSON file
# files_list = json_load(step)
# Load files list to test from exp_dir folder
files_list = files_load(exp_dir, step)
print(files_list)
sys.exit()
# Load list of sample ids given by user
# sampleIds_list = sampleIds.split(',')
# Initiate log for untested files (removes already existing logs)
not_tested_log = 'ft_{}.not_tested'.format(step)
......@@ -114,28 +94,45 @@ def check_files(exp_dir, obs_dir, step):
log.write('Expected directory: {a}\nvs\nObserved directory: {b}\n'.format(a = expected_prefix, b = observed_prefix))
# r = re.compile('sampleId')
# samples_cnt = filter(r.match, sampleIds_list)
# sys.exit(samples_cnt)
# other_cnt
# max_cnt = len(files_list) * len(sampleIds_list)
max_cnt = len(files_list)
# Passed and failed tests count initialization
global true_cnt, false_cnt
max_cnt = len(files_list)
true_cnt = 0
false_cnt = 0
nt_cnt = 0
print('\nLaunching {}...\n'.format(step))
if verbose: print('\nLaunching {}...\n'.format(step))
for file_method in files_list:
for file_path in files_list:
file_path = file_method['file']
file_method = file_method['method']
expected_path = path.join(expected_prefix, file_path)
observed_path = path.join(observed_prefix, file_path)
file_name = path.basename(file_path)
file_extension = path.splitext(file_name)[1]
if re.search(r"taxo_affi_reads_.*\.tsv", file_name):
method = "taxo_diff"
elif re.search(r".*_sickle\.log", file_name):
method = "diff"
elif re.search(r".*_cutadapt\.log", file_name):
method = "cut_diff"
elif file_extension in methods["diff"]:
method = "diff"
elif file_extension in methods["not_empty"]:
method = "not_empty"
elif file_extension in methods["no_header_diff"]:
method = "no_header_diff"
elif file_extension in methods["zdiff"]:
method = "zdiff"