Commit fe79fddd authored by MARTIN Pierre's avatar MARTIN Pierre
Browse files

changes from july 6 to july 20 (see WIP issue for detail on operations)

parent 61f579b9
#!/usr/bin/env python
"""--------------------------------------------------------------------
Script Name: merge_idxstats_percontig_lineage.py
Description: merge idstats and .percontig.tsv files for one sample.
Input files: idxstats file and percontig.tsv file.
Created By: Joanna Fourquet
Date: 2021-01-19
-----------------------------------------------------------------------
"""
# Metadata.
__author__ = 'Joanna Fourquet \
- GenPhySE - NED'
__copyright__ = 'Copyright (C) 2021 INRAE'
__license__ = 'GNU General Public License'
__version__ = '0.1'
__email__ = 'support.bioinfo.genotoul@inra.fr'
__status__ = 'dev'
# Status: dev.
# Modules importation.
try:
import argparse
import re
import sys
import pandas as pd
import numpy as np
from datetime import datetime
except ImportError as error:
print(error)
exit(1)
# Print time.
print(str(datetime.now()))
# Manage parameters.
parser = argparse.ArgumentParser(description = 'Script which \
merge idstats and .percontig.tsv files for one sample.')
parser.add_argument('-i', '--idxstats_file', required = True, \
help = 'idxstats file.')
parser.add_argument('-c', '--percontig_file', required = True, \
help = '.percontig.tsv file.')
parser.add_argument('-o', '--output_name', required = True, \
help = 'Name of output file containing counts of contigs and reads \
for each lineage.')
parser.add_argument('-v', '--version', action = 'version', \
version = __version__)
args = parser.parse_args()
# Recovery of idxstats file.
idxstats = pd.read_csv(args.idxstats_file, delimiter='\t', header=None)
# Recovery of .percontig.tsv file.
percontig = pd.read_csv(args.percontig_file, delimiter='\t', dtype=str)
# Merge idxstats and .percontig.tsv files.
merge = pd.merge(idxstats,percontig,left_on=0,right_on='#contig', how='outer')
# Group by lineage and sum number of reads and contigs.
res = merge.groupby(['consensus_lineage','consensus_tax_id', 'tax_id_by_level']).agg({0 : [';'.join, 'count'], 2: 'sum'}).reset_index()
res.columns=['lineage_by_level', 'consensus_tax_id', 'tax_id_by_level', 'name_contigs', 'nb_contigs', 'nb_reads']
print(res.head())
# Fill the NaN by 0.
res.fillna(0, inplace=True)
# Split by taxonomic level
res_split_tax_id = res.join(res['tax_id_by_level'].str.split(pat=";",expand=True))
res_split_tax_id.columns=['consensus_lineage', 'consensus_taxid', 'tax_id_by_level', 'name_contigs', 'nb_contigs', 'nb_reads', "superkingdom_tax_id", "phylum_tax_id", "order_tax_id", "class_tax_id", "family_tax_id", "genus_tax_id", "species_tax_id"]
res_split_tax_id.fillna(value='no_affi', inplace = True)
print(res_split_tax_id.head())
res_split = res_split_tax_id.join(res_split_tax_id['consensus_lineage'].str.split(pat=";",expand=True))
res_split.columns=['consensus_lineage', 'consensus_taxid', 'tax_id_by_level', 'name_contigs', 'nb_contigs', 'nb_reads', "superkingdom_tax_id", "phylum_tax_id", "order_tax_id", "class_tax_id", "family_tax_id", "genus_tax_id", "species_tax_id", "superkingdom_lineage", "phylum_lineage", "order_lineage", "class_lineage", "family_lineage", "genus_lineage", "species_lineage"]
res_split.fillna(value='no_affi', inplace = True)
level_superkingdom = res_split.groupby(['superkingdom_tax_id','superkingdom_lineage']).agg({'name_contigs' : [';'.join], 'nb_contigs' : 'sum', 'nb_reads' : 'sum'}).reset_index()
level_superkingdom.columns=['tax_id_by_level','lineage_by_level','name_contigs','nb_contigs', 'nb_reads']
level_phylum = res_split.groupby(['phylum_tax_id','phylum_lineage']).agg({'name_contigs' : [';'.join], 'nb_contigs' : 'sum', 'nb_reads' : 'sum'}).reset_index()
level_phylum.columns=['tax_id_by_level','lineage_by_level','name_contigs','nb_contigs', 'nb_reads']
level_order = res_split.groupby(['order_tax_id','order_lineage']).agg({'name_contigs' : [';'.join], 'nb_contigs' : 'sum', 'nb_reads' : 'sum'}).reset_index()
level_order.columns=['tax_id_by_level','lineage_by_level','name_contigs','nb_contigs', 'nb_reads']
level_class = res_split.groupby(['class_tax_id','class_lineage']).agg({'name_contigs' : [';'.join], 'nb_contigs' : 'sum', 'nb_reads' : 'sum'}).reset_index()
level_class.columns=['tax_id_by_level','lineage_by_level','name_contigs','nb_contigs', 'nb_reads']
level_family = res_split.groupby(['family_tax_id','family_lineage']).agg({'name_contigs' : [';'.join], 'nb_contigs' : 'sum', 'nb_reads' : 'sum'}).reset_index()
level_family.columns=['tax_id_by_level','lineage_by_level','name_contigs','nb_contigs', 'nb_reads']
level_genus = res_split.groupby(['genus_tax_id','genus_lineage']).agg({'name_contigs' : [';'.join], 'nb_contigs' : 'sum', 'nb_reads' : 'sum'}).reset_index()
level_genus.columns=['tax_id_by_level','lineage_by_level','name_contigs','nb_contigs', 'nb_reads']
level_species = res_split.groupby(['species_tax_id','species_lineage']).agg({'name_contigs' : [';'.join], 'nb_contigs' : 'sum', 'nb_reads' : 'sum'}).reset_index()
level_species.columns=['tax_id_by_level','lineage_by_level','name_contigs','nb_contigs', 'nb_reads']
# Write merge data frame in output files.
res.to_csv(args.output_name + ".tsv", sep="\t", index=False)
level_superkingdom.to_csv(args.output_name + "_by_superkingdom.tsv", sep="\t", index=False)
level_phylum.to_csv(args.output_name + "_by_phylum.tsv", sep="\t", index=False)
level_order.to_csv(args.output_name + "_by_order.tsv", sep="\t", index=False)
level_class.to_csv(args.output_name + "_by_class.tsv", sep="\t", index=False)
level_family.to_csv(args.output_name + "_by_family.tsv", sep="\t", index=False)
level_genus.to_csv(args.output_name + "_by_genus.tsv", sep="\t", index=False)
level_species.to_csv(args.output_name + "_by_species.tsv", sep="\t", index=False)
......@@ -16,7 +16,7 @@ process {
//{ task.exitStatus in [1,143,137,104,134,139] ? 'retry' : 'finish' }
maxRetries = 1
maxErrors = '-1'
container = '/home/pmartin2/work/metaG/singularity_img/metagwgs.sif'
container = 'file://metagwgs/env/metagwgs.sif'
withName: cutadapt {
cpus = 8
memory = { 8.GB * task.attempt }
......@@ -111,9 +111,9 @@ process {
memory = { 50.GB * task.attempt }
}
withLabel: eggnog {
container = '/home/pmartin2/work/metaG/singularity_img/eggnog_mapper.sif'
container = 'file://metagwgs/env/eggnog_mapper.sif'
}
withLabel: mosdepth {
container = '/home/pmartin2/work/metaG/singularity_img/mosdepth.sif'
container = 'file://metagwgs/env/mosdepth.sif'
}
}
......@@ -15,18 +15,18 @@ process {
maxErrors = '-1'
withName: cutadapt {
cpus = 3
memory = { 1.GB * task.attempt }
cpus = 3
memory = { 1.GB * task.attempt }
}
withName: sickle {
memory = { 1.GB * task.attempt }
memory = { 1.GB * task.attempt }
}
withLabel: fastqc {
cpus = 6
memory = { 1.GB * task.attempt }
cpus = 6
memory = { 1.GB * task.attempt }
}
withName: multiqc {
memory = { 2.GB * task.attempt }
memory = { 2.GB * task.attempt }
}
withName: host_filter {
memory = { 20.GB * task.attempt }
......@@ -75,8 +75,8 @@ process {
memory = { 1.GB * task.attempt }
}
withName: diamond {
cpus = 2
memory = { 8.GB * task.attempt }
cpus = 8
memory = { 10.GB * task.attempt }
}
withName: get_software_versions {
memory = { 1.GB * task.attempt }
......
#!/usr/bin/env python
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Pierre MARTIN
......@@ -14,6 +14,7 @@ try:
import re
import os
import os.path
from collections import defaultdict
import subprocess
except ImportError as error:
......@@ -56,52 +57,39 @@ def launch_nextflow(script):
print('Launching test run with the provided file:\n\n{}\n'.format(script_file))
process = subprocess.Popen(['sbatch', '{}'.format(script_file)], cwd = script_dir, stdout = subprocess.PIPE, shell=True, executable = '/bin/bash')
output, error = process.communicate()
process = subprocess.Popen('sh' + ' {}'.format(script_file), cwd = script_dir, shell = True).wait()
print('Test run completed')
# Lists of output files to compare
## 01_clean_qc:
global _01_1_cleaned_reads, _01_2_qc, _01_3_taxonomic_affiliation_reads
_01_1_cleaned_reads = [
global _01_clean_qc = [
('01_1_cleaned_reads/logs/a.no_filter.flagstat', 'diff'),
('01_1_cleaned_reads/logs/c.no_filter.flagstat', 'diff'),
('01_1_cleaned_reads/logs/a_cutadapt.log', 'cut_diff'),
('01_1_cleaned_reads/logs/c_cutadapt.log', 'cut_diff'),
('01_1_cleaned_reads/logs/a_sickle.log', 'diff'),
('01_1_cleaned_reads/logs/c_sickle.log', 'diff'),
('01_1_cleaned_reads/logs/host_filter_flagstat/a.host_filter.flagstat', 'diff'),
('01_1_cleaned_reads/logs/host_filter_flagstat/c.host_filter.flagstat', 'diff')
]
_01_2_qc = [
('01_2_qc/fastqc_cleaned/cleaned_a/cleaned_a_R1_fastqc.html', 'not_empty'),
('01_1_cleaned_reads/logs/host_filter_flagstat/c.host_filter.flagstat', 'diff'),
('01_2_qc/fastqc_cleaned/cleaned_a/cleaned_a_R1_fastqc.zip', 'not_empty'),
('01_2_qc/fastqc_cleaned/cleaned_a/cleaned_a_R2_fastqc.html', 'not_empty'),
('01_2_qc/fastqc_cleaned/cleaned_a/cleaned_a_R1_fastqc.zip', 'not_empty'),
('01_2_qc/fastqc_cleaned/cleaned_c/cleaned_c_R1_fastqc.html', 'not_empty'),
('01_2_qc/fastqc_cleaned/cleaned_c/cleaned_c_R1_fastqc.zip', 'not_empty'),
('01_2_qc/fastqc_cleaned/cleaned_a/cleaned_a_R2_fastqc.zip', 'not_empty'),
('01_2_qc/fastqc_cleaned/cleaned_c/cleaned_c_R2_fastqc.zip', 'not_empty'),
('01_2_qc/fastqc_cleaned/cleaned_a/cleaned_a_R1_fastqc.html', 'not_empty'),
('01_2_qc/fastqc_cleaned/cleaned_c/cleaned_c_R1_fastqc.html', 'not_empty'),
('01_2_qc/fastqc_cleaned/cleaned_a/cleaned_a_R2_fastqc.html', 'not_empty'),
('01_2_qc/fastqc_cleaned/cleaned_c/cleaned_c_R2_fastqc.html', 'not_empty'),
('01_2_qc/fastqc_cleaned/cleaned_c/cleaned_c_R1_fastqc.zip', 'not_empty'),
('01_2_qc/fastqc_raw/a/a_R1_fastqc.html', 'not_empty'),
('01_2_qc/fastqc_raw/a/a_R1_fastqc.zip', 'not_empty'),
('01_2_qc/fastqc_raw/a/a_R2_fastqc.html', 'not_empty'),
('01_2_qc/fastqc_raw/c/c_R1_fastqc.zip', 'not_empty'),
('01_2_qc/fastqc_raw/a/a_R2_fastqc.zip', 'not_empty'),
('01_2_qc/fastqc_raw/c/c_R2_fastqc.zip', 'not_empty'),
('01_2_qc/fastqc_raw/a/a_R1_fastqc.html', 'not_empty'),
('01_2_qc/fastqc_raw/c/c_R1_fastqc.html', 'not_empty'),
('01_2_qc/fastqc_raw/c/c_R1_fastqc.zip', 'not_empty'),
('01_2_qc/fastqc_raw/a/a_R2_fastqc.html', 'not_empty'),
('01_2_qc/fastqc_raw/c/c_R2_fastqc.html', 'not_empty'),
('01_2_qc/fastqc_raw/c/c_R2_fastqc.zip', 'not_empty')
]
_01_3_taxonomic_affiliation_reads = [
('01_3_taxonomic_affiliation_reads/a.krona.html', 'not_empty'),
('01_3_taxonomic_affiliation_reads/c.krona.html', 'not_empty'),
('01_3_taxonomic_affiliation_reads/taxo_affi_reads_class.tsv', 'diff'),
('01_3_taxonomic_affiliation_reads/taxo_affi_reads_family.tsv', 'diff'),
('01_3_taxonomic_affiliation_reads/taxo_affi_reads_genus.tsv', 'diff'),
......@@ -111,30 +99,29 @@ _01_3_taxonomic_affiliation_reads = [
]
## 02_assembly:
global _02_assembly
global _02_assembly
_02_assembly = [
('logs/a.count_reads_on_contigs.flagstat', 'diff'),
('logs/c.count_reads_on_contigs.flagstat', 'diff'),
('logs/a.count_reads_on_contigs.idxstats', 'diff'),
('logs/c.count_reads_on_contigs.idxstats', 'diff'),
('a_metaspades/a_scaffolds.fasta', 'diff'),
('c_metaspades/c_scaffolds.fasta', 'diff'),
('a_megahit/a_scaffolds.fasta', 'diff'),
('c_megahit/c_scaffolds.fasta', 'diff'),
('a_all_contigs_QC/report.tsv', 'diff'),
('c_all_contigs_QC/report.tsv', 'diff'),
('a_all_contigs_QC/report.html', 'not_empty'),
('a_all_contigs_QC/report.html', 'not_empty')
('c_all_contigs_QC/report.html', 'not_empty')
]
## 03_filtering
global _03_filtering
global _03_filtering
_03_filtering = [
('a_select_contigs_cpm10.fasta', 'diff'),
('c_select_contigs_cpm10.fasta', 'diff'),
('a_discard_contigs_cpm10.fasta', 'diff'),
('c_discard_contigs_cpm10.fasta', 'diff'),
('a_select_contigs_QC/report.tsv', 'diff'),
('c_select_contigs_QC/report.tsv', 'diff'),
('a_select_contigs_QC/report.html', 'not_empty'),
......@@ -146,44 +133,49 @@ global _04_structural_annot
_04_structural_annot = [
('a.annotated.faa', 'diff'),
('c.annotated.faa', 'diff'),
('a.annotated.ffn', 'diff'),
('c.annotated.ffn', 'diff'),
('a.annotated.fna', 'diff'),
('c.annotated.fna', 'diff'),
('a.annotated.gff', 'diff'),
('c.annotated.gff', 'diff'),
('a_prot.len', 'diff'),
('c_prot.len', 'diff')
]
## 05_alignment
global _05_1_reads_alignment_on_contigs, _05_2_database_alignment
_05_1_reads_alignment_on_contigs = [
global _05_alignment
_05_alignment = [
('05_1_reads_alignment_on_contigs/a/a_contig.bed', 'diff'),
('05_1_reads_alignment_on_contigs/c/c_contig.bed', 'diff'),
('05_1_reads_alignment_on_contigs/a/a.sort.bam.idxstats', 'diff'),
('05_1_reads_alignment_on_contigs/c/c.sort.bam.idxstats', 'diff'),
('05_1_reads_alignment_on_contigs/a/a.regions.bed.gz', 'gz_diff'),
('05_1_reads_alignment_on_contigs/c/c.regions.bed.gz', 'gz_diff')
]
_05_2_database_alignment = [
('05_1_reads_alignment_on_contigs/a/a.regions.bed.gz', 'zdiff'),
('05_1_reads_alignment_on_contigs/c/c.regions.bed.gz', 'zdiff'),
('05_2_database_alignment/a/a_aln_diamond.m8', 'diff'),
('05_2_database_alignment/c/c_aln_diamond.m8', 'diff')
]
# ## 06_func_annot
# global _06_func_annot = [
# ]
# ## 07_taxo_affi
# global _07_taxo_affi = [
# ]
# Do file comparisons for given step and write output
def check_files(exp_dir, obs_dir, step):
if not os.path.exists(exp_dir) or not os.path.exists(obs_dir):
sys.exit('{} or {} folder(s) do not exist, please check --exp_dir and --obs_dir parameters'.format(exp_dir, obs_dir))
if step == '01_clean_qc':
files_list = _01_1_cleaned_reads + _01_2_qc + _01_3_taxonomic_affiliation_reads
files_list = _01_clean_qc
elif step == '02_assembly':
......@@ -199,7 +191,7 @@ def check_files(exp_dir, obs_dir, step):
elif step == '05_alignment':
files_list = _05_1_reads_alignment_on_contigs + _05_2_database_alignment
files_list = _05_alignment
# elif step == '06_func_annot':
......@@ -213,18 +205,19 @@ def check_files(exp_dir, obs_dir, step):
# files_list = _08_binning
f = open('ft_{}.log'.format(step), 'w+')
log = open('ft_{}.log'.format(step), 'w+')
expected_prefix = '{}/{}/'.format(os.path.abspath(exp_dir), step)
observed_prefix = '{}/{}/'.format(os.path.abspath(obs_dir), step)
f.write('Expected directory: {}\nvs\nObserved directory: {}\n'.format(expected_prefix, observed_prefix))
log.write('Expected directory: {}\nvs\nObserved directory: {}\n'.format(expected_prefix, observed_prefix))
max_cnt = len(files_list)
global true_cnt, false_cnt
true_cnt = 0
false_cnt = 0
nt_cnt = 0
print('\nLaunching {}...\n'.format(step))
......@@ -235,19 +228,39 @@ def check_files(exp_dir, obs_dir, step):
expected_path = '{}{}'.format(expected_prefix, file_path)
observed_path = '{}{}'.format(observed_prefix, file_path)
not_tested_log = 'ft_{}.not_tested'.format(step)
# Make log of non existing files
if not os.path.exists('{}'.format(expected_path)):
sys.exit(
'\n{} doesn\'t exist\n\nPlease check expected/ directory for any missing file\n'.format(expected_path)
)
if not os.path.exists(expected_path):
nt = open(not_tested_log, 'a')
nt.write('Exp:\t' + expected_path + '\n')
nt.close()
nt_cnt += 1
max_cnt -= 1
file_out = '''
------------------------------------------------------------------------------
elif not os.path.exists('{}'.format(observed_path)):
File: {}
Not tested (list in {})
'''.format(file_path, not_tested_log)
log.write(file_out)
print(file_out)
elif not os.path.exists(observed_path):
nt = open(not_tested_log, 'a')
nt.write('Obs:\t' + observed_path + '\n')
nt.close()
nt_cnt += 1
max_cnt -= 1
file_out = '''
------------------------------------------------------------------------------
sys.exit(
'\n{} doesn\'t exist\n\nPlease check observed/ directory for any missing file\n'.format(observed_path)
)
File: {}
Not tested (list in {})
'''.format(file_path, not_tested_log)
log.write(file_out)
print(file_out)
else:
......@@ -261,13 +274,12 @@ File: {}
Test method: {}
'''.format(file_path, file_usage)
f.write(file_out)
log.write(file_out)
print(file_out)
f.write(out)
log.write(out)
print(out)
else:
elif test == True:
file_out = '''
------------------------------------------------------------------------------
......@@ -275,18 +287,18 @@ File: {}
Test method: {}
'''.format(file_path, file_usage)
f.write(file_out)
log.write(file_out)
print(file_out)
f.write(out)
log.write(out)
print(out)
continue
true_perc = (float(true_cnt) / float(max_cnt) * 100)
true_perc = round((float(true_cnt) / float(max_cnt) * 100), 2)
if false_cnt != 0:
false_perc = 100 - (float(true_cnt) / float(max_cnt) * 100)
false_perc = round(100 - (float(true_cnt) / float(max_cnt) * 100), 2)
else:
false_perc = 0
false_perc = float(0)
out = '''
=========================================
......@@ -297,13 +309,14 @@ Testing the {a} step of metagWGS:
Total: {b}
Passed: {c} ({d}%)
Missed: {e} ({f}%)
Not tested: {g}
-----------------------------------------
=========================================
'''.format(a = step, b = max_cnt, c = true_cnt, d = true_perc, e = false_cnt, f = false_perc)
'''.format(a = step, b = max_cnt, c = true_cnt, d = true_perc, e = false_cnt, f = false_perc, g = nt_cnt)
f.write(out)
f.close()
log.write(out)
log.close()
return out
......@@ -311,42 +324,42 @@ def test_method(exp_path, obs_path, method):
global true_cnt, false_cnt
if method == 'diff' or method == 'cut_diff' or method == 'gz_diff':
if re.search('diff', method):
if method == 'diff':
YES = 0
command = 'diff {} {}'.format(exp_path, obs_path)
elif method == 'cut_diff':
YES = 1
command = 'diff <(tail -n+6 {}) <(tail -n+6 {})'.format(exp_path, obs_path)
elif method == 'gz_diff':
command = 'diff <(gunzip -c {}) <(gunzip -c {})'.format(exp_path, obs_path)
elif method == 'zdiff':
YES = 2
command = 'zdiff {} {}'.format(exp_path, obs_path)
process = subprocess.Popen(command, stdout = subprocess.PIPE, shell = True)
process = subprocess.Popen(command, stdout = subprocess.PIPE, shell = True, executable = '/bin/bash')
diff_out, error = process.communicate()
if not error:
if diff_out != '':
if diff_out.decode('ascii') != '':
test = False
out = 'Test result: {}\nDifferences:\n{}\n'.format(test, diff_out)
out = 'Test result: Failed\nDifferences:\n{}\n'.format(diff_out)
false_cnt += 1
elif diff_out == '':
elif diff_out.decode('ascii') == '':
test = True
out = 'Test result: Passed\n'
true_cnt += 1
else:
test = False
out = 'Test result: {}\nError: {}\nCommand: {}'.format(test, error, command)
out = 'Test result: Failed\nSubprocess error: {}\nCommand: {}'.format(error, command)
false_cnt += 1
return test, out
elif method == 'not_empty':
YES = 3
test = os.path.getsize(obs_path) > 0
if test:
......@@ -355,7 +368,7 @@ def test_method(exp_path, obs_path, method):
true_cnt += 1
else:
test = False
out = 'Test result: {}\n'.format('Failed')
out = 'Test result: Failed\n'
false_cnt += 1
return test, out
\ No newline at end of file
#!/usr/bin/env python
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Pierre MARTIN
......@@ -8,7 +8,7 @@
# Usage
## cd [work_directory]
## python ./metagwgs/functional_tests/main.py -step [step] -exp_dir ./expected_logs -obs_dir ./[work_directory]/results [optional: --script ./[work_directory]/launch_nextflow.sh]
## python metagwgs/functional_tests/main.py -step [step] -exp_dir ./test_expected_logs -obs_dir ./[work_directory]/results [optional: --script ./[work_directory]/launch_nextflow.sh]
try:
import sys
......@@ -31,8 +31,7 @@ steps_list = OrderedDict([
("04_structural_annot", 4),
("05_alignment", 5),
("06_func_annot", 6),
("07_taxo_affi", 7),
("08_binning", 8)
("07_taxo_affi", 7)
])
# __main__
......
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment