Commit 5edf721e authored by Jean Mainguy's avatar Jean Mainguy
Browse files

new plot for kaiju match length distribution

parent f9d6f56c
Pipeline #60789 skipped with stage
#!/usr/bin/env python
"""----------------------------------------------------------------------------------------------------------------------------------------------------------
Script Name: Generate_barplot_kaiju.py
Description: Generates histogram of the distribution of reads according
to the size of the matches
Input files:
Verbose file generated by Kaiju.
Created By: Joanna Fourquet
Date: 2019-09-09
----------------------------------------------------------------------------------------------------------------------------------------------------------
"""
# Metadata
__author__ = 'Joanna Fourquet - Plateforme bioinformatique Toulouse'
__copyright__ = 'Copyright (C) 2019 INRA'
__license__ = 'GNU General Public License'
__version__ = '0.1'
__email__ = 'support.bioinfo.genotoul@inra.fr'
__status__ = 'dev'
# Status: dev
# Modules importation
try:
import argparse
import re
import sys
import os
from collections import Counter
from collections import OrderedDict
from matplotlib import pyplot
except ImportError as error:
print(error)
exit(1)
# Manage parameters
parser = argparse.ArgumentParser(description = \
'Generates histogram of the distribution of reads according \
to the size of the matches.')
parser.add_argument('-i', '--input', required = True, help = \
'Verbose file generated by Kaiju.')
args = parser.parse_args()
def make_figure(ord_dictionary, name_sample, type_figure):
pyplot.bar(list(ord_dictionary.keys()), ord_dictionary.values())
axes = pyplot.gca()
axes.xaxis.set_ticks(range(4,44,5))
axes.xaxis.set_ticklabels(range(15,51,5))
pyplot.xlabel("Match length")
pyplot.ylabel("Number of reads (" + type_figure + ")")
pyplot.title(name_sample)
pyplot.savefig(name_sample + "_Kaiju_MEM_" + type_figure + ".pdf")
pyplot.close()
def main(argv):
with open(args.input) as kaiju_file:
# Variable initialization.
d_nb_reads_by_match_length = Counter()
for entire_line_kaiju_file in kaiju_file:
line_kaiju_file = entire_line_kaiju_file.split()
# Take into account only classified reads.
if line_kaiju_file[0] == "C":
# Record length of the match.
d_nb_reads_by_match_length[line_kaiju_file[3]] += 1
# For all others match lengths, initialize the value to 0.
for i in range(11,51):
if str(i) not in d_nb_reads_by_match_length.keys():
d_nb_reads_by_match_length[str(i)] = 0
d_nb_reads_by_match_length_ord = OrderedDict(sorted(d_nb_reads_by_match_length.items(), key=lambda t: t[0]))
print(d_nb_reads_by_match_length_ord)
# Figure.
make_figure(d_nb_reads_by_match_length_ord, args.input, "counts")
# Normalized reads by total number of reads.
sum_reads = sum(d_nb_reads_by_match_length_ord.values())
d_norm_reads_by_match_length = Counter()
for match_length, nb_reads in d_nb_reads_by_match_length_ord.items():
d_norm_reads_by_match_length[match_length] = nb_reads / sum_reads
d_norm_reads_by_match_length_ord = OrderedDict(d_norm_reads_by_match_length)
print(d_norm_reads_by_match_length_ord)
# Figure.
make_figure(d_norm_reads_by_match_length_ord, args.input, "normalized")
if __name__ == "__main__":
main(sys.argv[1:])
#!/usr/bin/env python
"""----------------------------------------------------------------------------------------------------------------------------------------------------------
Script Name: plot_kaiju_stat.py
Description: Generates density plot distribution of kiaju match length
Input files:
Verbose files generated by Kaiju for each sample.
Created By: Jean Mainguy
Date: 2022-06-10
----------------------------------------------------------------------------------------------------------------------------------------------------------
"""
# Metadata
__author__ = 'Jean Mainguy - Plateforme bioinformatique Toulouse'
__copyright__ = 'Copyright (C) 2022 INRAE'
__license__ = 'GNU General Public License'
__version__ = '0.1'
__email__ = 'support.bioinfo.genotoul@inra.fr'
__status__ = 'dev'
# Status: dev
# Modules importation
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter, FileType
import gzip
import os
import pandas as pd
from collections import defaultdict
import plotly.express as px
from scipy.stats import gaussian_kde
import numpy as np
import logging
import sys
from plotly.express.colors import qualitative
def parse_arguments():
"""Parse script arguments."""
parser = ArgumentParser(description="Generates density plot distribution of kiaju match length",
formatter_class=ArgumentDefaultsHelpFormatter)
parser.add_argument('kaiju_results', nargs='+', help='Verbose files generated by Kaiju')
parser.add_argument('-o', '--output', default='match_length_kaiju_distribution.html')
parser.add_argument("--smoothing_val", type=float, default=.3,
help="smoothing parameter for the density plot")
parser.add_argument("-v", "--verbose", help="increase output verbosity",
action="store_true")
args = parser.parse_args()
return args
def parse_matchlen_from_kaiju_output(kaiju_output):
proper_open = gzip.open if kaiju_output.endswith('.gz') else open
matchlen2count = defaultdict(int)
with proper_open(kaiju_output, "rt") as fl:
for l in fl:
match_len = int(l.split('\t')[3])
matchlen2count[match_len] += 1
df = pd.DataFrame(matchlen2count.items(), columns=['match_length', "reads"])
df['reads %'] = 100 * df['reads']/df['reads'].sum()
return df
def main():
args = parse_arguments()
if args.verbose:
logging.basicConfig(format="%(levelname)s: %(message)s", level=logging.DEBUG)
logging.info('Mode verbose ON')
else:
logging.basicConfig(format="%(levelname)s: %(message)s")
smoothing_parameter = args.smoothing_val
kaiju_results = args.kaiju_results
output = args.output
colors = qualitative.Bold + qualitative.G10 + qualitative.Pastel + qualitative.Antique + qualitative.Safe + qualitative.Vivid + qualitative.Dark24
density_df_samples = []
max_match_length = 0
sample2dfmatchlength = {}
logging.info(f'Parsing kaiju results')
for kaiju_result in kaiju_results:
sample_name = kaiju_result.replace('_kaiju.out.gz', "")
df_matchlen = parse_matchlen_from_kaiju_output(kaiju_result)
max_match_length = max((max_match_length, df_matchlen['match_length'].max()))
sample2dfmatchlength[sample_name] = df_matchlen
x_vals = np.linspace(0,max_match_length,max_match_length) # Specifying the limits of our data
logging.info(f'Maximum match length is {max_match_length}')
logging.info(f'Computing density for each sample')
for sample_name, df_matchlen in sample2dfmatchlength.items():
density = gaussian_kde(df_matchlen['match_length'], weights = df_matchlen['reads'])
density.covariance_factor = lambda : smoothing_parameter #Smoothing parameter
density._compute_covariance()
density_info = {"Match length":x_vals, "Density":density(x_vals), 'sample':sample_name}
df_density_sample = pd.DataFrame(data=density_info)
density_df_samples.append(df_density_sample)
df_density = pd.concat(density_df_samples)
logging.info(f'Plot creation')
fig = px.line(df_density, x='Match length', y='Density', color='sample',
title="Kaiju match length distribution", color_discrete_sequence=colors)
fig.update_traces(line=dict(width=2), opacity=.7)
logging.info(f'Writing output html in {output}')
fig.write_html(output)
if __name__ == '__main__':
main()
......@@ -4,17 +4,15 @@ process KAIJU {
tag "${sampleId}"
label "KAIJU"
publishDir "${params.outdir}/01_clean_qc/01_3_taxonomic_affiliation_reads", mode: 'copy', pattern: '*.krona.html'
publishDir "${params.outdir}/01_clean_qc/01_3_taxonomic_affiliation_reads", mode: 'copy', pattern: '*_kaiju_MEM_verbose_only_classified.out.gz'
publishDir "${params.outdir}/01_clean_qc/01_3_taxonomic_affiliation_reads", mode: 'copy', pattern: '*.pdf'
input:
tuple val(sampleId), path(read1), path(read2)
tuple path(nodes), path(fmi), path(names)
output:
path "${sampleId}_kaiju_MEM_verbose_only_classified.out.gz"
path "${sampleId}.krona.html"
path "${sampleId}_kaiju.out.gz", emit: kaiju_result
path "${sampleId}.krona", emit: krona_tab_file
path "*.summary_*", emit: k_all
path "*.summary_species", emit: k_species
path "*.summary_genus", emit: k_genus
......@@ -22,21 +20,20 @@ process KAIJU {
path "*.summary_class", emit: k_class
path "*.summary_order", emit: k_order
path "*.summary_phylum", emit: k_phylum
path "*.pdf"
script:
"""
kaiju -z ${task.cpus} -t ${nodes} -f ${fmi} -i ${read1} -j ${read2} -o ${sampleId}_kaiju_MEM_verbose.out -a mem -v
kaiju2krona -t ${nodes} -n ${names} -i ${sampleId}_kaiju_MEM_verbose.out -o ${sampleId}_kaiju_MEM_without_unassigned.out.krona -u
ktImportText -o ${sampleId}.krona.html ${sampleId}_kaiju_MEM_without_unassigned.out.krona
kaiju2krona -t ${nodes} -n ${names} -i ${sampleId}_kaiju_MEM_verbose.out -o ${sampleId}.krona -u
for i in ${taxon_levels} ;
do
kaiju2table -t ${nodes} -n ${names} -r \$i -o ${sampleId}_kaiju_MEM.out.summary_\$i ${sampleId}_kaiju_MEM_verbose.out
kaiju2table -t ${nodes} -n ${names} -r \$i -o ${sampleId}_kaiju_MEM.out.summary_\$i ${sampleId}_kaiju_MEM_verbose.out
done
Generate_barplot_kaiju.py -i ${sampleId}_kaiju_MEM_verbose.out
grep -v U ${sampleId}_kaiju_MEM_verbose.out > ${sampleId}_kaiju_MEM_verbose_only_classified.out
gzip ${sampleId}_kaiju_MEM_verbose_only_classified.out
grep -v U ${sampleId}_kaiju_MEM_verbose.out | gzip > ${sampleId}_kaiju.out.gz
rm ${sampleId}_kaiju_MEM_verbose.out
"""
}
......@@ -54,7 +51,7 @@ process KAIJU_HIFI {
tuple path(nodes), path(fmi), path(names)
output:
path "${sampleId}_kaiju_MEM_verbose_only_classified.out.gz"
path "${sampleId}_kaiju.out.gz", emit: kaiju_result
path "${sampleId}.krona", emit: krona_tab_file
path "*.summary_*", emit: k_all
path "*.summary_species", emit: k_species
......@@ -63,30 +60,25 @@ process KAIJU_HIFI {
path "*.summary_class", emit: k_class
path "*.summary_order", emit: k_order
path "*.summary_phylum", emit: k_phylum
path "*.pdf"
script:
"""
kaiju -z ${task.cpus} -t ${nodes} -f ${fmi} -i ${reads} -o ${sampleId}_kaiju_MEM_verbose.out -a mem -v
kaiju2krona -t ${nodes} -n ${names} -i ${sampleId}_kaiju_MEM_verbose.out -o ${sampleId}.krona -u
# ktImportText -o ${sampleId}.krona.html ${sampleId}_kaiju_MEM_without_unassigned.out.krona
for i in ${taxon_levels} ;
do
kaiju2table -t ${nodes} -n ${names} -r \$i -o ${sampleId}_kaiju_MEM.out.summary_\$i ${sampleId}_kaiju_MEM_verbose.out
done
Generate_barplot_kaiju.py -i ${sampleId}_kaiju_MEM_verbose.out
grep -v U ${sampleId}_kaiju_MEM_verbose.out | gzip > ${sampleId}_kaiju_MEM_verbose_only_classified.out.gz
# rm ${sampleId}_kaiju_MEM_verbose.out
grep -v U ${sampleId}_kaiju_MEM_verbose.out | gzip > ${sampleId}_kaiju.out.gz
rm ${sampleId}_kaiju_MEM_verbose.out
"""
}
process KAIJU_TO_KRONA {
tag "${sampleId}"
publishDir "${params.outdir}/01_clean_qc/01_3_taxonomic_affiliation_reads", mode: 'copy', pattern: '*.krona.html'
......@@ -102,8 +94,21 @@ process KAIJU_TO_KRONA {
"""
}
process PLOT_KAIJU_STAT {
publishDir "${params.outdir}/01_clean_qc/01_3_taxonomic_affiliation_reads", mode: 'copy'
input:
path kaiju_results
output:
path "match_length_kaiju_distribution.html"
script:
"""
plot_kaiju_stat.py $kaiju_results -o 'match_length_kaiju_distribution.html' -v
"""
}
process MERGE_KAIJU {
publishDir "${params.outdir}/01_clean_qc/01_3_taxonomic_affiliation_reads", mode: 'copy'
......@@ -145,6 +150,14 @@ workflow KAIJU_AND_MERGE {
database
)
PLOT_KAIJU_STAT (
KAIJU.out.kaiju_result.collect()
)
KAIJU_TO_KRONA (
KAIJU.out.krona_tab_file.collect()
)
MERGE_KAIJU (
KAIJU.out.k_species.collect(),
KAIJU.out.k_genus.collect(),
......@@ -169,6 +182,10 @@ workflow KAIJU_AND_MERGE_FOR_HIFI {
database
)
PLOT_KAIJU_STAT (
KAIJU_HIFI.out.kaiju_result.collect()
)
KAIJU_TO_KRONA (
KAIJU_HIFI.out.krona_tab_file.collect()
)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment