quantification_by_contig_lineage.py 2.81 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
#!/usr/bin/env python

"""--------------------------------------------------------------------
  Script Name: quantification_by_contig_lineage.py
  Description: make table where each line is a lineage and for each
               sample there are two columns: nb contigs and nb reads.
  Input files: List of merged files (idxstats+.percontig.csv).
  Created By:  Joanna Fourquet
  Date:        2021-01-19
-----------------------------------------------------------------------
"""

# Metadata.
__author__ = 'Joanna Fourquet \
- GenPhySE - NED'
__copyright__ = 'Copyright (C) 2021 INRAE'
__license__ = 'GNU General Public License'
__version__ = '0.1'
__email__ = 'support.bioinfo.genotoul@inra.fr'
__status__ = 'dev'

# Status: dev.

# Modules importation.
try:
    import argparse
    import re
    import sys
    import pandas as pd
    from datetime import datetime
except ImportError as error:
    print(error)
    exit(1)

# Print time.
print(str(datetime.now()))

# Manage parameters.
parser = argparse.ArgumentParser(description = 'Script which make \
table where each line is a lineage and for each \
sample there are two columns: nb contigs and nb reads.')

parser.add_argument('-i', '--list_of_input_files', required = True, \
help = 'List of input files (one for each sample).')

parser.add_argument('-o', '--output_file', required = True, \
help = 'Name of output file containing counts of contigs and reads \
in each sample for each lineage.')

parser.add_argument('-v', '--version', action = 'version', \
version = __version__)

args = parser.parse_args()

# Recovery of the list of input files.
with open(args.list_of_input_files) as finput_list:
    sample_files = finput_list.read().split()

# Merge results for all samples by lineage.
for (sample_idx,sample_path) in enumerate(sample_files):
    print(sample_idx)
    if(sample_idx==0):
        merge  = pd.read_csv(sample_path, delimiter='\t', dtype=str)
        sample_name = sample_path
65
        if('consensus_tax_id' in merge.columns): merge.drop('consensus_tax_id', inplace=True, axis=1)
66
67
    else:
        sample_results = pd.read_csv(sample_path, delimiter='\t', dtype=str)
68
        merge = pd.merge(merge,sample_results,left_on=["tax_id_by_level","lineage_by_level"],right_on=["tax_id_by_level","lineage_by_level"], how='outer', suffixes=('_' + sample_name,''))
69
        sample_name = sample_path
70
        if('consensus_tax_id' in merge.columns): merge.drop('consensus_tax_id', inplace=True, axis=1)
71
72
73
74
75
76
77

# Rename columns corresponding to the last sample file.
sample_name = sample_path

merge.rename(columns = {'name_contigs': 'name_contigs_' + sample_name, \
'nb_contigs': 'nb_contigs_' + sample_name,\
'nb_reads': 'nb_reads_' + sample_name},inplace=True)
Joanna Fourquet's avatar
Joanna Fourquet committed
78

79
# Fill NaN values with 0.
Joanna Fourquet's avatar
Joanna Fourquet committed
80
merge.fillna(0, inplace=True)
81
82
83

# Write merge data frame in output file.
merge.to_csv(args.output_file, sep="\t", index=False)