merge_abundance_and_functional_annotations.py 4.22 KB
Newer Older
1
2
3
4
5
6
7
#!/usr/bin/env python

"""--------------------------------------------------------------------
  Script Name: merge_abundance_and_functional_annotations.py
  Description: join quantification table by gene and tables by samples
               with functional annotations.
  Input files:
8
9
10
11
12
13
               1st input file: Table containing counts
               for each global gene id in each sample.
               2nd input file: List of files storing functional
               annotation for each gene per sample.
               3rd input file: List of files storing diamond results
               with best bitscore for each gene per sample.
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
  Created By:  Joanna Fourquet
  Date:        2021-01-12
-----------------------------------------------------------------------
"""

# Metadata.
__author__ = 'Joanna Fourquet \
- GenPhySE - NED'
__copyright__ = 'Copyright (C) 2021 INRAE'
__license__ = 'GNU General Public License'
__version__ = '0.1'
__email__ = 'support.bioinfo.genotoul@inra.fr'
__status__ = 'dev'

# Status: dev.

# Modules importation.
try:
    import argparse
    import re
    import sys
    import pandas as pd
    from datetime import datetime
except ImportError as error:
    print(error)
    exit(1)

# Print time.
print(str(datetime.now()))

# Manage parameters.
parser = argparse.ArgumentParser(description = 'Script which join \
quantification table by gene and tables by samples \
with functional annotations')

parser.add_argument('-t', '--table_of_abundances', required = True, \
help = 'Table containing counts \
for each global gene id in each sample.')

parser.add_argument('-f', '--list_of_file_annotations', required = True, \
help = 'List of files storing functional annotation for each gene per sample.')

56
57
58
59
parser.add_argument('-d', '--list_of_file_diamond', required = True, \
help = 'List of files storing diamond results with best bitscore \
for each gene per sample.')

60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
parser.add_argument('-o', '--output_file', required = True, \
help = 'Name of output file containing counts \
for each global gene id and its functional annotation.')

parser.add_argument('-v', '--version', action = 'version', \
version = __version__)

args = parser.parse_args()

counts_file = pd.read_csv(args.table_of_abundances, header=0, sep='\t')

# Recovery of the list of annotations files.
with open(args.list_of_file_annotations) as fannot_list:
    files_of_annotations = fannot_list.read().split()

75
76
77
78
79
# Recovery of the list of diamond best hits files.
with open(args.list_of_file_diamond) as fdiamond_list:
    diamond_files = fdiamond_list.read().split()

# Creates a new empty dataframe for annotations.
80
81
82
83
84
85
86
concat_eggnog_mapper_files = pd.DataFrame()

# Concatenate annotation files.
for (annotations_idx,annotations_path) in enumerate(files_of_annotations):
    eggnog_mapper_file = pd.read_csv(annotations_path, delimiter='\t', decimal='.',skiprows=4)
    concat_eggnog_mapper_files = pd.concat([concat_eggnog_mapper_files, eggnog_mapper_file])

87
88
89
90
91
# Creates a new empty dataframe for diamond results.
concat_diamond_files = pd.DataFrame()

# Concatenate diamond files.
for (diamond_idx,diamond_path) in enumerate(diamond_files):
92
93
94
95
96
97
98
99
    diamond_columns = ["qseqid","sseqid","pident","length","mismatch","gapopen","qstart","qend","sstart","send","evalue","bitscore","qlen","slen","stitle"]
    diamond_file = pd.read_csv(diamond_path, delimiter='\t', decimal='.', header=None, names=diamond_columns)
    diamond_file.loc[:,"sseqid"] = 'https://www.ncbi.nlm.nih.gov/protein/' + diamond_file.loc[:,"sseqid"]
    group_diamond_file = diamond_file.groupby("qseqid")\
        .agg({"stitle" : ';'.join, "sseqid" : ','.join})\
        .reset_index()\
        .reindex(columns=diamond_file.columns)
    res_diamond_file = group_diamond_file.loc[:,["qseqid","sseqid","stitle"]]
100
101
102
    concat_diamond_files = pd.concat([concat_diamond_files, res_diamond_file])

# Merge counts, annotation and diamond results.
103
merge_annot = pd.merge(counts_file,concat_eggnog_mapper_files,left_on="seed_cluster",right_on='#query_name', how='left')
104
merge = pd.merge(merge_annot,concat_diamond_files,left_on="seed_cluster",right_on="qseqid", how='left')
105
merge.drop('#query_name', inplace=True, axis=1)
106
merge.drop("qseqid", inplace=True, axis=1)
107

108
# Write merge data frame in output file.
MARTIN Pierre's avatar
MARTIN Pierre committed
109
merge.to_csv(args.output_file, sep="\t", index=False)