#!/usr/bin/env python """-------------------------------------------------------------------- Script Name: merge_abundance_and_functional_annotations.py Description: join quantification table by gene and tables by samples with functional annotations. Input files: 1st input file: Table containing counts for each global gene id in each sample. 2nd input file: List of files storing functional annotation for each gene per sample. 3rd input file: List of files storing diamond results with best bitscore for each gene per sample. Created By: Joanna Fourquet Date: 2021-01-12 ----------------------------------------------------------------------- """ # Metadata. __author__ = 'Joanna Fourquet \ - GenPhySE - NED' __copyright__ = 'Copyright (C) 2021 INRAE' __license__ = 'GNU General Public License' __version__ = '0.1' __email__ = 'support.bioinfo.genotoul@inra.fr' __status__ = 'dev' # Status: dev. # Modules importation. try: import argparse import re import sys import pandas as pd from datetime import datetime except ImportError as error: print(error) exit(1) # Print time. print(str(datetime.now())) # Manage parameters. parser = argparse.ArgumentParser(description = 'Script which join \ quantification table by gene and tables by samples \ with functional annotations') parser.add_argument('-t', '--table_of_abundances', required = True, \ help = 'Table containing counts \ for each global gene id in each sample.') parser.add_argument('-f', '--list_of_file_annotations', required = True, \ help = 'List of files storing functional annotation for each gene per sample.') parser.add_argument('-d', '--list_of_file_diamond', required = True, \ help = 'List of files storing diamond results with best bitscore \ for each gene per sample.') parser.add_argument('-o', '--output_file', required = True, \ help = 'Name of output file containing counts \ for each global gene id and its functional annotation.') parser.add_argument('-v', '--version', action = 'version', \ version = __version__) args = parser.parse_args() counts_file = pd.read_csv(args.table_of_abundances, header=0, sep='\t') # Recovery of the list of annotations files. with open(args.list_of_file_annotations) as fannot_list: files_of_annotations = fannot_list.read().split() # Recovery of the list of diamond best hits files. with open(args.list_of_file_diamond) as fdiamond_list: diamond_files = fdiamond_list.read().split() # Creates a new empty dataframe for annotations. concat_eggnog_mapper_files = pd.DataFrame() # Concatenate annotation files. for (annotations_idx,annotations_path) in enumerate(files_of_annotations): eggnog_mapper_file = pd.read_csv(annotations_path, delimiter='\t', decimal='.',skiprows=4) concat_eggnog_mapper_files = pd.concat([concat_eggnog_mapper_files, eggnog_mapper_file]) # Creates a new empty dataframe for diamond results. concat_diamond_files = pd.DataFrame() # Concatenate diamond files. for (diamond_idx,diamond_path) in enumerate(diamond_files): diamond_columns = ["qseqid","sseqid","pident","length","mismatch","gapopen","qstart","qend","sstart","send","evalue","bitscore","qlen","slen","stitle"] diamond_file = pd.read_csv(diamond_path, delimiter='\t', decimal='.', header=None, names=diamond_columns) diamond_file.loc[:,"sseqid"] = 'https://www.ncbi.nlm.nih.gov/protein/' + diamond_file.loc[:,"sseqid"] group_diamond_file = diamond_file.groupby("qseqid")\ .agg({"stitle" : ';'.join, "sseqid" : ','.join})\ .reset_index()\ .reindex(columns=diamond_file.columns) res_diamond_file = group_diamond_file.loc[:,["qseqid","sseqid","stitle"]] concat_diamond_files = pd.concat([concat_diamond_files, res_diamond_file]) # Merge counts, annotation and diamond results. merge_annot = pd.merge(counts_file,concat_eggnog_mapper_files,left_on="seed_cluster",right_on='#query_name', how='left') merge = pd.merge(merge_annot,concat_diamond_files,left_on="seed_cluster",right_on="qseqid", how='left') merge.drop('#query_name', inplace=True, axis=1) merge.drop("qseqid", inplace=True, axis=1) # Write merge data frame in output file. merge.to_csv(args.output_file, sep="\t", index=False)