Commit 773fb63b authored by Celine Noirot's avatar Celine Noirot
Browse files

Script for merging

parent 5a468bea
#
# Copyright (C) 2009 INRA
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
__author__ = 'Plateforme bioinformatique Midi Pyrenees'
__copyright__ = 'Copyright (C) 2009 INRA'
__license__ = 'GNU General Public License'
__version__ = '1.0'
__email__ = 'support.genopole@toulouse.inra.fr'
__status__ = 'beta'
from optparse import *
import sys,re,datetime
from os.path import join, splitext, basename, exists, split
def read_file (file,libname,dict_col, col_number=1):
"""
Extract count from the count file
@param file : the input count file (output of sam2count)
@param libname : the library name
@param dict_col : the dict of values
@param col_number : column to extract in each files [fisrt column = 0]
"""
fin=open (file, "r")
#parse data
for line in fin :
tab = line.rstrip().split("\t") # split line
if line.startswith("#") or len(tab) < int(col_number) or tab[0] == "*":
continue
if not dict_col.has_key(tab[0]) : # if new contig
dict_col[tab[0]] = {}
if not dict_col[tab[0]].has_key(libname) : # if new library for this contig
dict_col[tab[0]][libname] = {}
dict_col[tab[0]][libname] = tab[int(col_number)]
fin.close
def version_string ():
"""
Return the merge_count version
"""
return "merge_count.py " + __version__
if __name__ == '__main__':
parser = OptionParser(usage="Usage: %prog", description = "Merge files columns on first column, skip lines starting with #")
igroup = OptionGroup(parser, "Input options","")
igroup.add_option("-f", "--files", dest="files", help="tabulated files separated by ',' ")
igroup.add_option("-n", "--names", dest="names", help="matrix columns names in the same order as files ',' ")
igroup.add_option("-c", "--column", dest="col_number", help="column of each files to extract, first column is 0, default 1", default=1, metavar="INT")
parser.add_option_group(igroup)
ogroup = OptionGroup(parser, "Output files options","")
ogroup.add_option("-o", "--output", dest="output", help="the output count file")
parser.add_option_group(ogroup)
(options, args) = parser.parse_args()
if options.files == None :
sys.stderr.write("Need files to merge\n")
parser.print_help()
sys.exit(1)
if options.col_number < 1 :
sys.stderr.write("Column must be greater than 0 \n")
parser.print_help()
sys.exit(1)
if options.output == None:
sys.stderr.write("Output file is missing\n")
parser.print_help()
sys.exit(1)
dict_col={}
files = options.files.split(",")
libs_name = []
get_lib_name = True
if options.names :
libs_name=options.names.split(",")
get_lib_name = False
if len(libs_name) != len(files):
sys.stderr.write("You must provide same number of files and names\n")
parser.print_help()
sys.exit(1)
current_libname=""
for idx, line in enumerate(files):
file = line.rstrip()
if exists(file) :
if get_lib_name :
current_libname = splitext(basename(file))[0]
libs_name.append(current_libname)
else :
current_libname = libs_name[idx]
read_file(file,current_libname,dict_col,options.col_number)
else :
sys.stderr.write("File : " + file + " doesn't exist")
sys.exit(1)
fout = open (options.output,"w")
fout.write ( '#contig_name\t' + '\t'.join(libs_name) +"\n")
# Pour tous les contigs : contruction de la ligne
for contig_name in sorted(dict_col.keys()) :
line = []
line.append( contig_name )
# Pour toutes les contigs
for lib in libs_name :
line.append( str(dict_col[contig_name][lib]) )
fout.write ( '\t'.join(line) +"\n")
fout.close
sys.exit(0)
\ No newline at end of file
#!/usr/local/bioinfo/bin/python2.5
# -*- coding: utf-8 -*-
#
# Counting
# Copyright (C) 2009 INRA
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
__author__ = 'Plateforme bioinformatique Midi Pyrenees'
__copyright__ = 'Copyright (C) 2009 INRA'
__license__ = 'GNU General Public License'
__version__ = '1.0'
__email__ = 'support.genopole@toulouse.inra.fr'
__status__ = 'beta'
if __name__ == "__main__":
import re, os, sys, time
from tempfile import NamedTemporaryFile
from optparse import OptionParser, OptionGroup
# parser = OptionParser(usage="usage: %prog -c filename -p filename -a filename -r filename")
# parser.description="This program counts the contig occurences in the given conditions:\nContigs\tCond1\tCond2\t...\tAnnotations\ncontig1\tint\tint\t...\tannotations of the contig1\n\nUsing the parameters file to identify the condition's sequences and contig_seq file formatted as follow:"
# required=OptionGroup(parser, "Required options")
# required.add_option("-c", "--contig_seq", dest="contigFile",
# help="tabulated file linking contig to sequence in 2 columns : contig_name <tabulation> sequence_name ; with 1 line of header", metavar="FILE")
# required.add_option("-p", "--parameters", dest="paramFile",
# help="tabulated file giving for each condition the way to keep their sequences.Tabulated in 3 columns (name, type, value) and 1 line of header. Types allowed: 'file', 'fasta' and 'exp'", metavar="FILE")
# required.add_option("-a", "--annotations", dest="annotFile",
# help="tabulated file of annotations about the contigs: name,index,annotations. Only 1 line of header starting by 'name'", metavar="FILE")
# required.add_option("-r", "--results", dest="resultFile",
# help="write the result matrix in FILE", metavar="FILE")
# facultative=OptionGroup(parser, "Facultative options")
# facultative.add_option("-v", "--verbose",
# action="store_true", dest="verbose", default=False,
# help="print all status messages to stdout")
# facultative.add_option("-o", "--others",
# action="store", dest="otherFile", metavar="FILE",
# help="write sequences of unknown condition in FILE")
# parser.add_option_group(required) ; parser.add_option_group(facultative)
# (options, args) = parser.parse_args()
# contigs=options.contigFile ; parameters=options.paramFile ; annotations=options.annotFile
# results=options.resultFile ; verbose=options.verbose ; others=options.otherFile
options = sys.argv[1:]
COLUMN = 0 #first column
SEPARATOR = "\t"
OUTPUT = "files_merged.csv"
VERBOSE = True
NUM_ELEMENTS = 0
if not options:
sys.stderr.write("Input required\n")
parser.print_help()
sys.exit(1)
if VERBOSE:
init_time=time.time()
sys.stdout.write("Initializing...\n")
sys.stdout.flush()
files = options
# choose the lightest file as start
# tab_size = [float(os.stat(file).st_size) for file in files ]
# initial_file_position = tab_size.index(min(tab_size))
#
# header_order = []
# datas = {} #index on header, element of 1st column
# for line in open(files[initial_file_position], 'r'):
# header = line.strip().split(SEPARATOR)[COLUMN]
# header_order.append(header)
# datas[header] = []
# or parse all files, supposing that no file has exhaustive list of header
header_order = []
datas = {} #index on header, element of 1st column
for file in files:
visited = {}
for line in open(file, 'r'):
header = line.strip().split(SEPARATOR)[COLUMN]
#entry already known in the current file: error
if visited.has_key(header):
sys.stderr.write("File "+file+" is wrong: this entry is present at least 2 times: "+header+"\n")
sys.exit(1)
#new entry
elif not datas.has_key(header):
header_order.append(header)
datas[header] = []
#else: entry already known thanks to another file
if VERBOSE:
parse_time=time.time()
sys.stdout.write("Done in"+str(parse_time-init_time)+"\nNow parsing...\n")
sys.stdout.flush()
for file in files:
if VERBOSE:
sys.stdout.write("\t file "+file+"\n")
sys.stdout.flush()
visited = []
for line in open(file, 'r'):
elements = line.strip().split(SEPARATOR)
# control file integrity
if NUM_ELEMENTS==0:
NUM_ELEMENTS = len(elements)
elif NUM_ELEMENTS != len(elements) and len(elements)!=0 : # incoherence and not a blank line after header
sys.stderr.write("Le fichier "+file+" est en erreur")
header = elements[COLUMN]
datas[header] += elements[:COLUMN]+elements[COLUMN+1:]
visited.append(header)
# add blanks (no related information in this file)
for header in header_order:
if not visited.__contains__(header):
datas[header]+=['' for i in range(NUM_ELEMENTS-1)]
outhandle = open(OUTPUT,'w')
for header in header_order:
outhandle.write(header+"\t"+"\t".join(datas[header])+"\n")
if VERBOSE:
end_time=time.time()
sys.stdout.write("Done in"+str(end_time-parse_time)+"\nSee output file: "+OUTPUT+"\n")
sys.stdout.flush()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment