Commit 3bde3a69 authored by Jerome Mariette's avatar Jerome Mariette
Browse files

add some functionalities

parent f924579c
#
# Copyright (C) 2012 INRA
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
import os
from jflow.xopen import xopen
from jflow.seqio import FormatError
from jflow.seqio import UnknownFileType
def boolify(s):
return {'True': True, 'False': False}[s]
def autocast(s):
for fn in (boolify, int, float):
try:
return fn(s)
except:
pass
return s
class Entry(object):
def __init__(self, **kwargs):
self.attrib = kwargs
def addattr(self, k, v):
self.attrib[k] = v
def __getattr__(self, key):
return self.attrib[key]
def __str__(self):
return str(self.attrib)
def __getitem__(self, key):
return self.attrib[key]
class _AbstractFeatureReader(object):
'''
Abstract file reader
'''
def __init__(self, file, wholefile=False):
"""
@param file : filename or a file-like object.
@param wholefile: If True, then it is ok to read the entire file into memory. This is faster when there are
many newlines in the file, but may obviously need a lot of memory.
"""
if isinstance(file, basestring):
file = xopen(file, "r")
self.fp = file
self.wholefile = wholefile
def __iter__(self):
return self._wholefile_iter() if self.wholefile else self._streaming_iter()
def _streaming_iter(self):
raise NotImplementedError('not implemented')
def _wholefile_iter(self):
raise NotImplementedError('not implemented')
def __enter__(self):
if self.fp is None:
raise ValueError("I/O operation on closed {0}".format(self.__class__.__name__) )
return self
def __exit__(self, *args):
self.fp.close()
class GFF3Reader(_AbstractFeatureReader):
'''
Reader for GFF3 files
'''
def _process_line(self, line):
row = line.rstrip().split('\t')
if len(row) != 9 : raise FormatError('Invalid number of columns in your GFF3 file {0}'.format( len(row)))
attributes = {}
if row[8] and row[8] != '.' :
attributes = { p.split('=')[0] : p.split('=')[1] for p in row[8].split(';')}
return Entry(**{'seqid' : row[0], 'source' : row[1], 'type' : row[2], 'start' : int(row[3]), 'end' : int(row[4]),
'score' : row[5], 'strand' : row[6], 'phase' : row[7], 'attributes' : attributes })
def _streaming_iter(self):
"""
Read next entry from the file (single entry at a time).
"""
for line in self.fp:
if line.startswith('#') :
continue
yield self._process_line(line)
def _wholefile_iter(self):
"""
This reads in the entire file at once, but is faster than the above code when there are lots of newlines.
The idea comes from the TAMO package (http://fraenkel.mit.edu/TAMO/), module TAMO.seq.Fasta (author is
David Benjamin Gordon).
"""
wholefile = self.fp.read()
assert '\r' not in wholefile, "Sorry, currently don't know how to deal with files that contain \\r linebreaks"
assert len(wholefile) == 0 , "Empty gff3 file"
for line in wholefile.split('\n') :
if line.startswith('#') :
continue
yield self._process_line(line)
class BEDReader(_AbstractFeatureReader):
'''
Reader for BED
'''
def _process_line(self,line):
row = line.rstrip().split('\t')
if len(row) not in range(3,13) : raise FormatError('Invalid number of columns in your BED file {0}'.format( len(row)))
return Entry(**{ 'chrom' : row[0], 'chromStart' : row[1], 'chromEnd' : row[2] })
def _streaming_iter(self):
for line in self.fp :
if line.startswith('#') :
continue
yield self._process_line(line)
def _wholefile_iter(self):
wholefile = self.fp.read()
assert '\r' not in wholefile, "Sorry, currently don't know how to deal with files that contain \\r linebreaks"
assert len(wholefile) == 0 , "Empty BED file"
for line in wholefile.split('\n') :
if line.startswith('#') :
continue
yield self._process_line(line)
class WEGOReader(_AbstractFeatureReader):
'''
Reader for WEGO files
'''
def _process_line(self,line):
row = line.rstrip().split('\t')
name, ids = row[0], []
if len(row) > 1 :
ids = row[1:]
return (name, ids)
def _streaming_iter(self):
# first line must be !WGOP
if not self.fp.readline().startswith('!WGOP'):
raise FormatError('WEGO header not found (!WEGOP_), invalid WEGO file ')
for line in fp :
if line.startswith('!WEGO') :
continue
yield self._process_line(line)
def _wholefile_iter(self):
wholefile = self.fp.read()
assert '\r' not in wholefile, "Sorry, currently don't know how to deal with files that contain \\r linebreaks"
assert len(wholefile) == 0 , "Empty WEGO file"
if not wholefile.startswith('!WGOP') :
raise FormatError('WEGO header not found (!WEGOP_), invalid WEGO file ')
for line in wholefile.split('\n') :
if line.startswith('!WEGO') :
continue
yield self._process_line(line)
class VCFReader(_AbstractFeatureReader):
'''
Reader for VCF files
Read a vcf file and yield an entry object. Each line will be yield as an Entry object. To access samples for
variation, use entry.samples, which will be an array of Entry
Fields for a variation entry :
entry.chrom, entry.pos, entry.id, entry.ref, entry.alt, entry.qual, entry.filter, entry.info, entry.format
entry.is_indel
special case :
* entry.alt : array of string
* entry.info : dictionary
* entry.samples : array of entries
Fields of a sample entry :
entry.name, entry.path
all other fields depends on the FORMAT column
'''
def __init__(self, file, wholefile=False):
"""
@param file : filename or a file-like object.
@param wholefile: If True, then it is ok to read the entire file into memory. This is faster when there are
many newlines in the file, but may obviously need a lot of memory.
"""
if isinstance(file, basestring):
file = xopen(file, "r")
self.fp = file
self.wholefile = wholefile
self.samples_name=[]
def _process_line(self,line):
row = line.rstrip().split('\t')
variation = Entry(**{
'chrom' : row [0],
'pos' : int(row[1]),
'id' : row[2],
'ref' : row[3],
'alt' : row[4].split(',') ,
'qual' : autocast(row[5]),
'filter' : row[6],
'info' : {},
'format' : row[8].split(';'),
'samples' : [],
'is_indel': False
})
if len(variation.alt) > 1 :
variation.addattr( 'is_indel', True)
if row[7] != '.' :
variation.addattr( 'info', { p.split('=')[0] : autocast(p.split('=')[1]) for p in row[7].split(';') })
format = row[8].split(';')
for i in range(9, len(row)) :
if row[i] != '.' :
sformat = row[i].split(';')
variation.samples.append( Entry(**{ format[i] : autocast(sformat[i]) for i in range(0,len(format)) }) )
else :
variation.samples.append( Entry(**{ format[i] : None for i in range(0,len(format)) }) )
return variation
def _streaming_iter(self):
for line in self.fp :
if line.startswith('#') :
if line.startswith('#CHROM') :
row = line.rstrip().split('\t')
if len(row) <= 9 : raise FormatError( 'Invalid number of columns in your vcf header file {0}'.format(len(row)) )
for i in range(9, len(row)) :
self.samples_name.append( ( row[i] , os.path.basename(row[i]) ) )
continue
yield self._process_line(line)
def _wholefile_iter(self):
wholefile = self.fp.read()
assert '\r' not in wholefile, "Sorry, currently don't know how to deal with files that contain \\r linebreaks"
assert len(wholefile) == 0 , "Empty VCF file"
for line in wholefile.split('\n') :
if line.startswith('#') :
if line.startswith('#CHROM') :
row = line.rstrip().split('\t')
if len(row) <= 9 : raise FormatError( 'Invalid number of columns in your vcf header file {0}'.format(len(row)) )
for i in range(9, len(row)) :
self.samples_name.append( ( row[i] , os.path.basename(row[i]) ) )
continue
yield self._process_line(line)
class OboReader(_AbstractFeatureReader):
"""
Reader for OBO files.
"""
def __init__(self, file, wholefile=False):
"""
file is a filename or a file-like object.
If file is a filename, then .gz files are supported.
If wholefile is True, then it is ok to read the entire file
into memory. This is faster when there are many newlines in
the file, but may obviously need a lot of memory.
keep_linebreaks -- whether to keep the newline characters in the sequence
"""
if isinstance(file, basestring):
file = xopen(file, "r")
self.fp = file
self.wholefile = wholefile
def _streaming_iter(self):
"""
Read next entry from the file (single entry at a time).
# TODO this can be quadratic since += is used for the obo record
"""
id = None
name = ""
namespace = ""
parents = []
for line in self.fp:
# strip() should also take care of DOS line breaks
line = line.strip()
if line=="" :
if id is not None:
yield Entry(**{ 'id' : id, 'name' : name, 'namespace' : namespace, "parents" : parents })
id=None
parents = []
else:
if line.startswith("id: "):
id=line.split(": ")[1]
elif line.startswith("name: "):
name=line.split(": ")[1]
elif line.startswith("namespace: "):
namespace=line.split(" ")[1][0].upper()
elif line.startswith("is_a: "):
parents.append(line.split(" ")[1])
elif line.startswith("relationship: part_of: ") :
parents.append(line.split(" ")[3])
if id is not None:
yield Entry(**{ 'id' : id, 'name' : name, 'namespace' : namespace, "parents" : parents })
def _wholefile_iter(self):
"""
This reads in the entire file at once, but is faster than the above code when there are lots of newlines.
The idea comes from the TAMO package (http://fraenkel.mit.edu/TAMO/), module TAMO.seq.Fasta (author is
David Benjamin Gordon).
"""
wholefile = self.fp.read()
assert '\r' not in wholefile, "Sorry, currently don't know how to deal with files that contain \\r linebreaks"
parts = wholefile.split('\n[Term]')
id = None
name = ""
namespace = ""
parents = []
for part in parts:
for line in part.split('\n'):
line.strip()
if line.startswith("id: "):
id=line.split(": ")[1]
elif line.startswith("name: "):
name=line.split(": ")[1]
elif line.startswith("namespace: "):
namespace=line.split(" ")[1][0].upper()
elif line.startswith("is_a: "):
parents.append(line.split(" ")[1])
elif line.startswith("relationship: part_of: ") :
parents.append(line.split(" ")[3])
yield Entry(**{ 'id' : id, 'name' : name, 'namespace' : namespace, "parents" : parents })
parents = []
def __enter__(self):
if self.fp is None:
raise ValueError("I/O operation on closed FastaReader")
return self
def __exit__(self, *args):
self.fp.close()
\ No newline at end of file
......@@ -37,6 +37,8 @@ class Parameter(object):
if type == "date":
self.type = date
elif isinstance(type, types.FunctionType):
self.type = type
else:
try: self.type = eval(type)
except: self.type = types.StringType
......
......@@ -89,7 +89,7 @@ class Workflow(threading.Thread):
raise IOError(self.__class__.__name__ + " workflow property file not found or invalid.")
self.id = id
self.args = self._extend_and_format_args(args)
self.args = self._extend_and_format_args(self.parameters, args)
self.metadata = self.args["metadata"]
if self.id is not None:
self.directory = self.manager.get_workflow_directory(self.name, self.id)
......@@ -203,10 +203,10 @@ class Workflow(threading.Thread):
self.end_time = time.time()
self._serialize()
def _extend_and_format_args(self, args):
def _extend_and_format_args(self, parameters, args):
extended_args = {}
for param in self.parameters:
for param in parameters:
try: args[param.name] = args[param.name].encode('ascii','ignore')
except: pass
# if this parameter has been modified by the user
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment