Commit ae522ca0 authored by Jerome Mariette's avatar Jerome Mariette
Browse files

update jflow

parent 2e150236
......@@ -14,390 +14,11 @@
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
import os
from jflow.xopen import xopen
from jflow.seqio import FormatError
from jflow.seqio import UnknownFileType
import re
def boolify(s):
return {'True': True, 'False': False}[s]
def autocast(s):
for fn in (boolify, int, float):
try:
return fn(s)
except:
pass
return s
class Entry(object):
def __init__(self, **kwargs):
self.attrib = kwargs
def addattr(self, k, v):
self.attrib[k] = v
def __getattr__(self, key):
return self.attrib[key]
def __str__(self):
return str(self.attrib)
def __getitem__(self, key):
return self.attrib[key]
def has(self,attr):
return self.attrib.has_key(attr)
class _AbstractFeatureReader(object):
'''
Abstract file reader
'''
def __init__(self, file, wholefile=False):
"""
@param file : filename or a file-like object.
@param wholefile: If True, then it is ok to read the entire file into memory. This is faster when there are
many newlines in the file, but may obviously need a lot of memory.
"""
if isinstance(file, basestring):
file = xopen(file, "r")
self.fp = file
self.wholefile = wholefile
def __iter__(self):
return self._wholefile_iter() if self.wholefile else self._streaming_iter()
def _streaming_iter(self):
raise NotImplementedError('not implemented')
def _wholefile_iter(self):
raise NotImplementedError('not implemented')
def __enter__(self):
if self.fp is None:
raise ValueError("I/O operation on closed {0}".format(self.__class__.__name__) )
return self
def __exit__(self, *args):
self.fp.close()
class GFF3Reader(_AbstractFeatureReader):
'''
Reader for GFF3 files
'''
def _process_line(self, line):
row = line.rstrip().split('\t')
if len(row) != 9 : raise FormatError('Invalid number of columns in your GFF3 file {0}'.format( len(row)))
attributes = {}
if row[8] and row[8] != '.' :
for p in row[8].split(';') :
if p != "" :
if len(p.partition('=')) == 3 :
attributes[p.partition('=')[0]] = p.partition('=')[2]
else :
print "Warn : Attribute "+ p +" gff3 for "+ "\t".join(row) + "\n"
return Entry(**{'seqid' : row[0], 'source' : row[1], 'type' : row[2], 'start' : int(row[3]), 'end' : int(row[4]),
'score' : row[5], 'strand' : row[6], 'phase' : row[7], 'attributes' : attributes })
def _streaming_iter(self):
"""
Read next entry from the file (single entry at a time).
"""
for line in self.fp:
if line.startswith('#') :
continue
yield self._process_line(line)
def _wholefile_iter(self):
"""
This reads in the entire file at once, but is faster than the above code when there are lots of newlines.
The idea comes from the TAMO package (http://fraenkel.mit.edu/TAMO/), module TAMO.seq.Fasta (author is
David Benjamin Gordon).
"""
wholefile = self.fp.read()
assert '\r' not in wholefile, "Sorry, currently don't know how to deal with files that contain \\r linebreaks"
assert len(wholefile) == 0 , "Empty gff3 file"
for line in wholefile.split('\n') :
if line.startswith('#') :
continue
yield self._process_line(line)
class BEDReader(_AbstractFeatureReader):
'''
Reader for BED
'''
def _process_line(self,line):
row = line.rstrip().split('\t')
if len(row) not in range(3,13) : raise FormatError('Invalid number of columns in your BED file {0}'.format( len(row)))
return Entry(**{ 'chrom' : row[0], 'chromStart' : row[1], 'chromEnd' : row[2] })
def _streaming_iter(self):
for line in self.fp :
if line.startswith('#') :
continue
yield self._process_line(line)
def _wholefile_iter(self):
wholefile = self.fp.read()
assert '\r' not in wholefile, "Sorry, currently don't know how to deal with files that contain \\r linebreaks"
assert len(wholefile) == 0 , "Empty BED file"
for line in wholefile.split('\n') :
if line.startswith('#') :
continue
yield self._process_line(line)
class WEGOReader(_AbstractFeatureReader):
'''
Reader for WEGO files
'''
def _process_line(self,line):
row = line.rstrip().split('\t')
name, ids = row[0], []
if len(row) > 1 :
ids = row[1:]
return (name, ids)
def _streaming_iter(self):
# first line must be !WGOP
if not self.fp.readline().startswith('!WGOP'):
raise FormatError('WEGO header not found (!WEGOP_), invalid WEGO file ')
for line in fp :
if line.startswith('!WEGO') :
continue
yield self._process_line(line)
def _wholefile_iter(self):
wholefile = self.fp.read()
assert '\r' not in wholefile, "Sorry, currently don't know how to deal with files that contain \\r linebreaks"
assert len(wholefile) == 0 , "Empty WEGO file"
if not wholefile.startswith('!WGOP') :
raise FormatError('WEGO header not found (!WEGOP_), invalid WEGO file ')
for line in wholefile.split('\n') :
if line.startswith('!WEGO') :
continue
yield self._process_line(line)
class VCFReader(_AbstractFeatureReader):
'''
Reader for VCF files
Read a vcf file and yield an entry object. Each line will be yield as an Entry object. To access samples for
variation, use entry.samples, which will be an array of Entry
Fields for a variation entry :
entry.chrom, entry.pos, entry.id, entry.ref, entry.alt, entry.qual, entry.filter, entry.info, entry.format
entry.is_indel
special case :
* entry.alt : array of string
* entry.info : dictionary
* entry.samples : array of entries
Fields of a sample entry :
entry.name, entry.path
all other fields depends on the FORMAT column
'''
def __init__(self, file, wholefile=False):
"""
@param file : filename or a file-like object.
@param wholefile: If True, then it is ok to read the entire file into memory. This is faster when there are
many newlines in the file, but may obviously need a lot of memory.
"""
if isinstance(file, basestring):
file = xopen(file, "r")
self.fp = file
self.wholefile = wholefile
self.samples_name=[]
self._init_sample_names()
def _init_sample_names(self):
for line in self.fp :
if line.startswith('#') :
if line.startswith('#CHROM') :
row = line.rstrip().split('\t')
if len(row) <= 9 :
raise FormatError( 'Invalid number of columns in your vcf header file {0}'.format(len(row)) )
for i in range(9, len(row)) :
self.samples_name.append( ( row[i] , os.path.splitext(os.path.basename(row[i]))[0] ) )
break
else :
raise FormatError( 'The vcf file {0}must start with header lines (#) !!!'.format(self.fp.name) )
self.fp.seek(0,0)
if len(self.samples_name) < 0 :
raise FormatError( "Invalid VCF file {0}. Could not retrieve the sample names headers".format(self.fp.name) )
def _process_line(self,line):
row = line.rstrip().split('\t')
variation = Entry(**{
'chrom' : row [0],
'pos' : int(row[1]),
'id' : row[2],
'ref' : row[3],
'alt' : row[4].split(',') ,
'qual' : autocast(row[5]),
'filter' : row[6],
'info' : {},
'format' : row[8].split(';'),
'samples' : [],
'is_indel': False
})
if len(variation.alt) > 1 :
variation.addattr( 'is_indel', True)
regexp_none=re.compile("\.(\/\.)*")
#if row[7] != '.' :
# variation.addattr( 'info', { p.split('=')[0] : autocast(p.split('=')[1]) for p in row[7].split(';') })
format = row[8].split(':')
for lib_infos in range (9,len(row)) :
if not regexp_none.match(row[lib_infos]):
sformat = row[lib_infos].split(':')
variation.samples.append( Entry(**{ autocast(format[i]) : autocast(sformat[i]) if sformat[i] != '.' else None for i in range(0,len(format)) } ) )
else :
variation.samples.append( Entry(**{ autocast(format[i]) : None for i in range(0,len(format)) }) )
return variation
def _streaming_iter(self):
for line in self.fp :
if line.startswith('#') :
continue
yield self._process_line(line)
def _wholefile_iter(self):
wholefile = self.fp.read()
assert '\r' not in wholefile, "Sorry, currently don't know how to deal with files that contain \\r linebreaks"
assert len(wholefile) == 0 , "Empty VCF file"
for line in wholefile.split('\n') :
if line.startswith('#') :
continue
yield self._process_line(line)
class MpileupReader(_AbstractFeatureReader):
"""
Reader for Mpileup files.
"""
def _streaming_iter(self):
for line in self.fp :
if line == None or line == "" : continue
row = line.rstrip().split('\t')
if len(row) < 4 : continue
libs_count=[]
i=3
while (i < len(row) ) :
libs_count.append(int(row[i]))
i += 3
pileup = Entry(**{
'chrom' : row [0],
'pos' : int(row[1]),
'ref' : row[2],
'libs' : libs_count ,
})
yield pileup
libs_count=[]
def _wholefile_iter(self):
wholefile = self.fp.read()
assert '\r' not in wholefile, "Sorry, currently don't know how to deal with files that contain \\r linebreaks"
assert len(wholefile) == 0 , "Empty VCF file"
for line in wholefile.split('\n') :
row = line.rstrip().split('\t')
libs_count=[]
i=3
while (i < len(row) ) :
libs_count.append(int(row[i]))
i += 3
pileup = Entry(**{
'chrom' : row [0],
'pos' : int(row[1]),
'ref' : row[2],
'lib' : libs_count ,
})
yield pileup
libs_count=[]
class OboReader(_AbstractFeatureReader):
"""
Reader for OBO files.
"""
def _streaming_iter(self):
"""
Read next entry from the file (single entry at a time).
# TODO this can be quadratic since += is used for the obo record
"""
id = None
name = ""
namespace = ""
parents = []
for line in self.fp:
# strip() should also take care of DOS line breaks
line = line.strip()
if line=="" :
if id is not None:
yield Entry(**{ 'id' : id, 'name' : name, 'namespace' : namespace, "parents" : parents })
id=None
parents = []
else:
if line.startswith("id: "):
id=line.split(": ")[1]
elif line.startswith("name: "):
name=line.split(": ")[1]
elif line.startswith("namespace: "):
namespace=line.split()[1][0].upper()
elif line.startswith("is_a: "):
parents.append(line.split()[1])
elif line.startswith("relationship: part_of: ") :
parents.append(line.split()[3])
if id is not None:
yield Entry(**{ 'id' : id, 'name' : name, 'namespace' : namespace, "parents" : parents })
def _wholefile_iter(self):
"""
This reads in the entire file at once, but is faster than the above code when there are lots of newlines.
The idea comes from the TAMO package (http://fraenkel.mit.edu/TAMO/), module TAMO.seq.Fasta (author is
David Benjamin Gordon).
"""
wholefile = self.fp.read()
assert '\r' not in wholefile, "Sorry, currently don't know how to deal with files that contain \\r linebreaks"
parts = wholefile.split('\n[Term]')
id = None
name = ""
namespace = ""
parents = []
for part in parts:
for line in part.split('\n'):
line.strip()
if line.startswith("id: "):
id=line.split(": ")[1]
elif line.startswith("name: "):
name=line.split(": ")[1]
elif line.startswith("namespace: "):
namespace=line.split()[1][0].upper()
elif line.startswith("is_a: "):
parents.append(line.split()[1])
elif line.startswith("relationship: part_of: ") :
parents.append(line.split()[3])
yield Entry(**{ 'id' : id, 'name' : name, 'namespace' : namespace, "parents" : parents })
parents = []
from featureiolib.bed import BEDReader
from featureiolib.biom import Biom, BiomIO
from featureiolib.gff3 import GFF3Record, GFF3IO
from featureiolib.mpileup import MpileupReader
from featureiolib.obo import OboReader
from featureiolib.vcf import VCFReader
from featureiolib.wego import WEGOReader
\ No newline at end of file
#
# Copyright (C) 2014 INRA
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
\ No newline at end of file
import os
import re
from jflow.xopen import xopen
from jflow.seqio import FormatError
from jflow.seqio import UnknownFileType
def boolify(s):
return {'True': True, 'False': False}[s]
def autocast(s):
for fn in (boolify, int, float):
try:
return fn(s)
except:
pass
return s
class Entry(object):
def __init__(self, **kwargs):
self.attrib = kwargs
def addattr(self, k, v):
self.attrib[k] = v
def __getattr__(self, key):
return self.attrib[key]
def __str__(self):
return str(self.attrib)
def __getitem__(self, key):
return self.attrib[key]
def has(self,attr):
return self.attrib.has_key(attr)
class _AbstractFeatureReader(object):
'''
Abstract file reader
'''
def __init__(self, file, wholefile=False):
"""
@param file : filename or a file-like object.
@param wholefile: If True, then it is ok to read the entire file into memory. This is faster when there are
many newlines in the file, but may obviously need a lot of memory.
"""
if isinstance(file, basestring):
file = xopen(file, "r")
self.fp = file
self.wholefile = wholefile
def __iter__(self):
return self._wholefile_iter() if self.wholefile else self._streaming_iter()
def _streaming_iter(self):
raise NotImplementedError('not implemented')
def _wholefile_iter(self):
raise NotImplementedError('not implemented')
def __enter__(self):
if self.fp is None:
raise ValueError("I/O operation on closed {0}".format(self.__class__.__name__) )
return self
def __exit__(self, *args):
self.fp.close()
\ No newline at end of file
import os
import re
from jflow.xopen import xopen
from jflow.seqio import FormatError
from jflow.seqio import UnknownFileType
from abstractfeaturereader import _AbstractFeatureReader, Entry, boolify, autocast
class BEDReader(_AbstractFeatureReader):
'''
Reader for BED
'''
def _process_line(self,line):
row = line.rstrip().split('\t')
if len(row) not in range(3,13) : raise FormatError('Invalid number of columns in your BED file {0}'.format( len(row)))
return Entry(**{ 'chrom' : row[0], 'chromStart' : row[1], 'chromEnd' : row[2] })
def _streaming_iter(self):
for line in self.fp :
if line.startswith('#') :
continue
yield self._process_line(line)
def _wholefile_iter(self):
wholefile = self.fp.read()
assert '\r' not in wholefile, "Sorry, currently don't know how to deal with files that contain \\r linebreaks"
assert len(wholefile) == 0 , "Empty BED file"
for line in wholefile.split('\n') :
if line.startswith('#') :
continue
yield self._process_line(line)
\ No newline at end of file
This diff is collapsed.
import sys, re
class GFF3Record:
"""
@summary : Record for GFF3.
"""
def __init__( self ):
self.seq_id = None
self.source = None
self.type = None
self.start = None
self.end = None
self.score = None
self.strand = None
self.phase = None
self.attributes = None
def setAttribute( self, tag, value ):
"""
@summary : Create or replace an attribute tag.
@param tag : tag of the attribute.
@param value : value of the attribute tag.
"""
cleaned_tag = GFF3Record._getCleanedAttribute(tag)
cleaned_value = GFF3Record._getCleanedAttribute(value)
if self.attributes is not None :
self.attributes[cleaned_tag] = cleaned_value
else:
raise "The attibute 'Attributes' is not initialized."
def addToAttribute( self, tag, value ):
"""
@summary : Add one value on an existing tag.
@param tag : tag of the attribute.
@param value : value to add of the tag.
"""
cleaned_tag = GFF3Record._getCleanedAttribute(tag)
cleaned_value = GFF3Record._getCleanedAttribute(value)
if self.attributes is not None :
if self.attributes.has_key( cleaned_tag ):
self.attributes[cleaned_tag] = self.attributes[cleaned_tag] + "%2C" + cleaned_value
else:
self.attributes[cleaned_tag] = cleaned_value
else:
raise "The attibute 'Attributes' is not initialized."
def _attributesToGff( self ):
"""
@summary : Returns a string in GFF3 format attributes field from the GFF3Record.attributes.
@return : [str] the attributes in GFF3 format.
"""
gff_string = ""
for tag in self.attributes:
gff_string = gff_string + tag + "=" + str(self.attributes[tag]) + ";"
return gff_string[:-1]
def toGff( self ):
"""
@summary : Returns a string in GFF3 format from the GFF3Record object.
@return : [str] the line in GFF3 format.
"""
gff_record = "\t".join( [self.seq_id, self.source, self.type, str(self.start), str(self.end), str(self.score), self.strand, str(self.phase), self._attributesToGff()] )
return gff_record
def attributesToStr( self, tag ):
"""
@summary : Returns the attribute value in human readable format.
@param tag : [str] the attribute tag.
@return : [str] the human readable value.
@see : RFC 3986 Percent-Encoding
"""
cleaned_tag = GFF3Record._getCleanedAttribute(tag)
if self.attributes.has_key( cleaned_tag ):
readable_value = self.attributes[cleaned_tag].replace('%3B', ';')
readable_value = readable_value.replace('%2C', ',')
redable_value = readable_value.replace('%3D', '=')