pacbiolib.py 2.51 KB
Newer Older
Audrey Gibert's avatar
Audrey Gibert committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#
# Copyright (C) 2012 INRA
# 
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#

from .lightpbcoreio import BasH5Reader
from jflow.seqio import Sequence

import numpy as np
import os

class PacbioH5Reader(object):
    """
    Reader for pacbio movie file
    """
    def __init__(self, file):
        """
            @param file: either the .bas.h5 or  one of the three .bax.h5 files
        """
        parts = []
        analysisresults_dir = os.path.dirname(file)
        celldir = os.path.dirname(analysisresults_dir)
        
        if file.endswith(".bax.h5"):
            for each in os.listdir(analysisresults_dir) :
                fullpath = os.path.join(analysisresults_dir, each)
                if each.endswith('.bax.h5') :
                    parts.append(fullpath)
                elif each.endswith('.bas.h5') :
                    parts = [fullpath]
                    break
        elif file.endswith(".bas.h5"):
            parts = [file]
        
        self.bash5 = BasH5Reader(*parts)
        self.metadata = None
        for meta in  os.listdir(celldir) :
            metafile = os.path.join(celldir, meta)
            if os.path.isfile(metafile) and meta.endswith(".metadata.xml"):
                self.metadata = metafile
                break
    
    def qvsFromAscii(self,s):
        return (np.fromstring(s, dtype=np.uint8) - 33)

    def asciiFromQvs(self,a):
        return (np.clip(a, 0, 93).astype(np.uint8) + 33).tostring()  

    def __iter__(self):
        for zmwRead in self.bash5.subreads():
            yield Sequence(zmwRead.readName, zmwRead.readName, zmwRead.basecalls(), self.asciiFromQvs(zmwRead.QualityValue()))

def h5file(ifile):
    
    try :
        reader = PacbioH5Reader(ifile)
        nb_seq = 0
        for id, desc, seq, qualities in reader:
            nb_seq += 1
            if nb_seq == 10: break
    except :
        raise jflow.InvalidFormatError("The provided file '" + ifile + "' is not a valid pacbio h5 file!")