Re: [Numpy-discussion] Loading a > GB file into array

Ivan Vilata i Balaguer Sat, 01 Dec 2007 03:58:33 -0800

Ivan Vilata i Balaguer (el 2007-11-30 a les 19:19:38 +0100) va dir::

> Well, one thing you could do is dump your data into a PyTables_
> ``CArray`` dataset, which you may afterwards access as if its was a
> NumPy array to get slices which are actually NumPy arrays.  PyTables
> datasets have no problem in working with datasets exceeding memory size.
>[...]


I've put together the simple script I've attached which dumps a binary
file into a PyTables' ``CArray`` or loads it to measure the time taken
to load each frame.  I've run it on my laptop, which has a not very fast
4200 RPM laptop hard disk, and I've reached average times of 16 ms per
frame, after dropping caches with::

    # sync && echo 1 > /proc/sys/vm/drop_caches

This I've done with the standard chunkshape and no compression.  Your
data may lean itself very well to bigger chunkshapes and compression,
which should lower access times even further.  Since (as David pointed
out) 200 Hz may be a little exaggerated for human eye, loading
individual frames from disk may prove more than enough for your problem.

HTH,

::

        Ivan Vilata i Balaguer   >qo<   http://www.carabos.com/
               Cárabos Coop. V.  V  V   Enjoy Data
                                  ""

from __future__ import with_statement
from time import time
from contextlib import nested

import numpy as np
from tables import openFile, UInt8Atom, Filters


width, height = 640, 480  # 300 KiB per (greyscale) frame

def dump_frames_1(npfname, h5fname, nframes):
     """Dump `nframes` frames to a ``CArray`` dataset."""
     with nested(file(npfname, 'rb'), openFile(h5fname, 'w')) as (npf, h5f):
          frames = h5f.createCArray( '/', 'frames', atom=UInt8Atom(),
                                     shape=(nframes, height, width),
                                     chunkshape=(1, height/2, width),
                                     # filters=Filters(complib='lzo'),
                                     )
          framesize = width * height * 1
          for framei in xrange(nframes):
               frame = np.fromfile(npf, np.uint8, count=framesize)
               frame.shape = (height, width)
               frames[framei] = frame

def dump_frames_2(npfname, h5fname, nframes):
     """Dump `nframes` frames to an ``EArray`` dataset."""
     with nested(file(npfname, 'rb'), openFile(h5fname, 'w')) as (npf, h5f):
          frames = h5f.createEArray( '/', 'frames', atom=UInt8Atom(),
                                     shape=(0, height, width),
                                     expectedrows=nframes,
                                     # chunkshape=(1, height/2, width),
                                     # filters=Filters(complib='lzo'),
                                     )
          framesize = width * height * 1
          for framei in xrange(nframes):
               frame = np.fromfile(npf, np.uint8, count=framesize)
               frame.shape = (1, height, width)
               frames.append(frame)

def load_frames(h5fname):
     with openFile(h5fname, 'r') as h5f:
          frames = h5f.root.frames
          nframes = len(frames)
          times = np.zeros(nframes, float)
          for framei in xrange(nframes):
               t0 = time()
               frame = frames[framei]
               t1 = time()
               times[framei] = t1 - t0
     print ( "Load times for %d frames: min=%.4f avg=%.4f max=%.4f"
             % (nframes, np.min(times), np.average(times), np.max(times)) )

if __name__ == '__main__':
     import sys

     if sys.argv[1] == 'dump':
          npfname, h5fname, nframes = sys.argv[2:]
          nframes = int(nframes, 10)
          dump_frames_1(npfname, h5fname, nframes)
     elif sys.argv[1] == 'load':
          load_frames(sys.argv[2])
     else:
          print >> sys.stderr, """\
Usage: <script> dump NP_FILE H5_FILE NFRAMES
   or: <script> load H5_FILE"""
          sys.exit(1)

signature.asc
Description: Digital signature

_______________________________________________
Numpy-discussion mailing list
Numpy-discussion@scipy.org
http://projects.scipy.org/mailman/listinfo/numpy-discussion

Re: [Numpy-discussion] Loading a > GB file into array

Reply via email to