David, your screen captures were too large and your message bounced.  
I'm copying your message here.  Your scripts are also attached.  See my 
comments interspersed in your message.

A Monday 13 December 2010 10:56:04 david.bri...@ubs.com escrigué:
> ;o) here's the screen captures
> 
> The file I corrupted I opened for append but had only read data!!!

Uh, that's really ugly.  Anyway, if you are not going to update the 
file, it is safer to open it in 'r'ead-only mode.

> Good to know it's a well-known limitation of HDF5 though.

Yup.  Hope they fix this more sooner than later.

> ---
> 
> Ok I can now replicate without my app.
> 
> Script1.py builds a large db.
> 
> Script2.py opens the db and summarises it.
> 
> Also enclosed are some screen shots of the task manager as script2 is
> running.
> 
> My conclusions are:
>         1) pytables is not designed to safely manage memory,
>         2) I should keep any summaries in a separate table if I am to
>         open the data base quickly (and without causing a
>         MemoryError).

Well, your scripts were putting all the nodes in the object tree on a 
list, and that is the reason for the 'leak'.  You don't need to put all 
the nodes on a list (in fact, this is strongly discouraged, for the 
reasons that you have seen) in order to iterate through the some 
selected nodes; for this a generator is way better.  The next patch 
converts the function generating the list into a generator:

"""
--- script2.py  2010-12-13 19:13:44.000000000 +0100
+++ script2-modif.py    2010-12-13 19:17:23.000000000 +0100
@@ -22,8 +22,7 @@
                 if node._v_attrs.__getattr__(items[0]) <> items[1]:
                     matches = False
                     break
-            if matches: answer.append(node)
-    return answer
+            if matches: yield node

 def openDB(ptFilename):
"""

After applying this, script2 consumes 80 MB instead of 3.2 GB.  And 
times are also similar (13.2 s for the patched version versus 14.2 s).
 
> My intention is not to criticise but to understand where the limits
> are.

No offense taken ;-)

> If the above is a fair evaluation (?) then my application should use
> pytables to manage the data on disk but not in memory. My only
> concern is how to stop pytables consuming all the memory if I need
> to access many tables.
> 
> Is it possible to drop the data structures that access a given table
> from memory? Do I need to close the file occasionally or is there a
> way to say drop table xyz from cache? (I'm wondering how using
> node._f_close() affects performance?)
> 
> Many thx
> 

Hope this helps

> David

-- 
Francesc Alted
import tables as pytables
import numpy as np


class DtOHLCV_Schema(pytables.IsDescription):
    Date=pytables.Float64Col(pos=0)
    Open=pytables.Float32Col(pos=1)
    High=pytables.Float32Col(pos=2)
    Low=pytables.Float32Col(pos=3)
    Close=pytables.Float32Col(pos=4)
    Volume=pytables.Int64Col(pos=5)
DtOHLCV_dtype = np.dtype([("Date", np.float64), ("Open", np.float32), ("High", np.float32), ("Low", np.float32), ("Close", np.float32), ("Volume", np.uint64)])

# create a DB
#filename = "c:\\temp\\test.h5"
filename = "/tmp/test.h5"
ptFile = pytables.openFile(filename, mode="w", title="FredFred")
ptFile.createGroup("/", "data")
class Schema(pytables.IsDescription): seed = pytables.Int16Col(pos=0)
table = ptFile.createTable("/", "TableNameSeed", Schema)
row = table.row
row['seed'] = 1
row.append()
table.flush()

summariesByIndex = {}

class Summary: pass

for i in range(2000):
    size = int(np.random.rand() * 400000) + 100000
    #size = 500000

    attributes = dict(fred1="fred", fred2="a%s" % i, fred3=int(np.random.rand()*10)+1, fred4="D", fred5="fredfred")

    # get the new table name
    seedTable = ptFile.getNode("/TableNameSeed")
    tableNameSeed = seedTable.read(0,1,1,"seed")[0]
    tableNameSeed += 1
    seedTable.modifyColumn(0, 1, 1, tableNameSeed, "seed")
    seedTable.flush()
    tableName = "_%d" % tableNameSeed

    print "creating table %s (%s)" % (tableName, size)

    # create the new table
    table = ptFile.createTable("/data", tableName, DtOHLCV_Schema)
    for kv in attributes.items():
        setattr(table._v_attrs, kv[0], kv[1])     # maybe use table._f_setAttr(name.value) instead
    table.cols.Date.createCSIndex()
    ptFile.flush()

    summary = Summary()
    summary.attributes = attributes
    summary.dtype = DtOHLCV_Schema
    summary.pathname = table._v_pathname
    summariesByIndex[i] = summary

    # create a large np array (would be loaded from csv)
    data = np.zeros(size, DtOHLCV_dtype).view(np.recarray)
    data["Date"] = np.arange(size)

    summary._data = data

    # commit data to disk
    if table.nrows > 0: raise TSCError("There is already data in the database - unable to add data in this version")
    table.append(summary._data)
    table.flush()
    summary._persistentSize = len(summary._data)
    if summary._persistentSize > 0:
        summary._firstPersistentTS = summary._data[0]["Date"]
        summary._lastPersistentTS = summary._data[summary._persistentSize-1]["Date"]

    # drop data from memory
    summary._data = None


print "Done"
import tables as pytables

class TableSummary:
    def __init__(self, attributes, dtype, id, filename, tablename):
        self.attributes = attributes
        self.dtype = dtype
        self.id = id
        self.filename = filename
        self.tablename = tablename


def getTables(ptFile, attributes={}, where="/"):
    answer = []
    for node in ptFile.walkNodes(where):
        if isinstance(node, pytables.Leaf):
            matches = True
            for items in attributes.items():
                if not node._v_attrs.__contains__(items[0]):
                    matches = False
                    break
                #if ("%s" % node._v_attrs.__getattr__(items[0])) <> ("%s" % items[1]):
                if node._v_attrs.__getattr__(items[0]) <> items[1]:
                    matches = False
                    break
            if matches: answer.append(node)
    return answer


def openDB(ptFilename):

    ptFile = pytables.openFile(ptFilename, mode="r")
    TSIDSeed = 0
    newTSByTSID = {}

    for table in getTables(ptFile, where="/data"):
        attributes = {}
        for name in table._v_attrs._f_list():
            attributes[name] = table._v_attrs[name]  # I'm not sure if the _v_attrs can be added in one go
        TSIDSeed += 1
        ts = TableSummary(attributes, table.dtype, TSIDSeed, ptFilename, table._v_pathname)
        ts.size = table.nrows
        if ts.size > 0:
            ts.firstDate = table.read(start=0, field="Date")[0]
            ts.lastDate = table.read(start=table.nrows-1, field="Date")[0]
        newTSByTSID[TSIDSeed] = ts
        print "%s (%s)" % (ts.tablename, ts.size)
    return newTSByTSID


def test():
    #print len(openDB("C:\\temp\\test.h5"))
    print len(openDB("/tmp/test.h5"))


test()
------------------------------------------------------------------------------
Lotusphere 2011
Register now for Lotusphere 2011 and learn how
to connect the dots, take your collaborative environment
to the next level, and enter the era of Social Business.
http://p.sf.net/sfu/lotusphere-d2d
_______________________________________________
Pytables-users mailing list
Pytables-users@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/pytables-users

Reply via email to