David, your screen captures were too large and your message bounced.
I'm copying your message here. Your scripts are also attached. See my
comments interspersed in your message.
A Monday 13 December 2010 10:56:04 david.bri...@ubs.com escrigué:
> ;o) here's the screen captures
>
> The file I corrupted I opened for append but had only read data!!!
Uh, that's really ugly. Anyway, if you are not going to update the
file, it is safer to open it in 'r'ead-only mode.
> Good to know it's a well-known limitation of HDF5 though.
Yup. Hope they fix this more sooner than later.
> ---
>
> Ok I can now replicate without my app.
>
> Script1.py builds a large db.
>
> Script2.py opens the db and summarises it.
>
> Also enclosed are some screen shots of the task manager as script2 is
> running.
>
> My conclusions are:
> 1) pytables is not designed to safely manage memory,
> 2) I should keep any summaries in a separate table if I am to
> open the data base quickly (and without causing a
> MemoryError).
Well, your scripts were putting all the nodes in the object tree on a
list, and that is the reason for the 'leak'. You don't need to put all
the nodes on a list (in fact, this is strongly discouraged, for the
reasons that you have seen) in order to iterate through the some
selected nodes; for this a generator is way better. The next patch
converts the function generating the list into a generator:
"""
--- script2.py 2010-12-13 19:13:44.000000000 +0100
+++ script2-modif.py 2010-12-13 19:17:23.000000000 +0100
@@ -22,8 +22,7 @@
if node._v_attrs.__getattr__(items[0]) <> items[1]:
matches = False
break
- if matches: answer.append(node)
- return answer
+ if matches: yield node
def openDB(ptFilename):
"""
After applying this, script2 consumes 80 MB instead of 3.2 GB. And
times are also similar (13.2 s for the patched version versus 14.2 s).
> My intention is not to criticise but to understand where the limits
> are.
No offense taken ;-)
> If the above is a fair evaluation (?) then my application should use
> pytables to manage the data on disk but not in memory. My only
> concern is how to stop pytables consuming all the memory if I need
> to access many tables.
>
> Is it possible to drop the data structures that access a given table
> from memory? Do I need to close the file occasionally or is there a
> way to say drop table xyz from cache? (I'm wondering how using
> node._f_close() affects performance?)
>
> Many thx
>
Hope this helps
> David
--
Francesc Alted
import tables as pytables
import numpy as np
class DtOHLCV_Schema(pytables.IsDescription):
Date=pytables.Float64Col(pos=0)
Open=pytables.Float32Col(pos=1)
High=pytables.Float32Col(pos=2)
Low=pytables.Float32Col(pos=3)
Close=pytables.Float32Col(pos=4)
Volume=pytables.Int64Col(pos=5)
DtOHLCV_dtype = np.dtype([("Date", np.float64), ("Open", np.float32), ("High", np.float32), ("Low", np.float32), ("Close", np.float32), ("Volume", np.uint64)])
# create a DB
#filename = "c:\\temp\\test.h5"
filename = "/tmp/test.h5"
ptFile = pytables.openFile(filename, mode="w", title="FredFred")
ptFile.createGroup("/", "data")
class Schema(pytables.IsDescription): seed = pytables.Int16Col(pos=0)
table = ptFile.createTable("/", "TableNameSeed", Schema)
row = table.row
row['seed'] = 1
row.append()
table.flush()
summariesByIndex = {}
class Summary: pass
for i in range(2000):
size = int(np.random.rand() * 400000) + 100000
#size = 500000
attributes = dict(fred1="fred", fred2="a%s" % i, fred3=int(np.random.rand()*10)+1, fred4="D", fred5="fredfred")
# get the new table name
seedTable = ptFile.getNode("/TableNameSeed")
tableNameSeed = seedTable.read(0,1,1,"seed")[0]
tableNameSeed += 1
seedTable.modifyColumn(0, 1, 1, tableNameSeed, "seed")
seedTable.flush()
tableName = "_%d" % tableNameSeed
print "creating table %s (%s)" % (tableName, size)
# create the new table
table = ptFile.createTable("/data", tableName, DtOHLCV_Schema)
for kv in attributes.items():
setattr(table._v_attrs, kv[0], kv[1]) # maybe use table._f_setAttr(name.value) instead
table.cols.Date.createCSIndex()
ptFile.flush()
summary = Summary()
summary.attributes = attributes
summary.dtype = DtOHLCV_Schema
summary.pathname = table._v_pathname
summariesByIndex[i] = summary
# create a large np array (would be loaded from csv)
data = np.zeros(size, DtOHLCV_dtype).view(np.recarray)
data["Date"] = np.arange(size)
summary._data = data
# commit data to disk
if table.nrows > 0: raise TSCError("There is already data in the database - unable to add data in this version")
table.append(summary._data)
table.flush()
summary._persistentSize = len(summary._data)
if summary._persistentSize > 0:
summary._firstPersistentTS = summary._data[0]["Date"]
summary._lastPersistentTS = summary._data[summary._persistentSize-1]["Date"]
# drop data from memory
summary._data = None
print "Done"
import tables as pytables
class TableSummary:
def __init__(self, attributes, dtype, id, filename, tablename):
self.attributes = attributes
self.dtype = dtype
self.id = id
self.filename = filename
self.tablename = tablename
def getTables(ptFile, attributes={}, where="/"):
answer = []
for node in ptFile.walkNodes(where):
if isinstance(node, pytables.Leaf):
matches = True
for items in attributes.items():
if not node._v_attrs.__contains__(items[0]):
matches = False
break
#if ("%s" % node._v_attrs.__getattr__(items[0])) <> ("%s" % items[1]):
if node._v_attrs.__getattr__(items[0]) <> items[1]:
matches = False
break
if matches: answer.append(node)
return answer
def openDB(ptFilename):
ptFile = pytables.openFile(ptFilename, mode="r")
TSIDSeed = 0
newTSByTSID = {}
for table in getTables(ptFile, where="/data"):
attributes = {}
for name in table._v_attrs._f_list():
attributes[name] = table._v_attrs[name] # I'm not sure if the _v_attrs can be added in one go
TSIDSeed += 1
ts = TableSummary(attributes, table.dtype, TSIDSeed, ptFilename, table._v_pathname)
ts.size = table.nrows
if ts.size > 0:
ts.firstDate = table.read(start=0, field="Date")[0]
ts.lastDate = table.read(start=table.nrows-1, field="Date")[0]
newTSByTSID[TSIDSeed] = ts
print "%s (%s)" % (ts.tablename, ts.size)
return newTSByTSID
def test():
#print len(openDB("C:\\temp\\test.h5"))
print len(openDB("/tmp/test.h5"))
test()
------------------------------------------------------------------------------
Lotusphere 2011
Register now for Lotusphere 2011 and learn how
to connect the dots, take your collaborative environment
to the next level, and enter the era of Social Business.
http://p.sf.net/sfu/lotusphere-d2d
_______________________________________________
Pytables-users mailing list
Pytables-users@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/pytables-users