I spent some more time and came up with a completely standalone test
application. (attached) (requires SA 0.5.2 and some database backend).
I have tested this with postgres and with sqlite and both of them
exhibit the behavior I was talking about. Namely the memory keeps
increasing and stays topped out even though all handles to the results
have been cleared in python.
I am sure that I must be doing something very wrong, but I can't
figure it out. Can anyone point out my error?
-Allen
On Sat, Feb 21, 2009 at 7:52 AM, Allen Bierbaum <[email protected]> wrote:
> Hello all:
>
> We are running into a very strange memory leak issue. Here are the details:
>
> - The co executes select statements directly on a table using SA
> 0.5.2, Postgres 8.3, Psycopg2, all running on Linux (Ubuntu 8.04)
> - Querying a table with approximately 70000 records
> - When we query the table, convert the results to a list, and then
> free both the list and the results proxy, memory seems to stay used
> - If we query the table with ever increasing size limits (limit
> clause), the amount of memory consumed increases
> - If we query with a limit of say 50000 and then perform smaller
> queries, the smaller queries do not increase the amount of memory
> consumed overall
> - In other words, the amount of memory caps out in a way that looks
> like it is linear with the maximum number of table rows ever returned
>
> It looks like something in SA, Psycopg2, or PG is holding references
> to the returned rows but we have not been able to track it down. I am
> working on a completely standalone application to demonstrate the
> issue. Before I went much further though I wanted to post this
> question and code sample (not standalone, but shows the commands we
> are using) in case anyone sees something fishy.
>
> Any ideas?
>
> Thanks,
> Allen
>
> ----------------------------------------------------------------------------------
> import os, sys, gc
> import psycopg2
> import sqlalchemy as sa
> import sqlalchemy.exc
>
> class DataProvider(object):
> def __init__(self, dbUri, tableName):
> self._dbUri = dbUri
> self._dbEngine = None
> self._metadata = None
> self._table = None
> self._tableName = tableName
>
> def initialize(self):
> self._dbEngine = sa.create_engine(self._dbUri)
> self._metadata = sa.MetaData()
> self._metadata.bind = self._dbEngine
>
> # Build up the table metadata reflectively
> name_parts = self._tableName.split('.')
>
> if(len(name_parts) > 1):
> self._table = sa.Table(name_parts[1], self._metadata, autoload = True,
> schema = name_parts[0])
> else:
> self._table = sa.Table(name_parts[0], self._metadata, autoload = True)
>
> def query(self, limit = None):
> s = self._table.select()
>
> if limit is not None:
> s = s.limit(limit)
>
> print "objs before: ", len(gc.get_objects())
> print "garbage: ", len(gc.garbage)
> result_proxy = s.execute()
> result_objects = [x for x in result_proxy]
> #print "num results: ", len(result_objects)
> result_proxy.close()
> result_objects = []
> results = []
> gc.collect(2)
> print "objs after: ", len(gc.get_objects())
> print "garbage: ", len(gc.garbage)
> return results
>
> db_uri = sys.argv[0]
> collection_name = sys.argv[1]
>
> prov = DataProvider(db_uri, collection_name)
> prov.initialize()
>
> limits = [10000, 20000, 30000, 40000, 50000, 60000,
> 10000, 20000, 30000, 40000, 50000, 60000]
> # Query
> for x in range(10):
> limit = limits[x]
> print "----[ %s ] limit: %s ----" % (x, limit)
> gc.collect(2)
> printMemoryInformation()
> features = prov.query(limit = limit)
> features = None
> gc.collect(2)
> printMemoryInformation()
>
> ----------------------------------------------------------------------------------
>
--~--~---------~--~----~------------~-------~--~----~
You received this message because you are subscribed to the Google Groups
"sqlalchemy" group.
To post to this group, send email to [email protected]
To unsubscribe from this group, send email to
[email protected]
For more options, visit this group at
http://groups.google.com/group/sqlalchemy?hl=en
-~----------~----~----~----~------~----~------~--~---
import os, sys, gc
import psycopg2
import sqlalchemy as sa
import sqlalchemy.exc
def getMemoryUsage(VmKey):
proc_status = '/proc/%d/status' % os.getpid()
scale = {'kB': 1024.0, 'mB': 1024.0*1024.0,
'KB': 1024.0, 'MB': 1024.0*1024.0}
v = open(proc_status).read()
# get VmKey line e.g. 'VmRSS: 9999 kB\n ...'
i = v.index(VmKey)
v = v[i:].split(None, 3) # whitespace
if len(v) < 3:
return 0.0 # invalid format?
# convert Vm value to bytes
return float(v[1]) * scale[v[2]]
def memory():
return getMemoryUsage('VmSize:') / (1024*1024)
def resident():
return getMemoryUsage('VmRSS:') / (1024*1024)
def printMemoryUsage():
print "mem: ", memory(), " res: ", resident()
def createTestData(dbUri, tableName):
""" Called to create temporary data if needed. """
engine = sa.create_engine(dbUri)
metadata = sa.MetaData()
metadata.bind = engine
test_table = sa.Table(tableName, metadata,
sa.Column('id', sa.Integer, primary_key = True),
sa.Column('text1', sa.String),
sa.Column('text2', sa.String))
metadata.create_all(engine)
s = test_table.select().limit(1)
have_some = len(s.execute().fetchall()) > 0
if not have_some:
print "Creating data"
num_rows = 70000
for x in xrange(num_rows):
if (x%1000) == 0:
print "row: ", x
ins = test_table.insert()
ins.execute(text1 = 'a'*1000, text2 = 'b'*1000)
class DataProvider(object):
def __init__(self, dbUri, tableName):
self._dbEngine = sa.create_engine(dbUri)
self._metadata = sa.MetaData()
self._metadata.bind = self._dbEngine
self._table = sa.Table(tableName, self._metadata, autoload = True)
def query(self, limit = None):
s = self._table.select()
if limit is not None:
s = s.limit(limit)
print "objs before: ", len(gc.get_objects())
print "garbage: ", len(gc.garbage)
result_proxy = s.execute()
result_objects = [x for x in result_proxy]
#print "num results: ", len(result_objects)
result_proxy.close()
result_objects = []
results = []
gc.collect(2)
print "objs after: ", len(gc.get_objects())
print "garbage: ", len(gc.garbage)
return results
#db_uri = "postgres://localhost/test_db"
db_uri = "sqlite:///test_db.sqlite"
tablename = "test_table"
createTestData(db_uri, tablename)
prov = DataProvider(db_uri, tablename)
#prov = PgDataProvider(dsn, host, database, username, collection_name)
limits = [10000, 20000, 30000, 40000, 50000, 60000,
10000, 20000, 30000, 40000, 50000, 60000]
# Query
for x in range(10):
limit = limits[x]
print "----[ %s ] limit: %s ----" % (x, limit)
gc.collect(2)
printMemoryUsage()
features = prov.query(limit = limit)
features = None
gc.collect(2)
printMemoryUsage()