GK wrote:
> Michael,
>
> Thank you for your response - it was very helpful for me.
>
> It turns out my main problem was that I was importing an order of
> magnitude or so more data than I realized, but you were also right
> about using flush().
>
> You were also right about the overhead of creating extra indexes. In
> the spirit of putting some data in a public space...
>
> Starting with en empty database, with a test dataset of 1200 values
> (about 1150 unique insertions) and flushing after every insertion I
> have the following timings:
>
> No extra indexes: 2:00
> Three extra indexes: 2:15
>
> This is using SQLite with a flat file on a 1.8GHz laptop. The records
> are each nearly 1Kb. There's an overhead of about 5 seconds for
> reading the data, so most of the above time is loading the database.
>
> I haven't yet had time to judge how the performance varies with larger
> datasets.
2:00 seems very high- is that 2 minutes? Below are two similar bulk
table loads. The first uses the same insert-or-update methodology and
only the relational layer (no ORM)- that clocks in at 1.25 seconds on my
laptop. The second is an ORM implementation with a different duplicate
detection methodology- that clocks in at 2.0 seconds.
---
##
## Relational version
##
import os
import time
import random
from sqlalchemy import *
from sqlalchemy.exceptions import IntegrityError
data_cols = (
'sendadr', 'fromadr', 'toadr', 'sendnam', 'fromnam',
'tonam', 'subject', 'received', 'spam', 'folderid' )
chunk = lambda: '%x' % random.getrandbits(400)
dataset = [dict((col, chunk()) for col in data_cols)
for _ in xrange(1200)]
dupes = random.sample(dataset, 50)
db = '1krows.db'
if os.path.exists(db):
os.unlink(db)
engine = create_engine('sqlite:///%s' % db)
metadata = MetaData(engine)
table = Table('t', metadata,
Column('id', Integer, primary_key=True),
Column('occurs', Integer, default=1),
*(Column(col, Text) for col in data_cols))
table.append_constraint(UniqueConstraint(*data_cols))
metadata.create_all()
table.insert().execute(dupes)
assert table.select().count().scalar() == 50
start = time.time()
insert = table.insert()
update = (table.update().
where(and_(*((table.c[col] == bindparam(col))
for col in data_cols))).
values({'occurs': table.c.occurs+1}))
conn = engine.connect()
tx = conn.begin()
for row in dataset:
try:
conn.execute(insert, row)
except IntegrityError:
conn.execute(update, row)
tx.commit()
end = time.time()
assert table.select().count().scalar() == 1200
assert select([func.count(table.c.id)],
table.c.occurs==2).scalar() == 50
print "elapsed: %04f" % (end - start)
##
## ORM version
##
import hashlib
import os
import time
import random
from sqlalchemy import *
from sqlalchemy.orm import *
data_cols = (
'sendadr', 'fromadr', 'toadr', 'sendnam', 'fromnam',
'tonam', 'subject', 'received', 'spam', 'folderid' )
chunk = lambda: '%x' % random.getrandbits(400)
dataset = [dict((col, chunk()) for col in data_cols)
for _ in xrange(1200)]
def hashrow(row):
return hashlib.sha1(
','.join(row[c] for c in data_cols)).hexdigest()
dupes = []
for row in random.sample(dataset, 50):
dupe = row.copy()
dupe['hash'] = hashrow(dupe)
dupes.append(dupe)
db = '1krows.db'
if os.path.exists(db):
os.unlink(db)
engine = create_engine('sqlite:///%s' % db)
metadata = MetaData(engine)
table = Table('t', metadata,
Column('id', Integer, primary_key=True),
Column('occurs', Integer, default=1),
Column('hash', String(40), unique=True),
*(Column(col, Text) for col in data_cols))
metadata.create_all()
table.insert().execute(dupes)
assert table.select().count().scalar() == 50
class Email(object):
def __init__(self, **kw):
for key, value in kw.items():
setattr(self, key, value)
def hashval(self):
return hashrow(dict((col, getattr(self, col))
for col in data_cols))
mapper(Email, table)
start = time.time()
session = create_session()
session.begin()
data = [Email(**row) for row in dataset]
chunk, remaining = [], [(e.hashval(), e) for e in data]
while remaining:
chunk, remaining = remaining[:100], remaining[100:]
by_hash = dict(chunk)
dupes = (session.query(Email).
filter(Email.hash.in_(by_hash.keys()))).all()
for dupe in dupes:
dupe.occurs += 1
by_hash.pop(dupe.hash)
for hashval, email in by_hash.items():
email.hash = hashval
session.save(email)
session.flush()
session.commit()
end = time.time()
assert table.select().count().scalar() == 1200
assert select([func.count(table.c.id)],
table.c.occurs==2).scalar() == 50
print "elapsed: %04f" % (end - start)
--~--~---------~--~----~------------~-------~--~----~
You received this message because you are subscribed to the Google Groups
"sqlalchemy" group.
To post to this group, send email to [email protected]
To unsubscribe from this group, send email to [EMAIL PROTECTED]
For more options, visit this group at
http://groups.google.com/group/sqlalchemy?hl=en
-~----------~----~----~----~------~----~------~--~---