Now attached as text files (to be renamed to .py) to prevent the mailer software from dropping them...
Bye, Erik. On Sat, Feb 5, 2011 at 7:05 PM, Erik Huelsmann <ehu...@gmail.com> wrote: > Yesterday or IRC, Bert, Philip and I were chatting about our SQLite > perf issues and how Philip's findings in the past suggested that > SQLite wasn't using its indices to optimize our queries. > > After searching and discussing its documentation, Philip suggested the > -too obvious- "maybe we have the wrong indices". > > So, I went to work with his "fake database generator script" (attached > as "test.py"). > > > The type of query we're seeing problematic performance with looks like > the one below. The essential part is the WHERE clause. > > SELECT * FROM nodes WHERE wc_id = 1 AND (local_relpath = 'foo' OR > local_relpath like 'foo%'); > > > We discussed 3 ways to achieve the effect of this query: > > 1. The query itself > 2. The query stated as a UNION of two queries > 3. Running the two parts of the UNION manually ourselves. > > Ad (1) > This query doesn't perform as we had hoped to get from using a database. > > Ad (2) > In the past, UNIONs have been explicitly removed because they were > creating temporary tables (on disk!). However, since then we have > changed our SQLite setup to create temporary tables in memory, so the > option should really be re-evaluated. > > Ad (3) > I'd hate to have to use two queries in all places in our source where > we want to run queries like these. As a result, I think this scenario > should be avoided if we can. > > > So, I've created 'perf.py' to evaluate each of these scenarios, > researching the effect on each of them under the influence of adding > different indices. > > This is my finding: > > Scenario (1) [an AND combined with a complex OR] doesn't perform well > under any circumstance. > > Scenario (2) performs differently, depending on the available indices. > > Scenario (3) performs roughly equal to scenario (2). > > > Scenario (2) takes ~0.27 seconds to evaluate in the unmodified > database. Adding an index on (wc_id, local_relpath) makes the > execution time drop to ~0.000156 seconds! > > > Seems Philip was right :-) We need to carefully review the indices we > have in our database to support good performance. > > > Bye, > > > Erik. >
#!/usr/bin/python import os, sqlite3, time c = sqlite3.connect('wcx.db') c.execute("""pragma case_sensitive_like=1""") c.execute("""pragma foreign_keys=on""") c.execute("""pragma synchronous=off""") c.execute("""pragma temp_store=memory""") start = time.clock() # cpu clock as float in secs #c.execute("""drop index i_wc_id_rp;""") #c.execute("""create index i_wc_id_rp on nodes (wc_id, local_relpath);""") print c.execute(".indices") # strategy 1 c.execute("""select * from nodes where wc_id = 1 AND (local_relpath like 'foo/%' OR local_relpath = 'foo');"""); # strategy 2 #c.execute("""select * from nodes where wc_id = 1 AND local_relpath like 'foo/%' # union select * from nodes where wc_id = 1 AND local_relpath = 'foo';""") # strategy 3 #c.execute("""select * from nodes where wc_id = 1 AND local_relpath like 'foo/%';""") #c.execute("""select * from nodes where wc_id = 1 AND local_relpath = 'foo';""") end = time.clock() print "timing: %5f\n" % (end - start)
#!/usr/bin/python import os, sqlite3 try: os.remove('wcx.db') except: pass c = sqlite3.connect('wcx.db') c.execute("""pragma case_sensitive_like=1""") c.execute("""pragma foreign_keys=on""") c.execute("""pragma synchronous=off""") c.execute("""pragma temp_store=memory""") c.execute("""create table repository ( id integer primary key autoincrement, root text unique not null, uuid text not null)""") c.execute("""create index i_uuid on repository (uuid)""") c.execute("""create index i_root on repository (root)""") c.execute("""create table wcroot ( id integer primary key autoincrement, local_abspath text unique)""") c.execute("""create unique index i_local_abspath on wcroot (local_abspath)""") c.execute("""create table nodes ( wc_id integer not null references wcroot (id), local_relpath text not null, op_depth integer not null, parent_relpath text, repos_id integer references repository (id), repos_path text, revision integer, presence text not null, depth text, moved_here integer, moved_to text, kind text not null, changed_revision integer, changed_date integer, changed_author text, checksum text properties blob, translated_size integer, last_mod_time integer, dav_cache blob, symlink_target text, file_external text, primary key(wc_id, local_relpath, op_depth))""") c.execute("""create index i_parent on nodes (wc_id, parent_relpath, local_relpath, op_depth)""") c.execute("""create table lock ( repos_id integer not null references repository (id), repos_relpath text not null, lock_token text not null, lock_owner text, lock_comment text, lock_date integer, primary key (repos_id, repos_relpath))""") c.execute("""insert into repository (root, uuid) values ( "http://example.com/repo", "f738be9e-409d-481f-b246-1fb6a969aba2")""") c.execute("""insert into wcroot(local_abspath) values ("/wc")""") c.execute("""insert into nodes ( wc_id, local_relpath, op_depth, repos_id, repos_path, parent_relpath, presence, kind) values ( 1, "", 0, 1, "trunk", NULL, "normal", "dir")""") for i in range(100): c.execute("""insert into nodes ( wc_id, local_relpath, op_depth, repos_id, repos_path, parent_relpath, presence, kind) values ( 1, "foo"""+str(i)+"""", 0, 1, "trunk/foo"""+str(i)+"""", "", "normal", "file")""") if i >= 60: continue; c.execute("""insert into nodes ( wc_id, local_relpath, op_depth, repos_id, repos_path, parent_relpath, presence, kind) values ( 1, "zag"""+str(i)+"""", 0, 1, "trunk/zag"""+str(i)+"""", "", "normal", "dir")""") c.execute("""insert into nodes ( wc_id, local_relpath, op_depth, repos_id, repos_path, parent_relpath, presence, kind) values ( 1, "zig"""+str(i)+"""", 0, 1, "trunk/zig"""+str(i)+"""", "", "normal", "dir")""") for j in range(100): c.execute("""insert into nodes ( wc_id, local_relpath, op_depth, repos_id, repos_path, parent_relpath, presence, kind) values ( 1, "zag"""+str(i)+"/foo"+str(j)+"""", 0, 1, "trunk/zag"""+str(i)+"/foo"+str(j)+"""", "zag"""+str(i)+"""", "normal", "file")""") if j % 10 == 1: c.execute("""insert into nodes ( wc_id, local_relpath, op_depth, repos_id, repos_path, parent_relpath, presence, kind) values ( 1, "zag"""+str(i)+"/foo"+str(j)+"""", 3, 1, "trunk/zag"""+str(i)+"/foo"+str(j)+"""", "zag"""+str(i)+"""", "base-delete", "file")""") c.execute("""insert into nodes ( wc_id, local_relpath, op_depth, repos_id, repos_path, parent_relpath, presence, kind) values ( 1, "zag"""+str(i)+"/bar"+str(j)+"""", 3, null, null, "zag"""+str(i)+"""", "normal", "file")""") c.execute("""insert into nodes ( wc_id, local_relpath, op_depth, repos_id, repos_path, parent_relpath, presence, kind) values ( 1, "zig"""+str(i)+"/foo"+str(j)+"""", 0, 1, "trunk/zig"""+str(i)+"/foo"+str(j)+"""", "zig"""+str(i)+"""", "normal", "file")""") if j >= 60: continue c.execute("""insert into nodes ( wc_id, local_relpath, op_depth, repos_id, repos_path, parent_relpath, presence, kind) values ( 1, "zig"""+str(i)+"/zag"+str(j)+"""", 0, 1, "trunk/zig"""+str(i)+"/zag"+str(j)+"""", "zig"""+str(i)+"""", "normal", "dir")""") for k in range(100): c.execute("""insert into nodes ( wc_id, local_relpath, op_depth, repos_id, repos_path, parent_relpath, presence, kind) values ( 1, "zig"""+str(i)+"/zag"+str(j)+"/foo"+str(k)+"""", 0, 1, "trunk/zig"""+str(i)+"/zag"+str(j)+"/foo"+str(k)+"""", "zig"""+str(i)+"/zag"+str(j)+"""", "normal", "file")""") c.commit()