Your query has to visit every row of table d and execute the correlated subquery multiple times.
You need to devise a way to do this only once for each d.m and then join that table back into your query. >sqlite < demo.sql .eqp on .timer on CREATE TABLE d ( m INT NOT NULL, t INT NOT NULL, v REAL ); --EQP-- 0,0,0,SEARCH TABLE sqlite_master USING INTEGER PRIMARY KEY (rowid=?) Run Time: real 0.007 user 0.000000 sys 0.000000 CREATE UNIQUE INDEX d_ind_1 ON d (m, t); Run Time: real 0.002 user 0.000000 sys 0.000000 .timer on INSERT INTO d (m, t, v) WITH RECURSIVE cnt(x) AS ( VALUES(0) UNION ALL SELECT x+1 FROM cnt WHERE x<200000 ) SELECT (x % 10) + 1 AS m, x+1 AS t, 1.0 AS v FROM cnt; --EQP-- 3,0,0,SCAN TABLE cnt --EQP-- 1,0,0,COMPOUND SUBQUERIES 0 AND 0 (UNION ALL) --EQP-- 0,0,0,SCAN SUBQUERY 1 Run Time: real 1.064 user 1.046875 sys 0.000000 -- arrange for one rare value of m, which should not appear in results DELETE FROM d WHERE m = 5; --EQP-- 0,0,0,SEARCH TABLE d USING COVERING INDEX d_ind_1 (m=?) Run Time: real 0.060 user 0.046875 sys 0.000000 INSERT INTO d (m, t, v) WITH RECURSIVE cnt(x) AS ( VALUES(0) UNION ALL SELECT x+1 FROM cnt WHERE x<10) SELECT 5 AS m, x + 1230000 AS t, 1.0 AS v FROM cnt; --EQP-- 3,0,0,SCAN TABLE cnt --EQP-- 1,0,0,COMPOUND SUBQUERIES 0 AND 0 (UNION ALL) --EQP-- 0,0,0,SCAN SUBQUERY 1 Run Time: real 0.013 user 0.000000 sys 0.000000 -- now, for each value of m, look for exactly N rows, ordered by t CREATE TEMP TABLE tt (m INTEGER NOT NULL UNIQUE, t integer not null); --EQP-- 0,0,0,SEARCH TABLE sqlite_temp_master USING INTEGER PRIMARY KEY (rowid=?) Run Time: real 0.011 user 0.000000 sys 0.000000 INSERT INTO tt (m, t) SELECT DISTINCT m, 0 from d; --EQP-- 0,0,0,SCAN TABLE d USING COVERING INDEX d_ind_1 --EQP-- 0,0,0,USE TEMP B-TREE FOR DISTINCT Run Time: real 0.076 user 0.046875 sys 0.000000 update tt set t = (SELECT MAX(d3.t) FROM (SELECT d2.t FROM d AS d2 WHERE tt.m = d2.m ORDER BY d2.t LIMIT 20 ) AS d3 ); --EQP-- 0,0,0,SCAN TABLE tt --EQP-- 0,0,0,EXECUTE CORRELATED SCALAR SUBQUERY 0 --EQP-- 1,0,0,SEARCH TABLE d AS d2 USING COVERING INDEX d_ind_1 (m=?) --EQP-- 0,0,0,SEARCH SUBQUERY 1 AS d3 Run Time: real 0.058 user 0.000000 sys 0.000000 select * from tt; --EQP-- 0,0,0,SCAN TABLE tt 1|191 2|192 3|193 4|194 5|1230010 6|196 7|197 8|198 9|199 10|200 Run Time: real 0.053 user 0.000000 sys 0.000000 ANALYZE; --EQP-- 0,0,0,SEARCH TABLE sqlite_master USING INTEGER PRIMARY KEY (rowid=?) --EQP-- 0,0,0,SEARCH TABLE sqlite_master USING INTEGER PRIMARY KEY (rowid=?) Run Time: real 0.173 user 0.156250 sys 0.000000 SELECT m, n_points, t_csv, v_csv, max_rowid FROM (SELECT d.m AS "m", count(v) AS "n_points", substr(group_concat(d.t), 1, 16) || '...' AS "t_csv", substr(group_concat(round(v, 2)), 1, 16) || '...' AS "v_csv", max(d.rowid) AS "max_rowid" FROM d, tt WHERE d.t <= tt.t and d.m = tt.m GROUP BY d.m ORDER BY d.m, d.t ) WHERE n_points = 20; --EQP-- 1,0,0,SCAN TABLE d USING INDEX d_ind_1 --EQP-- 1,1,1,SEARCH TABLE tt USING INDEX sqlite_autoindex_tt_1 (m=?) --EQP-- 1,0,0,USE TEMP B-TREE FOR ORDER BY --EQP-- 0,0,0,SCAN SUBQUERY 1 1|20|1,11,21,31,41,51...|1.0,1.0,1.0,1.0,...|191 2|20|2,12,22,32,42,52...|1.0,1.0,1.0,1.0,...|192 3|20|3,13,23,33,43,53...|1.0,1.0,1.0,1.0,...|193 4|20|4,14,24,34,44,54...|1.0,1.0,1.0,1.0,...|194 6|20|6,16,26,36,46,56...|1.0,1.0,1.0,1.0,...|196 7|20|7,17,27,37,47,57...|1.0,1.0,1.0,1.0,...|197 8|20|8,18,28,38,48,58...|1.0,1.0,1.0,1.0,...|198 9|20|9,19,29,39,49,59...|1.0,1.0,1.0,1.0,...|199 10|20|10,20,30,40,50,6...|1.0,1.0,1.0,1.0,...|200 Run Time: real 0.308 user 0.203125 sys 0.000000 SELECT m, n_points, t_csv, v_csv, max_rowid FROM (SELECT d.m AS "m", count(v) AS "n_points", substr(group_concat(t), 1, 16) || '...' AS "t_csv", substr(group_concat(round(v, 2)), 1, 16) || '...' AS "v_csv", max(d.rowid) AS "max_rowid" FROM d WHERE d.t <= (SELECT MAX(d3.t) FROM (SELECT d2.t FROM d AS d2 WHERE d.m = d2.m ORDER BY d2.t LIMIT 20 ) AS d3 ) GROUP BY d.m ORDER BY d.m, d.t ) WHERE n_points = 20; --EQP-- 1,0,0,SCAN TABLE d USING INDEX d_ind_1 --EQP-- 1,0,0,EXECUTE CORRELATED SCALAR SUBQUERY 2 --EQP-- 3,0,0,SEARCH TABLE d AS d2 USING COVERING INDEX d_ind_1 (m=?) --EQP-- 2,0,0,SEARCH SUBQUERY 3 AS d3 --EQP-- 1,0,0,USE TEMP B-TREE FOR ORDER BY --EQP-- 0,0,0,SCAN SUBQUERY 1 1|20|1,11,21,31,41,51...|1.0,1.0,1.0,1.0,...|191 2|20|2,12,22,32,42,52...|1.0,1.0,1.0,1.0,...|192 3|20|3,13,23,33,43,53...|1.0,1.0,1.0,1.0,...|193 4|20|4,14,24,34,44,54...|1.0,1.0,1.0,1.0,...|194 6|20|6,16,26,36,46,56...|1.0,1.0,1.0,1.0,...|196 7|20|7,17,27,37,47,57...|1.0,1.0,1.0,1.0,...|197 8|20|8,18,28,38,48,58...|1.0,1.0,1.0,1.0,...|198 9|20|9,19,29,39,49,59...|1.0,1.0,1.0,1.0,...|199 10|20|10,20,30,40,50,6...|1.0,1.0,1.0,1.0,...|200 Run Time: real 2.421 user 2.281250 sys 0.000000 2015-04-24 21:17:08 [D:\Temp] > In case you missed it the answer is: CREATE TEMP TABLE tt (m INTEGER NOT NULL UNIQUE, t integer not null); INSERT INTO tt (m, t) SELECT DISTINCT m, 0 from d; update tt set t = (SELECT MAX(d3.t) FROM (SELECT d2.t FROM d AS d2 WHERE tt.m = d2.m ORDER BY d2.t LIMIT 20 ) AS d3 ); SELECT m, n_points, t_csv, v_csv, max_rowid FROM (SELECT d.m AS "m", count(v) AS "n_points", substr(group_concat(d.t), 1, 16) || '...' AS "t_csv", substr(group_concat(round(v, 2)), 1, 16) || '...' AS "v_csv", max(d.rowid) AS "max_rowid" FROM d, tt WHERE d.t <= tt.t and d.m = tt.m GROUP BY d.m ORDER BY d.m, d.t ) WHERE n_points = 20; This is now linear on the number of distinct m rather than the number of rows in table d. > -----Original Message----- > From: sqlite-users-bounces at mailinglists.sqlite.org [mailto:sqlite-users- > bounces at mailinglists.sqlite.org] On Behalf Of John Pitney > Sent: Friday, 24 April, 2015 20:13 > To: sqlite-users > Subject: [sqlite] First-N query time scaling with table size > > In my application, values v associated with metrics m and times t are > inserted into one big table d. Another process opens the same database > and looks in d for metrics with at least N entries, sorted by t. I'm > finding that the time to complete the query grows linearly with the > row count M of table d, even though, in the example here, the query > results are identical for any M >= 200. > > I've made a self-contained example, where N = 20 and M = 200. Is > there a way to write the final select statement so that its completion > time does not grow with M? > > -- demo.sql > -- create big table of data with M = 200 > CREATE TABLE d (m INT NOT NULL, t INT NOT NULL, v REAL); > CREATE UNIQUE INDEX d_ind_1 ON d (m, t); > .timer on > INSERT INTO d (m, t, v) > WITH RECURSIVE > cnt(x) AS (VALUES(0) UNION ALL SELECT x+1 FROM cnt WHERE x<200) > SELECT (x % 10) + 1 AS m, x+1 AS t, 1.0 AS v > FROM cnt; > .timer off > > -- arrange for one rare value of m, which should not appear in results > DELETE FROM d WHERE m = 5; > INSERT INTO d (m, t, v) > WITH RECURSIVE > cnt(x) AS (VALUES(0) UNION ALL SELECT x+1 FROM cnt WHERE x<10) > SELECT 5 AS m, x + 1230000 AS t, 1.0 AS v > FROM cnt; > > -- now, for each value of m, look for exactly N rows, ordered by t > CREATE TEMP TABLE tt (m INT NOT NULL); > INSERT INTO tt (m) SELECT DISTINCT m from d; > .timer on > SELECT m, n_points, t_csv, v_csv, max_rowid FROM > (SELECT tt.m AS "m", > count(v) AS "n_points", > substr(group_concat(t), 1, 16) || '...' AS "t_csv", > substr(group_concat(round(v, 2)), 1, 16) || '...' AS "v_csv", > max(d.rowid) AS "max_rowid" > FROM tt, d > WHERE tt.m = d.m > AND d.t <= (SELECT MAX(d3.t) FROM (SELECT d2.t FROM d AS d2 > WHERE d.m = d2.m ORDER BY d2.t LIMIT 20) AS d3) > GROUP BY tt.m ORDER BY tt.m, d.t) > WHERE n_points = 20; > -- end of demo.sql > > The results are the following, on a Windows 7 64-bit platform: > > $ sqlite3 -version > 3.8.4.3 2014-04-03 16:53:12 a611fa96c4a848614efe899130359c9f6fb889c3 > > $ sqlite3 < demo.sql > Run Time: real 0.003 user 0.000000 sys 0.000000 > 1|20|1,11,21,31,41,51...|1.0,1.0,1.0,1.0,...|191 > 2|20|2,12,22,32,42,52...|1.0,1.0,1.0,1.0,...|192 > 3|20|3,13,23,33,43,53...|1.0,1.0,1.0,1.0,...|193 > 4|20|4,14,24,34,44,54...|1.0,1.0,1.0,1.0,...|194 > 6|20|6,16,26,36,46,56...|1.0,1.0,1.0,1.0,...|196 > 7|20|7,17,27,37,47,57...|1.0,1.0,1.0,1.0,...|197 > 8|20|8,18,28,38,48,58...|1.0,1.0,1.0,1.0,...|198 > 9|20|9,19,29,39,49,59...|1.0,1.0,1.0,1.0,...|199 > 10|20|10,20,30,40,50,6...|1.0,1.0,1.0,1.0,...|200 > Run Time: real 0.012 user 0.000000 sys 0.000000 > > $ sed -e 's/200/200000/' < demo.sql | sqlite3 > Run Time: real 0.608 user 0.592804 sys 0.000000 > 1|20|1,11,21,31,41,51...|1.0,1.0,1.0,1.0,...|191 > 2|20|2,12,22,32,42,52...|1.0,1.0,1.0,1.0,...|192 > 3|20|3,13,23,33,43,53...|1.0,1.0,1.0,1.0,...|193 > 4|20|4,14,24,34,44,54...|1.0,1.0,1.0,1.0,...|194 > 6|20|6,16,26,36,46,56...|1.0,1.0,1.0,1.0,...|196 > 7|20|7,17,27,37,47,57...|1.0,1.0,1.0,1.0,...|197 > 8|20|8,18,28,38,48,58...|1.0,1.0,1.0,1.0,...|198 > 9|20|9,19,29,39,49,59...|1.0,1.0,1.0,1.0,...|199 > 10|20|10,20,30,40,50,6...|1.0,1.0,1.0,1.0,...|200 > Run Time: real 1.063 user 1.045207 sys 0.000000 > > $ sed -e 's/200/400000/' < demo.sql | sqlite3 > Run Time: real 1.255 user 1.248008 sys 0.000000 > 1|20|1,11,21,31,41,51...|1.0,1.0,1.0,1.0,...|191 > 2|20|2,12,22,32,42,52...|1.0,1.0,1.0,1.0,...|192 > 3|20|3,13,23,33,43,53...|1.0,1.0,1.0,1.0,...|193 > 4|20|4,14,24,34,44,54...|1.0,1.0,1.0,1.0,...|194 > 6|20|6,16,26,36,46,56...|1.0,1.0,1.0,1.0,...|196 > 7|20|7,17,27,37,47,57...|1.0,1.0,1.0,1.0,...|197 > 8|20|8,18,28,38,48,58...|1.0,1.0,1.0,1.0,...|198 > 9|20|9,19,29,39,49,59...|1.0,1.0,1.0,1.0,...|199 > 10|20|10,20,30,40,50,6...|1.0,1.0,1.0,1.0,...|200 > Run Time: real 2.090 user 2.074813 sys 0.000000 > > -- > John Pitney > john at pitney.org > _______________________________________________ > sqlite-users mailing list > sqlite-users at mailinglists.sqlite.org > http://mailinglists.sqlite.org/cgi-bin/mailman/listinfo/sqlite-users