[sqlite] First-N query time scaling with table size

John Pitney Fri, 24 Apr 2015 20:47:10 -0500

On Fri, Apr 24, 2015 at 8:22 PM, Keith Medcalf <kmedcalf at dessus.com> wrote:
>
> Your query has to visit every row of table d and execute the correlated 
> subquery multiple times.
>
> You need to devise a way to do this only once for each d.m and then join that 
> table back into your query.
>
>>sqlite < demo.sql
> .eqp on
> .timer on
>
> CREATE TABLE d
> (
>     m INT NOT NULL,
>     t INT NOT NULL,
>     v REAL
> );
> --EQP-- 0,0,0,SEARCH TABLE sqlite_master USING INTEGER PRIMARY KEY (rowid=?)
> Run Time: real 0.007 user 0.000000 sys 0.000000
> CREATE UNIQUE INDEX d_ind_1 ON d (m, t);
> Run Time: real 0.002 user 0.000000 sys 0.000000
>
> .timer on
> INSERT INTO d (m, t, v)
>   WITH RECURSIVE
>     cnt(x) AS (          VALUES(0)
>                UNION ALL
>                          SELECT x+1
>                            FROM cnt
>                           WHERE x<200000
>               )
>   SELECT (x % 10) + 1 AS m,
>          x+1 AS t,
>          1.0 AS v
>     FROM cnt;
> --EQP-- 3,0,0,SCAN TABLE cnt
> --EQP-- 1,0,0,COMPOUND SUBQUERIES 0 AND 0 (UNION ALL)
> --EQP-- 0,0,0,SCAN SUBQUERY 1
> Run Time: real 1.064 user 1.046875 sys 0.000000
>
> -- arrange for one rare value of m, which should not appear in results
> DELETE FROM d
>       WHERE m = 5;
> --EQP-- 0,0,0,SEARCH TABLE d USING COVERING INDEX d_ind_1 (m=?)
> Run Time: real 0.060 user 0.046875 sys 0.000000
> INSERT INTO d (m, t, v)
>   WITH RECURSIVE
>     cnt(x) AS (          VALUES(0)
>                UNION ALL
>                          SELECT x+1
>                            FROM cnt
>                           WHERE x<10)
>   SELECT 5 AS m,
>          x + 1230000 AS t,
>          1.0 AS v
>     FROM cnt;
> --EQP-- 3,0,0,SCAN TABLE cnt
> --EQP-- 1,0,0,COMPOUND SUBQUERIES 0 AND 0 (UNION ALL)
> --EQP-- 0,0,0,SCAN SUBQUERY 1
> Run Time: real 0.013 user 0.000000 sys 0.000000
>
> -- now, for each value of m, look for exactly N rows, ordered by t
> CREATE TEMP TABLE tt (m INTEGER NOT NULL UNIQUE, t integer not null);
> --EQP-- 0,0,0,SEARCH TABLE sqlite_temp_master USING INTEGER PRIMARY KEY 
> (rowid=?)
> Run Time: real 0.011 user 0.000000 sys 0.000000
> INSERT INTO tt (m, t) SELECT DISTINCT m, 0 from d;
> --EQP-- 0,0,0,SCAN TABLE d USING COVERING INDEX d_ind_1
> --EQP-- 0,0,0,USE TEMP B-TREE FOR DISTINCT
> Run Time: real 0.076 user 0.046875 sys 0.000000
> update tt
>    set t = (SELECT MAX(d3.t)
>               FROM (SELECT d2.t
>                       FROM d AS d2
>                      WHERE tt.m = d2.m
>                   ORDER BY d2.t
>                      LIMIT 20
>                    ) AS d3
>            );
> --EQP-- 0,0,0,SCAN TABLE tt
> --EQP-- 0,0,0,EXECUTE CORRELATED SCALAR SUBQUERY 0
> --EQP-- 1,0,0,SEARCH TABLE d AS d2 USING COVERING INDEX d_ind_1 (m=?)
> --EQP-- 0,0,0,SEARCH SUBQUERY 1 AS d3
> Run Time: real 0.058 user 0.000000 sys 0.000000
> select * from tt;
> --EQP-- 0,0,0,SCAN TABLE tt
> 1|191
> 2|192
> 3|193
> 4|194
> 5|1230010
> 6|196
> 7|197
> 8|198
> 9|199
> 10|200
> Run Time: real 0.053 user 0.000000 sys 0.000000
>
> ANALYZE;
> --EQP-- 0,0,0,SEARCH TABLE sqlite_master USING INTEGER PRIMARY KEY (rowid=?)
> --EQP-- 0,0,0,SEARCH TABLE sqlite_master USING INTEGER PRIMARY KEY (rowid=?)
> Run Time: real 0.173 user 0.156250 sys 0.000000
>
>
> SELECT m, n_points, t_csv, v_csv, max_rowid
>   FROM (SELECT d.m AS "m",
>                count(v) AS "n_points",
>                substr(group_concat(d.t), 1, 16) || '...' AS "t_csv",
>                substr(group_concat(round(v, 2)), 1, 16) || '...' AS "v_csv",
>                max(d.rowid) AS "max_rowid"
>           FROM d, tt
>          WHERE d.t <= tt.t
>            and d.m = tt.m
>       GROUP BY d.m
>       ORDER BY d.m, d.t
>        )
>  WHERE n_points = 20;
> --EQP-- 1,0,0,SCAN TABLE d USING INDEX d_ind_1
> --EQP-- 1,1,1,SEARCH TABLE tt USING INDEX sqlite_autoindex_tt_1 (m=?)
> --EQP-- 1,0,0,USE TEMP B-TREE FOR ORDER BY
> --EQP-- 0,0,0,SCAN SUBQUERY 1
> 1|20|1,11,21,31,41,51...|1.0,1.0,1.0,1.0,...|191
> 2|20|2,12,22,32,42,52...|1.0,1.0,1.0,1.0,...|192
> 3|20|3,13,23,33,43,53...|1.0,1.0,1.0,1.0,...|193
> 4|20|4,14,24,34,44,54...|1.0,1.0,1.0,1.0,...|194
> 6|20|6,16,26,36,46,56...|1.0,1.0,1.0,1.0,...|196
> 7|20|7,17,27,37,47,57...|1.0,1.0,1.0,1.0,...|197
> 8|20|8,18,28,38,48,58...|1.0,1.0,1.0,1.0,...|198
> 9|20|9,19,29,39,49,59...|1.0,1.0,1.0,1.0,...|199
> 10|20|10,20,30,40,50,6...|1.0,1.0,1.0,1.0,...|200
> Run Time: real 0.308 user 0.203125 sys 0.000000
>
> SELECT m, n_points, t_csv, v_csv, max_rowid
>   FROM (SELECT d.m AS "m",
>                count(v) AS "n_points",
>                substr(group_concat(t), 1, 16) || '...' AS "t_csv",
>                substr(group_concat(round(v, 2)), 1, 16) || '...' AS "v_csv",
>                max(d.rowid) AS "max_rowid"
>           FROM d
>          WHERE d.t <= (SELECT MAX(d3.t)
>                          FROM (SELECT d2.t
>                                  FROM d AS d2
>                                 WHERE d.m = d2.m
>                              ORDER BY d2.t
>                                 LIMIT 20
>                               ) AS d3
>                       )
>       GROUP BY d.m
>       ORDER BY d.m, d.t
>        )
>  WHERE n_points = 20;
> --EQP-- 1,0,0,SCAN TABLE d USING INDEX d_ind_1
> --EQP-- 1,0,0,EXECUTE CORRELATED SCALAR SUBQUERY 2
> --EQP-- 3,0,0,SEARCH TABLE d AS d2 USING COVERING INDEX d_ind_1 (m=?)
> --EQP-- 2,0,0,SEARCH SUBQUERY 3 AS d3
> --EQP-- 1,0,0,USE TEMP B-TREE FOR ORDER BY
> --EQP-- 0,0,0,SCAN SUBQUERY 1
> 1|20|1,11,21,31,41,51...|1.0,1.0,1.0,1.0,...|191
> 2|20|2,12,22,32,42,52...|1.0,1.0,1.0,1.0,...|192
> 3|20|3,13,23,33,43,53...|1.0,1.0,1.0,1.0,...|193
> 4|20|4,14,24,34,44,54...|1.0,1.0,1.0,1.0,...|194
> 6|20|6,16,26,36,46,56...|1.0,1.0,1.0,1.0,...|196
> 7|20|7,17,27,37,47,57...|1.0,1.0,1.0,1.0,...|197
> 8|20|8,18,28,38,48,58...|1.0,1.0,1.0,1.0,...|198
> 9|20|9,19,29,39,49,59...|1.0,1.0,1.0,1.0,...|199
> 10|20|10,20,30,40,50,6...|1.0,1.0,1.0,1.0,...|200
> Run Time: real 2.421 user 2.281250 sys 0.000000
>
>
>
> 2015-04-24 21:17:08 [D:\Temp]
>>
>
>
> In case you missed it the answer is:
>
> CREATE TEMP TABLE tt (m INTEGER NOT NULL UNIQUE, t integer not null);
>
> INSERT INTO tt (m, t) SELECT DISTINCT m, 0 from d;
>
> update tt
>    set t = (SELECT MAX(d3.t)
>               FROM (SELECT d2.t
>                       FROM d AS d2
>                      WHERE tt.m = d2.m
>                   ORDER BY d2.t
>                      LIMIT 20
>                    ) AS d3
>            );
>
> SELECT m, n_points, t_csv, v_csv, max_rowid
>   FROM (SELECT d.m AS "m",
>                count(v) AS "n_points",
>                substr(group_concat(d.t), 1, 16) || '...' AS "t_csv",
>                substr(group_concat(round(v, 2)), 1, 16) || '...' AS "v_csv",
>                max(d.rowid) AS "max_rowid"
>           FROM d, tt
>          WHERE d.t <= tt.t
>            and d.m = tt.m
>       GROUP BY d.m
>       ORDER BY d.m, d.t
>        )
>  WHERE n_points = 20;
>
> This is now linear on the number of distinct m rather than the number of rows 
> in table d.


This approach gives a massive reduction in the query time.
Surprisingly, the query time seems still to increase linearly with M.

The following query was even faster, but still O(M).  It does not get
d.rowid, but I had only included that as a diagnostic and can do
without it.

SELECT m, n_points, t_csv, v_csv, max_rowid FROM
(SELECT d2.m AS "m",
  count(v) AS "n_points",
  substr(group_concat(d2.t), 1, 16) || '...' AS "t_csv",
  substr(group_concat(round(v, 2)), 1, 16) || '...' AS "v_csv",
  max(d2.rowid) AS "max_rowid"
  FROM tt, (SELECT * FROM d WHERE d.t <= (SELECT MAX(tt.t) FROM tt)) AS d2
  WHERE d2.t <= tt.t AND tt.m = d2.m
  GROUP BY d2.m ORDER BY d2.m, d2.t)
WHERE n_points = 20;

[sqlite] First-N query time scaling with table size

Reply via email to