diff options
Diffstat (limited to '')
-rw-r--r-- | test/whereN.test | 103 |
1 files changed, 103 insertions, 0 deletions
diff --git a/test/whereN.test b/test/whereN.test new file mode 100644 index 0000000..b9b889f --- /dev/null +++ b/test/whereN.test @@ -0,0 +1,103 @@ +# 2024-04-02 +# +# The author disclaims copyright to this source code. In place of +# a legal notice, here is a blessing: +# +# May you do good and not evil. +# May you find forgiveness for yourself and forgive others. +# May you share freely, never taking more than you give. +# +#*********************************************************************** +# Tests for the whereInterstageHeuristic() routine in the query planner. +# + +set testdir [file dirname $argv0] +source $testdir/tester.tcl +set testprefix whereN + +# The following is a simplified and "sanitized" version of the original +# real-world query that brought the problem to light. +# +# The issue is a slow query. The answer is correct, but it was taking too +# much time, because it was doing a full table scan rather than an indexed +# lookup. +# +# The problem was that the query planner was overestimating the number of +# output rows. The estimated number of output rows is accurate if the +# DSNAME parameter is "ds-one". In that case, a large fraction of the rows +# in "violation" end up being output. The query planner correctly deduces +# that it is faster to do a full table scan of the large "violation" table +# to avoid the after-query sort that implements the ORDER BY clause. However, +# if the DSNAME is "ds-two", then only a few rows (about 6) are generated, +# and it is much much faster to do an indexed lookup of "violation" followed +# by a sort operation to implement ORDER BY +# +# The problem, of course, is that the query planner has no way of knowing +# in advance how many rows will be generated. The query planner tries to +# estimate a worst case, which is a large number of output rows, and it picks +# the best plan for that case. However, the plan choosen is very inefficient +# when the number of output rows is small. +# +# The whereInterstageHeuristic() routine in the query planner attempts to +# correct this by adjusting the query plan such that it avoids the very bad +# query plan for a small number of rows, at the expense of a slightly less +# efficient plan for a large number of rows. The large number of rows case +# is perhaps 5% slower with the revised plan, but the small number of +# rows case is around 100 times faster. That seems like a good tradeoff. +# +do_execsql_test 1.0 { + CREATE TABLE datasource(dsid INT, name TEXT); + INSERT INTO datasource VALUES(1,'ds-one'),(2,'ds-two'),(3,'ds-three'); + CREATE INDEX ds1 ON datasource(name, dsid); + + CREATE TABLE rule(rid INT, team_id INT, dsid INT); + WITH RECURSIVE c(n) AS (VALUES(1) UNION ALL SELECT n+1 FROM c WHERE n<9) + INSERT INTO rule(rid,team_id,dsid) SELECT n, 1, 1 FROM c; + WITH RECURSIVE c(n) AS (VALUES(10) UNION ALL SELECT n+1 FROM c WHERE n<24) + INSERT INTO rule(rid,team_id,dsid) SELECT n, 2, 2 FROM c; + CREATE INDEX rule2 ON rule(dsid, rid); + + CREATE TABLE violation(vid INT, rid INT, vx BLOB); + /*** Uncomment to insert actual data + WITH src(rid, cnt) AS (VALUES(1,3586),(2,1343),(3,6505),(5,76230), + (6,740),(7,287794),(8,457),(12,1), + (14,1),(16,1),(17,1),(18,1),(19,1)) + INSERT INTO violation(vid, rid, vx) + SELECT rid*1000000+value, rid, randomblob(15) + FROM src, generate_series(1,cnt); + ***/ + CREATE INDEX v1 ON violation(rid, vid); + CREATE INDEX v2 ON violation(vid); + ANALYZE; + DELETE FROM sqlite_stat1; + DROP TABLE IF EXISTS sqlite_stat4; + INSERT INTO sqlite_stat1 VALUES + ('violation','v2','376661 1'), + ('violation','v1','376661 28974 1'), + ('rule','rule2','24 12 1'), + ('datasource','ds1','3 1 1'); + ANALYZE sqlite_schema; +} +set DSNAME ds-two ;# Only a few rows. Change to "ds-one" for many rows. +do_eqp_test 1.1 { + SELECT count(*), length(group_concat(vx)) FROM ( + SELECT V.* + FROM datasource DS, rule R, violation V + WHERE V.rid=R.rid + AND R.dsid=DS.dsid + AND DS.name=$DSNAME + ORDER BY V.vid desc + ); +} { + QUERY PLAN + |--CO-ROUTINE (subquery-xxxxxx) + | |--SEARCH DS USING COVERING INDEX ds1 (name=?) + | |--SEARCH R USING COVERING INDEX rule2 (dsid=?) + | |--SEARCH V USING INDEX v1 (rid=?) + | `--USE TEMP B-TREE FOR ORDER BY + `--SCAN (subquery-xxxxxx) +} +# ^^^^---- We want to see three SEARCH terms. No SCAN terms. +# The ORDER BY is implemented by a separate sorter pass. + +finish_test |