1 files changed, 103 insertions, 0 deletions
diff --git a/test/whereN.test b/test/whereN.test
new file mode 100644
index 0000000..b9b889f
--- /dev/null
+++ b/test/whereN.test
@@ -0,0 +1,103 @@
+# 2024-04-02
+#
+# The author disclaims copyright to this source code.  In place of
+# a legal notice, here is a blessing:
+#
+#    May you do good and not evil.
+#    May you find forgiveness for yourself and forgive others.
+#    May you share freely, never taking more than you give.
+#
+#***********************************************************************
+# Tests for the whereInterstageHeuristic() routine in the query planner.
+#
+
+set testdir [file dirname $argv0]
+source $testdir/tester.tcl
+set testprefix whereN
+
+# The following is a simplified and "sanitized" version of the original
+# real-world query that brought the problem to light.
+#
+# The issue is a slow query.  The answer is correct, but it was taking too
+# much time, because it was doing a full table scan rather than an indexed
+# lookup.
+#
+# The problem was that the query planner was overestimating the number of
+# output rows.  The estimated number of output rows is accurate if the
+# DSNAME parameter is "ds-one".  In that case, a large fraction of the rows
+# in "violation" end up being output.  The query planner correctly deduces
+# that it is faster to do a full table scan of the large "violation" table
+# to avoid the after-query sort that implements the ORDER BY clause. However,
+# if the DSNAME is "ds-two", then only a few rows (about 6) are generated,
+# and it is much much faster to do an indexed lookup of "violation" followed
+# by a sort operation to implement ORDER BY
+#
+# The problem, of course, is that the query planner has no way of knowing
+# in advance how many rows will be generated.  The query planner tries to
+# estimate a worst case, which is a large number of output rows, and it picks
+# the best plan for that case.  However, the plan choosen is very inefficient
+# when the number of output rows is small.
+#
+# The whereInterstageHeuristic() routine in the query planner attempts to
+# correct this by adjusting the query plan such that it avoids the very bad
+# query plan for a small number of rows, at the expense of a slightly less
+# efficient plan for a large number of rows.  The large number of rows case
+# is perhaps 5% slower with the revised plan, but the small number of
+# rows case is around 100 times faster.  That seems like a good tradeoff.
+#
+do_execsql_test 1.0 {
+  CREATE TABLE datasource(dsid INT, name TEXT);
+  INSERT INTO datasource VALUES(1,'ds-one'),(2,'ds-two'),(3,'ds-three');
+  CREATE INDEX ds1 ON datasource(name, dsid);
+
+  CREATE TABLE rule(rid INT, team_id INT, dsid INT);
+  WITH RECURSIVE c(n) AS (VALUES(1) UNION ALL SELECT n+1 FROM c WHERE n<9)
+    INSERT INTO rule(rid,team_id,dsid) SELECT n, 1, 1 FROM c;
+  WITH RECURSIVE c(n) AS (VALUES(10) UNION ALL SELECT n+1 FROM c WHERE n<24)
+    INSERT INTO rule(rid,team_id,dsid) SELECT n, 2, 2 FROM c;
+  CREATE INDEX rule2 ON rule(dsid, rid);
+
+  CREATE TABLE violation(vid INT, rid INT, vx BLOB);
+  /***  Uncomment to insert actual data
+  WITH src(rid, cnt) AS (VALUES(1,3586),(2,1343),(3,6505),(5,76230),
+                               (6,740),(7,287794),(8,457),(12,1),
+                               (14,1),(16,1),(17,1),(18,1),(19,1))
+    INSERT INTO violation(vid, rid, vx)
+      SELECT rid*1000000+value, rid, randomblob(15)
+        FROM src, generate_series(1,cnt);
+  ***/
+  CREATE INDEX v1 ON violation(rid, vid);
+  CREATE INDEX v2 ON violation(vid);
+  ANALYZE;
+  DELETE FROM sqlite_stat1;
+  DROP TABLE IF EXISTS sqlite_stat4;
+  INSERT INTO sqlite_stat1 VALUES
+    ('violation','v2','376661 1'),
+    ('violation','v1','376661 28974 1'),
+    ('rule','rule2','24 12 1'),
+    ('datasource','ds1','3 1 1');
+  ANALYZE sqlite_schema;
+}
+set DSNAME ds-two   ;#  Only a few rows.  Change to "ds-one" for many rows.
+do_eqp_test 1.1 {
+  SELECT count(*), length(group_concat(vx)) FROM (
+    SELECT V.*
+      FROM datasource DS, rule R, violation V
+     WHERE V.rid=R.rid
+       AND R.dsid=DS.dsid
+       AND DS.name=$DSNAME
+     ORDER BY V.vid desc
+  );
+} {
+  QUERY PLAN
+  |--CO-ROUTINE (subquery-xxxxxx)
+  |  |--SEARCH DS USING COVERING INDEX ds1 (name=?)
+  |  |--SEARCH R USING COVERING INDEX rule2 (dsid=?)
+  |  |--SEARCH V USING INDEX v1 (rid=?)
+  |  `--USE TEMP B-TREE FOR ORDER BY
+  `--SCAN (subquery-xxxxxx)
+}
+#       ^^^^---- We want to see three SEARCH terms.  No SCAN terms.
+#                The ORDER BY is implemented by a separate sorter pass.
+
+finish_test