1 files changed, 205 insertions, 0 deletions
diff --git a/doc/src/sgml/html/sql-createstatistics.html b/doc/src/sgml/html/sql-createstatistics.html
new file mode 100644
index 0000000..b020a3a
--- /dev/null
+++ b/doc/src/sgml/html/sql-createstatistics.html
@@ -0,0 +1,205 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"><html xmlns="http://www.w3.org/1999/xhtml"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /><title>CREATE STATISTICS</title><link rel="stylesheet" type="text/css" href="stylesheet.css" /><link rev="made" href="pgsql-docs@lists.postgresql.org" /><meta name="generator" content="DocBook XSL Stylesheets Vsnapshot" /><link rel="prev" href="sql-createserver.html" title="CREATE SERVER" /><link rel="next" href="sql-createsubscription.html" title="CREATE SUBSCRIPTION" /></head><body id="docContent" class="container-fluid col-10"><div xmlns="http://www.w3.org/TR/xhtml1/transitional" class="navheader"><table width="100%" summary="Navigation header"><tr><th colspan="5" align="center">CREATE STATISTICS</th></tr><tr><td width="10%" align="left"><a accesskey="p" href="sql-createserver.html" title="CREATE SERVER">Prev</a> </td><td width="10%" align="left"><a accesskey="u" href="sql-commands.html" title="SQL Commands">Up</a></td><th width="60%" align="center">SQL Commands</th><td width="10%" align="right"><a accesskey="h" href="index.html" title="PostgreSQL 14.5 Documentation">Home</a></td><td width="10%" align="right"> <a accesskey="n" href="sql-createsubscription.html" title="CREATE SUBSCRIPTION">Next</a></td></tr></table><hr></hr></div><div class="refentry" id="SQL-CREATESTATISTICS"><div class="titlepage"></div><a id="id-1.9.3.83.1" class="indexterm"></a><div class="refnamediv"><h2><span class="refentrytitle">CREATE STATISTICS</span></h2><p>CREATE STATISTICS — define extended statistics</p></div><div class="refsynopsisdiv"><h2>Synopsis</h2><pre class="synopsis">
+CREATE STATISTICS [ IF NOT EXISTS ] <em class="replaceable"><code>statistics_name</code></em>
+    ON ( <em class="replaceable"><code>expression</code></em> )
+    FROM <em class="replaceable"><code>table_name</code></em>
+
+CREATE STATISTICS [ IF NOT EXISTS ] <em class="replaceable"><code>statistics_name</code></em>
+    [ ( <em class="replaceable"><code>statistics_kind</code></em> [, ... ] ) ]
+    ON { <em class="replaceable"><code>column_name</code></em> | ( <em class="replaceable"><code>expression</code></em> ) }, { <em class="replaceable"><code>column_name</code></em> | ( <em class="replaceable"><code>expression</code></em> ) } [, ...]
+    FROM <em class="replaceable"><code>table_name</code></em>
+</pre></div><div class="refsect1" id="SQL-CREATESTATISTICS-DESCRIPTION"><h2>Description</h2><p>
+   <code class="command">CREATE STATISTICS</code> will create a new extended statistics
+   object tracking data about the specified table, foreign table or
+   materialized view.  The statistics object will be created in the current
+   database and will be owned by the user issuing the command.
+  </p><p>
+   The <code class="command">CREATE STATISTICS</code> command has two basic forms. The
+   first form allows univariate statistics for a single expression to be
+   collected, providing benefits similar to an expression index without the
+   overhead of index maintenance.  This form does not allow the statistics
+   kind to be specified, since the various statistics kinds refer only to
+   multivariate statistics.  The second form of the command allows
+   multivariate statistics on multiple columns and/or expressions to be
+   collected, optionally specifying which statistics kinds to include.  This
+   form will also automatically cause univariate statistics to be collected on
+   any expressions included in the list.
+  </p><p>
+   If a schema name is given (for example, <code class="literal">CREATE STATISTICS
+   myschema.mystat ...</code>) then the statistics object is created in the
+   specified schema.  Otherwise it is created in the current schema.
+   The name of the statistics object must be distinct from the name of any
+   other statistics object in the same schema.
+  </p></div><div class="refsect1" id="id-1.9.3.83.6"><h2>Parameters</h2><div class="variablelist"><dl class="variablelist"><dt><span class="term"><code class="literal">IF NOT EXISTS</code></span></dt><dd><p>
+      Do not throw an error if a statistics object with the same name already
+      exists.  A notice is issued in this case.  Note that only the name of
+      the statistics object is considered here, not the details of its
+      definition.
+     </p></dd><dt><span class="term"><em class="replaceable"><code>statistics_name</code></em></span></dt><dd><p>
+      The name (optionally schema-qualified) of the statistics object to be
+      created.
+     </p></dd><dt><span class="term"><em class="replaceable"><code>statistics_kind</code></em></span></dt><dd><p>
+      A multivariate statistics kind to be computed in this statistics object.
+      Currently supported kinds are
+      <code class="literal">ndistinct</code>, which enables n-distinct statistics,
+      <code class="literal">dependencies</code>, which enables functional
+      dependency statistics, and <code class="literal">mcv</code> which enables
+      most-common values lists.
+      If this clause is omitted, all supported statistics kinds are
+      included in the statistics object. Univariate expression statistics are
+      built automatically if the statistics definition includes any complex
+      expressions rather than just simple column references.
+      For more information, see <a class="xref" href="planner-stats.html#PLANNER-STATS-EXTENDED" title="14.2.2. Extended Statistics">Section 14.2.2</a>
+      and <a class="xref" href="multivariate-statistics-examples.html" title="72.2. Multivariate Statistics Examples">Section 72.2</a>.
+     </p></dd><dt><span class="term"><em class="replaceable"><code>column_name</code></em></span></dt><dd><p>
+      The name of a table column to be covered by the computed statistics.
+      This is only allowed when building multivariate statistics.  At least
+      two column names or expressions must be specified, and their order is
+      not significant.
+     </p></dd><dt><span class="term"><em class="replaceable"><code>expression</code></em></span></dt><dd><p>
+      An expression to be covered by the computed statistics.  This may be
+      used to build univariate statistics on a single expression, or as part
+      of a list of multiple column names and/or expressions to build
+      multivariate statistics.  In the latter case, separate univariate
+      statistics are built automatically for each expression in the list.
+     </p></dd><dt><span class="term"><em class="replaceable"><code>table_name</code></em></span></dt><dd><p>
+      The name (optionally schema-qualified) of the table containing the
+      column(s) the statistics are computed on.
+     </p></dd></dl></div></div><div class="refsect1" id="id-1.9.3.83.7"><h2>Notes</h2><p>
+   You must be the owner of a table to create a statistics object
+   reading it.  Once created, however, the ownership of the statistics
+   object is independent of the underlying table(s).
+  </p><p>
+   Expression statistics are per-expression and are similar to creating an
+   index on the expression, except that they avoid the overhead of index
+   maintenance. Expression statistics are built automatically for each
+   expression in the statistics object definition.
+  </p></div><div class="refsect1" id="SQL-CREATESTATISTICS-EXAMPLES"><h2>Examples</h2><p>
+   Create table <code class="structname">t1</code> with two functionally dependent columns, i.e.,
+   knowledge of a value in the first column is sufficient for determining the
+   value in the other column. Then functional dependency statistics are built
+   on those columns:
+
+</p><pre class="programlisting">
+CREATE TABLE t1 (
+    a   int,
+    b   int
+);
+
+INSERT INTO t1 SELECT i/100, i/500
+                 FROM generate_series(1,1000000) s(i);
+
+ANALYZE t1;
+
+-- the number of matching rows will be drastically underestimated:
+EXPLAIN ANALYZE SELECT * FROM t1 WHERE (a = 1) AND (b = 0);
+
+CREATE STATISTICS s1 (dependencies) ON a, b FROM t1;
+
+ANALYZE t1;
+
+-- now the row count estimate is more accurate:
+EXPLAIN ANALYZE SELECT * FROM t1 WHERE (a = 1) AND (b = 0);
+</pre><p>
+
+   Without functional-dependency statistics, the planner would assume
+   that the two <code class="literal">WHERE</code> conditions are independent, and would
+   multiply their selectivities together to arrive at a much-too-small
+   row count estimate.
+   With such statistics, the planner recognizes that the <code class="literal">WHERE</code>
+   conditions are redundant and does not underestimate the row count.
+  </p><p>
+   Create table <code class="structname">t2</code> with two perfectly correlated columns
+   (containing identical data), and an MCV list on those columns:
+
+</p><pre class="programlisting">
+CREATE TABLE t2 (
+    a   int,
+    b   int
+);
+
+INSERT INTO t2 SELECT mod(i,100), mod(i,100)
+                 FROM generate_series(1,1000000) s(i);
+
+CREATE STATISTICS s2 (mcv) ON a, b FROM t2;
+
+ANALYZE t2;
+
+-- valid combination (found in MCV)
+EXPLAIN ANALYZE SELECT * FROM t2 WHERE (a = 1) AND (b = 1);
+
+-- invalid combination (not found in MCV)
+EXPLAIN ANALYZE SELECT * FROM t2 WHERE (a = 1) AND (b = 2);
+</pre><p>
+
+   The MCV list gives the planner more detailed information about the
+   specific values that commonly appear in the table, as well as an upper
+   bound on the selectivities of combinations of values that do not appear
+   in the table, allowing it to generate better estimates in both cases.
+  </p><p>
+   Create table <code class="structname">t3</code> with a single timestamp column,
+   and run queries using expressions on that column.  Without extended
+   statistics, the planner has no information about the data distribution for
+   the expressions, and uses default estimates.  The planner also does not
+   realize that the value of the date truncated to the month is fully
+   determined by the value of the date truncated to the day. Then expression
+   and ndistinct statistics are built on those two expressions:
+
+</p><pre class="programlisting">
+CREATE TABLE t3 (
+    a   timestamp
+);
+
+INSERT INTO t3 SELECT i FROM generate_series('2020-01-01'::timestamp,
+                                             '2020-12-31'::timestamp,
+                                             '1 minute'::interval) s(i);
+
+ANALYZE t3;
+
+-- the number of matching rows will be drastically underestimated:
+EXPLAIN ANALYZE SELECT * FROM t3
+  WHERE date_trunc('month', a) = '2020-01-01'::timestamp;
+
+EXPLAIN ANALYZE SELECT * FROM t3
+  WHERE date_trunc('day', a) BETWEEN '2020-01-01'::timestamp
+                                 AND '2020-06-30'::timestamp;
+
+EXPLAIN ANALYZE SELECT date_trunc('month', a), date_trunc('day', a)
+   FROM t3 GROUP BY 1, 2;
+
+-- build ndistinct statistics on the pair of expressions (per-expression
+-- statistics are built automatically)
+CREATE STATISTICS s3 (ndistinct) ON date_trunc('month', a), date_trunc('day', a) FROM t3;
+
+ANALYZE t3;
+
+-- now the row count estimates are more accurate:
+EXPLAIN ANALYZE SELECT * FROM t3
+  WHERE date_trunc('month', a) = '2020-01-01'::timestamp;
+
+EXPLAIN ANALYZE SELECT * FROM t3
+  WHERE date_trunc('day', a) BETWEEN '2020-01-01'::timestamp
+                                 AND '2020-06-30'::timestamp;
+
+EXPLAIN ANALYZE SELECT date_trunc('month', a), date_trunc('day', a)
+   FROM t3 GROUP BY 1, 2;
+</pre><p>
+
+   Without expression and ndistinct statistics, the planner has no information
+   about the number of distinct values for the expressions, and has to rely
+   on default estimates. The equality and range conditions are assumed to have
+   0.5% selectivity, and the number of distinct values in the expression is
+   assumed to be the same as for the column (i.e. unique). This results in a
+   significant underestimate of the row count in the first two queries. Moreover,
+   the planner has no information about the relationship between the expressions,
+   so it assumes the two <code class="literal">WHERE</code> and <code class="literal">GROUP BY</code>
+   conditions are independent, and multiplies their selectivities together to
+   arrive at a severe overestimate of the group count in the aggregate query.
+   This is further exacerbated by the lack of accurate statistics for the
+   expressions, forcing the planner to use a default ndistinct estimate for the
+   expression derived from ndistinct for the column. With such statistics, the
+   planner recognizes that the conditions are correlated, and arrives at much
+   more accurate estimates.
+  </p></div><div class="refsect1" id="id-1.9.3.83.9"><h2>Compatibility</h2><p>
+   There is no <code class="command">CREATE STATISTICS</code> command in the SQL standard.
+  </p></div><div class="refsect1" id="id-1.9.3.83.10"><h2>See Also</h2><span class="simplelist"><a class="xref" href="sql-alterstatistics.html" title="ALTER STATISTICS"><span class="refentrytitle">ALTER STATISTICS</span></a>, <a class="xref" href="sql-dropstatistics.html" title="DROP STATISTICS"><span class="refentrytitle">DROP STATISTICS</span></a></span></div></div><div xmlns="http://www.w3.org/TR/xhtml1/transitional" class="navfooter"><hr></hr><table width="100%" summary="Navigation footer"><tr><td width="40%" align="left"><a accesskey="p" href="sql-createserver.html" title="CREATE SERVER">Prev</a> </td><td width="20%" align="center"><a accesskey="u" href="sql-commands.html" title="SQL Commands">Up</a></td><td width="40%" align="right"> <a accesskey="n" href="sql-createsubscription.html" title="CREATE SUBSCRIPTION">Next</a></td></tr><tr><td width="40%" align="left" valign="top">CREATE SERVER </td><td width="20%" align="center"><a accesskey="h" href="index.html" title="PostgreSQL 14.5 Documentation">Home</a></td><td width="40%" align="right" valign="top"> CREATE SUBSCRIPTION</td></tr></table></div></body></html>
+\ No newline at end of file