Skip to content

Commit

Permalink
Merge pull request #140142 from mgartner/backport24.1-139766
Browse files Browse the repository at this point in the history
release-24.1: sql: do not collect histograms for non-indexed JSON columns
  • Loading branch information
mgartner authored Jan 31, 2025
2 parents 6ef388c + 781476a commit bfc34da
Show file tree
Hide file tree
Showing 4 changed files with 72 additions and 3 deletions.
1 change: 1 addition & 0 deletions docs/generated/settings/settings-for-tenants.txt
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,7 @@ sql.stats.histogram_collection.enabled boolean true histogram collection mode ap
sql.stats.histogram_samples.count integer 10000 number of rows sampled for histogram construction during table statistics collection application
sql.stats.multi_column_collection.enabled boolean true multi-column statistics collection mode application
sql.stats.non_default_columns.min_retention_period duration 24h0m0s minimum retention period for table statistics collected on non-default columns application
sql.stats.non_indexed_json_histograms.enabled boolean true set to true to collect table statistics histograms on non-indexed JSON columns application
sql.stats.persisted_rows.max integer 1000000 maximum number of rows of statement and transaction statistics that will be persisted in the system tables before compaction begins application
sql.stats.post_events.enabled boolean false if set, an event is logged for every CREATE STATISTICS job application
sql.stats.response.max integer 20000 the maximum number of statements and transaction stats returned in a CombinedStatements request application
Expand Down
1 change: 1 addition & 0 deletions docs/generated/settings/settings.html
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,7 @@
<tr><td><div id="setting-sql-stats-histogram-samples-count" class="anchored"><code>sql.stats.histogram_samples.count</code></div></td><td>integer</td><td><code>10000</code></td><td>number of rows sampled for histogram construction during table statistics collection</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
<tr><td><div id="setting-sql-stats-multi-column-collection-enabled" class="anchored"><code>sql.stats.multi_column_collection.enabled</code></div></td><td>boolean</td><td><code>true</code></td><td>multi-column statistics collection mode</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
<tr><td><div id="setting-sql-stats-non-default-columns-min-retention-period" class="anchored"><code>sql.stats.non_default_columns.min_retention_period</code></div></td><td>duration</td><td><code>24h0m0s</code></td><td>minimum retention period for table statistics collected on non-default columns</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
<tr><td><div id="setting-sql-stats-non-indexed-json-histograms-enabled" class="anchored"><code>sql.stats.non_indexed_json_histograms.enabled</code></div></td><td>boolean</td><td><code>true</code></td><td>set to true to collect table statistics histograms on non-indexed JSON columns</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
<tr><td><div id="setting-sql-stats-persisted-rows-max" class="anchored"><code>sql.stats.persisted_rows.max</code></div></td><td>integer</td><td><code>1000000</code></td><td>maximum number of rows of statement and transaction statistics that will be persisted in the system tables before compaction begins</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
<tr><td><div id="setting-sql-stats-post-events-enabled" class="anchored"><code>sql.stats.post_events.enabled</code></div></td><td>boolean</td><td><code>false</code></td><td>if set, an event is logged for every CREATE STATISTICS job</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
<tr><td><div id="setting-sql-stats-response-max" class="anchored"><code>sql.stats.response.max</code></div></td><td>integer</td><td><code>20000</code></td><td>the maximum number of statements and transaction stats returned in a CombinedStatements request</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
Expand Down
28 changes: 25 additions & 3 deletions pkg/sql/create_stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,18 @@ var statsOnVirtualCols = settings.RegisterBoolSetting(
true,
settings.WithPublic)

// Collecting histograms on non-indexed JSON columns can require a lot of memory
// when the JSON values are large. This is true even when only two histogram
// buckets are generated because we still sample many JSON values which exist in
// memory for the duration of the stats collection job. By default, we do not
// collect histograms for non-indexed JSON columns.
var nonIndexJSONHistograms = settings.RegisterBoolSetting(
settings.ApplicationLevel,
"sql.stats.non_indexed_json_histograms.enabled",
"set to true to collect table statistics histograms on non-indexed JSON columns",
true,
settings.WithPublic)

const nonIndexColHistogramBuckets = 2

// StubTableStats generates "stub" statistics for a table which are missing
Expand All @@ -72,6 +84,7 @@ func StubTableStats(
) ([]*stats.TableStatisticProto, error) {
colStats, err := createStatsDefaultColumns(
context.Background(), desc, false /* virtColEnabled */, false, /* multiColEnabled */
false, /* nonIndexJSONHistograms */
nonIndexColHistogramBuckets, nil, /* evalCtx */
)
if err != nil {
Expand Down Expand Up @@ -243,7 +256,9 @@ func (n *createStatsNode) makeJobRecord(ctx context.Context) (*jobs.Record, erro
}
defaultHistogramBuckets := stats.GetDefaultHistogramBuckets(n.p.ExecCfg().SV(), tableDesc)
if colStats, err = createStatsDefaultColumns(
ctx, tableDesc, virtColEnabled, multiColEnabled, defaultHistogramBuckets, n.p.EvalContext(),
ctx, tableDesc, virtColEnabled, multiColEnabled,
nonIndexJSONHistograms.Get(n.p.ExecCfg().SV()),
defaultHistogramBuckets, n.p.EvalContext(),
); err != nil {
return nil, err
}
Expand Down Expand Up @@ -355,13 +370,16 @@ const maxNonIndexCols = 100
// predicate expressions are also likely to appear in query filters, so stats
// are collected for those columns as well.
//
// If nonIndexJsonHistograms is true, 2-bucket histograms are collected for
// non-indexed JSON columns.
//
// In addition to the index columns, we collect stats on up to maxNonIndexCols
// other columns from the table. We only collect histograms for index columns,
// plus any other boolean or enum columns (where the "histogram" is tiny).
func createStatsDefaultColumns(
ctx context.Context,
desc catalog.TableDescriptor,
virtColEnabled, multiColEnabled bool,
virtColEnabled, multiColEnabled, nonIndexJSONHistograms bool,
defaultHistogramBuckets uint32,
evalCtx *eval.Context,
) ([]jobspb.CreateStatsDetails_ColStat, error) {
Expand Down Expand Up @@ -618,9 +636,13 @@ func createStatsDefaultColumns(
if col.GetType().Family() == types.BoolFamily || col.GetType().Family() == types.EnumFamily {
maxHistBuckets = defaultHistogramBuckets
}
hasHistogram := !colinfo.ColumnTypeIsOnlyInvertedIndexable(col.GetType())
if col.GetType().Family() == types.JsonFamily {
hasHistogram = nonIndexJSONHistograms
}
colStats = append(colStats, jobspb.CreateStatsDetails_ColStat{
ColumnIDs: colIDs,
HasHistogram: !colinfo.ColumnTypeIsOnlyInvertedIndexable(col.GetType()),
HasHistogram: hasHistogram,
HistogramMaxBuckets: maxHistBuckets,
})
nonIdxCols++
Expand Down
45 changes: 45 additions & 0 deletions pkg/sql/logictest/testdata/logic_test/stats
Original file line number Diff line number Diff line change
Expand Up @@ -50,3 +50,48 @@ CREATE TABLE t122312 (s STRING, g greeting AS (s::greeting) STORED);

statement ok
ANALYZE t122312;

# Regression test related to #139381. Do not collect histograms on non-indexed
# JSON columns when sql.stats.non_indexed_json_histograms.enabled is false.
statement ok
CREATE TABLE t139381 (
k INT PRIMARY KEY,
j JSON,
v STRING AS (j->>'name') VIRTUAL,
INDEX (v)
)

statement ok
SET CLUSTER SETTING sql.stats.non_indexed_json_histograms.enabled = false

statement ok
INSERT INTO t139381
SELECT i, ('{"name": "name_' || i || '", "data": "abcdefghij"}')::JSONB
FROM (VALUES (1), (2)) v(i)

statement ok
ANALYZE t139381

query TT rowsort
SELECT column_names, IF(histogram_id IS NOT NULL, 'histogram_collected', 'no_histogram_collected')
FROM [SHOW STATISTICS FOR TABLE t139381]
----
{k} histogram_collected
{j} no_histogram_collected
{v} histogram_collected

# Histograms are collected on non-indexed JSON columns when the cluster setting
# is enabled.
statement ok
SET CLUSTER SETTING sql.stats.non_indexed_json_histograms.enabled = true

statement ok
ANALYZE t139381

query TT rowsort
SELECT column_names, IF(histogram_id IS NOT NULL, 'histogram_collected', 'no_histogram_collected')
FROM [SHOW STATISTICS FOR TABLE t139381]
----
{k} histogram_collected
{j} histogram_collected
{v} histogram_collected

0 comments on commit bfc34da

Please sign in to comment.