diff --git a/pkg/statistics/BUILD.bazel b/pkg/statistics/BUILD.bazel index 6018f61cb8328..c67cdb90b630d 100644 --- a/pkg/statistics/BUILD.bazel +++ b/pkg/statistics/BUILD.bazel @@ -67,6 +67,7 @@ go_test( "bench_daily_test.go", "builder_test.go", "cmsketch_test.go", + "estimate_test.go", "fmsketch_test.go", "histogram_bench_test.go", "histogram_test.go", @@ -80,7 +81,7 @@ go_test( data = glob(["testdata/**"]), embed = [":statistics"], flaky = True, - shard_count = 43, + shard_count = 44, deps = [ "//pkg/config", "//pkg/meta/model", diff --git a/pkg/statistics/estimate.go b/pkg/statistics/estimate.go index f6055cfda309d..c4f1648a69fb9 100644 --- a/pkg/statistics/estimate.go +++ b/pkg/statistics/estimate.go @@ -65,3 +65,127 @@ func EstimateNDVByGEE(sampleNDV, singletonItems, sampleSize, rowCount uint64) ui } return ndv } + +// EstimateGlobalSingletonBySketches estimates the global singleton count using NDV and singleton sketches. +// For each node i, we ask: how many of node i's local singletons +// never appeared in any other node? Those are the values that are +// truly unique across the entire dataset, contributed by node i. +// +// We compute this by merging all *other* nodes' NDV sketches (their full +// distinct-value sets), then checking how much node i's local singletons +// grow that union. The growth is approximately node i's singleton's +// FMSketch that no other node has seen. +// +// Summing these per-node contributions gives the global singleton estimate. +// +// The implementation splits the nodes into two halves, precomputes one NDV +// union per half, and then rebuilds only the suffix within each half while +// keeping a rolling in-half prefix. That keeps the O(k²) time complexity +// but cuts repeated merge work to roughly one quarter of the naive +// rebuild-from-scratch loop while preserving O(1) extra sketches. A full +// prefix-suffix cache could reduce the runtime to O(k), but it would require +// O(k) extra sketches (~80KB each), which risks significant memory pressure +// for tables with many nodes. +// +// Example with three nodes: +// +// Node 0 all distinct values: {a, b, c} local singletons: {a, b, c} +// Node 1 all distinct values: {b, c, d} local singletons: {b, d} +// Node 2 all distinct values: {c, e, f} local singletons: {e, f} +// +// True global frequencies: a×1, b×2, c×3, d×1, e×1, f×1 +// True singletons = 4 (the values {a, d, e, f} appear exactly once globally) +// +// Node 0: others' NDV = {b,c,d,e,f} (size 5) +// + node 0 singletons {a,b,c} = {a,b,c,d,e,f} (size 6) +// contribution = 1 (only `a` is new) +// +// Node 1: others' NDV = {a,b,c,e,f} (size 5) +// + node 1 singletons {b,d} = {a,b,c,d,e,f} (size 6) +// contribution = 1 (only `d` is new) +// +// Node 2: others' NDV = {a,b,c,d} (size 4) +// + node 2 singletons {e,f} = {a,b,c,d,e,f} (size 6) +// contribution = 2 (`e` and `f` are new) +// +// Estimated singletons = 1 + 1 + 2 = 4 +func EstimateGlobalSingletonBySketches(ndvSketches, singletonSketches []*FMSketch) uint64 { + // Defensive checks. + intest.Assert(len(ndvSketches) > 0, "ndvSketches shouldn't be empty") + intest.Assert(len(ndvSketches) == len(singletonSketches), "ndvSketches and singletonSketches should have the same length") + intest.AssertFunc(func() bool { + for _, ndvSketch := range ndvSketches { + if ndvSketch == nil { + return false + } + } + return true + }, "ndvSketches must not contain nil entries") + intest.AssertFunc(func() bool { + for _, singletonSketch := range singletonSketches { + if singletonSketch == nil { + return false + } + } + return true + }, "singletonSketches must not contain nil entries") + if len(ndvSketches) == 0 || len(ndvSketches) != len(singletonSketches) { + return 0 + } + + mid := len(ndvSketches) - len(ndvSketches)/2 + var leftHalfNDV *FMSketch + for _, sketch := range ndvSketches[:mid] { + leftHalfNDV = mergeCopiedFMSketch(leftHalfNDV, sketch) + } + var rightHalfNDV *FMSketch + for _, sketch := range ndvSketches[mid:] { + rightHalfNDV = mergeCopiedFMSketch(rightHalfNDV, sketch) + } + + // NOTE: For each node, we still merge every other node's NDV sketch. + globalSingleton := estimateGlobalSingletonInRange(ndvSketches[:mid], singletonSketches[:mid], rightHalfNDV) + globalSingleton += estimateGlobalSingletonInRange(ndvSketches[mid:], singletonSketches[mid:], leftHalfNDV) + // SAFETY: Each per-node contribution is clamped to >= 0 before accumulation. + intest.Assert(globalSingleton >= 0, "globalSingleton must be positive") + return uint64(globalSingleton) +} + +func estimateGlobalSingletonInRange(ndvSketches, singletonSketches []*FMSketch, outOfRangeNDVSketch *FMSketch) int64 { + var globalSingleton int64 + // prefixNDVSketch accumulates ndvSketches[0..i-1] as i advances, so + // each iteration only rebuilds the suffix (ndvSketches[i+1..]) from + // scratch instead of the full "all-except-i" set. + var prefixNDVSketch *FMSketch + for i := range ndvSketches { + other := mergeCopiedFMSketch(nil, prefixNDVSketch) + for _, sketch := range ndvSketches[i+1:] { + other = mergeCopiedFMSketch(other, sketch) + } + other = mergeCopiedFMSketch(other, outOfRangeNDVSketch) + + // NDV of the union of all other nodes before merging this node's singletons. + ndvOther := other.NDV() + other = mergeCopiedFMSketch(other, singletonSketches[i]) + + // NDV of the union after merging this node's singleton sketch. + ndvUnion := other.NDV() + // FM sketch NDV estimates are not monotone under merge, so the estimated + // union can be smaller than ndvOther. Clamp the per-node contribution to 0. + // In practice, this appears to be fairly rare. + globalSingleton += max(0, ndvUnion-ndvOther) + prefixNDVSketch = mergeCopiedFMSketch(prefixNDVSketch, ndvSketches[i]) + } + return globalSingleton +} + +func mergeCopiedFMSketch(dst, src *FMSketch) *FMSketch { + if src == nil { + return dst + } + if dst == nil { + return src.Copy() + } + dst.MergeFMSketch(src) + return dst +} diff --git a/pkg/statistics/estimate_test.go b/pkg/statistics/estimate_test.go new file mode 100644 index 0000000000000..568cea568953e --- /dev/null +++ b/pkg/statistics/estimate_test.go @@ -0,0 +1,182 @@ +// Copyright 2026 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package statistics + +import ( + "testing" + + "github.com/pingcap/tidb/pkg/types" + "github.com/pingcap/tidb/pkg/util/mock" + "github.com/stretchr/testify/require" +) + +// newFMSketchFromHashValues builds an FM sketch by directly inserting hash values. +// With a large maxSize and few values, the sketch gives exact NDV (mask stays 0). +func newFMSketchFromHashValues(vals ...uint64) *FMSketch { + s := NewFMSketch(1000) + for _, v := range vals { + s.insertHashValue(v) + } + return s +} + +// newFMSketchesFromSamples builds a pair of sketches from the same sample rows: +// the NDV sketch sees every sample, while the singleton sketch keeps only +// values that occur exactly once in that sample set. +func newFMSketchesFromSamples(t *testing.T, maxSize int, samples ...int64) (*FMSketch, *FMSketch) { + t.Helper() + ctx := mock.NewContext() + ndvSketch := NewFMSketch(maxSize) + singletonSketch := NewFMSketch(maxSize) + counts := make(map[int64]int, len(samples)) + for _, v := range samples { + counts[v]++ + err := ndvSketch.InsertValue(ctx.GetSessionVars().StmtCtx, types.NewIntDatum(v)) + require.NoError(t, err) + } + for _, v := range samples { + if counts[v] != 1 { + continue + } + err := singletonSketch.InsertValue(ctx.GetSessionVars().StmtCtx, types.NewIntDatum(v)) + require.NoError(t, err) + delete(counts, v) + } + return ndvSketch, singletonSketch +} + +func TestEstimateGlobalSingletonBySketches(t *testing.T) { + // Use distinct hash values to represent distinct data values. + // With maxSize=1000 and few insertions, mask stays 0 so NDV = len(hashset) (exact). + const ( + a = uint64(100) + b = uint64(200) + c = uint64(300) + d = uint64(400) + e = uint64(500) + f = uint64(600) + ) + + t.Run("DocCommentExample", func(t *testing.T) { + // Node 0: all distinct = {a, b, c}, local singletons = {a, b, c} + // Node 1: all distinct = {b, c, d}, local singletons = {b, d} + // Node 2: all distinct = {c, e, f}, local singletons = {e, f} + // Global singletons = {a, d, e, f} = 4 + ndvSketches := []*FMSketch{ + newFMSketchFromHashValues(a, b, c), + newFMSketchFromHashValues(b, c, d), + newFMSketchFromHashValues(c, e, f), + } + singletonSketches := []*FMSketch{ + newFMSketchFromHashValues(a, b, c), + newFMSketchFromHashValues(b, d), + newFMSketchFromHashValues(e, f), + } + got := EstimateGlobalSingletonBySketches(ndvSketches, singletonSketches) + require.Equal(t, uint64(4), got) + }) + + t.Run("SingleNode", func(t *testing.T) { + // With one node, all local singletons are global singletons. + ndvSketches := []*FMSketch{ + newFMSketchFromHashValues(a, b, c), + } + singletonSketches := []*FMSketch{ + newFMSketchFromHashValues(a, b, c), + } + got := EstimateGlobalSingletonBySketches(ndvSketches, singletonSketches) + require.Equal(t, uint64(3), got) + }) + + t.Run("NoOverlap", func(t *testing.T) { + // Nodes have disjoint values. All local singletons are global singletons. + ndvSketches := []*FMSketch{ + newFMSketchFromHashValues(a, b), + newFMSketchFromHashValues(c, d), + newFMSketchFromHashValues(e, f), + } + singletonSketches := []*FMSketch{ + newFMSketchFromHashValues(a, b), + newFMSketchFromHashValues(c, d), + newFMSketchFromHashValues(e, f), + } + got := EstimateGlobalSingletonBySketches(ndvSketches, singletonSketches) + require.Equal(t, uint64(6), got) + }) + + t.Run("FullOverlap", func(t *testing.T) { + // Every local singleton also appears in another node's NDV. + // Node 0: all = {a, b}, singletons = {a, b} + // Node 1: all = {a, b}, singletons = {a, b} + // No value is unique to a single node → 0 global singletons. + ndvSketches := []*FMSketch{ + newFMSketchFromHashValues(a, b), + newFMSketchFromHashValues(a, b), + } + singletonSketches := []*FMSketch{ + newFMSketchFromHashValues(a, b), + newFMSketchFromHashValues(a, b), + } + got := EstimateGlobalSingletonBySketches(ndvSketches, singletonSketches) + require.Equal(t, uint64(0), got) + }) + + t.Run("NegativeContributionIsClamped", func(t *testing.T) { + // Both sketches are built from the same local samples. + // + // Node 0 samples: [0] + // Node 1 samples: [0, 0, 0, 1, 1, 4, 7] + // + // The true global singleton set is {4, 7}, so the result should be 2. + // Before the fix, node 0's contribution could become negative due to FM + // sketch merge behavior, making the final estimate incorrect. + ndv0, singleton0 := newFMSketchesFromSamples(t, 3, 0) + ndv1, singleton1 := newFMSketchesFromSamples(t, 3, 0, 0, 0, 1, 1, 4, 7) + ndvSketches := []*FMSketch{ndv0, ndv1} + singletonSketches := []*FMSketch{singleton0, singleton1} + + got := EstimateGlobalSingletonBySketches(ndvSketches, singletonSketches) + require.Equal(t, uint64(2), got) + }) + + t.Run("NilEntry", func(t *testing.T) { + require.PanicsWithValue(t, "assert failed, ndvSketches must not contain nil entries", func() { + EstimateGlobalSingletonBySketches( + []*FMSketch{nil, newFMSketchFromHashValues(c, d)}, + []*FMSketch{newFMSketchFromHashValues(a, b), newFMSketchFromHashValues(c, d)}, + ) + }) + require.PanicsWithValue(t, "assert failed, singletonSketches must not contain nil entries", func() { + EstimateGlobalSingletonBySketches( + []*FMSketch{newFMSketchFromHashValues(a, b), newFMSketchFromHashValues(c, d)}, + []*FMSketch{nil, newFMSketchFromHashValues(c, d)}, + ) + }) + }) + + t.Run("EmptyInput", func(t *testing.T) { + require.PanicsWithValue(t, "assert failed, ndvSketches shouldn't be empty", func() { + EstimateGlobalSingletonBySketches(nil, nil) + }) + }) + + t.Run("MismatchedLengths", func(t *testing.T) { + ndvSketches := []*FMSketch{newFMSketchFromHashValues(a)} + singletonSketches := []*FMSketch{newFMSketchFromHashValues(a), newFMSketchFromHashValues(b)} + require.PanicsWithValue(t, "assert failed, ndvSketches and singletonSketches should have the same length", func() { + EstimateGlobalSingletonBySketches(ndvSketches, singletonSketches) + }) + }) +}