Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion pkg/statistics/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ go_test(
"bench_daily_test.go",
"builder_test.go",
"cmsketch_test.go",
"estimate_test.go",
"fmsketch_test.go",
"histogram_bench_test.go",
"histogram_test.go",
Expand All @@ -80,7 +81,7 @@ go_test(
data = glob(["testdata/**"]),
embed = [":statistics"],
flaky = True,
shard_count = 43,
shard_count = 44,
deps = [
"//pkg/config",
"//pkg/meta/model",
Expand Down
124 changes: 124 additions & 0 deletions pkg/statistics/estimate.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,3 +65,127 @@ func EstimateNDVByGEE(sampleNDV, singletonItems, sampleSize, rowCount uint64) ui
}
return ndv
}

// EstimateGlobalSingletonBySketches estimates the global singleton count using NDV and singleton sketches.
// For each node i, we ask: how many of node i's local singletons
// never appeared in any other node? Those are the values that are
// truly unique across the entire dataset, contributed by node i.
//
// We compute this by merging all *other* nodes' NDV sketches (their full
// distinct-value sets), then checking how much node i's local singletons
// grow that union. The growth is approximately node i's singleton's
// FMSketch that no other node has seen.
//
// Summing these per-node contributions gives the global singleton estimate.
//
// The implementation splits the nodes into two halves, precomputes one NDV
// union per half, and then rebuilds only the suffix within each half while
// keeping a rolling in-half prefix. That keeps the O(k²) time complexity
// but cuts repeated merge work to roughly one quarter of the naive
// rebuild-from-scratch loop while preserving O(1) extra sketches. A full
// prefix-suffix cache could reduce the runtime to O(k), but it would require
// O(k) extra sketches (~80KB each), which risks significant memory pressure
// for tables with many nodes.
//
// Example with three nodes:
//
// Node 0 all distinct values: {a, b, c} local singletons: {a, b, c}
// Node 1 all distinct values: {b, c, d} local singletons: {b, d}
// Node 2 all distinct values: {c, e, f} local singletons: {e, f}
//
// True global frequencies: a×1, b×2, c×3, d×1, e×1, f×1
// True singletons = 4 (the values {a, d, e, f} appear exactly once globally)
//
// Node 0: others' NDV = {b,c,d,e,f} (size 5)
// + node 0 singletons {a,b,c} = {a,b,c,d,e,f} (size 6)
// contribution = 1 (only `a` is new)
//
// Node 1: others' NDV = {a,b,c,e,f} (size 5)
// + node 1 singletons {b,d} = {a,b,c,d,e,f} (size 6)
// contribution = 1 (only `d` is new)
//
// Node 2: others' NDV = {a,b,c,d} (size 4)
// + node 2 singletons {e,f} = {a,b,c,d,e,f} (size 6)
// contribution = 2 (`e` and `f` are new)
//
// Estimated singletons = 1 + 1 + 2 = 4
func EstimateGlobalSingletonBySketches(ndvSketches, singletonSketches []*FMSketch) uint64 {
// Defensive checks.
intest.Assert(len(ndvSketches) > 0, "ndvSketches shouldn't be empty")
intest.Assert(len(ndvSketches) == len(singletonSketches), "ndvSketches and singletonSketches should have the same length")
Comment thread
0xPoe marked this conversation as resolved.
intest.AssertFunc(func() bool {
for _, ndvSketch := range ndvSketches {
if ndvSketch == nil {
return false
}
}
return true
}, "ndvSketches must not contain nil entries")
intest.AssertFunc(func() bool {
for _, singletonSketch := range singletonSketches {
if singletonSketch == nil {
return false
}
}
return true
}, "singletonSketches must not contain nil entries")
if len(ndvSketches) == 0 || len(ndvSketches) != len(singletonSketches) {
return 0
}

mid := len(ndvSketches) - len(ndvSketches)/2
var leftHalfNDV *FMSketch
for _, sketch := range ndvSketches[:mid] {
leftHalfNDV = mergeCopiedFMSketch(leftHalfNDV, sketch)
}
var rightHalfNDV *FMSketch
for _, sketch := range ndvSketches[mid:] {
rightHalfNDV = mergeCopiedFMSketch(rightHalfNDV, sketch)
}

// NOTE: For each node, we still merge every other node's NDV sketch.
globalSingleton := estimateGlobalSingletonInRange(ndvSketches[:mid], singletonSketches[:mid], rightHalfNDV)
globalSingleton += estimateGlobalSingletonInRange(ndvSketches[mid:], singletonSketches[mid:], leftHalfNDV)
// SAFETY: Each per-node contribution is clamped to >= 0 before accumulation.
intest.Assert(globalSingleton >= 0, "globalSingleton must be positive")
return uint64(globalSingleton)
}

func estimateGlobalSingletonInRange(ndvSketches, singletonSketches []*FMSketch, outOfRangeNDVSketch *FMSketch) int64 {
var globalSingleton int64
// prefixNDVSketch accumulates ndvSketches[0..i-1] as i advances, so
// each iteration only rebuilds the suffix (ndvSketches[i+1..]) from
// scratch instead of the full "all-except-i" set.
var prefixNDVSketch *FMSketch
for i := range ndvSketches {
other := mergeCopiedFMSketch(nil, prefixNDVSketch)
for _, sketch := range ndvSketches[i+1:] {
other = mergeCopiedFMSketch(other, sketch)
}
other = mergeCopiedFMSketch(other, outOfRangeNDVSketch)

// NDV of the union of all other nodes before merging this node's singletons.
ndvOther := other.NDV()
other = mergeCopiedFMSketch(other, singletonSketches[i])

// NDV of the union after merging this node's singleton sketch.
ndvUnion := other.NDV()
// FM sketch NDV estimates are not monotone under merge, so the estimated
// union can be smaller than ndvOther. Clamp the per-node contribution to 0.
// In practice, this appears to be fairly rare.
globalSingleton += max(0, ndvUnion-ndvOther)
Comment thread
mjonss marked this conversation as resolved.
prefixNDVSketch = mergeCopiedFMSketch(prefixNDVSketch, ndvSketches[i])
}
return globalSingleton
}

func mergeCopiedFMSketch(dst, src *FMSketch) *FMSketch {
if src == nil {
return dst
}
if dst == nil {
return src.Copy()
}
dst.MergeFMSketch(src)
return dst
}
182 changes: 182 additions & 0 deletions pkg/statistics/estimate_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
// Copyright 2026 PingCAP, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package statistics

import (
"testing"

"github.com/pingcap/tidb/pkg/types"
"github.com/pingcap/tidb/pkg/util/mock"
"github.com/stretchr/testify/require"
)

// newFMSketchFromHashValues builds an FM sketch by directly inserting hash values.
// With a large maxSize and few values, the sketch gives exact NDV (mask stays 0).
func newFMSketchFromHashValues(vals ...uint64) *FMSketch {
s := NewFMSketch(1000)
for _, v := range vals {
s.insertHashValue(v)
}
return s
}

// newFMSketchesFromSamples builds a pair of sketches from the same sample rows:
// the NDV sketch sees every sample, while the singleton sketch keeps only
// values that occur exactly once in that sample set.
func newFMSketchesFromSamples(t *testing.T, maxSize int, samples ...int64) (*FMSketch, *FMSketch) {
t.Helper()
ctx := mock.NewContext()
ndvSketch := NewFMSketch(maxSize)
singletonSketch := NewFMSketch(maxSize)
counts := make(map[int64]int, len(samples))
for _, v := range samples {
counts[v]++
err := ndvSketch.InsertValue(ctx.GetSessionVars().StmtCtx, types.NewIntDatum(v))
require.NoError(t, err)
}
for _, v := range samples {
if counts[v] != 1 {
continue
}
err := singletonSketch.InsertValue(ctx.GetSessionVars().StmtCtx, types.NewIntDatum(v))
require.NoError(t, err)
delete(counts, v)
}
return ndvSketch, singletonSketch
}

func TestEstimateGlobalSingletonBySketches(t *testing.T) {
// Use distinct hash values to represent distinct data values.
// With maxSize=1000 and few insertions, mask stays 0 so NDV = len(hashset) (exact).
const (
a = uint64(100)
b = uint64(200)
c = uint64(300)
d = uint64(400)
e = uint64(500)
f = uint64(600)
)

t.Run("DocCommentExample", func(t *testing.T) {
// Node 0: all distinct = {a, b, c}, local singletons = {a, b, c}
// Node 1: all distinct = {b, c, d}, local singletons = {b, d}
// Node 2: all distinct = {c, e, f}, local singletons = {e, f}
// Global singletons = {a, d, e, f} = 4
ndvSketches := []*FMSketch{
newFMSketchFromHashValues(a, b, c),
newFMSketchFromHashValues(b, c, d),
newFMSketchFromHashValues(c, e, f),
}
singletonSketches := []*FMSketch{
newFMSketchFromHashValues(a, b, c),
newFMSketchFromHashValues(b, d),
newFMSketchFromHashValues(e, f),
}
got := EstimateGlobalSingletonBySketches(ndvSketches, singletonSketches)
require.Equal(t, uint64(4), got)
})

t.Run("SingleNode", func(t *testing.T) {
// With one node, all local singletons are global singletons.
ndvSketches := []*FMSketch{
newFMSketchFromHashValues(a, b, c),
}
singletonSketches := []*FMSketch{
newFMSketchFromHashValues(a, b, c),
}
got := EstimateGlobalSingletonBySketches(ndvSketches, singletonSketches)
require.Equal(t, uint64(3), got)
})

t.Run("NoOverlap", func(t *testing.T) {
// Nodes have disjoint values. All local singletons are global singletons.
ndvSketches := []*FMSketch{
newFMSketchFromHashValues(a, b),
newFMSketchFromHashValues(c, d),
newFMSketchFromHashValues(e, f),
}
singletonSketches := []*FMSketch{
newFMSketchFromHashValues(a, b),
newFMSketchFromHashValues(c, d),
newFMSketchFromHashValues(e, f),
}
got := EstimateGlobalSingletonBySketches(ndvSketches, singletonSketches)
require.Equal(t, uint64(6), got)
})

t.Run("FullOverlap", func(t *testing.T) {
// Every local singleton also appears in another node's NDV.
// Node 0: all = {a, b}, singletons = {a, b}
// Node 1: all = {a, b}, singletons = {a, b}
// No value is unique to a single node → 0 global singletons.
ndvSketches := []*FMSketch{
newFMSketchFromHashValues(a, b),
newFMSketchFromHashValues(a, b),
}
singletonSketches := []*FMSketch{
newFMSketchFromHashValues(a, b),
newFMSketchFromHashValues(a, b),
}
got := EstimateGlobalSingletonBySketches(ndvSketches, singletonSketches)
require.Equal(t, uint64(0), got)
})

t.Run("NegativeContributionIsClamped", func(t *testing.T) {
// Both sketches are built from the same local samples.
//
// Node 0 samples: [0]
// Node 1 samples: [0, 0, 0, 1, 1, 4, 7]
//
// The true global singleton set is {4, 7}, so the result should be 2.
// Before the fix, node 0's contribution could become negative due to FM
// sketch merge behavior, making the final estimate incorrect.
ndv0, singleton0 := newFMSketchesFromSamples(t, 3, 0)
ndv1, singleton1 := newFMSketchesFromSamples(t, 3, 0, 0, 0, 1, 1, 4, 7)
ndvSketches := []*FMSketch{ndv0, ndv1}
singletonSketches := []*FMSketch{singleton0, singleton1}

got := EstimateGlobalSingletonBySketches(ndvSketches, singletonSketches)
require.Equal(t, uint64(2), got)
})

t.Run("NilEntry", func(t *testing.T) {
require.PanicsWithValue(t, "assert failed, ndvSketches must not contain nil entries", func() {
EstimateGlobalSingletonBySketches(
[]*FMSketch{nil, newFMSketchFromHashValues(c, d)},
[]*FMSketch{newFMSketchFromHashValues(a, b), newFMSketchFromHashValues(c, d)},
)
})
require.PanicsWithValue(t, "assert failed, singletonSketches must not contain nil entries", func() {
EstimateGlobalSingletonBySketches(
[]*FMSketch{newFMSketchFromHashValues(a, b), newFMSketchFromHashValues(c, d)},
[]*FMSketch{nil, newFMSketchFromHashValues(c, d)},
)
})
})

t.Run("EmptyInput", func(t *testing.T) {
require.PanicsWithValue(t, "assert failed, ndvSketches shouldn't be empty", func() {
EstimateGlobalSingletonBySketches(nil, nil)
})
})

t.Run("MismatchedLengths", func(t *testing.T) {
ndvSketches := []*FMSketch{newFMSketchFromHashValues(a)}
singletonSketches := []*FMSketch{newFMSketchFromHashValues(a), newFMSketchFromHashValues(b)}
require.PanicsWithValue(t, "assert failed, ndvSketches and singletonSketches should have the same length", func() {
EstimateGlobalSingletonBySketches(ndvSketches, singletonSketches)
})
})
}
Loading