From 96482e50bbc2561d41399805a3fbcd22b5eea096 Mon Sep 17 00:00:00 2001 From: 0xPoe Date: Tue, 7 Apr 2026 09:19:22 +0200 Subject: [PATCH 01/21] stats: add global singleton estimation Signed-off-by: 0xPoe --- pkg/statistics/estimate.go | 93 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) diff --git a/pkg/statistics/estimate.go b/pkg/statistics/estimate.go index f6055cfda309d..7439ae890ffe3 100644 --- a/pkg/statistics/estimate.go +++ b/pkg/statistics/estimate.go @@ -65,3 +65,96 @@ func EstimateNDVByGEE(sampleNDV, singletonItems, sampleSize, rowCount uint64) ui } return ndv } + +// EstimateGlobalSingletonBySketches estimates the global singleton count using NDV and singleton sketches. +// For each region i, we ask: how many of region i's local singletons +// never appeared in any other region? Those are the values that are +// truly unique across the entire dataset, contributed by region i. +// +// We compute this by merging all *other* regions' NDV sketches (their full +// distinct-value sets), then checking how much region i's local singletons +// grow that union. The growth is exactly the count of region i's singletons +// that no other region has seen. +// +// Summing these per-region contributions gives the global singleton estimate. +// +// Example with three regions: +// +// Region 0 all distinct values: {a, b, c} local singletons: {a, b, c} +// Region 1 all distinct values: {b, c, d} local singletons: {b, d} +// Region 2 all distinct values: {c, e, f} local singletons: {e, f} +// +// True global frequencies: a×1, b×2, c×3, d×1, e×1, f×1 +// True singletons = 4 (the values {a, d, e, f} appear exactly once globally) +// +// Region 0: others' NDV = {b,c,d,e,f} (size 5) +// + region 0 singletons {a,b,c} = {a,b,c,d,e,f} (size 6) +// contribution = 1 (only `a` is new) +// +// Region 1: others' NDV = {a,b,c,e,f} (size 5) +// + region 1 singletons {b,d} = {a,b,c,d,e,f} (size 6) +// contribution = 1 (only `d` is new) +// +// Region 2: others' NDV = {a,b,c,d} (size 4) +// + region 2 singletons {e,f} = {a,b,c,d,e,f} (size 6) +// contribution = 2 (`e` and `f` are new) +// +// Estimated singletons = 1 + 1 + 2 = 4 +func EstimateGlobalSingletonBySketches(ndvSketches, singletonSketches []*FMSketch) uint64 { + intest.Assert(len(ndvSketches) > 0, "ndvSketches shouldn't be empty") + intest.Assert(len(ndvSketches) == len(singletonSketches), "ndvSketches and singletonSketches should have the same length") + // Defensive checks. + if len(ndvSketches) == 0 || len(ndvSketches) != len(singletonSketches) { + return 0 + } + + var globalSingleton int64 + for i := range ndvSketches { + // Merge NDV sketches from all regions except region i. + var other *FMSketch + for j, ns := range ndvSketches { + if j == i || ns == nil { + continue + } + + if other == nil { + other = ns.Copy() + continue + } + other.MergeFMSketch(ns) + } + + ndvOther := int64(0) + if other != nil { + ndvOther = other.NDV() + } + + // Merge the other-regions sketch with region i's singleton sketch. + // ndvUnion - ndvOther gives the count of region i's singletons + // that don't appear in any other region (i.e. globally unique values). + var union *FMSketch + if other != nil { + union = other.Copy() + if singletonSketches[i] != nil { + union.MergeFMSketch(singletonSketches[i]) + } + } else if singletonSketches[i] != nil { + union = singletonSketches[i].Copy() + } + + ndvUnion := int64(0) + if union != nil { + ndvUnion = union.NDV() + } + + globalSingleton += int64(ndvUnion) - int64(ndvOther) + + // Cleanup. + other = nil + union = nil + } + if globalSingleton < 0 { + return 0 + } + return uint64(globalSingleton) +} From 183bd2a2996e57a36633aeba600bd34cbf55146a Mon Sep 17 00:00:00 2001 From: 0xPoe Date: Tue, 7 Apr 2026 10:58:43 +0200 Subject: [PATCH 02/21] test: add cases for global signleton estimation Signed-off-by: 0xPoe --- pkg/statistics/BUILD.bazel | 3 +- pkg/statistics/estimate_test.go | 159 ++++++++++++++++++++++++++++++++ 2 files changed, 161 insertions(+), 1 deletion(-) create mode 100644 pkg/statistics/estimate_test.go diff --git a/pkg/statistics/BUILD.bazel b/pkg/statistics/BUILD.bazel index 6018f61cb8328..c67cdb90b630d 100644 --- a/pkg/statistics/BUILD.bazel +++ b/pkg/statistics/BUILD.bazel @@ -67,6 +67,7 @@ go_test( "bench_daily_test.go", "builder_test.go", "cmsketch_test.go", + "estimate_test.go", "fmsketch_test.go", "histogram_bench_test.go", "histogram_test.go", @@ -80,7 +81,7 @@ go_test( data = glob(["testdata/**"]), embed = [":statistics"], flaky = True, - shard_count = 43, + shard_count = 44, deps = [ "//pkg/config", "//pkg/meta/model", diff --git a/pkg/statistics/estimate_test.go b/pkg/statistics/estimate_test.go new file mode 100644 index 0000000000000..6c9aa79473665 --- /dev/null +++ b/pkg/statistics/estimate_test.go @@ -0,0 +1,159 @@ +// Copyright 2026 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package statistics + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +// newFMSketchFromHashValues builds an FM sketch by directly inserting hash values. +// With a large maxSize and few values, the sketch gives exact NDV (mask stays 0). +func newFMSketchFromHashValues(vals ...uint64) *FMSketch { + s := NewFMSketch(1000) + for _, v := range vals { + s.insertHashValue(v) + } + return s +} + +func TestEstimateGlobalSingletonBySketches(t *testing.T) { + // Use distinct hash values to represent distinct data values. + // With maxSize=1000 and few insertions, mask stays 0 so NDV = len(hashset) (exact). + const ( + a = uint64(100) + b = uint64(200) + c = uint64(300) + d = uint64(400) + e = uint64(500) + f = uint64(600) + ) + + t.Run("DocCommentExample", func(t *testing.T) { + // Region 0: all distinct = {a, b, c}, local singletons = {a, b, c} + // Region 1: all distinct = {b, c, d}, local singletons = {b, d} + // Region 2: all distinct = {c, e, f}, local singletons = {e, f} + // Global singletons = {a, d, e, f} = 4 + ndvSketches := []*FMSketch{ + newFMSketchFromHashValues(a, b, c), + newFMSketchFromHashValues(b, c, d), + newFMSketchFromHashValues(c, e, f), + } + singletonSketches := []*FMSketch{ + newFMSketchFromHashValues(a, b, c), + newFMSketchFromHashValues(b, d), + newFMSketchFromHashValues(e, f), + } + got := EstimateGlobalSingletonBySketches(ndvSketches, singletonSketches) + require.Equal(t, uint64(4), got) + }) + + t.Run("SingleRegion", func(t *testing.T) { + // With one region, all local singletons are global singletons. + ndvSketches := []*FMSketch{ + newFMSketchFromHashValues(a, b, c), + } + singletonSketches := []*FMSketch{ + newFMSketchFromHashValues(a, b, c), + } + got := EstimateGlobalSingletonBySketches(ndvSketches, singletonSketches) + require.Equal(t, uint64(3), got) + }) + + t.Run("NoOverlap", func(t *testing.T) { + // Regions have disjoint values. All local singletons are global singletons. + ndvSketches := []*FMSketch{ + newFMSketchFromHashValues(a, b), + newFMSketchFromHashValues(c, d), + newFMSketchFromHashValues(e, f), + } + singletonSketches := []*FMSketch{ + newFMSketchFromHashValues(a, b), + newFMSketchFromHashValues(c, d), + newFMSketchFromHashValues(e, f), + } + got := EstimateGlobalSingletonBySketches(ndvSketches, singletonSketches) + require.Equal(t, uint64(6), got) + }) + + t.Run("FullOverlap", func(t *testing.T) { + // Every local singleton also appears in another region's NDV. + // Region 0: all = {a, b}, singletons = {a, b} + // Region 1: all = {a, b}, singletons = {a, b} + // No value is unique to a single region → 0 global singletons. + ndvSketches := []*FMSketch{ + newFMSketchFromHashValues(a, b), + newFMSketchFromHashValues(a, b), + } + singletonSketches := []*FMSketch{ + newFMSketchFromHashValues(a, b), + newFMSketchFromHashValues(a, b), + } + got := EstimateGlobalSingletonBySketches(ndvSketches, singletonSketches) + require.Equal(t, uint64(0), got) + }) + + t.Run("NilSingletonSketches", func(t *testing.T) { + // A region with nil singleton sketch contributes 0. + ndvSketches := []*FMSketch{ + newFMSketchFromHashValues(a, b), + newFMSketchFromHashValues(c, d), + } + singletonSketches := []*FMSketch{ + nil, + newFMSketchFromHashValues(c, d), + } + // Region 0 contributes 0 (nil singleton). + // Region 1: others' NDV = {a, b}, union with {c, d} = {a, b, c, d} → contribution = 2. + got := EstimateGlobalSingletonBySketches(ndvSketches, singletonSketches) + require.Equal(t, uint64(2), got) + }) + + t.Run("NilNDVSketches", func(t *testing.T) { + // A region with nil NDV sketch is skipped when building "others". + ndvSketches := []*FMSketch{ + nil, + newFMSketchFromHashValues(c, d), + } + singletonSketches := []*FMSketch{ + newFMSketchFromHashValues(a, b), + newFMSketchFromHashValues(c, d), + } + // Region 0: others' NDV = {c, d}, union with {a, b} = {a, b, c, d} → contribution = 2. + // Region 1: others' NDV = {} (nil), union with {c, d} = {c, d} → contribution = 2. + got := EstimateGlobalSingletonBySketches(ndvSketches, singletonSketches) + require.Equal(t, uint64(4), got) + }) + + t.Run("AllNilSketches", func(t *testing.T) { + ndvSketches := []*FMSketch{nil, nil} + singletonSketches := []*FMSketch{nil, nil} + got := EstimateGlobalSingletonBySketches(ndvSketches, singletonSketches) + require.Equal(t, uint64(0), got) + }) + + t.Run("EmptyInput", func(t *testing.T) { + got := EstimateGlobalSingletonBySketches(nil, nil) + require.Equal(t, uint64(0), got) + }) + + t.Run("MismatchedLengths", func(t *testing.T) { + ndvSketches := []*FMSketch{newFMSketchFromHashValues(a)} + singletonSketches := []*FMSketch{newFMSketchFromHashValues(a), newFMSketchFromHashValues(b)} + got := EstimateGlobalSingletonBySketches(ndvSketches, singletonSketches) + require.Equal(t, uint64(0), got) + }) +} From 488ea1491b903b64ca3663439fd45e5053181876 Mon Sep 17 00:00:00 2001 From: 0xPoe Date: Tue, 7 Apr 2026 11:17:09 +0200 Subject: [PATCH 03/21] docs: add more comments Signed-off-by: 0xPoe --- pkg/statistics/estimate.go | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/pkg/statistics/estimate.go b/pkg/statistics/estimate.go index 7439ae890ffe3..78cfbb00f6b5d 100644 --- a/pkg/statistics/estimate.go +++ b/pkg/statistics/estimate.go @@ -78,6 +78,13 @@ func EstimateNDVByGEE(sampleNDV, singletonItems, sampleSize, rowCount uint64) ui // // Summing these per-region contributions gives the global singleton estimate. // +// The current implementation rebuilds "all except i" from scratch for each +// region, which is O(k²) in the number of regions. A prefix-suffix approach +// could reduce this to O(k) time, but would require O(k) extra sketches +// (~160KB each), which risks significant memory pressure for tables with +// many TiKV regions. We keep the O(k²) approach +// for its O(1) memory overhead. +// // Example with three regions: // // Region 0 all distinct values: {a, b, c} local singletons: {a, b, c} @@ -132,26 +139,20 @@ func EstimateGlobalSingletonBySketches(ndvSketches, singletonSketches []*FMSketc // Merge the other-regions sketch with region i's singleton sketch. // ndvUnion - ndvOther gives the count of region i's singletons // that don't appear in any other region (i.e. globally unique values). - var union *FMSketch + // We already captured ndvOther, so we can safely reuse other here. if other != nil { - union = other.Copy() if singletonSketches[i] != nil { - union.MergeFMSketch(singletonSketches[i]) + other.MergeFMSketch(singletonSketches[i]) } } else if singletonSketches[i] != nil { - union = singletonSketches[i].Copy() + other = singletonSketches[i].Copy() } ndvUnion := int64(0) - if union != nil { - ndvUnion = union.NDV() + if other != nil { + ndvUnion = other.NDV() } - - globalSingleton += int64(ndvUnion) - int64(ndvOther) - - // Cleanup. - other = nil - union = nil + globalSingleton += ndvUnion - ndvOther } if globalSingleton < 0 { return 0 From d3076a010cff4b870cc9bd3f593b3810da32aec9 Mon Sep 17 00:00:00 2001 From: 0xPoe Date: Tue, 7 Apr 2026 11:32:12 +0200 Subject: [PATCH 04/21] fix: check panics Signed-off-by: 0xPoe --- pkg/statistics/estimate_test.go | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pkg/statistics/estimate_test.go b/pkg/statistics/estimate_test.go index 6c9aa79473665..7c18c795b6818 100644 --- a/pkg/statistics/estimate_test.go +++ b/pkg/statistics/estimate_test.go @@ -146,14 +146,16 @@ func TestEstimateGlobalSingletonBySketches(t *testing.T) { }) t.Run("EmptyInput", func(t *testing.T) { - got := EstimateGlobalSingletonBySketches(nil, nil) - require.Equal(t, uint64(0), got) + require.PanicsWithValue(t, "assert failed, ndvSketches shouldn't be empty", func() { + EstimateGlobalSingletonBySketches(nil, nil) + }) }) t.Run("MismatchedLengths", func(t *testing.T) { ndvSketches := []*FMSketch{newFMSketchFromHashValues(a)} singletonSketches := []*FMSketch{newFMSketchFromHashValues(a), newFMSketchFromHashValues(b)} - got := EstimateGlobalSingletonBySketches(ndvSketches, singletonSketches) - require.Equal(t, uint64(0), got) + require.PanicsWithValue(t, "assert failed, ndvSketches and singletonSketches should have the same length", func() { + EstimateGlobalSingletonBySketches(ndvSketches, singletonSketches) + }) }) } From 012a6f560748ad7af9c022e5668fa860e4696f8f Mon Sep 17 00:00:00 2001 From: 0xPoe Date: Tue, 7 Apr 2026 18:06:13 +0200 Subject: [PATCH 05/21] docs: rename region to node in global singleton estimation The algorithm is not region-specific; use the more general term "node" to describe each partition that contributes local sketches. --- pkg/statistics/estimate.go | 54 ++++++++++++++++----------------- pkg/statistics/estimate_test.go | 32 +++++++++---------- 2 files changed, 43 insertions(+), 43 deletions(-) diff --git a/pkg/statistics/estimate.go b/pkg/statistics/estimate.go index 78cfbb00f6b5d..17f8f62f7e5b7 100644 --- a/pkg/statistics/estimate.go +++ b/pkg/statistics/estimate.go @@ -67,44 +67,44 @@ func EstimateNDVByGEE(sampleNDV, singletonItems, sampleSize, rowCount uint64) ui } // EstimateGlobalSingletonBySketches estimates the global singleton count using NDV and singleton sketches. -// For each region i, we ask: how many of region i's local singletons -// never appeared in any other region? Those are the values that are -// truly unique across the entire dataset, contributed by region i. +// For each node i, we ask: how many of node i's local singletons +// never appeared in any other node? Those are the values that are +// truly unique across the entire dataset, contributed by node i. // -// We compute this by merging all *other* regions' NDV sketches (their full -// distinct-value sets), then checking how much region i's local singletons -// grow that union. The growth is exactly the count of region i's singletons -// that no other region has seen. +// We compute this by merging all *other* nodes' NDV sketches (their full +// distinct-value sets), then checking how much node i's local singletons +// grow that union. The growth is exactly the count of node i's singletons +// that no other node has seen. // -// Summing these per-region contributions gives the global singleton estimate. +// Summing these per-node contributions gives the global singleton estimate. // // The current implementation rebuilds "all except i" from scratch for each -// region, which is O(k²) in the number of regions. A prefix-suffix approach +// node, which is O(k²) in the number of nodes. A prefix-suffix approach // could reduce this to O(k) time, but would require O(k) extra sketches // (~160KB each), which risks significant memory pressure for tables with -// many TiKV regions. We keep the O(k²) approach +// many nodes. We keep the O(k²) approach // for its O(1) memory overhead. // -// Example with three regions: +// Example with three nodes: // -// Region 0 all distinct values: {a, b, c} local singletons: {a, b, c} -// Region 1 all distinct values: {b, c, d} local singletons: {b, d} -// Region 2 all distinct values: {c, e, f} local singletons: {e, f} +// Node 0 all distinct values: {a, b, c} local singletons: {a, b, c} +// Node 1 all distinct values: {b, c, d} local singletons: {b, d} +// Node 2 all distinct values: {c, e, f} local singletons: {e, f} // // True global frequencies: a×1, b×2, c×3, d×1, e×1, f×1 // True singletons = 4 (the values {a, d, e, f} appear exactly once globally) // -// Region 0: others' NDV = {b,c,d,e,f} (size 5) -// + region 0 singletons {a,b,c} = {a,b,c,d,e,f} (size 6) -// contribution = 1 (only `a` is new) +// Node 0: others' NDV = {b,c,d,e,f} (size 5) +// + node 0 singletons {a,b,c} = {a,b,c,d,e,f} (size 6) +// contribution = 1 (only `a` is new) // -// Region 1: others' NDV = {a,b,c,e,f} (size 5) -// + region 1 singletons {b,d} = {a,b,c,d,e,f} (size 6) -// contribution = 1 (only `d` is new) +// Node 1: others' NDV = {a,b,c,e,f} (size 5) +// + node 1 singletons {b,d} = {a,b,c,d,e,f} (size 6) +// contribution = 1 (only `d` is new) // -// Region 2: others' NDV = {a,b,c,d} (size 4) -// + region 2 singletons {e,f} = {a,b,c,d,e,f} (size 6) -// contribution = 2 (`e` and `f` are new) +// Node 2: others' NDV = {a,b,c,d} (size 4) +// + node 2 singletons {e,f} = {a,b,c,d,e,f} (size 6) +// contribution = 2 (`e` and `f` are new) // // Estimated singletons = 1 + 1 + 2 = 4 func EstimateGlobalSingletonBySketches(ndvSketches, singletonSketches []*FMSketch) uint64 { @@ -117,7 +117,7 @@ func EstimateGlobalSingletonBySketches(ndvSketches, singletonSketches []*FMSketc var globalSingleton int64 for i := range ndvSketches { - // Merge NDV sketches from all regions except region i. + // Merge NDV sketches from all nodes except node i. var other *FMSketch for j, ns := range ndvSketches { if j == i || ns == nil { @@ -136,9 +136,9 @@ func EstimateGlobalSingletonBySketches(ndvSketches, singletonSketches []*FMSketc ndvOther = other.NDV() } - // Merge the other-regions sketch with region i's singleton sketch. - // ndvUnion - ndvOther gives the count of region i's singletons - // that don't appear in any other region (i.e. globally unique values). + // Merge the other-nodes sketch with node i's singleton sketch. + // ndvUnion - ndvOther gives the count of node i's singletons + // that don't appear in any other node (i.e. globally unique values). // We already captured ndvOther, so we can safely reuse other here. if other != nil { if singletonSketches[i] != nil { diff --git a/pkg/statistics/estimate_test.go b/pkg/statistics/estimate_test.go index 7c18c795b6818..a6f03a871f00b 100644 --- a/pkg/statistics/estimate_test.go +++ b/pkg/statistics/estimate_test.go @@ -43,9 +43,9 @@ func TestEstimateGlobalSingletonBySketches(t *testing.T) { ) t.Run("DocCommentExample", func(t *testing.T) { - // Region 0: all distinct = {a, b, c}, local singletons = {a, b, c} - // Region 1: all distinct = {b, c, d}, local singletons = {b, d} - // Region 2: all distinct = {c, e, f}, local singletons = {e, f} + // Node 0: all distinct = {a, b, c}, local singletons = {a, b, c} + // Node 1: all distinct = {b, c, d}, local singletons = {b, d} + // Node 2: all distinct = {c, e, f}, local singletons = {e, f} // Global singletons = {a, d, e, f} = 4 ndvSketches := []*FMSketch{ newFMSketchFromHashValues(a, b, c), @@ -61,8 +61,8 @@ func TestEstimateGlobalSingletonBySketches(t *testing.T) { require.Equal(t, uint64(4), got) }) - t.Run("SingleRegion", func(t *testing.T) { - // With one region, all local singletons are global singletons. + t.Run("SingleNode", func(t *testing.T) { + // With one node, all local singletons are global singletons. ndvSketches := []*FMSketch{ newFMSketchFromHashValues(a, b, c), } @@ -74,7 +74,7 @@ func TestEstimateGlobalSingletonBySketches(t *testing.T) { }) t.Run("NoOverlap", func(t *testing.T) { - // Regions have disjoint values. All local singletons are global singletons. + // Nodes have disjoint values. All local singletons are global singletons. ndvSketches := []*FMSketch{ newFMSketchFromHashValues(a, b), newFMSketchFromHashValues(c, d), @@ -90,10 +90,10 @@ func TestEstimateGlobalSingletonBySketches(t *testing.T) { }) t.Run("FullOverlap", func(t *testing.T) { - // Every local singleton also appears in another region's NDV. - // Region 0: all = {a, b}, singletons = {a, b} - // Region 1: all = {a, b}, singletons = {a, b} - // No value is unique to a single region → 0 global singletons. + // Every local singleton also appears in another node's NDV. + // Node 0: all = {a, b}, singletons = {a, b} + // Node 1: all = {a, b}, singletons = {a, b} + // No value is unique to a single node → 0 global singletons. ndvSketches := []*FMSketch{ newFMSketchFromHashValues(a, b), newFMSketchFromHashValues(a, b), @@ -107,7 +107,7 @@ func TestEstimateGlobalSingletonBySketches(t *testing.T) { }) t.Run("NilSingletonSketches", func(t *testing.T) { - // A region with nil singleton sketch contributes 0. + // A node with nil singleton sketch contributes 0. ndvSketches := []*FMSketch{ newFMSketchFromHashValues(a, b), newFMSketchFromHashValues(c, d), @@ -116,14 +116,14 @@ func TestEstimateGlobalSingletonBySketches(t *testing.T) { nil, newFMSketchFromHashValues(c, d), } - // Region 0 contributes 0 (nil singleton). - // Region 1: others' NDV = {a, b}, union with {c, d} = {a, b, c, d} → contribution = 2. + // Node 0 contributes 0 (nil singleton). + // Node 1: others' NDV = {a, b}, union with {c, d} = {a, b, c, d} → contribution = 2. got := EstimateGlobalSingletonBySketches(ndvSketches, singletonSketches) require.Equal(t, uint64(2), got) }) t.Run("NilNDVSketches", func(t *testing.T) { - // A region with nil NDV sketch is skipped when building "others". + // A node with nil NDV sketch is skipped when building "others". ndvSketches := []*FMSketch{ nil, newFMSketchFromHashValues(c, d), @@ -132,8 +132,8 @@ func TestEstimateGlobalSingletonBySketches(t *testing.T) { newFMSketchFromHashValues(a, b), newFMSketchFromHashValues(c, d), } - // Region 0: others' NDV = {c, d}, union with {a, b} = {a, b, c, d} → contribution = 2. - // Region 1: others' NDV = {} (nil), union with {c, d} = {c, d} → contribution = 2. + // Node 0: others' NDV = {c, d}, union with {a, b} = {a, b, c, d} → contribution = 2. + // Node 1: others' NDV = {} (nil), union with {c, d} = {c, d} → contribution = 2. got := EstimateGlobalSingletonBySketches(ndvSketches, singletonSketches) require.Equal(t, uint64(4), got) }) From 7b1b93888657232fea6bf59656a53fe8ed8f6e18 Mon Sep 17 00:00:00 2001 From: 0xPoe Date: Tue, 7 Apr 2026 18:17:14 +0200 Subject: [PATCH 06/21] fix: do not allow nil Signed-off-by: 0xPoe --- pkg/statistics/estimate.go | 4 +++ pkg/statistics/estimate_test.go | 50 +++++++++------------------------ 2 files changed, 17 insertions(+), 37 deletions(-) diff --git a/pkg/statistics/estimate.go b/pkg/statistics/estimate.go index 17f8f62f7e5b7..08bdb89a2b64e 100644 --- a/pkg/statistics/estimate.go +++ b/pkg/statistics/estimate.go @@ -110,6 +110,10 @@ func EstimateNDVByGEE(sampleNDV, singletonItems, sampleSize, rowCount uint64) ui func EstimateGlobalSingletonBySketches(ndvSketches, singletonSketches []*FMSketch) uint64 { intest.Assert(len(ndvSketches) > 0, "ndvSketches shouldn't be empty") intest.Assert(len(ndvSketches) == len(singletonSketches), "ndvSketches and singletonSketches should have the same length") + for i := range ndvSketches { + intest.Assert(ndvSketches[i] != nil, "ndvSketches must not contain nil entries") + intest.Assert(singletonSketches[i] != nil, "singletonSketches must not contain nil entries") + } // Defensive checks. if len(ndvSketches) == 0 || len(ndvSketches) != len(singletonSketches) { return 0 diff --git a/pkg/statistics/estimate_test.go b/pkg/statistics/estimate_test.go index a6f03a871f00b..3a860d8e69a2f 100644 --- a/pkg/statistics/estimate_test.go +++ b/pkg/statistics/estimate_test.go @@ -106,43 +106,19 @@ func TestEstimateGlobalSingletonBySketches(t *testing.T) { require.Equal(t, uint64(0), got) }) - t.Run("NilSingletonSketches", func(t *testing.T) { - // A node with nil singleton sketch contributes 0. - ndvSketches := []*FMSketch{ - newFMSketchFromHashValues(a, b), - newFMSketchFromHashValues(c, d), - } - singletonSketches := []*FMSketch{ - nil, - newFMSketchFromHashValues(c, d), - } - // Node 0 contributes 0 (nil singleton). - // Node 1: others' NDV = {a, b}, union with {c, d} = {a, b, c, d} → contribution = 2. - got := EstimateGlobalSingletonBySketches(ndvSketches, singletonSketches) - require.Equal(t, uint64(2), got) - }) - - t.Run("NilNDVSketches", func(t *testing.T) { - // A node with nil NDV sketch is skipped when building "others". - ndvSketches := []*FMSketch{ - nil, - newFMSketchFromHashValues(c, d), - } - singletonSketches := []*FMSketch{ - newFMSketchFromHashValues(a, b), - newFMSketchFromHashValues(c, d), - } - // Node 0: others' NDV = {c, d}, union with {a, b} = {a, b, c, d} → contribution = 2. - // Node 1: others' NDV = {} (nil), union with {c, d} = {c, d} → contribution = 2. - got := EstimateGlobalSingletonBySketches(ndvSketches, singletonSketches) - require.Equal(t, uint64(4), got) - }) - - t.Run("AllNilSketches", func(t *testing.T) { - ndvSketches := []*FMSketch{nil, nil} - singletonSketches := []*FMSketch{nil, nil} - got := EstimateGlobalSingletonBySketches(ndvSketches, singletonSketches) - require.Equal(t, uint64(0), got) + t.Run("NilEntry", func(t *testing.T) { + require.PanicsWithValue(t, "assert failed, ndvSketches must not contain nil entries", func() { + EstimateGlobalSingletonBySketches( + []*FMSketch{nil, newFMSketchFromHashValues(c, d)}, + []*FMSketch{newFMSketchFromHashValues(a, b), newFMSketchFromHashValues(c, d)}, + ) + }) + require.PanicsWithValue(t, "assert failed, singletonSketches must not contain nil entries", func() { + EstimateGlobalSingletonBySketches( + []*FMSketch{newFMSketchFromHashValues(a, b), newFMSketchFromHashValues(c, d)}, + []*FMSketch{nil, newFMSketchFromHashValues(c, d)}, + ) + }) }) t.Run("EmptyInput", func(t *testing.T) { From e2654f6818f01d491896b586dd10ebec3d772451 Mon Sep 17 00:00:00 2001 From: 0xPoe Date: Wed, 8 Apr 2026 10:19:53 +0200 Subject: [PATCH 07/21] fix: move checks under comments Signed-off-by: 0xPoe --- pkg/statistics/estimate.go | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/pkg/statistics/estimate.go b/pkg/statistics/estimate.go index 08bdb89a2b64e..2520cab0e834a 100644 --- a/pkg/statistics/estimate.go +++ b/pkg/statistics/estimate.go @@ -108,13 +108,20 @@ func EstimateNDVByGEE(sampleNDV, singletonItems, sampleSize, rowCount uint64) ui // // Estimated singletons = 1 + 1 + 2 = 4 func EstimateGlobalSingletonBySketches(ndvSketches, singletonSketches []*FMSketch) uint64 { + // Defensive checks. intest.Assert(len(ndvSketches) > 0, "ndvSketches shouldn't be empty") intest.Assert(len(ndvSketches) == len(singletonSketches), "ndvSketches and singletonSketches should have the same length") - for i := range ndvSketches { - intest.Assert(ndvSketches[i] != nil, "ndvSketches must not contain nil entries") - intest.Assert(singletonSketches[i] != nil, "singletonSketches must not contain nil entries") - } - // Defensive checks. + intest.AssertFunc(func() bool { + for i := range ndvSketches { + if ndvSketches[i] == nil { + return false + } + if singletonSketches[i] == nil { + return false + } + } + return true + }, "ndvSketches and singletonSketches should never contains nil values") if len(ndvSketches) == 0 || len(ndvSketches) != len(singletonSketches) { return 0 } @@ -158,8 +165,6 @@ func EstimateGlobalSingletonBySketches(ndvSketches, singletonSketches []*FMSketc } globalSingleton += ndvUnion - ndvOther } - if globalSingleton < 0 { - return 0 - } + intest.Assert(globalSingleton >= 0, "globalSingleton must be positive") return uint64(globalSingleton) } From 9f1c109c6b7662a872bb1bb68a88aafab33bd499 Mon Sep 17 00:00:00 2001 From: 0xPoe Date: Wed, 8 Apr 2026 11:11:57 +0200 Subject: [PATCH 08/21] fix: handle the negative delta Signed-off-by: 0xPoe --- pkg/statistics/estimate.go | 5 +++- pkg/statistics/estimate_test.go | 45 +++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 1 deletion(-) diff --git a/pkg/statistics/estimate.go b/pkg/statistics/estimate.go index 2520cab0e834a..9c382dd80895c 100644 --- a/pkg/statistics/estimate.go +++ b/pkg/statistics/estimate.go @@ -163,7 +163,10 @@ func EstimateGlobalSingletonBySketches(ndvSketches, singletonSketches []*FMSketc if other != nil { ndvUnion = other.NDV() } - globalSingleton += ndvUnion - ndvOther + // FM sketch NDV estimates are not monotone under merge, so the estimated + // union can be smaller than ndvOther. Clamp the per-node contribution to 0. + delta := max(0, ndvUnion-ndvOther) + globalSingleton += delta } intest.Assert(globalSingleton >= 0, "globalSingleton must be positive") return uint64(globalSingleton) diff --git a/pkg/statistics/estimate_test.go b/pkg/statistics/estimate_test.go index 3a860d8e69a2f..568cea568953e 100644 --- a/pkg/statistics/estimate_test.go +++ b/pkg/statistics/estimate_test.go @@ -17,6 +17,8 @@ package statistics import ( "testing" + "github.com/pingcap/tidb/pkg/types" + "github.com/pingcap/tidb/pkg/util/mock" "github.com/stretchr/testify/require" ) @@ -30,6 +32,31 @@ func newFMSketchFromHashValues(vals ...uint64) *FMSketch { return s } +// newFMSketchesFromSamples builds a pair of sketches from the same sample rows: +// the NDV sketch sees every sample, while the singleton sketch keeps only +// values that occur exactly once in that sample set. +func newFMSketchesFromSamples(t *testing.T, maxSize int, samples ...int64) (*FMSketch, *FMSketch) { + t.Helper() + ctx := mock.NewContext() + ndvSketch := NewFMSketch(maxSize) + singletonSketch := NewFMSketch(maxSize) + counts := make(map[int64]int, len(samples)) + for _, v := range samples { + counts[v]++ + err := ndvSketch.InsertValue(ctx.GetSessionVars().StmtCtx, types.NewIntDatum(v)) + require.NoError(t, err) + } + for _, v := range samples { + if counts[v] != 1 { + continue + } + err := singletonSketch.InsertValue(ctx.GetSessionVars().StmtCtx, types.NewIntDatum(v)) + require.NoError(t, err) + delete(counts, v) + } + return ndvSketch, singletonSketch +} + func TestEstimateGlobalSingletonBySketches(t *testing.T) { // Use distinct hash values to represent distinct data values. // With maxSize=1000 and few insertions, mask stays 0 so NDV = len(hashset) (exact). @@ -106,6 +133,24 @@ func TestEstimateGlobalSingletonBySketches(t *testing.T) { require.Equal(t, uint64(0), got) }) + t.Run("NegativeContributionIsClamped", func(t *testing.T) { + // Both sketches are built from the same local samples. + // + // Node 0 samples: [0] + // Node 1 samples: [0, 0, 0, 1, 1, 4, 7] + // + // The true global singleton set is {4, 7}, so the result should be 2. + // Before the fix, node 0's contribution could become negative due to FM + // sketch merge behavior, making the final estimate incorrect. + ndv0, singleton0 := newFMSketchesFromSamples(t, 3, 0) + ndv1, singleton1 := newFMSketchesFromSamples(t, 3, 0, 0, 0, 1, 1, 4, 7) + ndvSketches := []*FMSketch{ndv0, ndv1} + singletonSketches := []*FMSketch{singleton0, singleton1} + + got := EstimateGlobalSingletonBySketches(ndvSketches, singletonSketches) + require.Equal(t, uint64(2), got) + }) + t.Run("NilEntry", func(t *testing.T) { require.PanicsWithValue(t, "assert failed, ndvSketches must not contain nil entries", func() { EstimateGlobalSingletonBySketches( From ec78c2fef1be3c71bb20c4090dedd454f9d5f960 Mon Sep 17 00:00:00 2001 From: 0xPoe Date: Wed, 8 Apr 2026 11:32:39 +0200 Subject: [PATCH 09/21] docs: more commnets Signed-off-by: 0xPoe --- pkg/statistics/estimate.go | 1 + 1 file changed, 1 insertion(+) diff --git a/pkg/statistics/estimate.go b/pkg/statistics/estimate.go index 9c382dd80895c..f633ed8900e83 100644 --- a/pkg/statistics/estimate.go +++ b/pkg/statistics/estimate.go @@ -165,6 +165,7 @@ func EstimateGlobalSingletonBySketches(ndvSketches, singletonSketches []*FMSketc } // FM sketch NDV estimates are not monotone under merge, so the estimated // union can be smaller than ndvOther. Clamp the per-node contribution to 0. + // In practice, this appears to be fairly rare. delta := max(0, ndvUnion-ndvOther) globalSingleton += delta } From 352d92bdba57be0d87c821f0c5243ddd03044dee Mon Sep 17 00:00:00 2001 From: 0xPoe Date: Wed, 8 Apr 2026 12:05:23 +0200 Subject: [PATCH 10/21] feat: impove the perf Signed-off-by: 0xPoe --- pkg/statistics/estimate.go | 38 ++++++++++++++++++++++++++++---------- 1 file changed, 28 insertions(+), 10 deletions(-) diff --git a/pkg/statistics/estimate.go b/pkg/statistics/estimate.go index f633ed8900e83..88657660f4fcb 100644 --- a/pkg/statistics/estimate.go +++ b/pkg/statistics/estimate.go @@ -78,12 +78,12 @@ func EstimateNDVByGEE(sampleNDV, singletonItems, sampleSize, rowCount uint64) ui // // Summing these per-node contributions gives the global singleton estimate. // -// The current implementation rebuilds "all except i" from scratch for each -// node, which is O(k²) in the number of nodes. A prefix-suffix approach -// could reduce this to O(k) time, but would require O(k) extra sketches -// (~160KB each), which risks significant memory pressure for tables with -// many nodes. We keep the O(k²) approach -// for its O(1) memory overhead. +// The implementation keeps a rolling prefix union for nodes before i and +// rebuilds only the suffix union for nodes after i. That keeps the O(k²) +// time complexity but avoids roughly half of the repeated merge work while +// preserving O(1) extra sketches. A full prefix-suffix cache could reduce +// the runtime to O(k), but it would require O(k) extra sketches (~160KB +// each), which risks significant memory pressure for tables with many nodes. // // Example with three nodes: // @@ -127,20 +127,29 @@ func EstimateGlobalSingletonBySketches(ndvSketches, singletonSketches []*FMSketc } var globalSingleton int64 + var prefixNDV *FMSketch for i := range ndvSketches { - // Merge NDV sketches from all nodes except node i. + // Rebuild the suffix union from nodes after i, then merge the rolling + // prefix union from nodes before i. var other *FMSketch - for j, ns := range ndvSketches { - if j == i || ns == nil { + for j := i + 1; j < len(ndvSketches); j++ { + ns := ndvSketches[j] + if ns == nil { continue } - if other == nil { other = ns.Copy() continue } other.MergeFMSketch(ns) } + if prefixNDV != nil { + if other == nil { + other = prefixNDV.Copy() + } else { + other.MergeFMSketch(prefixNDV) + } + } ndvOther := int64(0) if other != nil { @@ -168,6 +177,15 @@ func EstimateGlobalSingletonBySketches(ndvSketches, singletonSketches []*FMSketc // In practice, this appears to be fairly rare. delta := max(0, ndvUnion-ndvOther) globalSingleton += delta + + if ndvSketches[i] == nil { + continue + } + if prefixNDV == nil { + prefixNDV = ndvSketches[i].Copy() + } else { + prefixNDV.MergeFMSketch(ndvSketches[i]) + } } intest.Assert(globalSingleton >= 0, "globalSingleton must be positive") return uint64(globalSingleton) From 03d8d0ce793cd86474a692ccbbf16d1c4bf4bf79 Mon Sep 17 00:00:00 2001 From: 0xPoe Date: Wed, 8 Apr 2026 12:05:54 +0200 Subject: [PATCH 11/21] fix: correct the size Signed-off-by: 0xPoe --- pkg/statistics/estimate.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/statistics/estimate.go b/pkg/statistics/estimate.go index 88657660f4fcb..b1faeae629cd0 100644 --- a/pkg/statistics/estimate.go +++ b/pkg/statistics/estimate.go @@ -82,7 +82,7 @@ func EstimateNDVByGEE(sampleNDV, singletonItems, sampleSize, rowCount uint64) ui // rebuilds only the suffix union for nodes after i. That keeps the O(k²) // time complexity but avoids roughly half of the repeated merge work while // preserving O(1) extra sketches. A full prefix-suffix cache could reduce -// the runtime to O(k), but it would require O(k) extra sketches (~160KB +// the runtime to O(k), but it would require O(k) extra sketches (~80KB // each), which risks significant memory pressure for tables with many nodes. // // Example with three nodes: From e11f2a33f114342d8621d2e59176039638a4169f Mon Sep 17 00:00:00 2001 From: 0xPoe Date: Wed, 8 Apr 2026 12:27:46 +0200 Subject: [PATCH 12/21] fix: better code Signed-off-by: 0xPoe --- pkg/statistics/estimate.go | 68 ++++++++++++++------------------------ 1 file changed, 24 insertions(+), 44 deletions(-) diff --git a/pkg/statistics/estimate.go b/pkg/statistics/estimate.go index b1faeae629cd0..22afb53bb3c69 100644 --- a/pkg/statistics/estimate.go +++ b/pkg/statistics/estimate.go @@ -112,16 +112,21 @@ func EstimateGlobalSingletonBySketches(ndvSketches, singletonSketches []*FMSketc intest.Assert(len(ndvSketches) > 0, "ndvSketches shouldn't be empty") intest.Assert(len(ndvSketches) == len(singletonSketches), "ndvSketches and singletonSketches should have the same length") intest.AssertFunc(func() bool { - for i := range ndvSketches { - if ndvSketches[i] == nil { + for _, ndvSketch := range ndvSketches { + if ndvSketch == nil { return false } - if singletonSketches[i] == nil { + } + return true + }, "ndvSketches must not contain nil entries") + intest.AssertFunc(func() bool { + for _, singletonSketch := range singletonSketches { + if singletonSketch == nil { return false } } return true - }, "ndvSketches and singletonSketches should never contains nil values") + }, "singletonSketches must not contain nil entries") if len(ndvSketches) == 0 || len(ndvSketches) != len(singletonSketches) { return 0 } @@ -129,44 +134,16 @@ func EstimateGlobalSingletonBySketches(ndvSketches, singletonSketches []*FMSketc var globalSingleton int64 var prefixNDV *FMSketch for i := range ndvSketches { - // Rebuild the suffix union from nodes after i, then merge the rolling - // prefix union from nodes before i. - var other *FMSketch + other := mergeCopiedFMSketch(nil, prefixNDV) for j := i + 1; j < len(ndvSketches); j++ { - ns := ndvSketches[j] - if ns == nil { - continue - } - if other == nil { - other = ns.Copy() - continue - } - other.MergeFMSketch(ns) - } - if prefixNDV != nil { - if other == nil { - other = prefixNDV.Copy() - } else { - other.MergeFMSketch(prefixNDV) - } + other = mergeCopiedFMSketch(other, ndvSketches[j]) } ndvOther := int64(0) if other != nil { ndvOther = other.NDV() } - - // Merge the other-nodes sketch with node i's singleton sketch. - // ndvUnion - ndvOther gives the count of node i's singletons - // that don't appear in any other node (i.e. globally unique values). - // We already captured ndvOther, so we can safely reuse other here. - if other != nil { - if singletonSketches[i] != nil { - other.MergeFMSketch(singletonSketches[i]) - } - } else if singletonSketches[i] != nil { - other = singletonSketches[i].Copy() - } + other = mergeCopiedFMSketch(other, singletonSketches[i]) ndvUnion := int64(0) if other != nil { @@ -177,16 +154,19 @@ func EstimateGlobalSingletonBySketches(ndvSketches, singletonSketches []*FMSketc // In practice, this appears to be fairly rare. delta := max(0, ndvUnion-ndvOther) globalSingleton += delta - - if ndvSketches[i] == nil { - continue - } - if prefixNDV == nil { - prefixNDV = ndvSketches[i].Copy() - } else { - prefixNDV.MergeFMSketch(ndvSketches[i]) - } + prefixNDV = mergeCopiedFMSketch(prefixNDV, ndvSketches[i]) } intest.Assert(globalSingleton >= 0, "globalSingleton must be positive") return uint64(globalSingleton) } + +func mergeCopiedFMSketch(dst, src *FMSketch) *FMSketch { + if src == nil { + return dst + } + if dst == nil { + return src.Copy() + } + dst.MergeFMSketch(src) + return dst +} From beb4fc8f89701d9bac91b987769ce56a27f4bd07 Mon Sep 17 00:00:00 2001 From: 0xPoe Date: Wed, 8 Apr 2026 13:25:01 +0200 Subject: [PATCH 13/21] feat: improve perf futher Signed-off-by: 0xPoe --- pkg/statistics/estimate.go | 46 +++++++++++++++++++++++++++----------- 1 file changed, 33 insertions(+), 13 deletions(-) diff --git a/pkg/statistics/estimate.go b/pkg/statistics/estimate.go index 22afb53bb3c69..6eef9c7d8c22c 100644 --- a/pkg/statistics/estimate.go +++ b/pkg/statistics/estimate.go @@ -78,12 +78,14 @@ func EstimateNDVByGEE(sampleNDV, singletonItems, sampleSize, rowCount uint64) ui // // Summing these per-node contributions gives the global singleton estimate. // -// The implementation keeps a rolling prefix union for nodes before i and -// rebuilds only the suffix union for nodes after i. That keeps the O(k²) -// time complexity but avoids roughly half of the repeated merge work while -// preserving O(1) extra sketches. A full prefix-suffix cache could reduce -// the runtime to O(k), but it would require O(k) extra sketches (~80KB -// each), which risks significant memory pressure for tables with many nodes. +// The implementation splits the nodes into two halves, precomputes one NDV +// union per half, and then rebuilds only the suffix within each half while +// keeping a rolling in-half prefix. That keeps the O(k²) time complexity +// but cuts repeated merge work to roughly one quarter of the naive +// rebuild-from-scratch loop while preserving O(1) extra sketches. A full +// prefix-suffix cache could reduce the runtime to O(k), but it would require +// O(k) extra sketches (~80KB each), which risks significant memory pressure +// for tables with many nodes. // // Example with three nodes: // @@ -131,13 +133,33 @@ func EstimateGlobalSingletonBySketches(ndvSketches, singletonSketches []*FMSketc return 0 } + mid := (len(ndvSketches) + 1) / 2 + var leftHalfNDV *FMSketch + for i := range mid { + leftHalfNDV = mergeCopiedFMSketch(leftHalfNDV, ndvSketches[i]) + } + var rightHalfNDV *FMSketch + for i := mid; i < len(ndvSketches); i++ { + rightHalfNDV = mergeCopiedFMSketch(rightHalfNDV, ndvSketches[i]) + } + + globalSingleton := estimateGlobalSingletonInRange(ndvSketches, singletonSketches, 0, mid, nil, rightHalfNDV) + globalSingleton += estimateGlobalSingletonInRange(ndvSketches, singletonSketches, mid, len(ndvSketches), leftHalfNDV, nil) + intest.Assert(globalSingleton >= 0, "globalSingleton must be positive") + // SAFETY: Each per-node contribution is clamped to >= 0 before accumulation. + return uint64(globalSingleton) +} + +func estimateGlobalSingletonInRange(ndvSketches, singletonSketches []*FMSketch, start, end int, beforeRangeNDV, afterRangeNDV *FMSketch) int64 { var globalSingleton int64 var prefixNDV *FMSketch - for i := range ndvSketches { - other := mergeCopiedFMSketch(nil, prefixNDV) - for j := i + 1; j < len(ndvSketches); j++ { + for i := start; i < end; i++ { + other := mergeCopiedFMSketch(nil, beforeRangeNDV) + other = mergeCopiedFMSketch(other, prefixNDV) + for j := i + 1; j < end; j++ { other = mergeCopiedFMSketch(other, ndvSketches[j]) } + other = mergeCopiedFMSketch(other, afterRangeNDV) ndvOther := int64(0) if other != nil { @@ -152,12 +174,10 @@ func EstimateGlobalSingletonBySketches(ndvSketches, singletonSketches []*FMSketc // FM sketch NDV estimates are not monotone under merge, so the estimated // union can be smaller than ndvOther. Clamp the per-node contribution to 0. // In practice, this appears to be fairly rare. - delta := max(0, ndvUnion-ndvOther) - globalSingleton += delta + globalSingleton += max(0, ndvUnion-ndvOther) prefixNDV = mergeCopiedFMSketch(prefixNDV, ndvSketches[i]) } - intest.Assert(globalSingleton >= 0, "globalSingleton must be positive") - return uint64(globalSingleton) + return globalSingleton } func mergeCopiedFMSketch(dst, src *FMSketch) *FMSketch { From 8e0c541e1b3b2e19e911012e33fc1724aefd62ed Mon Sep 17 00:00:00 2001 From: 0xPoe Date: Wed, 8 Apr 2026 13:30:09 +0200 Subject: [PATCH 14/21] fix: avoid overflow Signed-off-by: 0xPoe --- pkg/statistics/estimate.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/statistics/estimate.go b/pkg/statistics/estimate.go index 6eef9c7d8c22c..3ec0076237593 100644 --- a/pkg/statistics/estimate.go +++ b/pkg/statistics/estimate.go @@ -133,7 +133,7 @@ func EstimateGlobalSingletonBySketches(ndvSketches, singletonSketches []*FMSketc return 0 } - mid := (len(ndvSketches) + 1) / 2 + mid := len(ndvSketches) - len(ndvSketches)/2 var leftHalfNDV *FMSketch for i := range mid { leftHalfNDV = mergeCopiedFMSketch(leftHalfNDV, ndvSketches[i]) From 0ffa0d1f471db007843d860ab2117e7069c2e4ea Mon Sep 17 00:00:00 2001 From: 0xPoe Date: Wed, 8 Apr 2026 13:40:16 +0200 Subject: [PATCH 15/21] docs: add notes Signed-off-by: 0xPoe --- pkg/statistics/estimate.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pkg/statistics/estimate.go b/pkg/statistics/estimate.go index 3ec0076237593..a74e54a78f579 100644 --- a/pkg/statistics/estimate.go +++ b/pkg/statistics/estimate.go @@ -143,10 +143,11 @@ func EstimateGlobalSingletonBySketches(ndvSketches, singletonSketches []*FMSketc rightHalfNDV = mergeCopiedFMSketch(rightHalfNDV, ndvSketches[i]) } + // NOTE: For each node, we still merge every other node's NDV sketch. globalSingleton := estimateGlobalSingletonInRange(ndvSketches, singletonSketches, 0, mid, nil, rightHalfNDV) globalSingleton += estimateGlobalSingletonInRange(ndvSketches, singletonSketches, mid, len(ndvSketches), leftHalfNDV, nil) - intest.Assert(globalSingleton >= 0, "globalSingleton must be positive") // SAFETY: Each per-node contribution is clamped to >= 0 before accumulation. + intest.Assert(globalSingleton >= 0, "globalSingleton must be positive") return uint64(globalSingleton) } From 918e33da9e48f2a9816b246d1b9decbd848e5d11 Mon Sep 17 00:00:00 2001 From: Dongpo Liu Date: Wed, 8 Apr 2026 16:08:34 +0200 Subject: [PATCH 16/21] Update pkg/statistics/estimate.go Co-authored-by: Mattias Jonsson --- pkg/statistics/estimate.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/statistics/estimate.go b/pkg/statistics/estimate.go index a74e54a78f579..72942dfa08624 100644 --- a/pkg/statistics/estimate.go +++ b/pkg/statistics/estimate.go @@ -73,8 +73,8 @@ func EstimateNDVByGEE(sampleNDV, singletonItems, sampleSize, rowCount uint64) ui // // We compute this by merging all *other* nodes' NDV sketches (their full // distinct-value sets), then checking how much node i's local singletons -// grow that union. The growth is exactly the count of node i's singletons -// that no other node has seen. +// grow that union. The growth is approximately node i's singleton's +// FMSketch that no other node has seen. // // Summing these per-node contributions gives the global singleton estimate. // From 2a37e83a97b9cf2a926bb57fb2bccfb5255c4966 Mon Sep 17 00:00:00 2001 From: 0xPoe Date: Thu, 9 Apr 2026 09:37:58 +0200 Subject: [PATCH 17/21] refactor: reduce noisy Signed-off-by: 0xPoe --- pkg/statistics/estimate.go | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/pkg/statistics/estimate.go b/pkg/statistics/estimate.go index 72942dfa08624..116b64cbc339d 100644 --- a/pkg/statistics/estimate.go +++ b/pkg/statistics/estimate.go @@ -135,32 +135,31 @@ func EstimateGlobalSingletonBySketches(ndvSketches, singletonSketches []*FMSketc mid := len(ndvSketches) - len(ndvSketches)/2 var leftHalfNDV *FMSketch - for i := range mid { - leftHalfNDV = mergeCopiedFMSketch(leftHalfNDV, ndvSketches[i]) + for _, sketch := range ndvSketches[:mid] { + leftHalfNDV = mergeCopiedFMSketch(leftHalfNDV, sketch) } var rightHalfNDV *FMSketch - for i := mid; i < len(ndvSketches); i++ { - rightHalfNDV = mergeCopiedFMSketch(rightHalfNDV, ndvSketches[i]) + for _, sketch := range ndvSketches[mid:] { + rightHalfNDV = mergeCopiedFMSketch(rightHalfNDV, sketch) } // NOTE: For each node, we still merge every other node's NDV sketch. - globalSingleton := estimateGlobalSingletonInRange(ndvSketches, singletonSketches, 0, mid, nil, rightHalfNDV) - globalSingleton += estimateGlobalSingletonInRange(ndvSketches, singletonSketches, mid, len(ndvSketches), leftHalfNDV, nil) + globalSingleton := estimateGlobalSingletonInRange(ndvSketches[:mid], singletonSketches[:mid], rightHalfNDV) + globalSingleton += estimateGlobalSingletonInRange(ndvSketches[mid:], singletonSketches[mid:], leftHalfNDV) // SAFETY: Each per-node contribution is clamped to >= 0 before accumulation. intest.Assert(globalSingleton >= 0, "globalSingleton must be positive") return uint64(globalSingleton) } -func estimateGlobalSingletonInRange(ndvSketches, singletonSketches []*FMSketch, start, end int, beforeRangeNDV, afterRangeNDV *FMSketch) int64 { +func estimateGlobalSingletonInRange(ndvSketches, singletonSketches []*FMSketch, outOfRangeNDVSketch *FMSketch) int64 { var globalSingleton int64 var prefixNDV *FMSketch - for i := start; i < end; i++ { - other := mergeCopiedFMSketch(nil, beforeRangeNDV) - other = mergeCopiedFMSketch(other, prefixNDV) - for j := i + 1; j < end; j++ { + for i := range ndvSketches { + other := mergeCopiedFMSketch(nil, prefixNDV) + for j := i + 1; j < len(ndvSketches); j++ { other = mergeCopiedFMSketch(other, ndvSketches[j]) } - other = mergeCopiedFMSketch(other, afterRangeNDV) + other = mergeCopiedFMSketch(other, outOfRangeNDVSketch) ndvOther := int64(0) if other != nil { From 479f7e75b3ff0ff0025226287351d09c0a4c11e2 Mon Sep 17 00:00:00 2001 From: 0xPoe Date: Thu, 9 Apr 2026 09:40:57 +0200 Subject: [PATCH 18/21] fix: remove useless nil checks Signed-off-by: 0xPoe --- pkg/statistics/estimate.go | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/pkg/statistics/estimate.go b/pkg/statistics/estimate.go index 116b64cbc339d..d6ec4af77eacb 100644 --- a/pkg/statistics/estimate.go +++ b/pkg/statistics/estimate.go @@ -153,29 +153,23 @@ func EstimateGlobalSingletonBySketches(ndvSketches, singletonSketches []*FMSketc func estimateGlobalSingletonInRange(ndvSketches, singletonSketches []*FMSketch, outOfRangeNDVSketch *FMSketch) int64 { var globalSingleton int64 - var prefixNDV *FMSketch + var prefixNDVSketch *FMSketch for i := range ndvSketches { - other := mergeCopiedFMSketch(nil, prefixNDV) + other := mergeCopiedFMSketch(nil, prefixNDVSketch) for j := i + 1; j < len(ndvSketches); j++ { other = mergeCopiedFMSketch(other, ndvSketches[j]) } other = mergeCopiedFMSketch(other, outOfRangeNDVSketch) - ndvOther := int64(0) - if other != nil { - ndvOther = other.NDV() - } + ndvOther := other.NDV() other = mergeCopiedFMSketch(other, singletonSketches[i]) - ndvUnion := int64(0) - if other != nil { - ndvUnion = other.NDV() - } + ndvUnion := other.NDV() // FM sketch NDV estimates are not monotone under merge, so the estimated // union can be smaller than ndvOther. Clamp the per-node contribution to 0. // In practice, this appears to be fairly rare. globalSingleton += max(0, ndvUnion-ndvOther) - prefixNDV = mergeCopiedFMSketch(prefixNDV, ndvSketches[i]) + prefixNDVSketch = mergeCopiedFMSketch(prefixNDVSketch, ndvSketches[i]) } return globalSingleton } From 7b1de2a0e6802edac89dba435586a070711d3d3d Mon Sep 17 00:00:00 2001 From: 0xPoe Date: Thu, 9 Apr 2026 09:45:03 +0200 Subject: [PATCH 19/21] docs: add more comments Signed-off-by: 0xPoe --- pkg/statistics/estimate.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pkg/statistics/estimate.go b/pkg/statistics/estimate.go index d6ec4af77eacb..bdc7129afd587 100644 --- a/pkg/statistics/estimate.go +++ b/pkg/statistics/estimate.go @@ -161,9 +161,11 @@ func estimateGlobalSingletonInRange(ndvSketches, singletonSketches []*FMSketch, } other = mergeCopiedFMSketch(other, outOfRangeNDVSketch) + // NDV of the union of all other nodes before merging this node's singletons. ndvOther := other.NDV() other = mergeCopiedFMSketch(other, singletonSketches[i]) + // NDV of the union after merging this node's singleton sketch. ndvUnion := other.NDV() // FM sketch NDV estimates are not monotone under merge, so the estimated // union can be smaller than ndvOther. Clamp the per-node contribution to 0. From cedb658a2532473359e658bdffdeeb1993bc739e Mon Sep 17 00:00:00 2001 From: 0xPoe Date: Thu, 9 Apr 2026 09:56:18 +0200 Subject: [PATCH 20/21] refactor: all use slice access Signed-off-by: 0xPoe --- pkg/statistics/estimate.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/statistics/estimate.go b/pkg/statistics/estimate.go index bdc7129afd587..79d8f1abbf99d 100644 --- a/pkg/statistics/estimate.go +++ b/pkg/statistics/estimate.go @@ -156,8 +156,8 @@ func estimateGlobalSingletonInRange(ndvSketches, singletonSketches []*FMSketch, var prefixNDVSketch *FMSketch for i := range ndvSketches { other := mergeCopiedFMSketch(nil, prefixNDVSketch) - for j := i + 1; j < len(ndvSketches); j++ { - other = mergeCopiedFMSketch(other, ndvSketches[j]) + for _, sketch := range ndvSketches[i+1:] { + other = mergeCopiedFMSketch(other, sketch) } other = mergeCopiedFMSketch(other, outOfRangeNDVSketch) From 35b6a87a746771b6eaee7209883f640baf7cfbbd Mon Sep 17 00:00:00 2001 From: 0xPoe Date: Thu, 9 Apr 2026 09:58:36 +0200 Subject: [PATCH 21/21] docs: add more comments Signed-off-by: 0xPoe --- pkg/statistics/estimate.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pkg/statistics/estimate.go b/pkg/statistics/estimate.go index 79d8f1abbf99d..c4f1648a69fb9 100644 --- a/pkg/statistics/estimate.go +++ b/pkg/statistics/estimate.go @@ -153,6 +153,9 @@ func EstimateGlobalSingletonBySketches(ndvSketches, singletonSketches []*FMSketc func estimateGlobalSingletonInRange(ndvSketches, singletonSketches []*FMSketch, outOfRangeNDVSketch *FMSketch) int64 { var globalSingleton int64 + // prefixNDVSketch accumulates ndvSketches[0..i-1] as i advances, so + // each iteration only rebuilds the suffix (ndvSketches[i+1..]) from + // scratch instead of the full "all-except-i" set. var prefixNDVSketch *FMSketch for i := range ndvSketches { other := mergeCopiedFMSketch(nil, prefixNDVSketch)