From 4de1df92bca54b213949ca8e4d8f196eece7b4c9 Mon Sep 17 00:00:00 2001 From: stdpain Date: Fri, 8 May 2026 11:10:18 +0800 Subject: [PATCH 1/6] [BugFix] Resolve inconsistent global dictionary generation in flat JSON The heterogeneous JSON detection logic did not correctly handle cases where: a load batch did not generate a dictionary a JSON path was extracted as a non-string type (such as int) As a result, later dictionary collection could incorrectly ignore these values, causing dictionary loss and inconsistency across loads. Signed-off-by: stdpain --- be/src/storage/meta_reader.cpp | 2 +- be/src/storage/rowset/column_iterator.h | 2 ++ .../rowset/default_value_column_iterator.h | 2 ++ test/sql/test_semi/R/test_flat_json_dict | 28 +++++++++---------- test/sql/test_semi/T/test_flat_json_dict | 2 +- 5 files changed, 20 insertions(+), 16 deletions(-) diff --git a/be/src/storage/meta_reader.cpp b/be/src/storage/meta_reader.cpp index 84ad2c9a43a930..453b7badb9ed21 100644 --- a/be/src/storage/meta_reader.cpp +++ b/be/src/storage/meta_reader.cpp @@ -428,7 +428,7 @@ Status SegmentMetaCollecter::_collect_dict_for_column(ColumnIterator* column_ite auto& tablet_column = _params->tablet_schema->column(cid); // For JSON data, the schema may be heterogeneous, meaning that some segments might not contain the dictionary column, // but a global dictionary could still be present and usable. - if (!tablet_column.is_extended()) { + if (!tablet_column.is_extended() || !column_iter->only_nulls()) { return Status::GlobalDictError("no global dict"); } else { return Status::OK(); diff --git a/be/src/storage/rowset/column_iterator.h b/be/src/storage/rowset/column_iterator.h index 0d1a61dde85b3e..76074ba703b89c 100644 --- a/be/src/storage/rowset/column_iterator.h +++ b/be/src/storage/rowset/column_iterator.h @@ -131,6 +131,8 @@ class ColumnIterator { virtual ordinal_t get_current_ordinal() const = 0; + virtual bool only_nulls() const { return false; } + virtual bool has_zone_map() const { return false; } /// Store the row ranges that satisfy the given predicates into |row_ranges|. diff --git a/be/src/storage/rowset/default_value_column_iterator.h b/be/src/storage/rowset/default_value_column_iterator.h index 4eeaa52e685ede..dd1ef4ad58d909 100644 --- a/be/src/storage/rowset/default_value_column_iterator.h +++ b/be/src/storage/rowset/default_value_column_iterator.h @@ -73,6 +73,8 @@ class DefaultValueColumnIterator final : public ColumnIterator { } } + bool only_nulls() const override { return _is_default_value_null; } + Status init(const ColumnIteratorOptions& opts) override; Status seek_to_first() override { diff --git a/test/sql/test_semi/R/test_flat_json_dict b/test/sql/test_semi/R/test_flat_json_dict index 5575258c6523a2..4a5b42424427c2 100644 --- a/test/sql/test_semi/R/test_flat_json_dict +++ b/test/sql/test_semi/R/test_flat_json_dict @@ -1,4 +1,4 @@ --- name: test_normal_flat_json_dict @sequential +-- name: test_normal_flat_json_dict update information_schema.be_configs set value = 'true' where name = 'enable_json_flat'; -- result: -- !result @@ -77,23 +77,23 @@ from (table(generate_series(1, 100))); -- !result select dict_merge(get_json_string(c1, 'f1'), 255) from js2 [_META_]; -- result: -{"2":{"lst":["str",10,"YTA","YTE","YTI","YTM","YTQ","YTU","YTY","YTc","YTg","YTk"]},"3":{"lst":["i32",10,1,2,3,4,5,6,7,8,9,10]}} +[REGEX].*no global dict.* -- !result select dict_merge(get_json_string(c1, 'f2'), 255) from js2 [_META_]; -- result: -{"2":{"lst":["str",100,"YTA","YTE","YTEw","YTEx","YTEy","YTEz","YTE0","YTE1","YTE2","YTE3","YTE4","YTE5","YTI","YTIw","YTIx","YTIy","YTIz","YTI0","YTI1","YTI2","YTI3","YTI4","YTI5","YTM","YTMw","YTMx","YTMy","YTMz","YTM0","YTM1","YTM2","YTM3","YTM4","YTM5","YTQ","YTQw","YTQx","YTQy","YTQz","YTQ0","YTQ1","YTQ2","YTQ3","YTQ4","YTQ5","YTU","YTUw","YTUx","YTUy","YTUz","YTU0","YTU1","YTU2","YTU3","YTU4","YTU5","YTY","YTYw","YTYx","YTYy","YTYz","YTY0","YTY1","YTY2","YTY3","YTY4","YTY5","YTc","YTcw","YTcx","YTcy","YTcz","YTc0","YTc1","YTc2","YTc3","YTc4","YTc5","YTg","YTgw","YTgx","YTgy","YTgz","YTg0","YTg1","YTg2","YTg3","YTg4","YTg5","YTk","YTkw","YTkx","YTky","YTkz","YTk0","YTk1","YTk2","YTk3","YTk4","YTk5"]},"3":{"lst":["i32",100,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100]}} +[REGEX].*no global dict.* -- !result select dict_merge(get_json_string(c1, 'f3'), 255) from js2 [_META_]; -- result: -{"2":{"lst":["str",200,"YTA","YTE","YTEw","YTEwMA","YTEwMQ","YTEwMg","YTEwMw","YTEwNA","YTEwNQ","YTEwNg","YTEwNw","YTEwOA","YTEwOQ","YTEx","YTExMA","YTExMQ","YTExMg","YTExMw","YTExNA","YTExNQ","YTExNg","YTExNw","YTExOA","YTExOQ","YTEy","YTEyMA","YTEyMQ","YTEyMg","YTEyMw","YTEyNA","YTEyNQ","YTEyNg","YTEyNw","YTEyOA","YTEyOQ","YTEz","YTEzMA","YTEzMQ","YTEzMg","YTEzMw","YTEzNA","YTEzNQ","YTEzNg","YTEzNw","YTEzOA","YTEzOQ","YTE0","YTE0MA","YTE0MQ","YTE0Mg","YTE0Mw","YTE0NA","YTE0NQ","YTE0Ng","YTE0Nw","YTE0OA","YTE0OQ","YTE1","YTE1MA","YTE1MQ","YTE1Mg","YTE1Mw","YTE1NA","YTE1NQ","YTE1Ng","YTE1Nw","YTE1OA","YTE1OQ","YTE2","YTE2MA","YTE2MQ","YTE2Mg","YTE2Mw","YTE2NA","YTE2NQ","YTE2Ng","YTE2Nw","YTE2OA","YTE2OQ","YTE3","YTE3MA","YTE3MQ","YTE3Mg","YTE3Mw","YTE3NA","YTE3NQ","YTE3Ng","YTE3Nw","YTE3OA","YTE3OQ","YTE4","YTE4MA","YTE4MQ","YTE4Mg","YTE4Mw","YTE4NA","YTE4NQ","YTE4Ng","YTE4Nw","YTE4OA","YTE4OQ","YTE5","YTE5MA","YTE5MQ","YTE5Mg","YTE5Mw","YTE5NA","YTE5NQ","YTE5Ng","YTE5Nw","YTE5OA","YTE5OQ","YTI","YTIw","YTIx","YTIy","YTIz","YTI0","YTI1","YTI2","YTI3","YTI4","YTI5","YTM","YTMw","YTMx","YTMy","YTMz","YTM0","YTM1","YTM2","YTM3","YTM4","YTM5","YTQ","YTQw","YTQx","YTQy","YTQz","YTQ0","YTQ1","YTQ2","YTQ3","YTQ4","YTQ5","YTU","YTUw","YTUx","YTUy","YTUz","YTU0","YTU1","YTU2","YTU3","YTU4","YTU5","YTY","YTYw","YTYx","YTYy","YTYz","YTY0","YTY1","YTY2","YTY3","YTY4","YTY5","YTc","YTcw","YTcx","YTcy","YTcz","YTc0","YTc1","YTc2","YTc3","YTc4","YTc5","YTg","YTgw","YTgx","YTgy","YTgz","YTg0","YTg1","YTg2","YTg3","YTg4","YTg5","YTk","YTkw","YTkx","YTky","YTkz","YTk0","YTk1","YTk2","YTk3","YTk4","YTk5"]},"3":{"lst":["i32",200,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200]}} +[REGEX].*no global dict.* -- !result -select dict_merge(get_json_string(c1, 'f4'), 255) from js2 [_META_]; +[REGEX].*no global dict.* -- result: -[REGEX].*global dict size:500 greater than low_cardinality_threshold:255: BE:.* +[REGEX].*no global dict.* -- !result select dict_merge(get_json_string(c1, 'f5'), 255) from js2 [_META_]; -- result: -None +[REGEX].*no global dict.* -- !result truncate table js2; -- result: @@ -264,7 +264,7 @@ from (table(generate_series(1, 1000))); -- !result select get_json_string(inspect_global_dict('js2', 'c1.f1'), 'dict'); -- result: -{"a0": 1, "a1": 2, "a2": 3, "a3": 4, "a4": 5, "a5": 6, "a6": 7, "a7": 8, "a8": 9, "a9": 10} +None -- !result select get_json_string(inspect_global_dict('js2', 'c1.f2'), 'dict'); -- result: @@ -287,7 +287,7 @@ truncate table js2; -- !result select get_json_string(inspect_global_dict('js2', 'c1.f1'), 'dict'); -- result: -{"a0": 1, "a1": 2, "a2": 3, "a3": 4, "a4": 5, "a5": 6, "a6": 7, "a7": 8, "a8": 9, "a9": 10} +None -- !result select get_json_string(inspect_global_dict('js2', 'c1.f2'), 'dict'); -- result: @@ -353,23 +353,23 @@ from (table(generate_series(1, 1000))); -- !result select get_json_string(inspect_global_dict('js2', 'c1.f1'), 'dict'); -- result: -{"a0": 1, "a1": 2, "a2": 3, "a3": 4, "a4": 5, "a5": 6, "a6": 7, "a7": 8, "a8": 9, "a9": 10} +None -- !result select get_json_string(inspect_global_dict('js2', 'c1.f2'), 'dict'); -- result: -{"a0": 1, "a1": 2, "a10": 3, "a11": 4, "a12": 5, "a13": 6, "a14": 7, "a15": 8, "a16": 9, "a17": 10, "a18": 11, "a19": 12, "a2": 13, "a3": 14, "a4": 15, "a5": 16, "a6": 17, "a7": 18, "a8": 19, "a9": 20} +None -- !result select get_json_string(inspect_global_dict('js2', 'c1.f3'), 'dict'); -- result: -{"a0": 1, "a1": 2, "a10": 3, "a11": 4, "a12": 5, "a13": 6, "a14": 7, "a15": 8, "a16": 9, "a17": 10, "a18": 11, "a19": 12, "a2": 13, "a20": 14, "a21": 15, "a22": 16, "a23": 17, "a24": 18, "a25": 19, "a26": 20, "a27": 21, "a28": 22, "a29": 23, "a3": 24, "a4": 25, "a5": 26, "a6": 27, "a7": 28, "a8": 29, "a9": 30} +None -- !result select get_json_string(inspect_global_dict('js2', 'c1.f4'), 'dict'); -- result: -{"a0": 1, "a1": 2, "a10": 3, "a11": 4, "a12": 5, "a13": 6, "a14": 7, "a15": 8, "a16": 9, "a17": 10, "a18": 11, "a19": 12, "a2": 13, "a20": 14, "a21": 15, "a22": 16, "a23": 17, "a24": 18, "a25": 19, "a26": 20, "a27": 21, "a28": 22, "a29": 23, "a3": 24, "a30": 25, "a31": 26, "a32": 27, "a33": 28, "a34": 29, "a35": 30, "a36": 31, "a37": 32, "a38": 33, "a39": 34, "a4": 35, "a5": 36, "a6": 37, "a7": 38, "a8": 39, "a9": 40} +None -- !result select get_json_string(inspect_global_dict('js2', 'c1.f5'), 'dict'); -- result: -{"a0": 1, "a1": 2, "a10": 3, "a11": 4, "a12": 5, "a13": 6, "a14": 7, "a15": 8, "a16": 9, "a17": 10, "a18": 11, "a19": 12, "a2": 13, "a20": 14, "a21": 15, "a22": 16, "a23": 17, "a24": 18, "a25": 19, "a26": 20, "a27": 21, "a28": 22, "a29": 23, "a3": 24, "a30": 25, "a31": 26, "a32": 27, "a33": 28, "a34": 29, "a35": 30, "a36": 31, "a37": 32, "a38": 33, "a39": 34, "a4": 35, "a40": 36, "a41": 37, "a42": 38, "a43": 39, "a44": 40, "a45": 41, "a46": 42, "a47": 43, "a48": 44, "a49": 45, "a5": 46, "a6": 47, "a7": 48, "a8": 49, "a9": 50} +None -- !result CREATE TABLE js3 ( v1 BIGINT NULL, diff --git a/test/sql/test_semi/T/test_flat_json_dict b/test/sql/test_semi/T/test_flat_json_dict index a14e48c1515947..5a51cdcc951e7f 100644 --- a/test/sql/test_semi/T/test_flat_json_dict +++ b/test/sql/test_semi/T/test_flat_json_dict @@ -1,4 +1,4 @@ --- name: test_normal_flat_json_dict @sequential +-- name: test_normal_flat_json_dict update information_schema.be_configs set value = 'true' where name = 'enable_json_flat'; set enable_profile = true; From b2f5a1eb03a47b377acc9d8c03f1a2864185aa4e Mon Sep 17 00:00:00 2001 From: stdpain <34912776+stdpain@users.noreply.github.com> Date: Fri, 8 May 2026 11:52:15 +0800 Subject: [PATCH 2/6] Update test_flat_json_dict Signed-off-by: stdpain <34912776+stdpain@users.noreply.github.com> --- test/sql/test_semi/R/test_flat_json_dict | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/sql/test_semi/R/test_flat_json_dict b/test/sql/test_semi/R/test_flat_json_dict index 4a5b42424427c2..b94abbf12848bb 100644 --- a/test/sql/test_semi/R/test_flat_json_dict +++ b/test/sql/test_semi/R/test_flat_json_dict @@ -1,4 +1,4 @@ --- name: test_normal_flat_json_dict +-- name: test_normal_flat_json_dict @sequential update information_schema.be_configs set value = 'true' where name = 'enable_json_flat'; -- result: -- !result @@ -477,4 +477,4 @@ None select get_json_string(inspect_global_dict('js3', 'c1.deep_nested.level1.level2.leaf_num'), 'dict'); -- result: None --- !result \ No newline at end of file +-- !result From 6af1de5aa381db10676d7b74aac62dcbad2701da Mon Sep 17 00:00:00 2001 From: stdpain <34912776+stdpain@users.noreply.github.com> Date: Fri, 8 May 2026 11:52:33 +0800 Subject: [PATCH 3/6] Update test_flat_json_dict Signed-off-by: stdpain <34912776+stdpain@users.noreply.github.com> --- test/sql/test_semi/T/test_flat_json_dict | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/sql/test_semi/T/test_flat_json_dict b/test/sql/test_semi/T/test_flat_json_dict index 5a51cdcc951e7f..a14e48c1515947 100644 --- a/test/sql/test_semi/T/test_flat_json_dict +++ b/test/sql/test_semi/T/test_flat_json_dict @@ -1,4 +1,4 @@ --- name: test_normal_flat_json_dict +-- name: test_normal_flat_json_dict @sequential update information_schema.be_configs set value = 'true' where name = 'enable_json_flat'; set enable_profile = true; From bc7f38167b613ba6e6e628237e9a5a5d2a49adbc Mon Sep 17 00:00:00 2001 From: stdpain <34912776+stdpain@users.noreply.github.com> Date: Fri, 8 May 2026 16:13:57 +0800 Subject: [PATCH 4/6] Update SQL test for dict_merge with new field Signed-off-by: stdpain <34912776+stdpain@users.noreply.github.com> --- test/sql/test_semi/R/test_flat_json_dict | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/sql/test_semi/R/test_flat_json_dict b/test/sql/test_semi/R/test_flat_json_dict index b94abbf12848bb..614ebcadcf5cd3 100644 --- a/test/sql/test_semi/R/test_flat_json_dict +++ b/test/sql/test_semi/R/test_flat_json_dict @@ -87,7 +87,7 @@ select dict_merge(get_json_string(c1, 'f3'), 255) from js2 [_META_]; -- result: [REGEX].*no global dict.* -- !result -[REGEX].*no global dict.* +select dict_merge(get_json_string(c1, 'f4'), 255) from js2 [_META_]; -- result: [REGEX].*no global dict.* -- !result From 1ea8709aa3592d06a82ad85e7bd8849c504ef266 Mon Sep 17 00:00:00 2001 From: stdpain <34912776+stdpain@users.noreply.github.com> Date: Fri, 8 May 2026 17:26:03 +0800 Subject: [PATCH 5/6] Update test_flat_json_dict Signed-off-by: stdpain <34912776+stdpain@users.noreply.github.com> --- test/sql/test_semi/R/test_flat_json_dict | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/test/sql/test_semi/R/test_flat_json_dict b/test/sql/test_semi/R/test_flat_json_dict index 614ebcadcf5cd3..e83260074aa7e3 100644 --- a/test/sql/test_semi/R/test_flat_json_dict +++ b/test/sql/test_semi/R/test_flat_json_dict @@ -87,9 +87,8 @@ select dict_merge(get_json_string(c1, 'f3'), 255) from js2 [_META_]; -- result: [REGEX].*no global dict.* -- !result -select dict_merge(get_json_string(c1, 'f4'), 255) from js2 [_META_]; +[UC]select dict_merge(get_json_string(c1, 'f4'), 255) from js2 [_META_]; -- result: -[REGEX].*no global dict.* -- !result select dict_merge(get_json_string(c1, 'f5'), 255) from js2 [_META_]; -- result: From 6334633d2d3e8740164bfe5b8f5a27150b0e931e Mon Sep 17 00:00:00 2001 From: stdpain <34912776+stdpain@users.noreply.github.com> Date: Tue, 12 May 2026 17:30:30 +0800 Subject: [PATCH 6/6] Update meta_reader.cpp Signed-off-by: stdpain <34912776+stdpain@users.noreply.github.com> --- be/src/storage/meta_reader.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/be/src/storage/meta_reader.cpp b/be/src/storage/meta_reader.cpp index cb99553376db13..ab4e9e42687ee6 100644 --- a/be/src/storage/meta_reader.cpp +++ b/be/src/storage/meta_reader.cpp @@ -449,7 +449,7 @@ Status SegmentMetaCollecter::_collect_dict_for_column(ColumnIterator* column_ite ASSIGN_OR_RETURN(const TabletColumn* tablet_column, _get_tablet_column(cid)); // For JSON data, the schema may be heterogeneous, meaning that some segments might not contain the dictionary column, // but a global dictionary could still be present and usable. - if (!tablet_column.is_extended() || !column_iter->only_nulls()) { + if (!tablet_column->is_extended() || !column_iter->only_nulls()) { return Status::GlobalDictError("no global dict"); } else { return Status::OK();