Skip to content
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions dbms/src/Storages/KVStore/Decode/RegionBlockReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,7 @@ bool RegionBlockReader::readImpl(Block & block, const ReadList & data_list, bool
{
VersionColResolver<ReadList> version_col_resolver;
version_col_resolver.check(block, schema_snapshot->column_defines->size());
// The column_ids to read according to schema_snapshot, each elem is (column_id, block_pos)
const auto & read_column_ids = schema_snapshot->getColId2BlockPosMap();
const auto & pk_column_ids = schema_snapshot->pk_column_ids;
const auto & pk_pos_map = schema_snapshot->pk_pos_map;
Expand Down Expand Up @@ -269,6 +270,8 @@ bool RegionBlockReader::readImpl(Block & block, const ReadList & data_list, bool
else
{
// Parse column value from encoded value
// Decode the column_ids from `column_ids_iter` to `read_column_ids.end()`
// and insert into `block` at position starting from `next_column_pos`
if (!appendRowToBlock(
*value_ptr,
column_ids_iter,
Expand Down
141 changes: 141 additions & 0 deletions dbms/src/Storages/KVStore/tests/gtest_region_block_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -692,5 +692,146 @@ try
}
CATCH

TEST_F(RegionBlockReaderTest, ReadFromRegionDefaultValue)
try
{
// With this table_info, column "c1" is "NOT NULL" and has no origin default
TableInfo table_info_c1_not_null_no_origin_default(
R"({"cols":[{"id":1,"name":{"L":"c0","O":"c0"},"offset":0,"state":5,"type":{"Charset":"binary","Collate":"binary","Decimal":0,"Flag":0,"Flen":4,"Tp":1}},{"id":2,"name":{"L":"handle","O":"handle"},"offset":1,"state":5,"type":{"Charset":"binary","Collate":"binary","Decimal":0,"Flag":515,"Flen":11,"Tp":3}},{"default":"-56083770","id":7,"name":{"L":"c1","O":"c1"},"offset":2,"state":5,"type":{"Charset":"binary","Collate":"binary","Decimal":0,"Flag":1,"Flen":20,"Tp":8}},{"id":4,"name":{"L":"c2","O":"c2"},"offset":3,"origin_default":"0.07954397","state":5,"type":{"Charset":"binary","Collate":"binary","Decimal":-1,"Flag":4097,"Flen":12,"Tp":4}},{"id":5,"name":{"L":"c5","O":"c5"},"offset":4,"origin_default":"0","state":5,"type":{"Charset":"binary","Collate":"binary","Decimal":0,"Flag":0,"Flen":10,"Tp":246}},{"default":"247262911","id":6,"name":{"L":"c4","O":"c4"},"offset":5,"origin_default":"247262911","state":5,"type":{"Charset":"binary","Collate":"binary","Decimal":0,"Flag":0,"Flen":10,"Tp":246}}],"id":636,"index_info":[],"is_common_handle":false,"keyspace_id":4294967295,"name":{"L":"t0","O":"t0"},"pk_is_handle":true,"state":5,"tiflash_replica":{"Available":true,"Count":1},"update_timestamp":463845180343844895})",
NullspaceID);

// With this table_info, column "c1" is "NOT NULL" and has no default value
TableInfo table_info_c1_not_null_no_default_value(
R"({"cols":[{"id":1,"name":{"L":"c0","O":"c0"},"offset":0,"state":5,"type":{"Charset":"binary","Collate":"binary","Decimal":0,"Flag":0,"Flen":4,"Tp":1}},{"id":2,"name":{"L":"handle","O":"handle"},"offset":1,"state":5,"type":{"Charset":"binary","Collate":"binary","Decimal":0,"Flag":515,"Flen":11,"Tp":3}},{"default":"-56083770","id":7,"name":{"L":"c1","O":"c1"},"offset":2,"state":5,"type":{"Charset":"binary","Collate":"binary","Decimal":0,"Flag":4097,"Flen":20,"Tp":8}},{"id":4,"name":{"L":"c2","O":"c2"},"offset":3,"origin_default":"0.07954397","state":5,"type":{"Charset":"binary","Collate":"binary","Decimal":-1,"Flag":4097,"Flen":12,"Tp":4}},{"id":5,"name":{"L":"c5","O":"c5"},"offset":4,"origin_default":"0","state":5,"type":{"Charset":"binary","Collate":"binary","Decimal":0,"Flag":0,"Flen":10,"Tp":246}},{"default":"247262911","id":6,"name":{"L":"c4","O":"c4"},"offset":5,"origin_default":"247262911","state":5,"type":{"Charset":"binary","Collate":"binary","Decimal":0,"Flag":0,"Flen":10,"Tp":246}}],"id":636,"index_info":[],"is_common_handle":false,"keyspace_id":4294967295,"name":{"L":"t0","O":"t0"},"pk_is_handle":true,"state":5,"tiflash_replica":{"Available":true,"Count":1},"update_timestamp":463845180343844895})",
NullspaceID);

// With this table_info, column "c1" has the "NOT NULL" flag and has origin default "-56083770"
TableInfo table_info_c1_not_null_with_origin_default(
R"({"cols":[{"id":1,"name":{"L":"c0","O":"c0"},"offset":0,"state":5,"type":{"Charset":"binary","Collate":"binary","Decimal":0,"Flag":0,"Flen":4,"Tp":1}},{"id":2,"name":{"L":"handle","O":"handle"},"offset":1,"state":5,"type":{"Charset":"binary","Collate":"binary","Decimal":0,"Flag":515,"Flen":11,"Tp":3}},{"origin_default":"-56083770","id":7,"name":{"L":"c1","O":"c1"},"offset":2,"state":5,"type":{"Charset":"binary","Collate":"binary","Decimal":0,"Flag":1,"Flen":20,"Tp":8}},{"id":4,"name":{"L":"c2","O":"c2"},"offset":3,"origin_default":"0.07954397","state":5,"type":{"Charset":"binary","Collate":"binary","Decimal":-1,"Flag":4097,"Flen":12,"Tp":4}},{"id":5,"name":{"L":"c5","O":"c5"},"offset":4,"origin_default":"0","state":5,"type":{"Charset":"binary","Collate":"binary","Decimal":0,"Flag":0,"Flen":10,"Tp":246}},{"default":"247262911","id":6,"name":{"L":"c4","O":"c4"},"offset":5,"origin_default":"247262911","state":5,"type":{"Charset":"binary","Collate":"binary","Decimal":0,"Flag":0,"Flen":10,"Tp":246}}],"id":636,"index_info":[],"is_common_handle":false,"keyspace_id":4294967295,"name":{"L":"t0","O":"t0"},"pk_is_handle":true,"state":5,"tiflash_replica":{"Available":true,"Count":1},"update_timestamp":463845180343844895})",
NullspaceID);

// With this table_info, column "c1" does not have the "NOT NULL" flag
TableInfo table_info_c1_nullable(
R"({"cols":[{"id":1,"name":{"L":"c0","O":"c0"},"offset":0,"state":5,"type":{"Charset":"binary","Collate":"binary","Decimal":0,"Flag":0,"Flen":4,"Tp":1}},{"id":2,"name":{"L":"handle","O":"handle"},"offset":1,"state":5,"type":{"Charset":"binary","Collate":"binary","Decimal":0,"Flag":515,"Flen":11,"Tp":3}},{"id":7,"name":{"L":"c1","O":"c1"},"offset":2,"state":5,"type":{"Charset":"binary","Collate":"binary","Decimal":0,"Flag":0,"Flen":20,"Tp":8}},{"id":4,"name":{"L":"c2","O":"c2"},"offset":3,"origin_default":"0.07954397","state":5,"type":{"Charset":"binary","Collate":"binary","Decimal":-1,"Flag":4097,"Flen":12,"Tp":4}},{"id":5,"name":{"L":"c5","O":"c5"},"offset":4,"origin_default":"0","state":5,"type":{"Charset":"binary","Collate":"binary","Decimal":0,"Flag":0,"Flen":10,"Tp":246}},{"default":"247262911","id":6,"name":{"L":"c4","O":"c4"},"offset":5,"origin_default":"247262911","state":5,"type":{"Charset":"binary","Collate":"binary","Decimal":0,"Flag":0,"Flen":10,"Tp":246}}],"id":636,"index_info":[],"is_common_handle":false,"keyspace_id":4294967295,"name":{"L":"t0","O":"t0"},"pk_is_handle":true,"state":5,"tiflash_replica":{"Available":true,"Count":1},"update_timestamp":463844343842340870})",
NullspaceID);

RegionID region_id = 4;
// the start_key and end_key for table_id = 68
String region_start_key(bytesFromHexString("7480000000000002FF7C5F720000000000FA"));
String region_end_key(bytesFromHexString("7480000000000002FF7D00000000000000F8"));
auto region = RegionBench::makeRegionForRange(region_id, region_start_key, region_end_key);
// the hex kv dump from RaftLog
std::vector<std::tuple<std::string_view, std::string_view>> kvs = {
{
"7480000000000002FFA95F728000000000FF0000010000000000FAF9901806DEF7FFDA",
"50A380A08892FFF9B706762C8000040000000405060708000F0016001A00BFDC4011A00000000A0080000000000A008000003CA339"
"1ABC85",
},
{
"7480000000000002FFA95F728000000000FF0000010000000000FAF9901806DEF7FFD8",
"50A680A08892FFF9B706762C8000040000000405060708000F0016001A00BFDC4011A00000000A008033E04D600A008000003CA339"
"1ABC85",
},
{
"7480000000000002FFA95F728000000000FF0000020000000000FAF9901806DE33FFE8",
"509680B08E92FFF9B706762580000300000004050608000F001600BF720CDD400000000A0080000000010A00800000393C",
},
};
for (const auto & [k, v] : kvs)
{
region->insertDebug("write", TiKVKey(bytesFromHexString(k)), TiKVValue(bytesFromHexString(v)));
}

auto data_list_read = ReadRegionCommitCache(region, true);
ASSERT_TRUE(data_list_read.has_value());

// Test with `table_info_c1_not_null_no_origin_default`
auto decoding_schema = getDecodingStorageSchemaSnapshot(table_info_c1_not_null_no_origin_default);
{
// force_decode=false can not decode because there are
// missing value for column with not null flag.
auto reader = RegionBlockReader(decoding_schema);
Block res_block = createBlockSortByColumnID(decoding_schema);
ASSERT_FALSE(reader.read(res_block, *data_list_read, false));
}
{
// force_decode=true can decode the block, and filling the default value for c1
auto reader = RegionBlockReader(decoding_schema);
Block res_block = createBlockSortByColumnID(decoding_schema);
ASSERT_TRUE(reader.read(res_block, *data_list_read, true));
LOG_INFO(
Logger::get(),
"Decoded block:\n{}",
DB::tests::getColumnsContent(res_block.getColumnsWithTypeAndName()));
ASSERT_EQ(res_block.getByName("c1").type->getName(), "Int64");
// verify the default value is filled correctly
ASSERT_COLUMN_EQ( //
res_block.getByName("c1"),
createColumn<Int64>({-2051270087, -2051270087, 0}));
}

// Test with `table_info_c1_not_null_no_default_value`
decoding_schema = getDecodingStorageSchemaSnapshot(table_info_c1_not_null_no_default_value);
{
// force_decode=false can not decode because there are
// missing value for column with not null flag.
auto reader = RegionBlockReader(decoding_schema);
Block res_block = createBlockSortByColumnID(decoding_schema);
ASSERT_FALSE(reader.read(res_block, *data_list_read, false));
}
{
// force_decode=true can decode the block, and filling the default value for c1
auto reader = RegionBlockReader(decoding_schema);
Block res_block = createBlockSortByColumnID(decoding_schema);
ASSERT_TRUE(reader.read(res_block, *data_list_read, true));
LOG_INFO(
Logger::get(),
"Decoded block:\n{}",
DB::tests::getColumnsContent(res_block.getColumnsWithTypeAndName()));
ASSERT_EQ(res_block.getByName("c1").type->getName(), "Int64");
// verify the default value is filled correctly
ASSERT_COLUMN_EQ( //
res_block.getByName("c1"),
createColumn<Int64>({-2051270087, -2051270087, 0}));
}

// Test with `table_info_c1_not_null_with_origin_default`
decoding_schema = getDecodingStorageSchemaSnapshot(table_info_c1_not_null_with_origin_default);
{
// force_decode=false can decode because origin_default exists and NoDefaultValue flag is not set
// so RegionBlockReader can use origin_default to fill the missing value`
auto reader = RegionBlockReader(decoding_schema);
Block res_block = createBlockSortByColumnID(decoding_schema);
ASSERT_TRUE(reader.read(res_block, *data_list_read, false));
LOG_INFO(
Logger::get(),
"Decoded block:\n{}",
DB::tests::getColumnsContent(res_block.getColumnsWithTypeAndName()));
ASSERT_EQ(res_block.getByName("c1").type->getName(), "Int64");
// verify the default value is filled correctly
ASSERT_COLUMN_EQ( //
res_block.getByName("c1"),
createColumn<Int64>({-2051270087, -2051270087, -56083770}));
}

// Test with `table_info_c1_nullable`
decoding_schema = getDecodingStorageSchemaSnapshot(table_info_c1_nullable);
{
// force_decode=false should be able to decode because c1 is nullable
auto reader = RegionBlockReader(decoding_schema);
Block res_block = createBlockSortByColumnID(decoding_schema);
ASSERT_TRUE(reader.read(res_block, *data_list_read, false));
LOG_INFO(
Logger::get(),
"Decoded block:\n{}",
DB::tests::getColumnsContent(res_block.getColumnsWithTypeAndName()));
ASSERT_EQ(res_block.getByName("c1").type->getName(), "Nullable(Int64)");
// verify the default value is filled with NULL correctly at the last row
ASSERT_COLUMN_EQ( //
res_block.getByName("c1"),
createNullableColumn<Int64>({-2051270087, -2051270087, 0}, {0, 0, 1}));
}
}
CATCH

} // namespace DB::tests
64 changes: 46 additions & 18 deletions dbms/src/TiDB/Decode/RowCodec.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -388,40 +388,55 @@ bool appendRowToBlock(
}
}

// When a column is missing in the encoded row, decide whether we can append a value
// (NULL or origin default) to `block` or we must trigger schema sync.
//
// Background
// - TiDB encode semantics, see [tables.CanSkip](https://github.com/pingcap/tidb/blob/v8.5.5/pkg/table/tables/tables.go#L1463-L1489)
// - PK handle columns may be omitted from the value part and decoded from the key.
// - A NULL column may be omitted only when BOTH DefaultValue and OriginDefaultValue are empty.
// - For NOT NULL columns, TiDB must encode the value in the row unless the column did not
// exist in the writer schema (old data); in that case OriginDefaultValue is expected.
//
// Policy here:
// - If the omission is clearly valid under the current schema, we fill a value and return true.
// - Otherwise return false (force_decode == false) to let the caller sync schema and retry.
// - If force_decode == true, we fall back to best-effort filling.
inline bool addDefaultValueToColumnIfPossible(
Copy link
Copy Markdown
Contributor Author

@JaySon-Huang JaySon-Huang Jan 28, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Summary the logical change under different aspect:

Aspect Before PR (old logic) After PR (latest logic)
PK handle column (pk_is_handle/common handle) If ignore_pk_if_absent → return true; else force_decode=false → false, force_decode=true → fill Same behavior
NOT NULL + NoDefaultValueFlag force_decode=false → false; force_decode=true → fill Same behavior
NOT NULL + no origin default (but has default) Filled with defaultValueToField() even when force_decode=false (often zero) force_decode=false → return false unless hasOriginDefaultValue(); force_decode=true still fills
NOT NULL + origin default present Filled with defaultValueToField() Still filled, and now explicitly allowed without schema sync
NOT NULL + non‑public state No special handling (would try to fill) force_decode=false → return false if state not public
Nullable column missing Filled by defaultValueToField() (NULL if no origin default) Same behavior
Schema sync triggered for missing NOT NULL column Only when NoDefaultValueFlag Also when no origin default or non‑public state
Safety vs NOT NULL → NULLABLE mismatch Could silently insert zero Forces schema sync to avoid wrong zero fill

const ColumnInfo & column_info,
Block & block,
size_t block_column_pos,
bool ignore_pk_if_absent,
bool force_decode)
{
// We consider a missing column could be safely filled with NULL, unless it has not default value and is NOT NULL.
// This could saves lots of unnecessary schema syncs for old data with a newer schema that has newly added columns.

// 1) Primary-key columns can be decoded from the key for pk_is_handle / common-handle tables.
// Skip value-part filling here if allowed.
if (column_info.hasPriKeyFlag())
{
// For clustered index or pk_is_handle, if the pk column does not exists, it can still be decoded from the key
if (ignore_pk_if_absent)
return true;

assert(!ignore_pk_if_absent);
// For non-clustered tables, a missing PK column implies schema mismatch.
if (!force_decode)
return false;
// Else non-clustered index, and not pk_is_handle, it could be a row encoded by older schema,
// we need to fill the column which has primary key flag with default value.
// fallthrough to fill default value when force_decode
// fallthrough for best-effort fill when force_decode == true
}

if (column_info.hasNoDefaultValueFlag() && column_info.hasNotNullFlag())
// 2) NOT NULL columns:
// - If the column has NO DEFAULT, TiDB should always encode a value. Missing datum implies mismatch.
// - If the column has NO origin default, missing datum may come from a newer schema where the column
// became NULLABLE and was skipped; require schema sync.
// - If origin default exists, missing datum can be from older rows before the column was added; safe to fill.
if (!force_decode && column_info.hasNotNullFlag())
{
if (!force_decode)
if (column_info.hasNoDefaultValueFlag() || !column_info.hasOriginDefaultValue())
return false;
// Else the row does not contain this "not null" / "no default value" column,
// it could be a row encoded by older schema.
// fallthrough to fill default value when force_decode
}
// not null or has no default value, tidb will fill with specific value.
auto * raw_column = const_cast<IColumn *>((block.getByPosition(block_column_pos)).column.get());

// 3) Fill using origin default or NULL.
// Note: defaultValueToField() uses origin_default_value/origin_default_bit_value,
// and falls back to NULL (nullable) or GenDefaultField (NOT NULL) when they are empty.
auto * raw_column = const_cast<IColumn *>(block.getByPosition(block_column_pos).column.get());
raw_column->insert(column_info.defaultValueToField());
return true;
}
Expand Down Expand Up @@ -460,7 +475,9 @@ bool appendRowV2ToBlockImpl(
num_not_null_columns,
value_offsets);
size_t values_start_pos = cursor;
// how many not null columns have been processed
size_t idx_not_null = 0;
// how many null columns have been processed
size_t idx_null = 0;
// Merge ordered not null/null columns to keep order.
while (idx_not_null < not_null_column_ids.size() || idx_null < null_column_ids.size())
Expand All @@ -481,20 +498,21 @@ bool appendRowV2ToBlockImpl(
const auto next_column_id = column_ids_iter->first;
if (next_column_id > next_datum_column_id)
{
// The next column id to read is bigger than the column id of next datum in encoded row.
// The next_column_id to read is bigger than the next_datum_column_id in encoded row.
// It means this is the datum of extra column. May happen when reading after dropping
// a column.
// For `force_decode == false`, we should return false to let upper layer trigger schema sync.
if (!force_decode)
return false;
// Ignore the extra column and continue to parse other datum
// For `force_decode == true`, we just skip this extra column and continue to parse other datum.
if (is_null)
idx_null++;
else
idx_not_null++;
}
else if (next_column_id < next_datum_column_id)
{
// The next column id to read is less than the column id of next datum in encoded row.
// The next_column_id to read is less than the next_datum_column_id in encoded row.
// It means this is the datum of missing column. May happen when reading after adding
// a column.
// Fill with default value and continue to read data for next column id.
Expand All @@ -505,7 +523,10 @@ bool appendRowV2ToBlockImpl(
block_column_pos,
ignore_pk_if_absent,
force_decode))
{
// If failed to fill default value, return false to let upper layer trigger schema sync.
return false;
}
column_ids_iter++;
block_column_pos++;
}
Expand Down Expand Up @@ -570,8 +591,12 @@ bool appendRowV2ToBlockImpl(
block_column_pos++;
}
}

// There are more columns to read other than the datum encoded in the row.
while (column_ids_iter != column_ids_iter_end)
{
// Skip if the column_id is the same as `pk_handle_id`. The value of column
// `pk_handle_id` will be filled in upper layer but not in this function.
if (column_ids_iter->first != pk_handle_id)
{
const auto & column_info = column_infos[column_ids_iter->second];
Expand All @@ -581,7 +606,10 @@ bool appendRowV2ToBlockImpl(
block_column_pos,
ignore_pk_if_absent,
force_decode))
{
// If failed to fill default value, return false to let upper layer trigger schema sync.
return false;
}
}
column_ids_iter++;
block_column_pos++;
Expand Down
Loading