Skip to content

Commit 3683cb8

Browse files
committed
feat: optimize null-eq join fixed keys
1 parent 73ee822 commit 3683cb8

File tree

6 files changed

+148
-49
lines changed

6 files changed

+148
-49
lines changed

dbms/src/Interpreters/Join.cpp

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -51,17 +51,19 @@ extern const int TYPE_MISMATCH;
5151

5252
namespace
5353
{
54-
ColumnRawPtrs getKeyColumns(const Names & key_names, const Block & block)
54+
ColumnRawPtrs getKeyColumns(const Names & key_names, const Block & block, const std::vector<UInt8> & is_null_eq = {})
5555
{
5656
size_t keys_size = key_names.size();
57+
RUNTIME_CHECK(is_null_eq.empty() || is_null_eq.size() == keys_size);
5758
ColumnRawPtrs key_columns(keys_size);
5859

5960
for (size_t i = 0; i < keys_size; ++i)
6061
{
6162
key_columns[i] = block.getByName(key_names[i]).column.get();
6263

63-
/// We will join only keys, where all components are not NULL.
64-
if (key_columns[i]->isColumnNullable())
64+
/// Ordinary '=' keys join only nested values where all components are not NULL.
65+
/// NullEQ keys must keep their nullable wrapper so nullness can participate in key comparison.
66+
if (key_columns[i]->isColumnNullable() && (is_null_eq.empty() || is_null_eq[i] == 0))
6567
key_columns[i] = &static_cast<const ColumnNullable &>(*key_columns[i]).getNestedColumn();
6668
}
6769

@@ -437,12 +439,15 @@ void Join::initBuild(const Block & sample_block, size_t build_concurrency_)
437439
if (unlikely(initialized))
438440
throw Exception("Logical error: Join has been initialized", ErrorCodes::LOGICAL_ERROR);
439441
initialized = true;
440-
join_map_method = chooseJoinMapMethod(getKeyColumns(key_names_right, sample_block), key_sizes, collators);
442+
join_map_method = chooseJoinMapMethod(
443+
getKeyColumns(key_names_right, sample_block, is_null_eq),
444+
key_sizes,
445+
collators,
446+
is_null_eq);
441447
if (hasNullableNullEqKey(key_names_right, sample_block, is_null_eq))
442448
{
443-
if (join_map_method != JoinMapMethod::serialized)
444-
LOG_DEBUG(log, "Force serialized join map method because a nullable NullEQ key is present");
445-
join_map_method = JoinMapMethod::serialized;
449+
if (join_map_method == JoinMapMethod::serialized)
450+
LOG_DEBUG(log, "Use serialized join map method because nullable NullEQ keys do not fit packed fixed keys");
446451
}
447452
build_sample_block = sample_block;
448453
setBuildConcurrencyAndInitJoinPartition(build_concurrency_);

dbms/src/Interpreters/JoinHashMap.cpp

Lines changed: 46 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,14 @@
1414

1515
#include <Columns/ColumnConst.h>
1616
#include <Columns/ColumnFixedString.h>
17+
#include <Columns/ColumnNullable.h>
1718
#include <Columns/ColumnString.h>
1819
#include <Common/typeid_cast.h>
20+
#include <Interpreters/AggregationCommon.h>
1921
#include <Interpreters/JoinHashMap.h>
2022

23+
#include <limits>
24+
2125
namespace DB
2226
{
2327
namespace
@@ -33,31 +37,66 @@ bool canAsColumnString(const IColumn * column)
3337
JoinMapMethod chooseJoinMapMethod(
3438
const ColumnRawPtrs & key_columns,
3539
Sizes & key_sizes,
36-
const TiDB::TiDBCollators & collators)
40+
const TiDB::TiDBCollators & collators,
41+
const std::vector<UInt8> & is_null_eq)
3742
{
3843
const size_t keys_size = key_columns.size();
44+
RUNTIME_CHECK(is_null_eq.empty() || is_null_eq.size() == keys_size);
3945

4046
if (keys_size == 0)
4147
return JoinMapMethod::CROSS;
4248

49+
ColumnRawPtrs nested_key_columns;
50+
nested_key_columns.reserve(keys_size);
51+
bool has_nullable_null_eq_key = false;
52+
for (size_t j = 0; j < keys_size; ++j)
53+
{
54+
const auto * key_column = key_columns[j];
55+
if (const auto * nullable_column = typeid_cast<const ColumnNullable *>(key_column))
56+
{
57+
nested_key_columns.push_back(&nullable_column->getNestedColumn());
58+
has_nullable_null_eq_key = has_nullable_null_eq_key || (!is_null_eq.empty() && is_null_eq[j] != 0);
59+
}
60+
else
61+
{
62+
nested_key_columns.push_back(key_column);
63+
}
64+
}
65+
4366
bool all_fixed = true;
4467
size_t keys_bytes = 0;
4568
key_sizes.resize(keys_size);
4669
for (size_t j = 0; j < keys_size; ++j)
4770
{
48-
if (!key_columns[j]->isFixedAndContiguous())
71+
if (!nested_key_columns[j]->isFixedAndContiguous())
4972
{
5073
all_fixed = false;
5174
break;
5275
}
53-
key_sizes[j] = key_columns[j]->sizeOfValueIfFixed();
76+
key_sizes[j] = nested_key_columns[j]->sizeOfValueIfFixed();
5477
keys_bytes += key_sizes[j];
5578
}
5679

80+
if (has_nullable_null_eq_key)
81+
{
82+
if (all_fixed)
83+
{
84+
if (keys_bytes > (std::numeric_limits<size_t>::max() - std::tuple_size<KeysNullMap<UInt128>>::value))
85+
throw Exception("Join: keys sizes overflow", ErrorCodes::LOGICAL_ERROR);
86+
87+
if (std::tuple_size<KeysNullMap<UInt128>>::value + keys_bytes <= sizeof(UInt128))
88+
return JoinMapMethod::nullable_keys128;
89+
if (std::tuple_size<KeysNullMap<UInt256>>::value + keys_bytes <= sizeof(UInt256))
90+
return JoinMapMethod::nullable_keys256;
91+
}
92+
93+
return JoinMapMethod::serialized;
94+
}
95+
5796
/// If there is one numeric key that fits in 64 bits
58-
if (keys_size == 1 && key_columns[0]->isNumeric())
97+
if (keys_size == 1 && nested_key_columns[0]->isNumeric())
5998
{
60-
size_t size_of_field = key_columns[0]->sizeOfValueIfFixed();
99+
size_t size_of_field = nested_key_columns[0]->sizeOfValueIfFixed();
61100
if (size_of_field == 1)
62101
return JoinMapMethod::key8;
63102
if (size_of_field == 2)
@@ -80,7 +119,7 @@ JoinMapMethod chooseJoinMapMethod(
80119
return JoinMapMethod::keys256;
81120

82121
/// If there is single string key, use hash table of it's values.
83-
if (keys_size == 1 && canAsColumnString(key_columns[0]))
122+
if (keys_size == 1 && canAsColumnString(nested_key_columns[0]))
84123
{
85124
if (collators.empty() || !collators[0])
86125
return JoinMapMethod::key_strbin;
@@ -108,7 +147,7 @@ JoinMapMethod chooseJoinMapMethod(
108147
}
109148
}
110149

111-
if (keys_size == 1 && typeid_cast<const ColumnFixedString *>(key_columns[0]))
150+
if (keys_size == 1 && typeid_cast<const ColumnFixedString *>(nested_key_columns[0]))
112151
return JoinMapMethod::key_fixed_string;
113152

114153
/// Otherwise, use serialized values as the key.

dbms/src/Interpreters/JoinHashMap.h

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,8 @@ struct WithUsedFlag<false, Base> : Base
143143
M(key_fixed_string) \
144144
M(keys128) \
145145
M(keys256) \
146+
M(nullable_keys128) \
147+
M(nullable_keys256) \
146148
M(serialized)
147149

148150
enum class JoinMapMethod
@@ -171,6 +173,8 @@ struct ConcurrentMapsTemplate
171173
using key_fixed_stringType = ConcurrentHashMapWithSavedHash<StringRef, Mapped>;
172174
using keys128Type = ConcurrentHashMap<UInt128, Mapped, HashCRC32<UInt128>>;
173175
using keys256Type = ConcurrentHashMap<UInt256, Mapped, HashCRC32<UInt256>>;
176+
using nullable_keys128Type = ConcurrentHashMap<UInt128, Mapped, HashCRC32<UInt128>>;
177+
using nullable_keys256Type = ConcurrentHashMap<UInt256, Mapped, HashCRC32<UInt256>>;
174178
using serializedType = ConcurrentHashMapWithSavedHash<StringRef, Mapped>;
175179

176180
std::unique_ptr<key8Type> key8;
@@ -183,6 +187,8 @@ struct ConcurrentMapsTemplate
183187
std::unique_ptr<key_fixed_stringType> key_fixed_string;
184188
std::unique_ptr<keys128Type> keys128;
185189
std::unique_ptr<keys256Type> keys256;
190+
std::unique_ptr<nullable_keys128Type> nullable_keys128;
191+
std::unique_ptr<nullable_keys256Type> nullable_keys256;
186192
std::unique_ptr<serializedType> serialized;
187193
// TODO: add more cases like Aggregator
188194
};
@@ -201,6 +207,8 @@ struct MapsTemplate
201207
using key_fixed_stringType = HashMapWithSavedHash<StringRef, Mapped>;
202208
using keys128Type = HashMap<UInt128, Mapped, HashCRC32<UInt128>>;
203209
using keys256Type = HashMap<UInt256, Mapped, HashCRC32<UInt256>>;
210+
using nullable_keys128Type = HashMap<UInt128, Mapped, HashCRC32<UInt128>>;
211+
using nullable_keys256Type = HashMap<UInt256, Mapped, HashCRC32<UInt256>>;
204212
using serializedType = HashMapWithSavedHash<StringRef, Mapped>;
205213

206214
std::unique_ptr<key8Type> key8;
@@ -213,6 +221,8 @@ struct MapsTemplate
213221
std::unique_ptr<key_fixed_stringType> key_fixed_string;
214222
std::unique_ptr<keys128Type> keys128;
215223
std::unique_ptr<keys256Type> keys256;
224+
std::unique_ptr<nullable_keys128Type> nullable_keys128;
225+
std::unique_ptr<nullable_keys256Type> nullable_keys256;
216226
std::unique_ptr<serializedType> serialized;
217227
// TODO: add more cases like Aggregator
218228
};
@@ -230,6 +240,8 @@ struct MapsAny
230240
using key_fixed_stringType = HashSetWithSavedHash<StringRef>;
231241
using keys128Type = HashSet<UInt128, HashCRC32<UInt128>>;
232242
using keys256Type = HashSet<UInt256, HashCRC32<UInt256>>;
243+
using nullable_keys128Type = HashSet<UInt128, HashCRC32<UInt128>>;
244+
using nullable_keys256Type = HashSet<UInt256, HashCRC32<UInt256>>;
233245
using serializedType = HashSetWithSavedHash<StringRef>;
234246

235247
std::unique_ptr<key8Type> key8;
@@ -242,6 +254,8 @@ struct MapsAny
242254
std::unique_ptr<key_fixed_stringType> key_fixed_string;
243255
std::unique_ptr<keys128Type> keys128;
244256
std::unique_ptr<keys256Type> keys256;
257+
std::unique_ptr<nullable_keys128Type> nullable_keys128;
258+
std::unique_ptr<nullable_keys256Type> nullable_keys256;
245259
std::unique_ptr<serializedType> serialized;
246260
// TODO: add more cases like Aggregator
247261
};
@@ -257,5 +271,6 @@ using MapsAllFullWithRowFlag = MapsTemplate<RowRefListWithUsedFlag>; // With fla
257271
JoinMapMethod chooseJoinMapMethod(
258272
const ColumnRawPtrs & key_columns,
259273
Sizes & key_sizes,
260-
const TiDB::TiDBCollators & collators);
274+
const TiDB::TiDBCollators & collators,
275+
const std::vector<UInt8> & is_null_eq = {});
261276
} // namespace DB

dbms/src/Interpreters/JoinPartition.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -440,6 +440,16 @@ struct KeyGetterForTypeImpl<JoinMapMethod::keys256, Value, Mapped>
440440
using Type = ColumnsHashing::HashMethodKeysFixed<Value, UInt256, Mapped, false, false>;
441441
};
442442
template <typename Value, typename Mapped>
443+
struct KeyGetterForTypeImpl<JoinMapMethod::nullable_keys128, Value, Mapped>
444+
{
445+
using Type = ColumnsHashing::HashMethodKeysFixed<Value, UInt128, Mapped, true, false>;
446+
};
447+
template <typename Value, typename Mapped>
448+
struct KeyGetterForTypeImpl<JoinMapMethod::nullable_keys256, Value, Mapped>
449+
{
450+
using Type = ColumnsHashing::HashMethodKeysFixed<Value, UInt256, Mapped, true, false>;
451+
};
452+
template <typename Value, typename Mapped>
443453
struct KeyGetterForTypeImpl<JoinMapMethod::serialized, Value, Mapped>
444454
{
445455
using Type = ColumnsHashing::HashMethodSerialized<Value, Mapped>;

dbms/src/Interpreters/tests/gtest_join_null_eq.cpp

Lines changed: 37 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -199,9 +199,8 @@ JoinPtr makeSemiJoinTestJoin(ASTTableJoin::Kind kind)
199199
true);
200200
}
201201

202-
JoinPtr makeMixedKeyJoin(const std::vector<UInt8> & is_null_eq)
202+
JoinPtr makeMixedKeyJoin(const std::vector<UInt8> & is_null_eq, const DataTypePtr & key_type)
203203
{
204-
auto nullable_int_type = makeNullable(std::make_shared<DataTypeInt32>());
205204
auto int_type = std::make_shared<DataTypeInt32>();
206205
SpillConfig build_spill_config("/tmp", "join_null_eq_build", 0, 0, 0, nullptr);
207206
SpillConfig probe_spill_config("/tmp", "join_null_eq_probe", 0, 0, 0, nullptr);
@@ -217,11 +216,11 @@ JoinPtr makeMixedKeyJoin(const std::vector<UInt8> & is_null_eq)
217216
probe_spill_config,
218217
RestoreConfig{1, 0, 0},
219218
NamesAndTypes{
220-
{mixed_probe_key1_name, nullable_int_type},
221-
{mixed_probe_key2_name, nullable_int_type},
219+
{mixed_probe_key1_name, key_type},
220+
{mixed_probe_key2_name, key_type},
222221
{mixed_probe_value_name, int_type},
223-
{mixed_build_key1_name, nullable_int_type},
224-
{mixed_build_key2_name, nullable_int_type},
222+
{mixed_build_key1_name, key_type},
223+
{mixed_build_key2_name, key_type},
225224
{mixed_build_value_name, int_type},
226225
},
227226
RegisterOperatorSpillContext{},
@@ -236,6 +235,11 @@ JoinPtr makeMixedKeyJoin(const std::vector<UInt8> & is_null_eq)
236235
true);
237236
}
238237

238+
JoinPtr makeMixedKeyJoin(const std::vector<UInt8> & is_null_eq)
239+
{
240+
return makeMixedKeyJoin(is_null_eq, makeNullable(std::make_shared<DataTypeInt32>()));
241+
}
242+
239243
Block makeOuterProbeSampleBlock(bool include_filter = false)
240244
{
241245
auto nullable_int_type = makeNullable(std::make_shared<DataTypeInt32>());
@@ -268,28 +272,36 @@ Block makeOuterBuildSampleBlock(bool include_filter = false)
268272
return block;
269273
}
270274

271-
Block makeMixedProbeSampleBlock()
275+
Block makeMixedProbeSampleBlock(const DataTypePtr & key_type)
272276
{
273-
auto nullable_int_type = makeNullable(std::make_shared<DataTypeInt32>());
274277
auto int_type = std::make_shared<DataTypeInt32>();
275278
return Block{
276-
{nullable_int_type->createColumn(), nullable_int_type, mixed_probe_key1_name},
277-
{nullable_int_type->createColumn(), nullable_int_type, mixed_probe_key2_name},
279+
{key_type->createColumn(), key_type, mixed_probe_key1_name},
280+
{key_type->createColumn(), key_type, mixed_probe_key2_name},
278281
{int_type->createColumn(), int_type, mixed_probe_value_name},
279282
};
280283
}
281284

282-
Block makeMixedBuildSampleBlock()
285+
Block makeMixedProbeSampleBlock()
286+
{
287+
return makeMixedProbeSampleBlock(makeNullable(std::make_shared<DataTypeInt32>()));
288+
}
289+
290+
Block makeMixedBuildSampleBlock(const DataTypePtr & key_type)
283291
{
284-
auto nullable_int_type = makeNullable(std::make_shared<DataTypeInt32>());
285292
auto int_type = std::make_shared<DataTypeInt32>();
286293
return Block{
287-
{nullable_int_type->createColumn(), nullable_int_type, mixed_build_key1_name},
288-
{nullable_int_type->createColumn(), nullable_int_type, mixed_build_key2_name},
294+
{key_type->createColumn(), key_type, mixed_build_key1_name},
295+
{key_type->createColumn(), key_type, mixed_build_key2_name},
289296
{int_type->createColumn(), int_type, mixed_build_value_name},
290297
};
291298
}
292299

300+
Block makeMixedBuildSampleBlock()
301+
{
302+
return makeMixedBuildSampleBlock(makeNullable(std::make_shared<DataTypeInt32>()));
303+
}
304+
293305
ColumnPtr makeNullableInt32Column(std::initializer_list<std::optional<Int32>> values)
294306
{
295307
auto nested = ColumnInt32::create();
@@ -403,13 +415,22 @@ void prepareAndFinalizeMixedJoin(const JoinPtr & join)
403415
}
404416
} // namespace
405417

406-
TEST(JoinNullEqTest, NullableNullEqKeyForcesSerializedJoinMapMethod)
418+
TEST(JoinNullEqTest, NullableNullEqKeyUsesNullablePackedJoinMapMethod)
407419
{
408420
auto nullable_int_type = makeNullable(std::make_shared<DataTypeInt32>());
409421
auto join = makeTestJoin(nullable_int_type, {1});
410422
join->initBuild(makeSampleBlock(nullable_int_type), 1);
411423

412-
ASSERT_EQ(join->getJoinMapMethod(), JoinMapMethod::serialized);
424+
ASSERT_EQ(join->getJoinMapMethod(), JoinMapMethod::nullable_keys128);
425+
}
426+
427+
TEST(JoinNullEqTest, NullableMixedNullEqKeysCanUseNullableKeys256JoinMapMethod)
428+
{
429+
auto nullable_int64_type = makeNullable(std::make_shared<DataTypeInt64>());
430+
auto join = makeMixedKeyJoin({1, 1}, nullable_int64_type);
431+
join->initBuild(makeMixedBuildSampleBlock(nullable_int64_type), 1);
432+
433+
ASSERT_EQ(join->getJoinMapMethod(), JoinMapMethod::nullable_keys256);
413434
}
414435

415436
TEST(JoinNullEqTest, DefaultMethodSelectionRemainsForOtherCases)

0 commit comments

Comments
 (0)