-
Notifications
You must be signed in to change notification settings - Fork 414
Expand file tree
/
Copy pathCollator.h
More file actions
511 lines (415 loc) · 17.8 KB
/
Collator.h
File metadata and controls
511 lines (415 loc) · 17.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
// Copyright 2023 PingCAP, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <Common/Exception.h>
#include <Common/UTF8Helpers.h>
#include <TiDB/Collation/CollatorCompare.h>
#include <TiDB/Schema/TiDB_fwd.h>
#include <common/StringRef.h>
#include <memory>
namespace TiDB
{
class ITiDBCollator
{
public:
enum
{
LATIN1_SWEDISH_CI = 8,
UTF8_GENERAL_CI = 33,
UTF8MB4_GENERAL_CI = 45,
UTF8_UNICODE_CI = 192,
UTF8MB4_UNICODE_CI = 224,
UTF8MB4_0900_AI_CI = 255,
UTF8MB4_0900_BIN = 309,
UTF8MB4_BIN = 46,
LATIN1_BIN = 47,
BINARY = 63,
ASCII_BIN = 65,
UTF8_BIN = 83,
};
// internal wrapped collator types which are effective for `switch case`
enum class CollatorType : uint32_t
{
// bin
UTF8MB4_BIN = 0,
UTF8_BIN,
LATIN1_BIN,
ASCII_BIN,
// binary
BINARY,
// ----
UTF8_GENERAL_CI,
UTF8MB4_GENERAL_CI,
UTF8_UNICODE_CI,
UTF8MB4_UNICODE_CI,
LATIN1_SWEDISH_CI,
UTF8MB4_0900_AI_CI,
UTF8MB4_0900_BIN,
// ----
MAX_,
};
/// Get the collator according to the internal collation ID, which directly comes from tipb and has been properly
/// de-rewritten - the "New CI Collation" will flip the sign of the collation ID.
static TiDBCollatorPtr getCollator(int32_t id);
/// Get the collator according to collator name
static TiDBCollatorPtr getCollator(const std::string & name);
class IPattern
{
public:
virtual ~IPattern() = default;
virtual void compile(const std::string & pattern, char escape) = 0;
virtual void tryCompileAsciiCi(const std::string & pattern, char escape) = 0;
virtual bool match(const char * s, size_t length) const = 0;
protected:
IPattern() = default;
};
virtual ~ITiDBCollator() = default;
virtual int compare(const char * s1, size_t length1, const char * s2, size_t length2) const = 0;
ALWAYS_INLINE inline int compareFastPath(const char * s1, size_t length1, const char * s2, size_t length2) const
{
if (likely(isPaddingBinary()))
{
return DB::BinCollatorCompare<true>(s1, length1, s2, length2);
}
return compare(s1, length1, s2, length2);
}
// Convert raw string to collate string and return the length of each character
virtual StringRef convert(const char * s, size_t length, std::string & container, std::vector<size_t> * lens) const
= 0;
virtual StringRef sortKeyNoTrim(const char * s, size_t length, std::string & container) const = 0;
virtual StringRef sortKey(const char * s, size_t length, std::string & container) const = 0;
// For sort key, a n * length mem will be reserved to decode collator.
// This method returns n.
virtual size_t sortKeyReservedSpaceMultipler() const = 0;
virtual bool isTrivialCollator() const = 0;
virtual std::unique_ptr<IPattern> pattern() const = 0;
int32_t getCollatorId() const { return collator_id; }
CollatorType getCollatorType() const { return collator_type; }
bool isBinary() const;
bool isCI() const;
ALWAYS_INLINE static inline bool isPaddingBinary(CollatorType collator_type)
{
switch (collator_type)
{
case CollatorType::UTF8MB4_BIN:
case CollatorType::UTF8_BIN:
case CollatorType::LATIN1_BIN:
case CollatorType::ASCII_BIN:
{
// collator_type < 4
return true;
}
default:
break;
}
return false;
}
ALWAYS_INLINE inline bool isPaddingBinary() const { return isPaddingBinary(getCollatorType()); }
ALWAYS_INLINE inline StringRef sortKeyFastPath(const char * s, size_t length, std::string & container) const
{
if (likely(isPaddingBinary()))
{
return DB::BinCollatorSortKey<true>(s, length);
}
return sortKey(s, length, container);
}
protected:
explicit ITiDBCollator(int32_t collator_id_);
int32_t collator_id; // collator id to be compatible with TiDB
CollatorType collator_type{CollatorType::MAX_}; // collator type for internal usage
};
/// these dummy_xxx are used as the default value to avoid too many meaningless
/// modification on the legacy ClickHouse code
extern TiDBCollators dummy_collators;
extern std::vector<std::string> dummy_sort_key_contaners;
extern std::string dummy_sort_key_contaner;
ITiDBCollator::CollatorType GetTiDBCollatorType(const void * collator);
inline void fillLensForBinCollator(const char * start, const char * end, std::vector<size_t> * lens)
{
lens->resize(0);
const char * it = start;
while (it != end)
{
UInt8 len = DB::UTF8::seqLength(static_cast<UInt8>(*it));
lens->push_back(len);
if likely (end - it >= len)
it += len;
else
throw DB::Exception("Encounter invalid character");
}
}
template <bool need_len>
StringRef convertForBinCollator(const char * start, size_t len, std::vector<size_t> * lens)
{
if constexpr (need_len)
TiDB::fillLensForBinCollator(start, start + len, lens);
return DB::BinCollatorSortKey<false>(start, len);
}
template <typename Collator>
class Pattern : public ITiDBCollator::IPattern
{
public:
void compile(const std::string & pattern, char escape) override;
void tryCompileAsciiCi(const std::string & pattern, char escape) override;
bool match(const char * s, size_t length) const override;
private:
int tryMatchAsciiCi(const char * s, size_t length) const;
std::vector<typename Collator::CharType> pattern_weights;
bool is_ascii_ci_pattern = false;
std::vector<char> ascii_ci_pattern;
enum MatchType
{
Match,
One,
Any,
};
std::vector<MatchType> match_types;
};
template <typename T, bool padding = false>
class BinCollator final : public ITiDBCollator
{
public:
explicit BinCollator(int32_t id)
: ITiDBCollator(id)
{}
int compare(const char * s1, size_t length1, const char * s2, size_t length2) const override
{
return DB::BinCollatorCompare<padding>(s1, length1, s2, length2);
}
StringRef sortKey(const char * s, size_t length, std::string &) const override
{
return DB::BinCollatorSortKey<padding>(s, length);
}
StringRef sortKeyNoTrim(const char * s, size_t length, std::string &) const override
{
return convertForBinCollator<false>(s, length, nullptr);
}
StringRef convert(const char * s, size_t length, std::string &, std::vector<size_t> * lens) const override
{
return convertForBinCollator<true>(s, length, lens);
}
std::unique_ptr<IPattern> pattern() const override;
size_t sortKeyReservedSpaceMultipler() const override
{
// BinCollator only trims trailing spaces,
// so it does not increase the space required after decoding.
// Hence, it returns 1 here.
return 1;
}
bool isTrivialCollator() const override { return !padding; }
private:
const std::string name = padding ? "BinaryPadding" : "Binary";
private:
using WeightType = T;
using CharType = T;
static inline CharType decodeChar(const char * s, size_t & offset);
static inline WeightType weight(CharType c) { return c; }
static inline bool regexEq(CharType a, CharType b) { return weight(a) == weight(b); }
friend class Pattern<BinCollator>;
};
using Rune = int32_t;
namespace UnicodeCI
{
using long_weight = struct
{
uint64_t first;
uint64_t second;
};
} // namespace UnicodeCI
class Unicode0400
{
public:
static inline bool regexEq(Rune a, Rune b);
static inline bool weight(uint64_t & first, uint64_t & second, Rune r);
private:
static inline const UnicodeCI::long_weight & weightLutLongMap(Rune r);
};
class Unicode0900
{
public:
static inline bool regexEq(Rune a, Rune b);
static inline bool weight(uint64_t & first, uint64_t & second, Rune r);
private:
static inline const UnicodeCI::long_weight & weightLutLongMap(Rune r);
};
template <typename T, bool padding>
class UCACICollator final : public ITiDBCollator
{
public:
explicit UCACICollator(int32_t id)
: ITiDBCollator(id)
{}
int compare(const char * s1, size_t length1, const char * s2, size_t length2) const override;
StringRef convert(const char * s, size_t length, std::string & container, std::vector<size_t> * lens) const override
{
return convertImpl<true, false>(s, length, container, lens);
}
StringRef sortKey(const char * s, size_t length, std::string & container) const override
{
return convertImpl<false, true>(s, length, container, nullptr);
}
StringRef sortKeyNoTrim(const char * s, size_t length, std::string & container) const override
{
return convertImpl<false, false>(s, length, container, nullptr);
}
std::unique_ptr<IPattern> pattern() const override { return std::make_unique<Pattern<UCACICollator>>(); }
size_t sortKeyReservedSpaceMultipler() const override
{
// Every char have 8 uint16 at most.
return 8 * sizeof(uint16_t);
}
bool isTrivialCollator() const override { return false; }
private:
const std::string name = "UnicodeCI";
private:
using CharType = Rune;
template <bool need_len, bool need_trim>
StringRef convertImpl(const char * s, size_t length, std::string & container, std::vector<size_t> * lens) const;
static inline CharType decodeChar(const char * s, size_t & offset);
static inline void writeResult(uint64_t & w, std::string & container, size_t & total_size)
{
while (w != 0)
{
container[total_size++] = static_cast<char>(w >> 8);
container[total_size++] = static_cast<char>(w);
w >>= 16;
}
}
static inline bool regexEq(CharType a, CharType b) { return T::regexEq(a, b); }
static inline void weight(uint64_t & first, uint64_t & second, size_t & offset, size_t length, const char * s);
static inline std::string_view preprocess(const char * s, size_t length);
friend class Pattern<UCACICollator>;
};
namespace GeneralCI
{
using WeightType = uint16_t;
extern const std::array<WeightType, 256 * 256> weight_lut;
} // namespace GeneralCI
namespace Latin1CI
{
using WeightType = UInt8;
extern const std::array<WeightType, 256> swedish_ci_weight_lut;
} // namespace Latin1CI
class Latin1Swedish
{
public:
static const std::array<Latin1CI::WeightType, 256> & weight_lut;
};
class GeneralCICollator final : public ITiDBCollator
{
public:
explicit GeneralCICollator(int32_t id)
: ITiDBCollator(id)
{}
int compare(const char * s1, size_t length1, const char * s2, size_t length2) const override;
StringRef convert(const char * s, size_t length, std::string & container, std::vector<size_t> * lens) const override
{
return convertImpl<true, false>(s, length, container, lens);
}
StringRef sortKey(const char * s, size_t length, std::string & container) const override
{
return convertImpl<false, true>(s, length, container, nullptr);
}
StringRef sortKeyNoTrim(const char * s, size_t length, std::string & container) const override
{
return convertImpl<false, false>(s, length, container, nullptr);
}
std::unique_ptr<IPattern> pattern() const override { return std::make_unique<Pattern<GeneralCICollator>>(); }
size_t sortKeyReservedSpaceMultipler() const override { return sizeof(WeightType); }
bool isTrivialCollator() const override { return false; }
private:
const std::string name = "GeneralCI";
private:
using WeightType = GeneralCI::WeightType;
using CharType = Rune;
template <bool need_len, bool need_trim>
StringRef convertImpl(const char * s, size_t length, std::string & container, std::vector<size_t> * lens) const;
static inline CharType decodeChar(const char * s, size_t & offset);
static inline WeightType weight(CharType c)
{
if (c > 0xFFFF)
return 0xFFFD;
return GeneralCI::weight_lut[c & 0xFFFF];
//return !!(c >> 16) * 0xFFFD + (1 - !!(c >> 16)) * GeneralCI::weight_lut_0400[c & 0xFFFF];
}
static inline bool regexEq(CharType a, CharType b) { return weight(a) == weight(b); }
friend class Pattern<GeneralCICollator>;
};
template <typename T>
class Latin1CICollator final : public ITiDBCollator
{
public:
explicit Latin1CICollator(int32_t id)
: ITiDBCollator(id)
{}
int compare(const char * s1, size_t length1, const char * s2, size_t length2) const override;
StringRef convert(const char * s, size_t length, std::string & container, std::vector<size_t> * lens) const override
{
return convertImpl<true, false>(s, length, container, lens);
}
StringRef sortKey(const char * s, size_t length, std::string & container) const override
{
return convertImpl<false, true>(s, length, container, nullptr);
}
StringRef sortKeyNoTrim(const char * s, size_t length, std::string & container) const override
{
return convertImpl<false, false>(s, length, container, nullptr);
}
std::unique_ptr<IPattern> pattern() const override { return std::make_unique<Pattern<Latin1CICollator>>(); }
size_t sortKeyReservedSpaceMultipler() const override { return sizeof(WeightType); }
bool isTrivialCollator() const override { return false; }
private:
using WeightType = Latin1CI::WeightType;
using CharType = Rune;
template <bool need_len, bool need_trim>
StringRef convertImpl(const char * s, size_t length, std::string & container, std::vector<size_t> * lens) const;
static CharType decodeChar(const char * s, size_t & offset);
static WeightType weight(CharType c);
static bool regexEq(CharType a, CharType b) { return weight(a) == weight(b); }
friend class Pattern<Latin1CICollator>;
};
using UTF8MB4_BIN_TYPE = BinCollator<Rune, true>;
using UTF8MB4_0900_BIN_TYPE = BinCollator<Rune, false>;
using UCACI_0400_PADDING = UCACICollator<Unicode0400, true>;
using UCACI_0900_NON_PADDING = UCACICollator<Unicode0900, false>;
using LATIN1_SWEDISH_CI_TYPE = Latin1CICollator<Latin1Swedish>;
using BIN_COLLATOR_PADDING = BinCollator<char, true>;
using BIN_COLLATOR_NON_PADDING = BinCollator<char, false>;
} // namespace TiDB
#define APPLY_FOR_COLLATOR_TYPES_WITH_VARS(VAR_PREFIX, M, ...) \
M(VAR_PREFIX, utf8_general_ci, TiDB::GeneralCICollator, TiDB::ITiDBCollator::UTF8_GENERAL_CI, ##__VA_ARGS__) \
M(VAR_PREFIX, utf8mb4_general_ci, TiDB::GeneralCICollator, TiDB::ITiDBCollator::UTF8MB4_GENERAL_CI, ##__VA_ARGS__) \
M(VAR_PREFIX, utf8_unicode_ci, TiDB::UCACI_0400_PADDING, TiDB::ITiDBCollator::UTF8_UNICODE_CI, ##__VA_ARGS__) \
M(VAR_PREFIX, \
utf8mb4_unicode_ci, \
TiDB::UCACI_0400_PADDING, \
TiDB::ITiDBCollator::UTF8MB4_UNICODE_CI, \
##__VA_ARGS__) \
M(VAR_PREFIX, \
utf8mb4_0900_ai_ci, \
TiDB::UCACI_0900_NON_PADDING, \
TiDB::ITiDBCollator::UTF8MB4_0900_AI_CI, \
##__VA_ARGS__) \
M(VAR_PREFIX, \
latin1_swedish_ci, \
TiDB::LATIN1_SWEDISH_CI_TYPE, \
TiDB::ITiDBCollator::LATIN1_SWEDISH_CI, \
##__VA_ARGS__) \
M(VAR_PREFIX, utf8mb4_0900_bin, TiDB::UTF8MB4_0900_BIN_TYPE, TiDB::ITiDBCollator::UTF8MB4_0900_BIN, ##__VA_ARGS__) \
M(VAR_PREFIX, utf8mb4_bin, TiDB::UTF8MB4_BIN_TYPE, TiDB::ITiDBCollator::UTF8MB4_BIN, ##__VA_ARGS__) \
M(VAR_PREFIX, latin1_bin, TiDB::BIN_COLLATOR_PADDING, TiDB::ITiDBCollator::LATIN1_BIN, ##__VA_ARGS__) \
M(VAR_PREFIX, binary, TiDB::BIN_COLLATOR_NON_PADDING, TiDB::ITiDBCollator::BINARY, ##__VA_ARGS__) \
M(VAR_PREFIX, ascii_bin, TiDB::BIN_COLLATOR_PADDING, TiDB::ITiDBCollator::ASCII_BIN, ##__VA_ARGS__) \
M(VAR_PREFIX, utf8_bin, TiDB::UTF8MB4_BIN_TYPE, TiDB::ITiDBCollator::UTF8_BIN, ##__VA_ARGS__)
#define APPLY_FOR_COLLATOR_TYPES(M, ...) APPLY_FOR_COLLATOR_TYPES_WITH_VARS(tmp_, M, ##__VA_ARGS__)