Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions dbms/src/Flash/tests/gtest_spill_sort.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ try
TiDB::ITiDBCollator::UTF8MB4_BIN,
TiDB::ITiDBCollator::UTF8MB4_GENERAL_CI,
TiDB::ITiDBCollator::UTF8MB4_UNICODE_CI,
TiDB::ITiDBCollator::LATIN1_SWEDISH_CI,
TiDB::ITiDBCollator::UTF8MB4_0900_AI_CI,
TiDB::ITiDBCollator::UTF8MB4_0900_BIN};
for (const auto & collator_id : collators)
Expand Down
3 changes: 3 additions & 0 deletions dbms/src/Functions/FunctionsStringSearch.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,9 @@ class IlikeLowerHelper
case TiDB::ITiDBCollator::CollatorType::UTF8MB4_0900_AI_CI:
collator = TiDB::ITiDBCollator::getCollator(TiDB::ITiDBCollator::UTF8MB4_0900_BIN);
break;
case TiDB::ITiDBCollator::CollatorType::LATIN1_SWEDISH_CI:
collator = TiDB::ITiDBCollator::getCollator(TiDB::ITiDBCollator::LATIN1_BIN);
break;
default:
break;
}
Expand Down
25 changes: 25 additions & 0 deletions dbms/src/Functions/tests/gtest_strings_search.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -680,6 +680,31 @@ TEST_F(StringMatch, IlikeConstWithConst)
}
}

TEST_F(StringMatch, Latin1SwedishCI)
{
const auto * collator = TiDB::ITiDBCollator::getCollator(TiDB::ITiDBCollator::LATIN1_SWEDISH_CI);

ASSERT_COLUMN_EQ(
toConst(1),
executeFunction(func_like_name, {toConst("Äta"), toConst("æ__"), escape}, collator));

ASSERT_COLUMN_EQ(
toConst(1),
executeFunction(func_like_name, {toConst("æta"), toConst("Ä__"), escape}, collator));

ASSERT_COLUMN_EQ(
toConst(0),
executeFunction(func_like_name, {toConst("åka"), toConst("Ä__"), escape}, collator));

ASSERT_COLUMN_EQ(
toConst(0),
executeFunction(func_like_name, {toConst("Œ"), toConst("œ"), escape}, collator));

ASSERT_COLUMN_EQ(
toConst(0),
executeFunction(func_ilike_name, {toConst("Äta"), toConst("æ__"), escape}, collator));
}

TEST_F(StringMatch, CheckEscape)
{
std::vector<TiDB::TiDBCollatorPtr> collators{
Expand Down
165 changes: 165 additions & 0 deletions dbms/src/TiDB/Collation/Collator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,26 @@ const std::array<char, 128> weight_ascii_ci
0x5F, 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71,
0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F};

namespace Latin1CI
{
const std::array<WeightType, 256> swedish_ci_weight_lut
= {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11,
0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x20, 0x21, 0x22, 0x23,
0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35,
0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x3F, 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59,
0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F, 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B,
0x4C, 0x4D, 0x4E, 0x4F, 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x7B, 0x7C, 0x7D,
0x7E, 0x7F, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F,
0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9D, 0x9E, 0x9F, 0xA0, 0xA1,
0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF, 0xB0, 0xB1, 0xB2, 0xB3,
0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF, 0x41, 0x41, 0x41, 0x41, 0x5C, 0x5B,
0x5C, 0x43, 0x45, 0x45, 0x45, 0x45, 0x49, 0x49, 0x49, 0x49, 0x44, 0x4E, 0x4F, 0x4F, 0x4F, 0x4F, 0x5D, 0xD7,
0xD8, 0x55, 0x55, 0x55, 0x59, 0x59, 0xDE, 0xDF, 0x41, 0x41, 0x41, 0x41, 0x5C, 0x5B, 0x5C, 0x43, 0x45, 0x45,
0x45, 0x45, 0x49, 0x49, 0x49, 0x49, 0x44, 0x4E, 0x4F, 0x4F, 0x4F, 0x4F, 0x5D, 0xF7, 0xD8, 0x55, 0x55, 0x55,
0x59, 0x59, 0xDE, 0xFF};
} // namespace Latin1CI

TiDBCollators dummy_collators;
std::vector<std::string> dummy_sort_key_contaners;
std::string dummy_sort_key_contaner;
Expand All @@ -46,6 +66,74 @@ ALWAYS_INLINE std::string_view rtrim(const char * s, size_t length)
return DB::RightTrim(v);
}

constexpr Latin1CI::WeightType invalid_latin1_weight = 0x3F;

ALWAYS_INLINE int encodeWindows1252(Rune r)
{
if (r <= 0x7F || (r >= 0xA0 && r <= 0xFF))
return r;

switch (r)
{
case U'€':
return 0x80;
case U'‚':
return 0x82;
case U'ƒ':
return 0x83;
case U'„':
return 0x84;
case U'…':
return 0x85;
case U'†':
return 0x86;
case U'‡':
return 0x87;
case U'ˆ':
return 0x88;
case U'‰':
return 0x89;
case U'Š':
return 0x8A;
case U'‹':
return 0x8B;
case U'Œ':
return 0x8C;
case U'Ž':
return 0x8E;
case U'‘':
return 0x91;
case U'’':
return 0x92;
case U'“':
return 0x93;
case U'”':
return 0x94;
case U'•':
return 0x95;
case U'–':
return 0x96;
case U'—':
return 0x97;
case U'˜':
return 0x98;
case U'™':
return 0x99;
case U'š':
return 0x9A;
case U'›':
return 0x9B;
case U'œ':
return 0x9C;
case U'ž':
return 0x9E;
case U'Ÿ':
return 0x9F;
default:
return -1;
}
}

using StringType = std::vector<Rune>;
constexpr uint8_t b2_mask = 0x1F;
constexpr uint8_t b3_mask = 0x0F;
Expand Down Expand Up @@ -459,6 +547,81 @@ inline GeneralCICollator::CharType GeneralCICollator::decodeChar(const char * s,
return decodeUtf8Char(s, offset);
}

const std::array<Latin1CI::WeightType, 256> & Latin1Swedish::weight_lut = Latin1CI::swedish_ci_weight_lut;

template <typename T>
int Latin1CICollator<T>::compare(const char * s1, size_t length1, const char * s2, size_t length2) const
{
auto v1 = rtrim(s1, length1);
auto v2 = rtrim(s2, length2);

size_t offset1 = 0, offset2 = 0;
while (offset1 < v1.length() && offset2 < v2.length())
{
auto c1 = decodeChar(s1, offset1);
auto c2 = decodeChar(s2, offset2);
auto cmp = static_cast<Int32>(weight(c1)) - static_cast<Int32>(weight(c2));
if (cmp != 0)
return DB::signum(cmp);
}

return (offset1 < v1.length()) - (offset2 < v2.length());
}

template <typename T>
template <bool need_len, bool need_trim>
StringRef Latin1CICollator<T>::convertImpl(
const char * s,
size_t length,
std::string & container,
std::vector<size_t> * lens) const
{
std::string_view v;

if constexpr (need_trim)
v = rtrim(s, length);
else
v = std::string_view(s, length);

const auto max_bytes_one_char = sortKeyReservedSpaceMultipler();
if (length * max_bytes_one_char > container.size())
container.resize(length * max_bytes_one_char);
size_t offset = 0;
size_t total_size = 0;
size_t v_length = v.length();

if constexpr (need_len)
{
if (lens->capacity() < v_length)
lens->reserve(v_length);
lens->resize(0);
}

while (offset < v_length)
{
container[total_size++] = static_cast<char>(weight(decodeChar(s, offset)));
if constexpr (need_len)
lens->push_back(1);
}

return StringRef(container.data(), total_size);
}

template <typename T>
typename Latin1CICollator<T>::WeightType Latin1CICollator<T>::weight(CharType c)
{
auto cp1252 = encodeWindows1252(c);
if (cp1252 < 0)
return invalid_latin1_weight;
return T::weight_lut[cp1252];
}

template <typename T>
typename Latin1CICollator<T>::CharType Latin1CICollator<T>::decodeChar(const char * s, size_t & offset)
{
return decodeUtf8Char(s, offset);
}

namespace UnicodeCI
{
extern const std::array<uint64_t, 256 * 256 + 1> weight_lut_0400;
Expand Down Expand Up @@ -882,6 +1045,7 @@ struct TiDBCollatorTypeIDMap
{
TiDBCollatorTypeIDMap()
{
id_to_type[ITiDBCollator::LATIN1_SWEDISH_CI] = ITiDBCollator::CollatorType::LATIN1_SWEDISH_CI;
id_to_type[ITiDBCollator::UTF8_GENERAL_CI] = ITiDBCollator::CollatorType::UTF8_GENERAL_CI;
id_to_type[ITiDBCollator::UTF8MB4_GENERAL_CI] = ITiDBCollator::CollatorType::UTF8MB4_GENERAL_CI;
id_to_type[ITiDBCollator::UTF8_UNICODE_CI] = ITiDBCollator::CollatorType::UTF8_UNICODE_CI;
Expand Down Expand Up @@ -973,6 +1137,7 @@ bool ITiDBCollator::isCI() const
case CollatorType::UTF8_GENERAL_CI:
case CollatorType::UTF8MB4_UNICODE_CI:
case CollatorType::UTF8MB4_GENERAL_CI:
case CollatorType::LATIN1_SWEDISH_CI:
case CollatorType::UTF8MB4_0900_AI_CI:
return true;
default:
Expand Down
67 changes: 67 additions & 0 deletions dbms/src/TiDB/Collation/Collator.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ class ITiDBCollator
public:
enum
{
LATIN1_SWEDISH_CI = 8,
UTF8_GENERAL_CI = 33,
UTF8MB4_GENERAL_CI = 45,
UTF8_UNICODE_CI = 192,
Expand Down Expand Up @@ -58,6 +59,7 @@ class ITiDBCollator
UTF8MB4_GENERAL_CI,
UTF8_UNICODE_CI,
UTF8MB4_UNICODE_CI,
LATIN1_SWEDISH_CI,
UTF8MB4_0900_AI_CI,
UTF8MB4_0900_BIN,
// ----
Expand Down Expand Up @@ -357,6 +359,18 @@ using WeightType = uint16_t;
extern const std::array<WeightType, 256 * 256> weight_lut;
} // namespace GeneralCI

namespace Latin1CI
{
using WeightType = UInt8;
extern const std::array<WeightType, 256> swedish_ci_weight_lut;
} // namespace Latin1CI

class Latin1Swedish
{
public:
static const std::array<Latin1CI::WeightType, 256> & weight_lut;
};

class GeneralCICollator final : public ITiDBCollator
{
public:
Expand Down Expand Up @@ -412,10 +426,58 @@ class GeneralCICollator final : public ITiDBCollator
friend class Pattern<GeneralCICollator>;
};

template <typename T>
class Latin1CICollator final : public ITiDBCollator
{
public:
explicit Latin1CICollator(int32_t id)
: ITiDBCollator(id)
{}

int compare(const char * s1, size_t length1, const char * s2, size_t length2) const override;

StringRef convert(const char * s, size_t length, std::string & container, std::vector<size_t> * lens) const override
{
return convertImpl<true, false>(s, length, container, lens);
}

StringRef sortKey(const char * s, size_t length, std::string & container) const override
{
return convertImpl<false, true>(s, length, container, nullptr);
}

StringRef sortKeyNoTrim(const char * s, size_t length, std::string & container) const override
{
return convertImpl<false, false>(s, length, container, nullptr);
}

std::unique_ptr<IPattern> pattern() const override { return std::make_unique<Pattern<Latin1CICollator>>(); }

size_t sortKeyReservedSpaceMultipler() const override { return sizeof(WeightType); }

bool isTrivialCollator() const override { return false; }

private:
using WeightType = Latin1CI::WeightType;
using CharType = Rune;

template <bool need_len, bool need_trim>
StringRef convertImpl(const char * s, size_t length, std::string & container, std::vector<size_t> * lens) const;

static CharType decodeChar(const char * s, size_t & offset);

static WeightType weight(CharType c);

static bool regexEq(CharType a, CharType b) { return weight(a) == weight(b); }

friend class Pattern<Latin1CICollator>;
};

using UTF8MB4_BIN_TYPE = BinCollator<Rune, true>;
using UTF8MB4_0900_BIN_TYPE = BinCollator<Rune, false>;
using UCACI_0400_PADDING = UCACICollator<Unicode0400, true>;
using UCACI_0900_NON_PADDING = UCACICollator<Unicode0900, false>;
using LATIN1_SWEDISH_CI_TYPE = Latin1CICollator<Latin1Swedish>;
using BIN_COLLATOR_PADDING = BinCollator<char, true>;
using BIN_COLLATOR_NON_PADDING = BinCollator<char, false>;
} // namespace TiDB
Expand All @@ -434,6 +496,11 @@ using BIN_COLLATOR_NON_PADDING = BinCollator<char, false>;
TiDB::UCACI_0900_NON_PADDING, \
TiDB::ITiDBCollator::UTF8MB4_0900_AI_CI, \
##__VA_ARGS__) \
M(VAR_PREFIX, \
latin1_swedish_ci, \
TiDB::LATIN1_SWEDISH_CI_TYPE, \
TiDB::ITiDBCollator::LATIN1_SWEDISH_CI, \
##__VA_ARGS__) \
M(VAR_PREFIX, utf8mb4_0900_bin, TiDB::UTF8MB4_0900_BIN_TYPE, TiDB::ITiDBCollator::UTF8MB4_0900_BIN, ##__VA_ARGS__) \
M(VAR_PREFIX, utf8mb4_bin, TiDB::UTF8MB4_BIN_TYPE, TiDB::ITiDBCollator::UTF8MB4_BIN, ##__VA_ARGS__) \
M(VAR_PREFIX, latin1_bin, TiDB::BIN_COLLATOR_PADDING, TiDB::ITiDBCollator::LATIN1_BIN, ##__VA_ARGS__) \
Expand Down
16 changes: 16 additions & 0 deletions dbms/src/TiDB/Schema/tests/gtest_table_info.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,22 @@ try
}
CATCH

TEST(TiDBTableInfoTest, Latin1SwedishCICollation)
try
{
TableInfo table_info(
R"json({"id":45,"name":{"O":"t","L":"t"},"charset":"latin1","collate":"latin1_swedish_ci","cols":[{"id":1,"name":{"O":"val","L":"val"},"offset":0,"origin_default":null,"origin_default_bit":null,"default":null,"default_bit":null,"default_is_expr":false,"generated_expr_string":"","generated_stored":false,"dependences":null,"type":{"Tp":15,"Flag":0,"Flen":32,"Decimal":0,"Charset":"latin1","Collate":"latin1_swedish_ci","Elems":null},"state":5,"comment":"","hidden":false,"change_state_info":null,"version":2}],"index_info":null,"constraint_info":null,"fk_info":null,"state":5,"pk_is_handle":false,"is_common_handle":false,"comment":"","auto_inc_id":0,"auto_id_cache":0,"auto_rand_id":0,"max_col_id":1,"max_idx_id":0,"max_cst_id":0,"update_timestamp":418683341902184450,"ShardRowIDBits":0,"max_shard_row_id_bits":0,"auto_random_bits":0,"pre_split_regions":0,"partition":null,"compression":"","view":null,"sequence":null,"Lock":null,"version":3})json",
NullspaceID);

ASSERT_EQ(table_info.columns.size(), 1U);
ASSERT_EQ(table_info.columns[0].charset.convert<String>(), "latin1");
ASSERT_EQ(table_info.columns[0].collate.convert<String>(), "latin1_swedish_ci");

auto field_type = columnInfoToFieldType(table_info.columns[0]);
ASSERT_EQ(field_type.collate(), TiDB::ITiDBCollator::LATIN1_SWEDISH_CI);
}
CATCH

TEST(TiDBTableInfoTest, ParseVectorIndexJSON)
try
{
Expand Down
Loading