diff --git a/dbms/src/Flash/tests/gtest_spill_sort.cpp b/dbms/src/Flash/tests/gtest_spill_sort.cpp index d9a61043562..39586df8042 100644 --- a/dbms/src/Flash/tests/gtest_spill_sort.cpp +++ b/dbms/src/Flash/tests/gtest_spill_sort.cpp @@ -131,6 +131,7 @@ try TiDB::ITiDBCollator::UTF8MB4_BIN, TiDB::ITiDBCollator::UTF8MB4_GENERAL_CI, TiDB::ITiDBCollator::UTF8MB4_UNICODE_CI, + TiDB::ITiDBCollator::LATIN1_SWEDISH_CI, TiDB::ITiDBCollator::UTF8MB4_0900_AI_CI, TiDB::ITiDBCollator::UTF8MB4_0900_BIN}; for (const auto & collator_id : collators) diff --git a/dbms/src/Functions/FunctionsStringSearch.h b/dbms/src/Functions/FunctionsStringSearch.h index 3cb3a1da0b1..d73f5f1ca10 100644 --- a/dbms/src/Functions/FunctionsStringSearch.h +++ b/dbms/src/Functions/FunctionsStringSearch.h @@ -59,6 +59,9 @@ class IlikeLowerHelper case TiDB::ITiDBCollator::CollatorType::UTF8MB4_0900_AI_CI: collator = TiDB::ITiDBCollator::getCollator(TiDB::ITiDBCollator::UTF8MB4_0900_BIN); break; + case TiDB::ITiDBCollator::CollatorType::LATIN1_SWEDISH_CI: + collator = TiDB::ITiDBCollator::getCollator(TiDB::ITiDBCollator::LATIN1_BIN); + break; default: break; } diff --git a/dbms/src/Functions/tests/gtest_strings_search.cpp b/dbms/src/Functions/tests/gtest_strings_search.cpp index 58ebbbe63f1..154a66cc069 100644 --- a/dbms/src/Functions/tests/gtest_strings_search.cpp +++ b/dbms/src/Functions/tests/gtest_strings_search.cpp @@ -680,6 +680,31 @@ TEST_F(StringMatch, IlikeConstWithConst) } } +TEST_F(StringMatch, Latin1SwedishCI) +{ + const auto * collator = TiDB::ITiDBCollator::getCollator(TiDB::ITiDBCollator::LATIN1_SWEDISH_CI); + + ASSERT_COLUMN_EQ( + toConst(1), + executeFunction(func_like_name, {toConst("Äta"), toConst("æ__"), escape}, collator)); + + ASSERT_COLUMN_EQ( + toConst(1), + executeFunction(func_like_name, {toConst("æta"), toConst("Ä__"), escape}, collator)); + + ASSERT_COLUMN_EQ( + toConst(0), + executeFunction(func_like_name, {toConst("åka"), toConst("Ä__"), escape}, collator)); + + ASSERT_COLUMN_EQ( + toConst(0), + executeFunction(func_like_name, {toConst("Œ"), toConst("œ"), escape}, collator)); + + ASSERT_COLUMN_EQ( + toConst(0), + executeFunction(func_ilike_name, {toConst("Äta"), toConst("æ__"), escape}, collator)); +} + TEST_F(StringMatch, CheckEscape) { std::vector collators{ diff --git a/dbms/src/TiDB/Collation/Collator.cpp b/dbms/src/TiDB/Collation/Collator.cpp index 93a123564a6..3c6f6f51b47 100644 --- a/dbms/src/TiDB/Collation/Collator.cpp +++ b/dbms/src/TiDB/Collation/Collator.cpp @@ -36,6 +36,26 @@ const std::array weight_ascii_ci 0x5F, 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F}; +namespace Latin1CI +{ +const std::array swedish_ci_weight_lut + = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, + 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x20, 0x21, 0x22, 0x23, + 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, + 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x3F, 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, + 0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F, 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B, + 0x4C, 0x4D, 0x4E, 0x4F, 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x7B, 0x7C, 0x7D, + 0x7E, 0x7F, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F, + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9D, 0x9E, 0x9F, 0xA0, 0xA1, + 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF, 0xB0, 0xB1, 0xB2, 0xB3, + 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF, 0x41, 0x41, 0x41, 0x41, 0x5C, 0x5B, + 0x5C, 0x43, 0x45, 0x45, 0x45, 0x45, 0x49, 0x49, 0x49, 0x49, 0x44, 0x4E, 0x4F, 0x4F, 0x4F, 0x4F, 0x5D, 0xD7, + 0xD8, 0x55, 0x55, 0x55, 0x59, 0x59, 0xDE, 0xDF, 0x41, 0x41, 0x41, 0x41, 0x5C, 0x5B, 0x5C, 0x43, 0x45, 0x45, + 0x45, 0x45, 0x49, 0x49, 0x49, 0x49, 0x44, 0x4E, 0x4F, 0x4F, 0x4F, 0x4F, 0x5D, 0xF7, 0xD8, 0x55, 0x55, 0x55, + 0x59, 0x59, 0xDE, 0xFF}; +} // namespace Latin1CI + TiDBCollators dummy_collators; std::vector dummy_sort_key_contaners; std::string dummy_sort_key_contaner; @@ -46,6 +66,74 @@ ALWAYS_INLINE std::string_view rtrim(const char * s, size_t length) return DB::RightTrim(v); } +constexpr Latin1CI::WeightType invalid_latin1_weight = 0x3F; + +ALWAYS_INLINE int encodeWindows1252(Rune r) +{ + if (r <= 0x7F || (r >= 0xA0 && r <= 0xFF)) + return r; + + switch (r) + { + case U'€': + return 0x80; + case U'‚': + return 0x82; + case U'ƒ': + return 0x83; + case U'„': + return 0x84; + case U'…': + return 0x85; + case U'†': + return 0x86; + case U'‡': + return 0x87; + case U'ˆ': + return 0x88; + case U'‰': + return 0x89; + case U'Š': + return 0x8A; + case U'‹': + return 0x8B; + case U'Œ': + return 0x8C; + case U'Ž': + return 0x8E; + case U'‘': + return 0x91; + case U'’': + return 0x92; + case U'“': + return 0x93; + case U'”': + return 0x94; + case U'•': + return 0x95; + case U'–': + return 0x96; + case U'—': + return 0x97; + case U'˜': + return 0x98; + case U'™': + return 0x99; + case U'š': + return 0x9A; + case U'›': + return 0x9B; + case U'œ': + return 0x9C; + case U'ž': + return 0x9E; + case U'Ÿ': + return 0x9F; + default: + return -1; + } +} + using StringType = std::vector; constexpr uint8_t b2_mask = 0x1F; constexpr uint8_t b3_mask = 0x0F; @@ -459,6 +547,81 @@ inline GeneralCICollator::CharType GeneralCICollator::decodeChar(const char * s, return decodeUtf8Char(s, offset); } +const std::array & Latin1Swedish::weight_lut = Latin1CI::swedish_ci_weight_lut; + +template +int Latin1CICollator::compare(const char * s1, size_t length1, const char * s2, size_t length2) const +{ + auto v1 = rtrim(s1, length1); + auto v2 = rtrim(s2, length2); + + size_t offset1 = 0, offset2 = 0; + while (offset1 < v1.length() && offset2 < v2.length()) + { + auto c1 = decodeChar(s1, offset1); + auto c2 = decodeChar(s2, offset2); + auto cmp = static_cast(weight(c1)) - static_cast(weight(c2)); + if (cmp != 0) + return DB::signum(cmp); + } + + return (offset1 < v1.length()) - (offset2 < v2.length()); +} + +template +template +StringRef Latin1CICollator::convertImpl( + const char * s, + size_t length, + std::string & container, + std::vector * lens) const +{ + std::string_view v; + + if constexpr (need_trim) + v = rtrim(s, length); + else + v = std::string_view(s, length); + + const auto max_bytes_one_char = sortKeyReservedSpaceMultipler(); + if (length * max_bytes_one_char > container.size()) + container.resize(length * max_bytes_one_char); + size_t offset = 0; + size_t total_size = 0; + size_t v_length = v.length(); + + if constexpr (need_len) + { + if (lens->capacity() < v_length) + lens->reserve(v_length); + lens->resize(0); + } + + while (offset < v_length) + { + container[total_size++] = static_cast(weight(decodeChar(s, offset))); + if constexpr (need_len) + lens->push_back(1); + } + + return StringRef(container.data(), total_size); +} + +template +typename Latin1CICollator::WeightType Latin1CICollator::weight(CharType c) +{ + auto cp1252 = encodeWindows1252(c); + if (cp1252 < 0) + return invalid_latin1_weight; + return T::weight_lut[cp1252]; +} + +template +typename Latin1CICollator::CharType Latin1CICollator::decodeChar(const char * s, size_t & offset) +{ + return decodeUtf8Char(s, offset); +} + namespace UnicodeCI { extern const std::array weight_lut_0400; @@ -882,6 +1045,7 @@ struct TiDBCollatorTypeIDMap { TiDBCollatorTypeIDMap() { + id_to_type[ITiDBCollator::LATIN1_SWEDISH_CI] = ITiDBCollator::CollatorType::LATIN1_SWEDISH_CI; id_to_type[ITiDBCollator::UTF8_GENERAL_CI] = ITiDBCollator::CollatorType::UTF8_GENERAL_CI; id_to_type[ITiDBCollator::UTF8MB4_GENERAL_CI] = ITiDBCollator::CollatorType::UTF8MB4_GENERAL_CI; id_to_type[ITiDBCollator::UTF8_UNICODE_CI] = ITiDBCollator::CollatorType::UTF8_UNICODE_CI; @@ -973,6 +1137,7 @@ bool ITiDBCollator::isCI() const case CollatorType::UTF8_GENERAL_CI: case CollatorType::UTF8MB4_UNICODE_CI: case CollatorType::UTF8MB4_GENERAL_CI: + case CollatorType::LATIN1_SWEDISH_CI: case CollatorType::UTF8MB4_0900_AI_CI: return true; default: diff --git a/dbms/src/TiDB/Collation/Collator.h b/dbms/src/TiDB/Collation/Collator.h index f183afbc597..3d8acfcbc94 100644 --- a/dbms/src/TiDB/Collation/Collator.h +++ b/dbms/src/TiDB/Collation/Collator.h @@ -30,6 +30,7 @@ class ITiDBCollator public: enum { + LATIN1_SWEDISH_CI = 8, UTF8_GENERAL_CI = 33, UTF8MB4_GENERAL_CI = 45, UTF8_UNICODE_CI = 192, @@ -58,6 +59,7 @@ class ITiDBCollator UTF8MB4_GENERAL_CI, UTF8_UNICODE_CI, UTF8MB4_UNICODE_CI, + LATIN1_SWEDISH_CI, UTF8MB4_0900_AI_CI, UTF8MB4_0900_BIN, // ---- @@ -357,6 +359,18 @@ using WeightType = uint16_t; extern const std::array weight_lut; } // namespace GeneralCI +namespace Latin1CI +{ +using WeightType = UInt8; +extern const std::array swedish_ci_weight_lut; +} // namespace Latin1CI + +class Latin1Swedish +{ +public: + static const std::array & weight_lut; +}; + class GeneralCICollator final : public ITiDBCollator { public: @@ -412,10 +426,58 @@ class GeneralCICollator final : public ITiDBCollator friend class Pattern; }; +template +class Latin1CICollator final : public ITiDBCollator +{ +public: + explicit Latin1CICollator(int32_t id) + : ITiDBCollator(id) + {} + + int compare(const char * s1, size_t length1, const char * s2, size_t length2) const override; + + StringRef convert(const char * s, size_t length, std::string & container, std::vector * lens) const override + { + return convertImpl(s, length, container, lens); + } + + StringRef sortKey(const char * s, size_t length, std::string & container) const override + { + return convertImpl(s, length, container, nullptr); + } + + StringRef sortKeyNoTrim(const char * s, size_t length, std::string & container) const override + { + return convertImpl(s, length, container, nullptr); + } + + std::unique_ptr pattern() const override { return std::make_unique>(); } + + size_t sortKeyReservedSpaceMultipler() const override { return sizeof(WeightType); } + + bool isTrivialCollator() const override { return false; } + +private: + using WeightType = Latin1CI::WeightType; + using CharType = Rune; + + template + StringRef convertImpl(const char * s, size_t length, std::string & container, std::vector * lens) const; + + static CharType decodeChar(const char * s, size_t & offset); + + static WeightType weight(CharType c); + + static bool regexEq(CharType a, CharType b) { return weight(a) == weight(b); } + + friend class Pattern; +}; + using UTF8MB4_BIN_TYPE = BinCollator; using UTF8MB4_0900_BIN_TYPE = BinCollator; using UCACI_0400_PADDING = UCACICollator; using UCACI_0900_NON_PADDING = UCACICollator; +using LATIN1_SWEDISH_CI_TYPE = Latin1CICollator; using BIN_COLLATOR_PADDING = BinCollator; using BIN_COLLATOR_NON_PADDING = BinCollator; } // namespace TiDB @@ -434,6 +496,11 @@ using BIN_COLLATOR_NON_PADDING = BinCollator; TiDB::UCACI_0900_NON_PADDING, \ TiDB::ITiDBCollator::UTF8MB4_0900_AI_CI, \ ##__VA_ARGS__) \ + M(VAR_PREFIX, \ + latin1_swedish_ci, \ + TiDB::LATIN1_SWEDISH_CI_TYPE, \ + TiDB::ITiDBCollator::LATIN1_SWEDISH_CI, \ + ##__VA_ARGS__) \ M(VAR_PREFIX, utf8mb4_0900_bin, TiDB::UTF8MB4_0900_BIN_TYPE, TiDB::ITiDBCollator::UTF8MB4_0900_BIN, ##__VA_ARGS__) \ M(VAR_PREFIX, utf8mb4_bin, TiDB::UTF8MB4_BIN_TYPE, TiDB::ITiDBCollator::UTF8MB4_BIN, ##__VA_ARGS__) \ M(VAR_PREFIX, latin1_bin, TiDB::BIN_COLLATOR_PADDING, TiDB::ITiDBCollator::LATIN1_BIN, ##__VA_ARGS__) \ diff --git a/dbms/src/TiDB/Schema/tests/gtest_table_info.cpp b/dbms/src/TiDB/Schema/tests/gtest_table_info.cpp index f14c9c88cbc..e3728d46c04 100644 --- a/dbms/src/TiDB/Schema/tests/gtest_table_info.cpp +++ b/dbms/src/TiDB/Schema/tests/gtest_table_info.cpp @@ -181,6 +181,22 @@ try } CATCH +TEST(TiDBTableInfoTest, Latin1SwedishCICollation) +try +{ + TableInfo table_info( + R"json({"id":45,"name":{"O":"t","L":"t"},"charset":"latin1","collate":"latin1_swedish_ci","cols":[{"id":1,"name":{"O":"val","L":"val"},"offset":0,"origin_default":null,"origin_default_bit":null,"default":null,"default_bit":null,"default_is_expr":false,"generated_expr_string":"","generated_stored":false,"dependences":null,"type":{"Tp":15,"Flag":0,"Flen":32,"Decimal":0,"Charset":"latin1","Collate":"latin1_swedish_ci","Elems":null},"state":5,"comment":"","hidden":false,"change_state_info":null,"version":2}],"index_info":null,"constraint_info":null,"fk_info":null,"state":5,"pk_is_handle":false,"is_common_handle":false,"comment":"","auto_inc_id":0,"auto_id_cache":0,"auto_rand_id":0,"max_col_id":1,"max_idx_id":0,"max_cst_id":0,"update_timestamp":418683341902184450,"ShardRowIDBits":0,"max_shard_row_id_bits":0,"auto_random_bits":0,"pre_split_regions":0,"partition":null,"compression":"","view":null,"sequence":null,"Lock":null,"version":3})json", + NullspaceID); + + ASSERT_EQ(table_info.columns.size(), 1U); + ASSERT_EQ(table_info.columns[0].charset.convert(), "latin1"); + ASSERT_EQ(table_info.columns[0].collate.convert(), "latin1_swedish_ci"); + + auto field_type = columnInfoToFieldType(table_info.columns[0]); + ASSERT_EQ(field_type.collate(), TiDB::ITiDBCollator::LATIN1_SWEDISH_CI); +} +CATCH + TEST(TiDBTableInfoTest, ParseVectorIndexJSON) try { diff --git a/dbms/src/TiDB/tests/gtest_tidb_collator.cpp b/dbms/src/TiDB/tests/gtest_tidb_collator.cpp index c1704f5c4c9..a8712d88420 100644 --- a/dbms/src/TiDB/tests/gtest_tidb_collator.cpp +++ b/dbms/src/TiDB/tests/gtest_tidb_collator.cpp @@ -18,6 +18,8 @@ #include #include +#include + namespace DB::tests { @@ -491,4 +493,41 @@ TEST(CollatorSuite, Utf8Mb40900BinCollator) testCollator(); } +TEST(CollatorSuite, Latin1SwedishCICollator) +{ + const auto * collator_by_id = ITiDBCollator::getCollator(ITiDBCollator::LATIN1_SWEDISH_CI); + const auto * collator_by_name = ITiDBCollator::getCollator("latin1_swedish_ci"); + + ASSERT_NE(collator_by_id, nullptr); + ASSERT_EQ(collator_by_id, collator_by_name); + ASSERT_TRUE(collator_by_id->isCI()); + + ASSERT_EQ(collator_by_id->compare("a", 1, "A", 1), 0); + ASSERT_EQ(collator_by_id->compare("y", 1, "ü", strlen("ü")), 0); + ASSERT_EQ(collator_by_id->compare("ä", strlen("ä"), "æ", strlen("æ")), 0); + ASSERT_LT(collator_by_id->compare("z", 1, "å", strlen("å")), 0); + ASSERT_LT(collator_by_id->compare("å", strlen("å"), "ä", strlen("ä")), 0); + ASSERT_LT(collator_by_id->compare("æ", strlen("æ"), "ö", strlen("ö")), 0); + ASSERT_EQ(collator_by_id->compare("A ", 3, "a", 1), 0); + + std::string sort_key; + ASSERT_EQ(collator_by_id->sortKey("😀", strlen("😀"), sort_key).toString(), std::string("?")); + ASSERT_EQ(collator_by_id->sortKey("Ā", strlen("Ā"), sort_key).toString(), std::string("?")); + ASSERT_EQ( + collator_by_id->sortKey("€‚ƒ„…†‡ˆ‰Š‹ŒŽ", strlen("€‚ƒ„…†‡ˆ‰Š‹ŒŽ"), sort_key).toString(), + std::string("\x80\x82\x83\x84\x85\x86\x87\x88\x89\x8A\x8B\x8C\x8E", 13)); + ASSERT_EQ( + collator_by_id->sortKey("‘’“”•–—˜™š›œžŸ", strlen("‘’“”•–—˜™š›œžŸ"), sort_key).toString(), + std::string("\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9A\x9B\x9C\x9E\x9F", 14)); + ASSERT_EQ( + collator_by_id->sortKey("aüäæåöz", strlen("aüäæåöz"), sort_key).toString(), + std::string("\x41\x59\x5C\x5C\x5B\x5D\x5A", 7)); + + auto pattern = collator_by_id->pattern(); + pattern->compile("Ä__", ESCAPE); + ASSERT_TRUE(pattern->match("Äta", strlen("Äta"))); + ASSERT_TRUE(pattern->match("æta", strlen("æta"))); + ASSERT_FALSE(pattern->match("åka", strlen("åka"))); +} + } // namespace DB::tests