diff --git a/src/mysql-util/src/decoding.rs b/src/mysql-util/src/decoding.rs index f0ff20f3d270b..e3da5c3542f14 100644 --- a/src/mysql-util/src/decoding.rs +++ b/src/mysql-util/src/decoding.rs @@ -382,19 +382,103 @@ fn pack_val_as_datum( } fn check_char_length( - length: Option, + max_char_len: Option, val: &str, col_desc: &MySqlColumnDesc, ) -> Result<(), anyhow::Error> { - if let Some(length) = length { - if let Some(_) = val.char_indices().nth(usize::cast_from(length)) { + if let Some(max_char_len) = max_char_len { + let char_len = val.chars().count(); + if char_len > usize::cast_from(max_char_len) { Err(anyhow::anyhow!( - "received string value of length {} for column {} which has a max length of {}", - val.len(), - col_desc.name, - length + "received string value of length {char_len} for column {col_name} which has a max length of {max_char_len}", + col_name = col_desc.name, ))? } } Ok(()) } + +#[cfg(test)] +mod tests { + use super::*; + + fn test_col_desc() -> MySqlColumnDesc { + MySqlColumnDesc { + name: "test_col".to_string(), + column_type: None, + meta: None, + } + } + + #[mz_ore::test] + fn check_char_length_no_limit() { + let col = test_col_desc(); + // With no length set, any string is accepted. + assert!(check_char_length(None, "", &col).is_ok()); + assert!(check_char_length(None, "abcdefghij", &col).is_ok()); + assert!(check_char_length(None, "πŸ¦€πŸ¦€πŸ¦€πŸ¦€πŸ¦€", &col).is_ok()); + } + + #[mz_ore::test] + fn check_char_length_within_limit() { + let col = test_col_desc(); + assert!(check_char_length(Some(5), "", &col).is_ok()); + assert!(check_char_length(Some(5), "abc", &col).is_ok()); + // Exactly at the limit is allowed. + assert!(check_char_length(Some(5), "abcde", &col).is_ok()); + } + + #[mz_ore::test] + fn check_char_length_exceeds_limit() { + let col = test_col_desc(); + // πŸ¦€ is a 4-byte UTF-8 codepoint but counts as one character. Four + // crabs = 4 chars / 16 bytes, which fits in a VARCHAR(4). + let err = check_char_length(Some(3), "πŸ¦€πŸ¦€πŸ¦€πŸ¦€", &col).unwrap_err(); + let msg = err.to_string(); + assert!( + msg.contains("test_col"), + "error should name the column: {msg}" + ); + assert!( + msg.contains("max length of 3"), + "error should report the configured max: {msg}" + ); + assert!( + msg.contains("value of length 4"), + "error should report the character count: {msg}" + ); + } + + #[mz_ore::test] + fn check_char_length_counts_characters_not_bytes() { + let col = test_col_desc(); + // Four crabs = 4 chars / 16 bytes, which fits in a VARCHAR(4). + assert!(check_char_length(Some(4), "πŸ¦€πŸ¦€πŸ¦€πŸ¦€", &col).is_ok()); + // Five crabs (5 chars / 20 bytes) does not. + assert!(check_char_length(Some(4), "πŸ¦€πŸ¦€πŸ¦€πŸ¦€πŸ¦€", &col).is_err()); + + // Mix of 2-, 3-, and 4-byte codepoints: "Γ©δΈ­πŸ¦€" is 3 characters / 9 bytes. + assert!(check_char_length(Some(3), "Γ©δΈ­πŸ¦€", &col).is_ok()); + assert!(check_char_length(Some(2), "Γ©δΈ­πŸ¦€", &col).is_err()); + } + + #[mz_ore::test] + fn check_char_length_counts_codepoints_not_graphemes() { + let col = test_col_desc(); + // The US flag πŸ‡ΊπŸ‡Έ is a single grapheme but two codepoints (Regional + // Indicator Symbols U+1F1FA + U+1F1F8). The check counts codepoints, + // matching MySQL's VARCHAR(N) character semantics, so three flags use + // six characters. + assert!(check_char_length(Some(6), "πŸ‡ΊπŸ‡ΈπŸ‡ΊπŸ‡ΈπŸ‡ΊπŸ‡Έ", &col).is_ok()); + assert!(check_char_length(Some(5), "πŸ‡ΊπŸ‡ΈπŸ‡ΊπŸ‡ΈπŸ‡ΊπŸ‡Έ", &col).is_err()); + } + + #[mz_ore::test] + fn check_char_length_zero_limit() { + let col = test_col_desc(); + // A zero limit accepts only the empty string. + assert!(check_char_length(Some(0), "", &col).is_ok()); + assert!(check_char_length(Some(0), "a", &col).is_err()); + assert!(check_char_length(Some(0), "πŸ¦€", &col).is_err()); + } +} diff --git a/test/mysql-cdc/varchar-utf8mb4-boundary.td b/test/mysql-cdc/varchar-utf8mb4-boundary.td new file mode 100644 index 0000000000000..b2fbdadb0d4c8 --- /dev/null +++ b/test/mysql-cdc/varchar-utf8mb4-boundary.td @@ -0,0 +1,85 @@ +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +$ set-sql-timeout duration=1s + +# +# Ingest strings at the upstream VARCHAR character-length limit using +# multi-codepoint graphemes β€” flag emoji, which a human sees as a single +# grapheme but MySQL counts as 2 characters (each Regional Indicator Symbol +# is its own code point). Then widen the column by one character and try to +# ingest a value at the new limit. Exercises the character-vs-grapheme +# distinction end-to-end and asserts that widening a tracked VARCHAR column +# surfaces as an incompatible schema change. +# + +> CREATE SECRET mysqlpass AS '${arg.mysql-root-password}' +> CREATE CONNECTION mysql_conn TO MYSQL ( + HOST mysql, + USER root, + PASSWORD SECRET mysqlpass + ) + +$ mysql-connect name=mysql url=mysql://root@mysql password=${arg.mysql-root-password} + +$ mysql-execute name=mysql +# Disable strict mode so that over-length VARCHAR inserts are silently truncated +# rather than rejected with an error. Scoped to this session. +SET SESSION sql_mode = ''; +DROP DATABASE IF EXISTS public; +CREATE DATABASE public; +USE public; + +# A utf8mb4 VARCHAR(40) holds up to 40 characters regardless of how many bytes +# each character occupies (1–4 bytes per character in utf8mb4). +CREATE TABLE t1 (f1 VARCHAR(40) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci DEFAULT NULL, f2 VARCHAR(5) COLLATE utf8mb3_unicode_ci NOT NULL DEFAULT 'en') DEFAULT CHARSET = utf8mb3 COLLATE = utf8mb3_unicode_ci; + +# 40-character value composed of 20 US-flag graphemes. Each πŸ‡ΊπŸ‡Έ is a single +# visual grapheme but MySQL counts it as 2 characters β€” the flag is encoded +# as two Regional Indicator Symbol code points (U+1F1FA + U+1F1F8), each 4 +# bytes in UTF-8. +# Total: 40 characters, 160 bytes. Pre-snapshot row. +INSERT INTO t1 (f1) VALUES (REPEAT('πŸ‡ΊπŸ‡Έ', 20)); + +> CREATE SOURCE mz_source FROM MYSQL CONNECTION mysql_conn; +> CREATE TABLE t1 FROM SOURCE mz_source (REFERENCE public.t1); + +> SELECT pg_typeof(f1) FROM t1 LIMIT 1; +"character varying" + +> SELECT char_length(f1), octet_length(f1) FROM t1; +40 160 + +> SELECT f1 FROM t1; +πŸ‡ΊπŸ‡ΈπŸ‡ΊπŸ‡ΈπŸ‡ΊπŸ‡ΈπŸ‡ΊπŸ‡ΈπŸ‡ΊπŸ‡ΈπŸ‡ΊπŸ‡ΈπŸ‡ΊπŸ‡ΈπŸ‡ΊπŸ‡ΈπŸ‡ΊπŸ‡ΈπŸ‡ΊπŸ‡ΈπŸ‡ΊπŸ‡ΈπŸ‡ΊπŸ‡ΈπŸ‡ΊπŸ‡ΈπŸ‡ΊπŸ‡ΈπŸ‡ΊπŸ‡ΈπŸ‡ΊπŸ‡ΈπŸ‡ΊπŸ‡ΈπŸ‡ΊπŸ‡ΈπŸ‡ΊπŸ‡ΈπŸ‡ΊπŸ‡Έ + +# Insert the same 40-character value post-snapshot to exercise the binlog path. +$ mysql-execute name=mysql +INSERT INTO t1 VALUES (REPEAT('πŸ‡ΊπŸ‡Έ', 20), 'bc'); +UPDATE t1 SET f1 = REPEAT('πŸ‡ΊπŸ‡Έ', 19) WHERE f2 = 'en'; + +> SELECT char_length(f1), octet_length(f1), count(*) FROM t1 GROUP BY 1, 2; +40 160 1 +38 152 1 + +# Widen the upstream column to VARCHAR(41) and insert a 41-character value +# (20 Γ— πŸ‡ΊπŸ‡Έ + 'a' = 40 + 1 = 41 chars, 161 bytes). An odd character count +# can't be made up of only 2-character graphemes, so one ASCII char pads +# the end. +$ mysql-execute name=mysql +ALTER TABLE t1 MODIFY f1 VARCHAR(41) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin; +INSERT INTO t1 VALUES (CONCAT(REPEAT('πŸ‡ΊπŸ‡Έ', 20), 'a'), 'cd'); + +# Narrowing the tracked VARCHAR's max_length is what makes the schemas +# incompatible β€” the source's cached descriptor pins max_length at 40, and +# schema verification rejects any value whose declared width differs. +! SELECT * FROM t1; +contains:incompatible schema change + +> DROP SOURCE mz_source CASCADE;