From 337ef67ae05afe758bdd75668db57383b9449c1d Mon Sep 17 00:00:00 2001 From: 770120041 <770120041@qq.com> Date: Sun, 8 Mar 2026 17:58:07 -0400 Subject: [PATCH 1/2] [Coral-Hive] Fix backslash escaping in Hive string literals (#305) In Hive SQL, backslash is an escape character in string literals: '\\d' represents the string \d (single backslash + d). Previously, Coral only unescaped \' and \" but not \\, causing double backslashes to be preserved in the internal representation. When outputting to Trino SQL (where backslash has no special meaning), the extra backslash produced incorrect regex patterns in REGEXP_LIKE calls. Renamed removeBackslashBeforeQuotes to unescapeHiveStringLiteral and extended it to also handle \\ -> \ escape sequences. Co-Authored-By: Claude Opus 4.6 --- .../hive2rel/parsetree/ParseTreeBuilder.java | 19 ++++++++++--------- .../rel2trino/HiveToTrinoConverterTest.java | 15 +++++++++++++++ 2 files changed, 25 insertions(+), 9 deletions(-) diff --git a/coral-hive/src/main/java/com/linkedin/coral/hive/hive2rel/parsetree/ParseTreeBuilder.java b/coral-hive/src/main/java/com/linkedin/coral/hive/hive2rel/parsetree/ParseTreeBuilder.java index 99b33ecb2..3e4bac6b1 100644 --- a/coral-hive/src/main/java/com/linkedin/coral/hive/hive2rel/parsetree/ParseTreeBuilder.java +++ b/coral-hive/src/main/java/com/linkedin/coral/hive/hive2rel/parsetree/ParseTreeBuilder.java @@ -692,10 +692,10 @@ protected SqlNode visitIdentifier(ASTNode node, ParseContext ctx) { return new SqlIdentifier(node.getText(), ZERO); } - /** See {@link #removeBackslashBeforeQuotes} - * We use removeBackslashBeforeQuotes to remove the backslash before quotes, - * so that we maintain patterns like {@code I'm} or {@code abc"xyz} as is in the java object in memory, - * the escaped literal string representation will be generated when the SqlNode is written to string + /** See {@link #unescapeHiveStringLiteral} + * We use unescapeHiveStringLiteral to interpret Hive backslash escape sequences, + * so that patterns like {@code \\d} are correctly stored as {@code \d} in the java object in memory. + * The escaped literal string representation will be generated when the SqlNode is written to string * by the SqlWriter, which can be controlled by the SqlDialect to decide the choice of escaping mechanism. * */ @Override @@ -703,17 +703,18 @@ protected SqlNode visitStringLiteral(ASTNode node, ParseContext ctx) { // TODO: Add charset here. UTF-8 is not supported by calcite String text = node.getText(); checkState(text.length() >= 2); - return SqlLiteral.createCharString(removeBackslashBeforeQuotes(text.substring(1, text.length() - 1)), ZERO); + return SqlLiteral.createCharString(unescapeHiveStringLiteral(text.substring(1, text.length() - 1)), ZERO); } - private String removeBackslashBeforeQuotes(String input) { - // matches a \' or \" literal pattern - Pattern pattern = Pattern.compile("\\\\['\"]"); + private String unescapeHiveStringLiteral(String input) { + // Handle Hive backslash escape sequences: \\ -> \, \' -> ', \" -> " + Pattern pattern = Pattern.compile("\\\\[\\\\'\"]"); Matcher matcher = pattern.matcher(input); StringBuffer res = new StringBuffer(); while (matcher.find()) { - String replacement = matcher.group().substring(1); + String matched = matcher.group(); + String replacement = Matcher.quoteReplacement(matched.substring(1)); matcher.appendReplacement(res, replacement); } matcher.appendTail(res); diff --git a/coral-trino/src/test/java/com/linkedin/coral/trino/rel2trino/HiveToTrinoConverterTest.java b/coral-trino/src/test/java/com/linkedin/coral/trino/rel2trino/HiveToTrinoConverterTest.java index ceb1fb85e..8dfe01e6e 100644 --- a/coral-trino/src/test/java/com/linkedin/coral/trino/rel2trino/HiveToTrinoConverterTest.java +++ b/coral-trino/src/test/java/com/linkedin/coral/trino/rel2trino/HiveToTrinoConverterTest.java @@ -966,6 +966,21 @@ public void testRegexpTransformation() { assertEquals(expandedSql, targetSql); } + @Test + public void testRlikeBackslashEscaping() { + RelToTrinoConverter relToTrinoConverter = TestUtils.getRelToTrinoConverter(); + + // In Hive SQL, '\\d' means the string \d (backslash is escape char). + // In Trino SQL, '\d' means the string \d (no backslash escaping). + // So Hive's '\\d{4}' should become Trino's '\d{4}'. + RelNode relNode = + TestUtils.getHiveToRelConverter().convertSql("SELECT '2022-01-01' RLIKE '^\\\\d{4}-\\\\d{2}-\\\\d{2}$'"); + String targetSql = + "SELECT \"REGEXP_LIKE\"('2022-01-01', '^\\d{4}-\\d{2}-\\d{2}$')\n" + "FROM (VALUES (0)) AS \"t\" (\"ZERO\")"; + String expandedSql = relToTrinoConverter.convert(relNode); + assertEquals(expandedSql, targetSql); + } + @Test public void testSqlSelectAliasAppenderTransformer() { // test.tableA(a int, b struct From 6b058e2aad0ff00af05b2a3fd4a88c75f3e086ef Mon Sep 17 00:00:00 2001 From: 770120041 <770120041@qq.com> Date: Sun, 8 Mar 2026 18:58:06 -0400 Subject: [PATCH 2/2] Add more test cases for backslash escaping - testRlikeBackslashEscapingWithColumn: column reference with regex - testRegexpBackslashEscaping: REGEXP synonym with \w pattern - testStringLiteralWithEscapedBackslash: general string literal escaping --- .../rel2trino/HiveToTrinoConverterTest.java | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/coral-trino/src/test/java/com/linkedin/coral/trino/rel2trino/HiveToTrinoConverterTest.java b/coral-trino/src/test/java/com/linkedin/coral/trino/rel2trino/HiveToTrinoConverterTest.java index 8dfe01e6e..41668c133 100644 --- a/coral-trino/src/test/java/com/linkedin/coral/trino/rel2trino/HiveToTrinoConverterTest.java +++ b/coral-trino/src/test/java/com/linkedin/coral/trino/rel2trino/HiveToTrinoConverterTest.java @@ -27,6 +27,8 @@ import static com.linkedin.coral.trino.rel2trino.CoralTrinoConfigKeys.*; import static org.apache.calcite.sql.type.OperandTypes.*; import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertFalse; +import static org.testng.Assert.assertTrue; public class HiveToTrinoConverterTest { @@ -981,6 +983,40 @@ public void testRlikeBackslashEscaping() { assertEquals(expandedSql, targetSql); } + @Test + public void testRlikeBackslashEscapingWithColumn() { + RelToTrinoConverter relToTrinoConverter = TestUtils.getRelToTrinoConverter(); + + // Test backslash escaping with a column reference instead of literal + RelNode relNode = + TestUtils.getHiveToRelConverter().convertSql("SELECT * FROM test.tableA WHERE a RLIKE '^\\\\d+$'"); + String expandedSql = relToTrinoConverter.convert(relNode); + assertTrue(expandedSql.contains("\"REGEXP_LIKE\"")); + assertTrue(expandedSql.contains("'^\\d+$'")); + assertFalse(expandedSql.contains("'^\\\\d+$'")); + } + + @Test + public void testRegexpBackslashEscaping() { + RelToTrinoConverter relToTrinoConverter = TestUtils.getRelToTrinoConverter(); + + // Test that REGEXP (synonym for RLIKE) also handles backslash escaping + RelNode relNode = TestUtils.getHiveToRelConverter().convertSql("SELECT 'hello' REGEXP '^\\\\w+$'"); + String targetSql = "SELECT \"REGEXP_LIKE\"('hello', '^\\w+$')\n" + "FROM (VALUES (0)) AS \"t\" (\"ZERO\")"; + String expandedSql = relToTrinoConverter.convert(relNode); + assertEquals(expandedSql, targetSql); + } + + @Test + public void testStringLiteralWithEscapedBackslash() { + RelToTrinoConverter relToTrinoConverter = TestUtils.getRelToTrinoConverter(); + + // Test that a literal backslash (\\\\ in Hive = \\ in string = one backslash in Trino) + RelNode relNode = TestUtils.getHiveToRelConverter().convertSql("SELECT 'path\\\\to\\\\file'"); + String expandedSql = relToTrinoConverter.convert(relNode); + assertTrue(expandedSql.contains("'path\\to\\file'")); + } + @Test public void testSqlSelectAliasAppenderTransformer() { // test.tableA(a int, b struct