diff --git a/coral-hive/src/main/java/com/linkedin/coral/hive/hive2rel/parsetree/ParseTreeBuilder.java b/coral-hive/src/main/java/com/linkedin/coral/hive/hive2rel/parsetree/ParseTreeBuilder.java index 99b33ecb2..3e4bac6b1 100644 --- a/coral-hive/src/main/java/com/linkedin/coral/hive/hive2rel/parsetree/ParseTreeBuilder.java +++ b/coral-hive/src/main/java/com/linkedin/coral/hive/hive2rel/parsetree/ParseTreeBuilder.java @@ -692,10 +692,10 @@ protected SqlNode visitIdentifier(ASTNode node, ParseContext ctx) { return new SqlIdentifier(node.getText(), ZERO); } - /** See {@link #removeBackslashBeforeQuotes} - * We use removeBackslashBeforeQuotes to remove the backslash before quotes, - * so that we maintain patterns like {@code I'm} or {@code abc"xyz} as is in the java object in memory, - * the escaped literal string representation will be generated when the SqlNode is written to string + /** See {@link #unescapeHiveStringLiteral} + * We use unescapeHiveStringLiteral to interpret Hive backslash escape sequences, + * so that patterns like {@code \\d} are correctly stored as {@code \d} in the java object in memory. + * The escaped literal string representation will be generated when the SqlNode is written to string * by the SqlWriter, which can be controlled by the SqlDialect to decide the choice of escaping mechanism. * */ @Override @@ -703,17 +703,18 @@ protected SqlNode visitStringLiteral(ASTNode node, ParseContext ctx) { // TODO: Add charset here. UTF-8 is not supported by calcite String text = node.getText(); checkState(text.length() >= 2); - return SqlLiteral.createCharString(removeBackslashBeforeQuotes(text.substring(1, text.length() - 1)), ZERO); + return SqlLiteral.createCharString(unescapeHiveStringLiteral(text.substring(1, text.length() - 1)), ZERO); } - private String removeBackslashBeforeQuotes(String input) { - // matches a \' or \" literal pattern - Pattern pattern = Pattern.compile("\\\\['\"]"); + private String unescapeHiveStringLiteral(String input) { + // Handle Hive backslash escape sequences: \\ -> \, \' -> ', \" -> " + Pattern pattern = Pattern.compile("\\\\[\\\\'\"]"); Matcher matcher = pattern.matcher(input); StringBuffer res = new StringBuffer(); while (matcher.find()) { - String replacement = matcher.group().substring(1); + String matched = matcher.group(); + String replacement = Matcher.quoteReplacement(matched.substring(1)); matcher.appendReplacement(res, replacement); } matcher.appendTail(res); diff --git a/coral-trino/src/test/java/com/linkedin/coral/trino/rel2trino/HiveToTrinoConverterTest.java b/coral-trino/src/test/java/com/linkedin/coral/trino/rel2trino/HiveToTrinoConverterTest.java index ceb1fb85e..41668c133 100644 --- a/coral-trino/src/test/java/com/linkedin/coral/trino/rel2trino/HiveToTrinoConverterTest.java +++ b/coral-trino/src/test/java/com/linkedin/coral/trino/rel2trino/HiveToTrinoConverterTest.java @@ -27,6 +27,8 @@ import static com.linkedin.coral.trino.rel2trino.CoralTrinoConfigKeys.*; import static org.apache.calcite.sql.type.OperandTypes.*; import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertFalse; +import static org.testng.Assert.assertTrue; public class HiveToTrinoConverterTest { @@ -966,6 +968,55 @@ public void testRegexpTransformation() { assertEquals(expandedSql, targetSql); } + @Test + public void testRlikeBackslashEscaping() { + RelToTrinoConverter relToTrinoConverter = TestUtils.getRelToTrinoConverter(); + + // In Hive SQL, '\\d' means the string \d (backslash is escape char). + // In Trino SQL, '\d' means the string \d (no backslash escaping). + // So Hive's '\\d{4}' should become Trino's '\d{4}'. + RelNode relNode = + TestUtils.getHiveToRelConverter().convertSql("SELECT '2022-01-01' RLIKE '^\\\\d{4}-\\\\d{2}-\\\\d{2}$'"); + String targetSql = + "SELECT \"REGEXP_LIKE\"('2022-01-01', '^\\d{4}-\\d{2}-\\d{2}$')\n" + "FROM (VALUES (0)) AS \"t\" (\"ZERO\")"; + String expandedSql = relToTrinoConverter.convert(relNode); + assertEquals(expandedSql, targetSql); + } + + @Test + public void testRlikeBackslashEscapingWithColumn() { + RelToTrinoConverter relToTrinoConverter = TestUtils.getRelToTrinoConverter(); + + // Test backslash escaping with a column reference instead of literal + RelNode relNode = + TestUtils.getHiveToRelConverter().convertSql("SELECT * FROM test.tableA WHERE a RLIKE '^\\\\d+$'"); + String expandedSql = relToTrinoConverter.convert(relNode); + assertTrue(expandedSql.contains("\"REGEXP_LIKE\"")); + assertTrue(expandedSql.contains("'^\\d+$'")); + assertFalse(expandedSql.contains("'^\\\\d+$'")); + } + + @Test + public void testRegexpBackslashEscaping() { + RelToTrinoConverter relToTrinoConverter = TestUtils.getRelToTrinoConverter(); + + // Test that REGEXP (synonym for RLIKE) also handles backslash escaping + RelNode relNode = TestUtils.getHiveToRelConverter().convertSql("SELECT 'hello' REGEXP '^\\\\w+$'"); + String targetSql = "SELECT \"REGEXP_LIKE\"('hello', '^\\w+$')\n" + "FROM (VALUES (0)) AS \"t\" (\"ZERO\")"; + String expandedSql = relToTrinoConverter.convert(relNode); + assertEquals(expandedSql, targetSql); + } + + @Test + public void testStringLiteralWithEscapedBackslash() { + RelToTrinoConverter relToTrinoConverter = TestUtils.getRelToTrinoConverter(); + + // Test that a literal backslash (\\\\ in Hive = \\ in string = one backslash in Trino) + RelNode relNode = TestUtils.getHiveToRelConverter().convertSql("SELECT 'path\\\\to\\\\file'"); + String expandedSql = relToTrinoConverter.convert(relNode); + assertTrue(expandedSql.contains("'path\\to\\file'")); + } + @Test public void testSqlSelectAliasAppenderTransformer() { // test.tableA(a int, b struct