diff --git a/RELEASE-NOTES.md b/RELEASE-NOTES.md index 609d0e39a9..3904cc9987 100644 --- a/RELEASE-NOTES.md +++ b/RELEASE-NOTES.md @@ -26,6 +26,7 @@ * Fixed `SELECT *` output being corrupted when joined tables share column names. Duplicate column names are now disambiguated by appending a numeric suffix (e.g. `NAME`, `NAME_2`). * Fixed `snow connection generate-jwt` and `snow connection generate-workload-identity-token` failing with `Connection None is not configured` when used with `--temporary-connection`. * The internal connection cache now remembers failed connect attempts and re-raises the original exception on subsequent accesses within the same process, instead of re-dialing Snowflake every time a command accesses the shared connection. This fixes, among other cases, the customer-visible duplicate `LOGIN_HISTORY` events (and `OVERFLOW_FAILURE_EVENTS_ELIDED`) previously emitted when a `snow` invocation was rejected by an authentication policy. +* `snow sql -f ` and the `!source ` include directive now honor a UTF-8 / UTF-16 / UTF-32 byte-order mark at the start of a SQL file and decode it using the matching encoding. This fixes `snow sql` on Windows when the input was produced by a PowerShell `>` redirect (which writes UTF-16 LE with a BOM) or by an editor that prepends a UTF-8 BOM; previously these files crashed with `UnicodeDecodeError` or leaked U+FEFF into the first statement. # v3.17.0 diff --git a/src/snowflake/cli/_plugins/sql/statement_reader.py b/src/snowflake/cli/_plugins/sql/statement_reader.py index 2a72d98b46..561f8e9d3a 100644 --- a/src/snowflake/cli/_plugins/sql/statement_reader.py +++ b/src/snowflake/cli/_plugins/sql/statement_reader.py @@ -1,3 +1,4 @@ +import codecs import enum import io import re @@ -24,6 +25,35 @@ ASYNC_SUFFIX = ";>" +# Byte-order marks checked in descending length so that the 4-byte UTF-32 marks +# are matched before the 2-byte UTF-16 marks they start with. +_BOM_ENCODINGS: tuple[tuple[bytes, str], ...] = ( + (codecs.BOM_UTF32_LE, "utf-32"), + (codecs.BOM_UTF32_BE, "utf-32"), + (codecs.BOM_UTF8, "utf-8-sig"), + (codecs.BOM_UTF16_LE, "utf-16"), + (codecs.BOM_UTF16_BE, "utf-16"), +) + + +def _read_sql_file(path: SecurePath) -> str: + """Read a SQL text file, honoring a UTF-8/16/32 BOM if present. + + PowerShell's ``>`` redirect and many Windows editors write files with a + byte-order mark. Without BOM handling those files either crash with a + UnicodeDecodeError (UTF-16) or leak a stray U+FEFF into the first + statement (UTF-8 BOM). When no BOM is present we fall back to UTF-8, + matching how the CLI writes files elsewhere. + """ + with path.open("rb", read_file_limit_mb=UNLIMITED) as f: + raw = f.read() + for bom, encoding in _BOM_ENCODINGS: + if raw.startswith(bom): + # ``utf-16``/``utf-32`` consume the BOM themselves; ``utf-8-sig`` + # strips the UTF-8 BOM. Decode the full payload in each case. + return raw.decode(encoding) + return raw.decode("utf-8") + # Regex that recognises SQL tokens whose contents must not be scanned for # comment syntax. Alternatives are tried left-to-right: @@ -214,7 +244,7 @@ def from_file(cls, path_part: str, raw_source: str) -> "ParsedStatement": path = SecurePath(stripped_comments_path_part) if path.is_file(): - payload = path.read_text(file_size_limit_mb=UNLIMITED) + payload = _read_sql_file(path) return cls(payload, StatementType.FILE, path.as_posix()) error_msg = f"Could not read: {path_part}" @@ -334,18 +364,17 @@ def files_reader( Returns a generator with statements.""" for path in paths: - with path.open(read_file_limit_mb=UNLIMITED) as f: - content = f.read() - if pre_render: - content = pre_render(content) - stmts = split_statements(io.StringIO(content), remove_comments) - yield from recursive_statement_reader( - stmts, - [path.as_posix()], - operators, - remove_comments, - pre_render, - ) + content = _read_sql_file(path) + if pre_render: + content = pre_render(content) + stmts = split_statements(io.StringIO(content), remove_comments) + yield from recursive_statement_reader( + stmts, + [path.as_posix()], + operators, + remove_comments, + pre_render, + ) def query_reader( diff --git a/tests/sql/test_statement_reader.py b/tests/sql/test_statement_reader.py index 1ffd413634..26b055ee3b 100644 --- a/tests/sql/test_statement_reader.py +++ b/tests/sql/test_statement_reader.py @@ -1,3 +1,4 @@ +import codecs from functools import partial import pytest @@ -9,6 +10,7 @@ ParsedStatement, StatementType, _protect_sql_comments, + _read_sql_file, compile_statements, files_reader, parse_statement, @@ -541,3 +543,94 @@ def test_protect_comments_roundtrip_through_jinja(): ) restored = saved.restore(rendered) assert restored == "-- {{ not_a_var }}\nSELECT /* {{ also_not }} */ 1 WHERE x = 42;" + + +# --------------------------------------------------------------------------- +# BOM / encoding tests — regression coverage for SNOW-1528909 +# +# PowerShell's ``>`` redirect writes UTF-16 LE by default, and many Windows +# editors add a UTF-8 BOM. Without BOM handling the CLI either crashes with a +# UnicodeDecodeError on UTF-16 or leaks a U+FEFF into the first statement. +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + ("encoding", "bom"), + [ + # BOM-emitting codec names — Python appends the BOM itself. + ("utf-16", b""), + ("utf-8-sig", b""), + ("utf-32", b""), + # No-BOM codec names — we prepend the BOM manually to mimic what + # PowerShell / Notepad actually write on Windows. + ("utf-16-le", codecs.BOM_UTF16_LE), + ("utf-16-be", codecs.BOM_UTF16_BE), + ("utf-32-le", codecs.BOM_UTF32_LE), + ("utf-32-be", codecs.BOM_UTF32_BE), + ], +) +def test_read_sql_file_decodes_bom_prefixed_file( + tmp_path_factory: pytest.TempPathFactory, encoding, bom +): + f = tmp_path_factory.mktemp("a") / "f.sql" + # Non-ASCII content ensures a naive UTF-8 decode of a UTF-16/UTF-32 file + # would raise UnicodeDecodeError — before the fix this was exactly the + # failure mode reported in the issue for PowerShell-redirected files. + text = "-- コメント\nselect 1;" + f.write_bytes(bom + text.encode(encoding)) + + assert _read_sql_file(SecurePath(f)) == text + + +def test_read_sql_file_utf8_without_bom(tmp_path_factory: pytest.TempPathFactory): + # Regression guard: plain UTF-8 (the common case) stays on the fast path. + f = tmp_path_factory.mktemp("a") / "f.sql" + text = "-- ascii only\nselect 3;" + f.write_bytes(text.encode("utf-8")) + + assert _read_sql_file(SecurePath(f)) == text + + +def test_read_sql_file_strips_utf8_bom(tmp_path_factory: pytest.TempPathFactory): + # If the UTF-8 BOM leaked through, the first statement would start with + # U+FEFF and Snowflake would reject it as a syntax error. + f = tmp_path_factory.mktemp("a") / "f.sql" + f.write_bytes(b"\xef\xbb\xbfselect 1;") + + result = _read_sql_file(SecurePath(f)) + + assert result == "select 1;" + assert "" not in result + + +@pytest.mark.parametrize("encoding", ["utf-16", "utf-8-sig"]) +def test_files_reader_decodes_bom_prefixed_sql( + tmp_path_factory: pytest.TempPathFactory, encoding +): + f = tmp_path_factory.mktemp("a") / "f.sql" + f.write_bytes("select 1;".encode(encoding)) + + errors, cnt, compiled = compile_statements( + files_reader((SecurePath(f),), WORKING_OPERATOR_FUNCS), + ) + + assert not errors, errors + assert cnt == 1 + assert compiled == [CompiledStatement(statement="select 1;")] + + +@pytest.mark.parametrize("encoding", ["utf-16", "utf-8-sig"]) +def test_source_directive_decodes_bom_prefixed_file( + tmp_path_factory: pytest.TempPathFactory, encoding +): + # ``!source `` goes through ParsedStatement.from_file, a different + # code path from files_reader. Both must honor BOMs. + included = tmp_path_factory.mktemp("a") / "included.sql" + included.write_bytes("select 2;\n".encode(encoding)) + + query = f"!source {included.as_posix()};" + source = parse_statement(query, WORKING_OPERATOR_FUNCS) + + assert source.error is None + assert source.statement_type == StatementType.FILE + assert source.statement.read() == "select 2;\n"