Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions RELEASE-NOTES.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
* Fixed `SELECT *` output being corrupted when joined tables share column names. Duplicate column names are now disambiguated by appending a numeric suffix (e.g. `NAME`, `NAME_2`).
* Fixed `snow connection generate-jwt` and `snow connection generate-workload-identity-token` failing with `Connection None is not configured` when used with `--temporary-connection`.
* The internal connection cache now remembers failed connect attempts and re-raises the original exception on subsequent accesses within the same process, instead of re-dialing Snowflake every time a command accesses the shared connection. This fixes, among other cases, the customer-visible duplicate `LOGIN_HISTORY` events (and `OVERFLOW_FAILURE_EVENTS_ELIDED`) previously emitted when a `snow` invocation was rejected by an authentication policy.
* `snow sql -f <file>` and the `!source <file>` include directive now honor a UTF-8 / UTF-16 / UTF-32 byte-order mark at the start of a SQL file and decode it using the matching encoding. This fixes `snow sql` on Windows when the input was produced by a PowerShell `>` redirect (which writes UTF-16 LE with a BOM) or by an editor that prepends a UTF-8 BOM; previously these files crashed with `UnicodeDecodeError` or leaked U+FEFF into the first statement.


# v3.17.0
Expand Down
55 changes: 42 additions & 13 deletions src/snowflake/cli/_plugins/sql/statement_reader.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import codecs
import enum
import io
import re
Expand All @@ -24,6 +25,35 @@

ASYNC_SUFFIX = ";>"

# Byte-order marks checked in descending length so that the 4-byte UTF-32 marks
# are matched before the 2-byte UTF-16 marks they start with.
_BOM_ENCODINGS: tuple[tuple[bytes, str], ...] = (
(codecs.BOM_UTF32_LE, "utf-32"),
(codecs.BOM_UTF32_BE, "utf-32"),
(codecs.BOM_UTF8, "utf-8-sig"),
(codecs.BOM_UTF16_LE, "utf-16"),
(codecs.BOM_UTF16_BE, "utf-16"),
)


def _read_sql_file(path: SecurePath) -> str:
"""Read a SQL text file, honoring a UTF-8/16/32 BOM if present.

PowerShell's ``>`` redirect and many Windows editors write files with a
byte-order mark. Without BOM handling those files either crash with a
UnicodeDecodeError (UTF-16) or leak a stray U+FEFF into the first
statement (UTF-8 BOM). When no BOM is present we fall back to UTF-8,
matching how the CLI writes files elsewhere.
"""
with path.open("rb", read_file_limit_mb=UNLIMITED) as f:
raw = f.read()
for bom, encoding in _BOM_ENCODINGS:
if raw.startswith(bom):
# ``utf-16``/``utf-32`` consume the BOM themselves; ``utf-8-sig``
# strips the UTF-8 BOM. Decode the full payload in each case.
return raw.decode(encoding)
return raw.decode("utf-8")


# Regex that recognises SQL tokens whose contents must not be scanned for
# comment syntax. Alternatives are tried left-to-right:
Expand Down Expand Up @@ -214,7 +244,7 @@ def from_file(cls, path_part: str, raw_source: str) -> "ParsedStatement":
path = SecurePath(stripped_comments_path_part)

if path.is_file():
payload = path.read_text(file_size_limit_mb=UNLIMITED)
payload = _read_sql_file(path)
return cls(payload, StatementType.FILE, path.as_posix())

error_msg = f"Could not read: {path_part}"
Expand Down Expand Up @@ -334,18 +364,17 @@ def files_reader(

Returns a generator with statements."""
for path in paths:
with path.open(read_file_limit_mb=UNLIMITED) as f:
content = f.read()
if pre_render:
content = pre_render(content)
stmts = split_statements(io.StringIO(content), remove_comments)
yield from recursive_statement_reader(
stmts,
[path.as_posix()],
operators,
remove_comments,
pre_render,
)
content = _read_sql_file(path)
if pre_render:
content = pre_render(content)
stmts = split_statements(io.StringIO(content), remove_comments)
yield from recursive_statement_reader(
stmts,
[path.as_posix()],
operators,
remove_comments,
pre_render,
)


def query_reader(
Expand Down
93 changes: 93 additions & 0 deletions tests/sql/test_statement_reader.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import codecs
from functools import partial

import pytest
Expand All @@ -9,6 +10,7 @@
ParsedStatement,
StatementType,
_protect_sql_comments,
_read_sql_file,
compile_statements,
files_reader,
parse_statement,
Expand Down Expand Up @@ -541,3 +543,94 @@ def test_protect_comments_roundtrip_through_jinja():
)
restored = saved.restore(rendered)
assert restored == "-- {{ not_a_var }}\nSELECT /* {{ also_not }} */ 1 WHERE x = 42;"


# ---------------------------------------------------------------------------
# BOM / encoding tests — regression coverage for SNOW-1528909
#
# PowerShell's ``>`` redirect writes UTF-16 LE by default, and many Windows
# editors add a UTF-8 BOM. Without BOM handling the CLI either crashes with a
# UnicodeDecodeError on UTF-16 or leaks a U+FEFF into the first statement.
# ---------------------------------------------------------------------------


@pytest.mark.parametrize(
("encoding", "bom"),
[
# BOM-emitting codec names — Python appends the BOM itself.
("utf-16", b""),
("utf-8-sig", b""),
("utf-32", b""),
# No-BOM codec names — we prepend the BOM manually to mimic what
# PowerShell / Notepad actually write on Windows.
("utf-16-le", codecs.BOM_UTF16_LE),
("utf-16-be", codecs.BOM_UTF16_BE),
("utf-32-le", codecs.BOM_UTF32_LE),
("utf-32-be", codecs.BOM_UTF32_BE),
],
)
def test_read_sql_file_decodes_bom_prefixed_file(
tmp_path_factory: pytest.TempPathFactory, encoding, bom
):
f = tmp_path_factory.mktemp("a") / "f.sql"
# Non-ASCII content ensures a naive UTF-8 decode of a UTF-16/UTF-32 file
# would raise UnicodeDecodeError — before the fix this was exactly the
# failure mode reported in the issue for PowerShell-redirected files.
text = "-- コメント\nselect 1;"
f.write_bytes(bom + text.encode(encoding))

assert _read_sql_file(SecurePath(f)) == text


def test_read_sql_file_utf8_without_bom(tmp_path_factory: pytest.TempPathFactory):
# Regression guard: plain UTF-8 (the common case) stays on the fast path.
f = tmp_path_factory.mktemp("a") / "f.sql"
text = "-- ascii only\nselect 3;"
f.write_bytes(text.encode("utf-8"))

assert _read_sql_file(SecurePath(f)) == text


def test_read_sql_file_strips_utf8_bom(tmp_path_factory: pytest.TempPathFactory):
# If the UTF-8 BOM leaked through, the first statement would start with
# U+FEFF and Snowflake would reject it as a syntax error.
f = tmp_path_factory.mktemp("a") / "f.sql"
f.write_bytes(b"\xef\xbb\xbfselect 1;")

result = _read_sql_file(SecurePath(f))

assert result == "select 1;"
assert "" not in result


@pytest.mark.parametrize("encoding", ["utf-16", "utf-8-sig"])
def test_files_reader_decodes_bom_prefixed_sql(
tmp_path_factory: pytest.TempPathFactory, encoding
):
f = tmp_path_factory.mktemp("a") / "f.sql"
f.write_bytes("select 1;".encode(encoding))

errors, cnt, compiled = compile_statements(
files_reader((SecurePath(f),), WORKING_OPERATOR_FUNCS),
)

assert not errors, errors
assert cnt == 1
assert compiled == [CompiledStatement(statement="select 1;")]


@pytest.mark.parametrize("encoding", ["utf-16", "utf-8-sig"])
def test_source_directive_decodes_bom_prefixed_file(
tmp_path_factory: pytest.TempPathFactory, encoding
):
# ``!source <file>`` goes through ParsedStatement.from_file, a different
# code path from files_reader. Both must honor BOMs.
included = tmp_path_factory.mktemp("a") / "included.sql"
included.write_bytes("select 2;\n".encode(encoding))

query = f"!source {included.as_posix()};"
source = parse_statement(query, WORKING_OPERATOR_FUNCS)

assert source.error is None
assert source.statement_type == StatementType.FILE
assert source.statement.read() == "select 2;\n"
Loading