From 886ef7fb1ef22424bf26e2e24ded93fd88654e7f Mon Sep 17 00:00:00 2001 From: zivxx Date: Thu, 18 Jun 2026 18:51:01 +0300 Subject: [PATCH 01/12] fix: redact credentials in logs at the model layer Follow-up to #902. A customer reported credentials still leaking into logs, this time via opal_client/data/updater.py (logger.exception serializing the full DataSourceEntry, whose `config` carries auth headers and `data` the payload). A full codebase scan found several more sites with the same pattern. Root cause: log statements interpolate credential-bearing pydantic models (`{entry}`, `{config}`, `model=...`) and loguru's serialize=True sink dumps record["extra"] as JSON, falling back to str() for non-JSON objects. Central fix: a RedactedReprMixin that overrides __repr_args__ on the credential-bearing models so repr()/str() (and thus the loguru serialize path) mask secret fields: - FetcherConfig / HttpFetcherConfig -> headers, data - DataSourceEntry -> config, data - FetchEvent -> config This protects every current and future log site at once. Wire serialization (.json()/.dict()), hashing (calc_hash uses config.json()), dedup keys, and equality are untouched, so transport and behavior are unaffected. Belt-and-suspenders spot fixes (log only safe fields): - opal-client data/updater.py (x3): log entry.url instead of the entry - opal-server data_update_publisher.py: log entry.url instead of the entry - opal-server data/api.py: drop the data_sources_config model from the error - opal-common fastapi_rpc_fetch_provider.py: stop logging rpc_arguments Adds opal-common tests/redaction_test.py covering repr/str redaction, wire-payload integrity, and the loguru serialize=True leak scenario. Note for reviewers: the publish-data-update CLI (cli/commands.py:188) echoes str(update), which now shows config/data as . The published payload itself (update.json(), line 180) is unaffected; redacting the stdout echo is intentional since it can land in shell history / CI logs. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../opal-client/opal_client/data/updater.py | 12 +-- .../opal-common/opal_common/fetcher/events.py | 14 +++- .../providers/fastapi_rpc_fetch_provider.py | 3 +- .../fetcher/providers/http_fetch_provider.py | 6 +- .../opal_common/logging_utils/redaction.py | 29 +++++++ .../opal-common/opal_common/schemas/data.py | 10 ++- .../opal_common/tests/redaction_test.py | 83 +++++++++++++++++++ packages/opal-server/opal_server/data/api.py | 5 +- .../opal_server/data/data_update_publisher.py | 4 +- 9 files changed, 151 insertions(+), 15 deletions(-) create mode 100644 packages/opal-common/opal_common/logging_utils/redaction.py create mode 100644 packages/opal-common/opal_common/tests/redaction_test.py diff --git a/packages/opal-client/opal_client/data/updater.py b/packages/opal-client/opal_client/data/updater.py index 3563b40ab..54a34262e 100644 --- a/packages/opal-client/opal_client/data/updater.py +++ b/packages/opal-client/opal_client/data/updater.py @@ -475,14 +475,16 @@ async def _update_policy_data(self, update: DataUpdate) -> None: for entry in update.entries: if not entry.topics: - logger.debug("Data entry {entry} has no topics, skipping", entry=entry) + logger.debug( + "Data entry for url {url} has no topics, skipping", url=entry.url + ) continue # Only process entries that match one of our subscribed data topics if set(entry.topics).isdisjoint(set(self._data_topics)): logger.debug( - "Data entry {entry} has no topics matching the data topics, skipping", - entry=entry, + "Data entry for url {url} has no topics matching the data topics, skipping", + url=entry.url, ) continue @@ -589,8 +591,8 @@ async def _fetch_data(self, entry: DataSourceEntry) -> JsonableValue: ) except Exception as e: logger.exception( - "Failed to fetch data for entry {entry} with exception {exc}", - entry=entry, + "Failed to fetch data for entry url {url} with exception {exc}", + url=entry.url, exc=e, ) raise Exception(f"Failed to fetch data for entry {entry.url}: {e}") diff --git a/packages/opal-common/opal_common/fetcher/events.py b/packages/opal-common/opal_common/fetcher/events.py index 878a56d79..a7cb589e6 100644 --- a/packages/opal-common/opal_common/fetcher/events.py +++ b/packages/opal-common/opal_common/fetcher/events.py @@ -1,13 +1,18 @@ -from typing import List, Optional +from typing import ClassVar, List, Optional, Set +from opal_common.logging_utils.redaction import RedactedReprMixin from pydantic import BaseModel, Field -class FetcherConfig(BaseModel): +class FetcherConfig(RedactedReprMixin, BaseModel): """The configuration of a fetcher, used as part of a FetchEvent Fetch Provider's have their own uniqueue events and configurations. Configurations + + Note: subclasses commonly carry credentials (e.g. auth headers). They must + list any secret-bearing fields in ``_redacted_repr_fields`` so they never + leak into logs - see ``RedactedReprMixin``. """ fetcher: Optional[str] = Field( @@ -16,12 +21,15 @@ class FetcherConfig(BaseModel): ) -class FetchEvent(BaseModel): +class FetchEvent(RedactedReprMixin, BaseModel): """Event used to describe an queue fetching tasks Design note - By using a Pydantic model - we can create a potentially transfer FetchEvents to be handled by other network nodes (perhaps via RPC) """ + # ``config`` may carry a FetcherConfig with credentials - mask it in repr/str. + _redacted_repr_fields: ClassVar[Set[str]] = {"config"} + # Event id to be filled by the engine id: str = None # optional name of the specific event diff --git a/packages/opal-common/opal_common/fetcher/providers/fastapi_rpc_fetch_provider.py b/packages/opal-common/opal_common/fetcher/providers/fastapi_rpc_fetch_provider.py index 4b574a8ea..ab7261ac9 100644 --- a/packages/opal-common/opal_common/fetcher/providers/fastapi_rpc_fetch_provider.py +++ b/packages/opal-common/opal_common/fetcher/providers/fastapi_rpc_fetch_provider.py @@ -38,8 +38,9 @@ async def _fetch_(self): args = self._event.config.rpc_arguments method = self._event.config.rpc_method_name result = None + # Note: ``args`` (rpc_arguments) may carry credentials - never log it. logger.info( - f"{self.__class__.__name__} fetching from {self._url} with RPC call {method}({args})" + f"{self.__class__.__name__} fetching from {self._url} with RPC call {method}" ) async with WebSocketRpcClient( self._url, diff --git a/packages/opal-common/opal_common/fetcher/providers/http_fetch_provider.py b/packages/opal-common/opal_common/fetcher/providers/http_fetch_provider.py index 80ac50202..f9a6acaed 100644 --- a/packages/opal-common/opal_common/fetcher/providers/http_fetch_provider.py +++ b/packages/opal-common/opal_common/fetcher/providers/http_fetch_provider.py @@ -1,7 +1,7 @@ """Simple HTTP get data fetcher using requests supports.""" from enum import Enum -from typing import Any, Union, cast +from typing import Any, ClassVar, Set, Union, cast import httpx from aiohttp import ClientResponse, ClientSession, ClientTimeout @@ -28,6 +28,10 @@ class HttpMethods(Enum): class HttpFetcherConfig(FetcherConfig): """Config for HttpFetchProvider's Adding HTTP headers.""" + # ``headers`` carries Authorization tokens and ``data`` the (possibly + # sensitive) payload - mask both in repr/str so they never leak into logs. + _redacted_repr_fields: ClassVar[Set[str]] = {"headers", "data"} + headers: dict = None is_json: bool = True process_data: bool = True diff --git a/packages/opal-common/opal_common/logging_utils/redaction.py b/packages/opal-common/opal_common/logging_utils/redaction.py new file mode 100644 index 000000000..6a131cb9f --- /dev/null +++ b/packages/opal-common/opal_common/logging_utils/redaction.py @@ -0,0 +1,29 @@ +from typing import ClassVar, Set + + +class RedactedReprMixin: + """Mixin for pydantic (v1) models that may carry credentials. + + Overrides ``repr()`` / ``str()`` so that any field listed in + ``_redacted_repr_fields`` renders as ```` instead of its real + value. + + This is the central defense against credential leaks in logs: OPAL logs via + loguru, and most leaks come from interpolating these models into log + messages (e.g. ``logger.info("... {entry}", entry=entry)``) or from loguru's + ``serialize=True`` sink, which falls back to ``str()`` for non-JSON objects. + Both paths go through ``__repr_args__`` here, so masking once at the model + layer protects every current and future log site at once. + + Note: this only affects human / log rendering. Wire serialization uses + ``.dict()`` / ``.json()``, which are untouched, so transport is unaffected. + """ + + #: Field names whose values may carry secrets and must be masked in repr/str. + _redacted_repr_fields: ClassVar[Set[str]] = set() + + def __repr_args__(self): + return [ + (key, "" if key in self._redacted_repr_fields else value) + for key, value in super().__repr_args__() + ] diff --git a/packages/opal-common/opal_common/schemas/data.py b/packages/opal-common/opal_common/schemas/data.py index 32e62c98f..b74b00f69 100644 --- a/packages/opal-common/opal_common/schemas/data.py +++ b/packages/opal-common/opal_common/schemas/data.py @@ -1,9 +1,10 @@ from logging import basicConfig from pydoc import describe -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Any, ClassVar, Dict, List, Optional, Set, Tuple, Union from opal_common.fetcher.events import FetcherConfig from opal_common.fetcher.providers.http_fetch_provider import HttpFetcherConfig +from opal_common.logging_utils.redaction import RedactedReprMixin from opal_common.schemas.store import JSONPatchAction from pydantic import AnyHttpUrl, BaseModel, Field, root_validator, validator @@ -13,11 +14,16 @@ DEFAULT_DATA_TOPIC = "policy_data" -class DataSourceEntry(BaseModel): +class DataSourceEntry(RedactedReprMixin, BaseModel): """ Data source configuration - where client's should retrieve data from and how they should store it """ + # ``config`` may carry fetcher auth (e.g. Authorization headers) and + # ``data`` an inline payload - mask both in repr/str so they never leak into + # logs (entries are frequently interpolated into log messages). + _redacted_repr_fields: ClassVar[Set[str]] = {"config", "data"} + @validator("data") def validate_save_method(cls, value, values): if values["save_method"] not in ["PUT", "PATCH"]: diff --git a/packages/opal-common/opal_common/tests/redaction_test.py b/packages/opal-common/opal_common/tests/redaction_test.py new file mode 100644 index 000000000..825d3c122 --- /dev/null +++ b/packages/opal-common/opal_common/tests/redaction_test.py @@ -0,0 +1,83 @@ +"""Regression tests for credential redaction in repr/str of credential-bearing +models (follow-up to #901 / #902). + +OPAL logs via loguru. Credentials leaked because log statements interpolated +whole models (e.g. ``logger.info("... {entry}", entry=entry)``) or relied on +loguru's ``serialize=True`` sink, which dumps ``record["extra"]`` as JSON and +falls back to ``str()`` for non-JSON objects. ``RedactedReprMixin`` masks the +secret-bearing fields at the model layer so every current and future log site is +protected at once - while leaving ``.dict()`` / ``.json()`` (the wire format) +untouched. +""" + +import pytest +from loguru import logger +from opal_common.fetcher.events import FetchEvent +from opal_common.fetcher.providers.http_fetch_provider import HttpFetcherConfig +from opal_common.schemas.data import DataSourceEntry + +SECRET_TOKEN = "Bearer super-secret-token-must-not-leak" + + +def _models(): + """One instance of each credential-bearing model, all carrying SECRET_TOKEN.""" + return { + "HttpFetcherConfig": HttpFetcherConfig( + headers={"Authorization": SECRET_TOKEN}, data={"payload": SECRET_TOKEN} + ), + "DataSourceEntry": DataSourceEntry( + url="http://data.example.com", + save_method="PUT", + config={"headers": {"Authorization": SECRET_TOKEN}}, + data={"payload": SECRET_TOKEN}, + ), + "FetchEvent": FetchEvent( + fetcher="HttpFetchProvider", + url="http://data.example.com", + config={"headers": {"Authorization": SECRET_TOKEN}}, + ), + } + + +@pytest.mark.parametrize("name,model", list(_models().items())) +def test_repr_and_str_redact_secrets(name, model): + """repr() / str() (used by f-strings and ``"{}".format``) must never expose + the secret, and must show it was redacted.""" + assert SECRET_TOKEN not in repr(model), f"secret leaked in repr({name})" + assert SECRET_TOKEN not in str(model), f"secret leaked in str({name})" + assert SECRET_TOKEN not in f"{model}", f"secret leaked in f-string of {name}" + assert "" in repr(model), f"{name} did not redact anything" + + +def test_wire_serialization_is_not_redacted(): + """Redaction is only for human/log rendering - ``.dict()`` (the transport + format) must still carry the real values so fetching keeps working.""" + cfg = HttpFetcherConfig(headers={"Authorization": SECRET_TOKEN}) + entry = DataSourceEntry( + url="http://data.example.com", + save_method="PUT", + config={"headers": {"Authorization": SECRET_TOKEN}}, + ) + assert cfg.dict()["headers"]["Authorization"] == SECRET_TOKEN + assert entry.dict()["config"]["headers"]["Authorization"] == SECRET_TOKEN + + +def test_loguru_serialize_does_not_leak(): + """The actual #901 scenario: a sink with ``serialize=True`` dumps the full + record (including ``extra``) as JSON. Interpolating or binding any of these + models must not put the secret on the wire.""" + captured: list[str] = [] + sink_id = logger.add( + lambda message: captured.append(str(message)), + level="DEBUG", + serialize=True, + ) + try: + for model in _models().values(): + logger.info("logging model {model}", model=model) + logger.bind(bound=model).info("bound model") + finally: + logger.remove(sink_id) + + combined = "\n".join(captured) + assert SECRET_TOKEN not in combined, "secret leaked through loguru serialize=True" diff --git a/packages/opal-server/opal_server/data/api.py b/packages/opal-server/opal_server/data/api.py index da5d043a9..0b64a6a5e 100644 --- a/packages/opal-server/opal_server/data/api.py +++ b/packages/opal-server/opal_server/data/api.py @@ -98,7 +98,10 @@ async def get_data_sources_config(authorization: Optional[str] = Header(None)): redirect_url = set_url_query_param(url, "token", token) return RedirectResponse(url=redirect_url) else: - logger.error("pydantic model invalid", model=data_sources_config) + logger.error( + "data source configuration is invalid: neither an inline config " + "nor an external_source_url was provided" + ) raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail="Did not find a data source configuration!", diff --git a/packages/opal-server/opal_server/data/data_update_publisher.py b/packages/opal-server/opal_server/data/data_update_publisher.py index 64bc32bbe..5acc20ddb 100644 --- a/packages/opal-server/opal_server/data/data_update_publisher.py +++ b/packages/opal-server/opal_server/data/data_update_publisher.py @@ -94,9 +94,9 @@ async def publish_data_updates(self, update: DataUpdate): all_topic_combos.update(topic_combos) else: logger.warning( - "[{pid}] No topics were provided for the following entry: {entry}", + "[{pid}] No topics were provided for the entry with url: {url}", pid=os.getpid(), - entry=entry, + url=entry.url, ) # publish all topics with all their sub combinations From 1bdcef26dfaf8437bc85b9a1a4f105521feab769 Mon Sep 17 00:00:00 2001 From: zivxx Date: Thu, 18 Jun 2026 19:08:33 +0300 Subject: [PATCH 02/12] fix: harden redaction after expert review (MRO-union, RPC config, url userinfo) Addresses findings from a multi-agent adversarial review of the redaction fix: - RedactedReprMixin now unions `_redacted_repr_fields` across the whole MRO, so a subclass can only ever ADD fields to redact - it can never silently drop a parent's protection by redeclaring/omitting the set. Prevents the exact failure mode below. - FastApiRpcFetchConfig: redact `rpc_arguments` (a secret-bearing FetcherConfig subclass that was missed - a live leak of the same class as the original bug). - Add http_utils.redact_url() to strip embedded `user:password@` credentials from URLs, and apply it to every `entry.url` log/exception in data/updater.py and data_update_publisher.py (DataSourceEntry.url is a free string and may be a basic-auth URL). - Tests: add nested-container (DataUpdate), RPC-config, diagnose=True, and redact_url cases. - Nits: fix pre-existing "uniqueue" typo, drop unused imports in schemas/data.py. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../opal-client/opal_client/data/updater.py | 19 +++-- .../opal-common/opal_common/fetcher/events.py | 2 +- .../providers/fastapi_rpc_fetch_provider.py | 5 ++ .../opal-common/opal_common/http_utils.py | 24 ++++++ .../opal_common/logging_utils/redaction.py | 26 +++++-- .../opal-common/opal_common/schemas/data.py | 2 - .../opal_common/tests/redaction_test.py | 75 ++++++++++++++++++- .../opal_server/data/data_update_publisher.py | 5 +- 8 files changed, 138 insertions(+), 20 deletions(-) diff --git a/packages/opal-client/opal_client/data/updater.py b/packages/opal-client/opal_client/data/updater.py index 54a34262e..a62449575 100644 --- a/packages/opal-client/opal_client/data/updater.py +++ b/packages/opal-client/opal_client/data/updater.py @@ -26,7 +26,7 @@ ) from opal_common.async_utils import TasksPool, repeated_call from opal_common.config import opal_common_config -from opal_common.http_utils import is_http_error_response +from opal_common.http_utils import is_http_error_response, redact_url from opal_common.schemas.data import ( DataEntryReport, DataSourceConfig, @@ -476,7 +476,8 @@ async def _update_policy_data(self, update: DataUpdate) -> None: for entry in update.entries: if not entry.topics: logger.debug( - "Data entry for url {url} has no topics, skipping", url=entry.url + "Data entry for url {url} has no topics, skipping", + url=redact_url(entry.url), ) continue @@ -484,7 +485,7 @@ async def _update_policy_data(self, update: DataUpdate) -> None: if set(entry.topics).isdisjoint(set(self._data_topics)): logger.debug( "Data entry for url {url} has no topics matching the data topics, skipping", - url=entry.url, + url=redact_url(entry.url), ) continue @@ -592,13 +593,15 @@ async def _fetch_data(self, entry: DataSourceEntry) -> JsonableValue: except Exception as e: logger.exception( "Failed to fetch data for entry url {url} with exception {exc}", - url=entry.url, + url=redact_url(entry.url), exc=e, ) - raise Exception(f"Failed to fetch data for entry {entry.url}: {e}") + raise Exception( + f"Failed to fetch data for entry {redact_url(entry.url)}: {e}" + ) if result is None: - raise Exception(f"Fetched data is empty for entry {entry.url}") + raise Exception(f"Fetched data is empty for entry {redact_url(entry.url)}") if isinstance(result, aiohttp.ClientResponse) and is_http_error_response( result @@ -606,12 +609,12 @@ async def _fetch_data(self, entry: DataSourceEntry) -> JsonableValue: error_content = await result.text() logger.error( "Failed to decode response from url: '{url}', got response code {status} with response: {error}", - url=entry.url, + url=redact_url(entry.url), status=result.status, error=error_content, ) raise Exception( - f"Failed to decode response from url: '{entry.url}', got response code {result.status} with response: {error_content}" + f"Failed to decode response from url: '{redact_url(entry.url)}', got response code {result.status} with response: {error_content}" ) return result diff --git a/packages/opal-common/opal_common/fetcher/events.py b/packages/opal-common/opal_common/fetcher/events.py index a7cb589e6..d80ac3a11 100644 --- a/packages/opal-common/opal_common/fetcher/events.py +++ b/packages/opal-common/opal_common/fetcher/events.py @@ -6,7 +6,7 @@ class FetcherConfig(RedactedReprMixin, BaseModel): """The configuration of a fetcher, used as part of a FetchEvent Fetch - Provider's have their own uniqueue events and configurations. + Provider's have their own unique events and configurations. Configurations diff --git a/packages/opal-common/opal_common/fetcher/providers/fastapi_rpc_fetch_provider.py b/packages/opal-common/opal_common/fetcher/providers/fastapi_rpc_fetch_provider.py index ab7261ac9..09b640687 100644 --- a/packages/opal-common/opal_common/fetcher/providers/fastapi_rpc_fetch_provider.py +++ b/packages/opal-common/opal_common/fetcher/providers/fastapi_rpc_fetch_provider.py @@ -1,5 +1,7 @@ """Simple HTTP get data fetcher using requests supports.""" +from typing import ClassVar, Set + from fastapi_websocket_rpc.rpc_methods import RpcMethodsBase from fastapi_websocket_rpc.websocket_rpc_client import WebSocketRpcClient from opal_common.fetcher.events import FetcherConfig, FetchEvent @@ -12,6 +14,9 @@ class FastApiRpcFetchConfig(FetcherConfig): """Config for FastApiRpcFetchConfig's Adding HTTP headers.""" + # ``rpc_arguments`` may carry credentials - mask it in repr/str. + _redacted_repr_fields: ClassVar[Set[str]] = {"rpc_arguments"} + rpc_method_name: str rpc_arguments: dict diff --git a/packages/opal-common/opal_common/http_utils.py b/packages/opal-common/opal_common/http_utils.py index 9c2d35a76..5dcf15178 100644 --- a/packages/opal-common/opal_common/http_utils.py +++ b/packages/opal-common/opal_common/http_utils.py @@ -1,9 +1,33 @@ from typing import Union +from urllib.parse import urlsplit, urlunsplit import aiohttp import httpx +def redact_url(url: str) -> str: + """Strip any embedded credentials (``user:password@``) from a URL so it is + safe to log. + + Data source / policy repo URLs may be of the form + ``https://user:token@host/path``. Logging them verbatim leaks the + credentials, so we replace the userinfo component with ``***`` while keeping + the rest of the URL intact for debugging. Returns the input unchanged if it + cannot be parsed or carries no credentials. + """ + try: + parts = urlsplit(url) + except ValueError: + return url + if not parts.username and not parts.password: + return url + host = parts.hostname or "" + if parts.port is not None: + host = f"{host}:{parts.port}" + netloc = f"***@{host}" + return urlunsplit((parts.scheme, netloc, parts.path, parts.query, parts.fragment)) + + def is_http_error_response( response: Union[aiohttp.ClientResponse, httpx.Response] ) -> bool: diff --git a/packages/opal-common/opal_common/logging_utils/redaction.py b/packages/opal-common/opal_common/logging_utils/redaction.py index 6a131cb9f..698129e2b 100644 --- a/packages/opal-common/opal_common/logging_utils/redaction.py +++ b/packages/opal-common/opal_common/logging_utils/redaction.py @@ -5,25 +5,39 @@ class RedactedReprMixin: """Mixin for pydantic (v1) models that may carry credentials. Overrides ``repr()`` / ``str()`` so that any field listed in - ``_redacted_repr_fields`` renders as ```` instead of its real - value. + ``_redacted_repr_fields`` (on this class or *any* of its base classes) + renders as ```` instead of its real value. This is the central defense against credential leaks in logs: OPAL logs via loguru, and most leaks come from interpolating these models into log messages (e.g. ``logger.info("... {entry}", entry=entry)``) or from loguru's - ``serialize=True`` sink, which falls back to ``str()`` for non-JSON objects. - Both paths go through ``__repr_args__`` here, so masking once at the model - layer protects every current and future log site at once. + ``serialize=True`` sink, which dumps the record as JSON and falls back to + ``str()`` for non-JSON objects such as pydantic models. Masking once at the + model layer protects every such log site at once. + + The redaction set is the **union** of ``_redacted_repr_fields`` across the + whole MRO, so a subclass can only ever *add* fields to redact - it can never + accidentally drop protection inherited from a parent. Each subclass that + introduces a new secret-bearing field must still list it here. Note: this only affects human / log rendering. Wire serialization uses ``.dict()`` / ``.json()``, which are untouched, so transport is unaffected. + + Caveat: this masks whole *fields* of the model. It does not protect a secret + that is logged by reaching *into* the model (e.g. ``logger.info("{h}", + h=config.headers)``); avoid logging credential-bearing attributes directly. """ #: Field names whose values may carry secrets and must be masked in repr/str. _redacted_repr_fields: ClassVar[Set[str]] = set() def __repr_args__(self): + # Union across the MRO so subclasses are additive (never lose a parent's + # protection, even if they redeclare ``_redacted_repr_fields``). + redacted: Set[str] = set() + for klass in type(self).__mro__: + redacted |= klass.__dict__.get("_redacted_repr_fields", set()) return [ - (key, "" if key in self._redacted_repr_fields else value) + (key, "" if key in redacted else value) for key, value in super().__repr_args__() ] diff --git a/packages/opal-common/opal_common/schemas/data.py b/packages/opal-common/opal_common/schemas/data.py index b74b00f69..25cb3b727 100644 --- a/packages/opal-common/opal_common/schemas/data.py +++ b/packages/opal-common/opal_common/schemas/data.py @@ -1,5 +1,3 @@ -from logging import basicConfig -from pydoc import describe from typing import Any, ClassVar, Dict, List, Optional, Set, Tuple, Union from opal_common.fetcher.events import FetcherConfig diff --git a/packages/opal-common/opal_common/tests/redaction_test.py b/packages/opal-common/opal_common/tests/redaction_test.py index 825d3c122..11498e500 100644 --- a/packages/opal-common/opal_common/tests/redaction_test.py +++ b/packages/opal-common/opal_common/tests/redaction_test.py @@ -13,8 +13,12 @@ import pytest from loguru import logger from opal_common.fetcher.events import FetchEvent +from opal_common.fetcher.providers.fastapi_rpc_fetch_provider import ( + FastApiRpcFetchConfig, +) from opal_common.fetcher.providers.http_fetch_provider import HttpFetcherConfig -from opal_common.schemas.data import DataSourceEntry +from opal_common.http_utils import redact_url +from opal_common.schemas.data import DataSourceEntry, DataUpdate SECRET_TOKEN = "Bearer super-secret-token-must-not-leak" @@ -36,6 +40,12 @@ def _models(): url="http://data.example.com", config={"headers": {"Authorization": SECRET_TOKEN}}, ), + # rpc_arguments is a secret-bearing field on a different FetcherConfig + # subclass; it must be redacted too (regression for a missed subclass). + "FastApiRpcFetchConfig": FastApiRpcFetchConfig( + rpc_method_name="publish", + rpc_arguments={"token": SECRET_TOKEN}, + ), } @@ -62,6 +72,30 @@ def test_wire_serialization_is_not_redacted(): assert entry.dict()["config"]["headers"]["Authorization"] == SECRET_TOKEN +def test_nested_container_models_redact_transitively(): + """The objects actually published on the wire are containers like + ``DataUpdate`` that hold ``DataSourceEntry`` objects. They carry no mixin + themselves but must still redact via nested repr.""" + update = DataUpdate( + reason="test", + entries=[ + DataSourceEntry( + url="http://data.example.com", + save_method="PUT", + config={"headers": {"Authorization": SECRET_TOKEN}}, + data={"payload": SECRET_TOKEN}, + ) + ], + ) + assert SECRET_TOKEN not in repr(update) + assert SECRET_TOKEN not in str(update) + assert SECRET_TOKEN not in f"{update}" + # wire format still carries the real value + assert update.dict()["entries"][0]["config"]["headers"]["Authorization"] == ( + SECRET_TOKEN + ) + + def test_loguru_serialize_does_not_leak(): """The actual #901 scenario: a sink with ``serialize=True`` dumps the full record (including ``extra``) as JSON. Interpolating or binding any of these @@ -81,3 +115,42 @@ def test_loguru_serialize_does_not_leak(): combined = "\n".join(captured) assert SECRET_TOKEN not in combined, "secret leaked through loguru serialize=True" + + +def test_loguru_diagnose_does_not_leak_model_value(): + """With ``diagnose=True`` loguru renders local variable *values* in + tracebacks (the #901-class vector). The model's own value must render + redacted.""" + captured: list[str] = [] + sink_id = logger.add( + lambda message: captured.append(str(message)), + level="DEBUG", + backtrace=True, + diagnose=True, + ) + try: + entry = _models()["DataSourceEntry"] # noqa: F841 - referenced in traceback + try: + raise ValueError("boom") + except ValueError: + logger.exception("failed while handling entry") + finally: + logger.remove(sink_id) + + combined = "\n".join(captured) + assert SECRET_TOKEN not in combined, "secret leaked through loguru diagnose=True" + + +@pytest.mark.parametrize( + "url,expected", + [ + ("https://user:tok@host/path", "https://***@host/path"), + ("https://x-access-token:tok@github.com/o/r.git", "https://***@github.com/o/r.git"), + ("https://host:8443/p?q=1", "https://host:8443/p?q=1"), + ("http://plain.example.com/data", "http://plain.example.com/data"), + ], +) +def test_redact_url(url, expected): + redacted = redact_url(url) + assert redacted == expected + assert "tok" not in redacted diff --git a/packages/opal-server/opal_server/data/data_update_publisher.py b/packages/opal-server/opal_server/data/data_update_publisher.py index 5acc20ddb..34342d4ec 100644 --- a/packages/opal-server/opal_server/data/data_update_publisher.py +++ b/packages/opal-server/opal_server/data/data_update_publisher.py @@ -3,6 +3,7 @@ from typing import List from fastapi_utils.tasks import repeat_every +from opal_common.http_utils import redact_url from opal_common.logger import logger from opal_common.schemas.data import ( DataSourceEntryWithPollingInterval, @@ -75,7 +76,7 @@ async def publish_data_updates(self, update: DataUpdate): # a nicer format of entries to the log logged_entries = [ dict( - url=entry.url, + url=redact_url(entry.url), method=entry.save_method, path=entry.dst_path or "/", inline_data=(entry.data is not None), @@ -96,7 +97,7 @@ async def publish_data_updates(self, update: DataUpdate): logger.warning( "[{pid}] No topics were provided for the entry with url: {url}", pid=os.getpid(), - url=entry.url, + url=redact_url(entry.url), ) # publish all topics with all their sub combinations From 702187797155ef98a1a0c58a309e448780eaa997 Mon Sep 17 00:00:00 2001 From: zivxx Date: Sun, 21 Jun 2026 15:04:17 +0300 Subject: [PATCH 03/12] fix: close traceback + git-subsystem credential leaks Two leak vectors outside the model layer, found in the expert review: 1. LOG_DIAGNOSE defaulted to True. loguru's diagnose=True renders raw local variable VALUES in tracebacks (e.g. a bare headers dict, or the Bearer token returned by opa_client._get_auth_headers), bypassing the model-level repr redaction entirely. Default flipped to False (matches loguru's own production guidance); operators can still opt in for local debugging. 2. Git subsystem leaked credentials independently of the data models: - repo_cloner.py logged the raw clone URL (https://token@github.com/...) and the GitCommandError text (which embeds the failing command incl. the URL). - git_fetcher.py (scopes) logged source/remote URLs in many places. - git_utils/env.py forced GIT_TRACE=1 + GIT_CURL_VERBOSE=1, making git dump HTTP Authorization headers to captured stderr. Fix: add http_utils.redact_url_in_text(); redact every git URL log and the git error text; gate GIT_TRACE/GIT_CURL_VERBOSE behind LOG_DIAGNOSE. The functional clone_repository() call still receives the real URL. Adds redact_url_in_text tests. Co-Authored-By: Claude Opus 4.8 (1M context) --- packages/opal-common/opal_common/config.py | 7 ++++++- .../opal-common/opal_common/git_utils/env.py | 11 +++++++--- .../opal_common/git_utils/repo_cloner.py | 13 +++++++++--- .../opal-common/opal_common/http_utils.py | 10 +++++++++ .../opal_common/tests/redaction_test.py | 11 +++++++++- .../opal-server/opal_server/git_fetcher.py | 21 ++++++++++++------- 6 files changed, 57 insertions(+), 16 deletions(-) diff --git a/packages/opal-common/opal_common/config.py b/packages/opal-common/opal_common/config.py index a65dd6211..9898c6466 100644 --- a/packages/opal-common/opal_common/config.py +++ b/packages/opal-common/opal_common/config.py @@ -34,7 +34,12 @@ class OpalCommonConfig(Confi): "LOG_TRACEBACK", True, description="Include traceback in log messages" ) LOG_DIAGNOSE = confi.bool( - "LOG_DIAGNOSE", True, description="Include diagnosis in log messages" + "LOG_DIAGNOSE", + False, + description="Include diagnosis (local variable values) in tracebacks. " + "Off by default because loguru renders raw variable values, which can " + "leak credentials (e.g. auth headers/tokens) into logs - only enable for " + "local debugging.", ) LOG_COLORIZE = confi.bool("LOG_COLORIZE", True, description="Colorize log messages") LOG_SERIALIZE = confi.bool( diff --git a/packages/opal-common/opal_common/git_utils/env.py b/packages/opal-common/opal_common/git_utils/env.py index 5123d04b6..a5de4015d 100644 --- a/packages/opal-common/opal_common/git_utils/env.py +++ b/packages/opal-common/opal_common/git_utils/env.py @@ -40,8 +40,13 @@ def provide_git_ssh_environment(url: str, ssh_key: str): if not is_ssh_repo_url(url) or ssh_key is None: return {} # no ssh config git_ssh_identity_file = save_ssh_key_to_pem_file(ssh_key) - return { + env = { "GIT_SSH_COMMAND": f"ssh -o StrictHostKeyChecking=no -o IdentitiesOnly=yes -i {git_ssh_identity_file}", - "GIT_TRACE": "1", - "GIT_CURL_VERBOSE": "1", } + # GIT_TRACE / GIT_CURL_VERBOSE make git dump verbose protocol output to + # stderr - including HTTP Authorization headers - which OPAL captures into + # its logs. Only enable them when diagnosis logging is explicitly turned on. + if opal_common_config.LOG_DIAGNOSE: + env["GIT_TRACE"] = "1" + env["GIT_CURL_VERBOSE"] = "1" + return env diff --git a/packages/opal-common/opal_common/git_utils/repo_cloner.py b/packages/opal-common/opal_common/git_utils/repo_cloner.py index 174823aca..f1121881b 100644 --- a/packages/opal-common/opal_common/git_utils/repo_cloner.py +++ b/packages/opal-common/opal_common/git_utils/repo_cloner.py @@ -10,6 +10,7 @@ from opal_common.config import opal_common_config from opal_common.git_utils.env import provide_git_ssh_environment from opal_common.git_utils.exceptions import GitFailed +from opal_common.http_utils import redact_url, redact_url_in_text from opal_common.logger import logger from opal_common.utils import get_filepaths_with_glob from tenacity import RetryError, retry, stop, wait @@ -179,7 +180,7 @@ async def clone(self) -> CloneResult: """ logger.info( "Cloning repo from '{url}' to '{to_path}' (branch: '{branch}')", - url=self.url, + url=redact_url(self.url), to_path=self.path, branch=self.branch_name, ) @@ -196,7 +197,10 @@ def _attempt_clone_from_url(self) -> CloneResult: except (GitError, GitCommandError) as e: raise GitFailed(e) except RetryError as e: - logger.exception("cannot clone policy repo: {error}", error=e) + logger.exception( + "cannot clone policy repo: {error}", + error=redact_url_in_text(str(e), self.url), + ) raise GitFailed(e) else: logger.info("Clone succeeded", repo_path=self.path) @@ -208,5 +212,8 @@ def _clone(self, env) -> Repo: url=self.url, to_path=self.path, branch=self.branch_name, env=env ) except (GitError, GitCommandError) as e: - logger.error("cannot clone policy repo: {error}", error=e) + logger.error( + "cannot clone policy repo: {error}", + error=redact_url_in_text(str(e), self.url), + ) raise diff --git a/packages/opal-common/opal_common/http_utils.py b/packages/opal-common/opal_common/http_utils.py index 5dcf15178..67002f125 100644 --- a/packages/opal-common/opal_common/http_utils.py +++ b/packages/opal-common/opal_common/http_utils.py @@ -28,6 +28,16 @@ def redact_url(url: str) -> str: return urlunsplit((parts.scheme, netloc, parts.path, parts.query, parts.fragment)) +def redact_url_in_text(text: str, url: str) -> str: + """Replace occurrences of ``url`` (which may embed credentials) in free text + - such as a git command error message - with its redacted form, so the text + is safe to log. + """ + if not url: + return text + return text.replace(url, redact_url(url)) + + def is_http_error_response( response: Union[aiohttp.ClientResponse, httpx.Response] ) -> bool: diff --git a/packages/opal-common/opal_common/tests/redaction_test.py b/packages/opal-common/opal_common/tests/redaction_test.py index 11498e500..ba008a684 100644 --- a/packages/opal-common/opal_common/tests/redaction_test.py +++ b/packages/opal-common/opal_common/tests/redaction_test.py @@ -17,7 +17,7 @@ FastApiRpcFetchConfig, ) from opal_common.fetcher.providers.http_fetch_provider import HttpFetcherConfig -from opal_common.http_utils import redact_url +from opal_common.http_utils import redact_url, redact_url_in_text from opal_common.schemas.data import DataSourceEntry, DataUpdate SECRET_TOKEN = "Bearer super-secret-token-must-not-leak" @@ -154,3 +154,12 @@ def test_redact_url(url, expected): redacted = redact_url(url) assert redacted == expected assert "tok" not in redacted + + +def test_redact_url_in_text(): + """Git command errors embed the (credentialed) repo URL in free text.""" + url = "https://x-access-token:SECRETTOKEN@github.com/org/repo.git" + err = f"Cmd('git') failed: fatal: could not read from '{url}'" + scrubbed = redact_url_in_text(err, url) + assert "SECRETTOKEN" not in scrubbed + assert "https://***@github.com/org/repo.git" in scrubbed diff --git a/packages/opal-server/opal_server/git_fetcher.py b/packages/opal-server/opal_server/git_fetcher.py index e52083c98..f03a3b123 100644 --- a/packages/opal-server/opal_server/git_fetcher.py +++ b/packages/opal-server/opal_server/git_fetcher.py @@ -13,6 +13,7 @@ from git import Repo from opal_common.async_utils import run_sync from opal_common.git_utils.bundle_maker import BundleMaker +from opal_common.http_utils import redact_url from opal_common.logger import logger from opal_common.schemas.policy import PolicyBundle from opal_common.schemas.policy_source import ( @@ -106,10 +107,10 @@ def verify_found_repo_matches_remote( for remote in repo.remotes: if remote.url == expected_remote_url: logger.debug( - f"found target repo url is referred by remote: {remote.name}, url={remote.url}" + f"found target repo url is referred by remote: {remote.name}, url={redact_url(remote.url)}" ) return - error: str = f"Repo mismatch! No remote matches target url: {expected_remote_url}, found urls: {[remote.url for remote in repo.remotes]}" + error: str = f"Repo mismatch! No remote matches target url: {redact_url(expected_remote_url)}, found urls: {[redact_url(remote.url) for remote in repo.remotes]}" logger.error(error) raise ValueError(error) @@ -136,7 +137,7 @@ def __init__( self._remote = remote_name self._scope_id = scope_id logger.debug( - f"Initializing git fetcher: scope_id={scope_id}, url={source.url}, branch={self._source.branch}, source_id={self._source_id}" + f"Initializing git fetcher: scope_id={scope_id}, url={redact_url(source.url)}, branch={self._source.branch}, source_id={self._source_id}" ) async def _get_repo_lock(self): @@ -185,7 +186,7 @@ async def fetch_and_notify_on_changes( ) if should_fetch: logger.debug( - f"Fetching remote (force_fetch={force_fetch}): {self._remote} ({self._source.url})" + f"Fetching remote (force_fetch={force_fetch}): {self._remote} ({redact_url(self._source.url)})" ) GitPolicyFetcher.repos_last_fetched[ self._source_id @@ -194,7 +195,9 @@ async def fetch_and_notify_on_changes( repo.remotes[self._remote].fetch, callbacks=self._auth_callbacks, ) - logger.debug(f"Fetch completed: {self._source.url}") + logger.debug( + f"Fetch completed: {redact_url(self._source.url)}" + ) # New commits might be present because of a previous fetch made by another scope await self._notify_on_changes(repo) @@ -218,7 +221,7 @@ def _discover_repository(self, path: Path) -> bool: async def _clone(self): logger.info( "Cloning repo at '{url}' to '{path}'", - url=self._source.url, + url=redact_url(self._source.url), path=self._repo_path, ) try: @@ -229,9 +232,11 @@ async def _clone(self): callbacks=self._auth_callbacks, ) except pygit2.GitError: - logger.exception(f"Could not clone repo at {self._source.url}") + logger.exception( + f"Could not clone repo at {redact_url(self._source.url)}" + ) else: - logger.info(f"Clone completed: {self._source.url}") + logger.info(f"Clone completed: {redact_url(self._source.url)}") await self._notify_on_changes(repo) def _get_repo(self) -> Repository: From 9d122d21c981688db22f39d79ae28d6c92efcf3b Mon Sep 17 00:00:00 2001 From: zivxx Date: Sun, 21 Jun 2026 15:24:45 +0300 Subject: [PATCH 04/12] fix: close fetch-path URL leaks + harden redact_url (round 3 review) Second adversarial review pass found residual leaks on the ACTUAL fetch path that the first sweep missed, plus correctness gaps in redact_url: - Fetch-path URL logs now redacted: http_fetch_provider._fetch_ (logged self._url raw), opal_client/data/fetcher.handle_url (3 sites), and sources/git_policy_source remote_urls. - redact_url hardened: * fix IPv6 host (brackets were dropped, producing an invalid URL) * also mask sensitive query-string params (?token=, ?access_token=, ...), not just user:password@ userinfo * username-only userinfo is stripped; non-sensitive URLs returned unchanged - redact_url_in_text is now regex-based (scrubs any scheme://userinfo@ in the text), so it catches credentials even when git normalizes/encodes the URL differently from the value we pass; still also replaces the known URL. - LOG_DIAGNOSE description now notes it also gates git protocol tracing. - Tests: IPv6, query-param, username-only, byte-identity, regex-scrub, and env-gating (GIT_TRACE/GIT_CURL_VERBOSE) cases. - Nits: drop unused FetcherConfig import; List[str] over list[str]; black/isort. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../opal-client/opal_client/data/fetcher.py | 7 +- packages/opal-common/opal_common/config.py | 3 +- .../fetcher/providers/http_fetch_provider.py | 4 +- .../opal-common/opal_common/http_utils.py | 87 +++++++++++++++---- .../opal-common/opal_common/schemas/data.py | 1 - .../opal_common/sources/git_policy_source.py | 5 +- .../opal_common/tests/redaction_test.py | 72 +++++++++++++-- .../opal-server/opal_server/git_fetcher.py | 4 +- 8 files changed, 144 insertions(+), 39 deletions(-) diff --git a/packages/opal-client/opal_client/data/fetcher.py b/packages/opal-client/opal_client/data/fetcher.py index edb38944b..df0bba8cd 100644 --- a/packages/opal-client/opal_client/data/fetcher.py +++ b/packages/opal-client/opal_client/data/fetcher.py @@ -7,6 +7,7 @@ from opal_common.fetcher import FetchingEngine from opal_common.fetcher.events import FetcherConfig from opal_common.fetcher.providers.http_fetch_provider import HttpFetcherConfig +from opal_common.http_utils import redact_url from opal_common.logger import logger from opal_common.utils import get_authorization_header, tuple_to_dict @@ -62,20 +63,20 @@ async def handle_url( ) -> Optional[JsonableValue]: """Helper function wrapping self._engine.handle_url.""" if data is not None: - logger.info("Data provided inline for url: {url}", url=url) + logger.info("Data provided inline for url: {url}", url=redact_url(url)) return data if url is None: logger.error("Invalid data update: no embedded data or URL") return None - logger.info("Fetching data from url: {url}", url=url) + logger.info("Fetching data from url: {url}", url=redact_url(url)) try: # ask the engine to get our data response = await self._engine.handle_url(url, config=config) return response except asyncio.TimeoutError as e: - logger.exception("Timeout while fetching url: {url}", url=url) + logger.exception("Timeout while fetching url: {url}", url=redact_url(url)) raise async def handle_urls( diff --git a/packages/opal-common/opal_common/config.py b/packages/opal-common/opal_common/config.py index 9898c6466..ff6f98a64 100644 --- a/packages/opal-common/opal_common/config.py +++ b/packages/opal-common/opal_common/config.py @@ -39,7 +39,8 @@ class OpalCommonConfig(Confi): description="Include diagnosis (local variable values) in tracebacks. " "Off by default because loguru renders raw variable values, which can " "leak credentials (e.g. auth headers/tokens) into logs - only enable for " - "local debugging.", + "local debugging. Also gates verbose git protocol tracing " + "(GIT_TRACE/GIT_CURL_VERBOSE), which can leak auth headers.", ) LOG_COLORIZE = confi.bool("LOG_COLORIZE", True, description="Colorize log messages") LOG_SERIALIZE = confi.bool( diff --git a/packages/opal-common/opal_common/fetcher/providers/http_fetch_provider.py b/packages/opal-common/opal_common/fetcher/providers/http_fetch_provider.py index f9a6acaed..6fb5f658a 100644 --- a/packages/opal-common/opal_common/fetcher/providers/http_fetch_provider.py +++ b/packages/opal-common/opal_common/fetcher/providers/http_fetch_provider.py @@ -9,7 +9,7 @@ from opal_common.fetcher.events import FetcherConfig, FetchEvent from opal_common.fetcher.fetch_provider import BaseFetchProvider from opal_common.fetcher.logger import get_logger -from opal_common.http_utils import is_http_error_response +from opal_common.http_utils import is_http_error_response, redact_url from opal_common.security.sslcontext import get_custom_ssl_context from pydantic import validator @@ -95,7 +95,7 @@ async def __aexit__(self, exc_type=None, exc_val=None, tb=None): await self._session.__aexit__(exc_type, exc_val, tb) async def _fetch_(self): - logger.debug(f"{self.__class__.__name__} fetching from {self._url}") + logger.debug(f"{self.__class__.__name__} fetching from {redact_url(self._url)}") http_method = self.match_http_method_from_type( self._session, self._event.config.method ) diff --git a/packages/opal-common/opal_common/http_utils.py b/packages/opal-common/opal_common/http_utils.py index 67002f125..ba9d94dfd 100644 --- a/packages/opal-common/opal_common/http_utils.py +++ b/packages/opal-common/opal_common/http_utils.py @@ -1,41 +1,90 @@ +import re from typing import Union -from urllib.parse import urlsplit, urlunsplit +from urllib.parse import parse_qsl, urlencode, urlsplit, urlunsplit import aiohttp import httpx +#: Query-string parameter names whose values may carry credentials. +SENSITIVE_QUERY_PARAMS = frozenset( + { + "token", + "access_token", + "api_key", + "apikey", + "key", + "password", + "secret", + "sig", + "signature", + } +) + +#: Matches ``scheme://userinfo@`` anywhere in free text. +_USERINFO_RE = re.compile(r"(?P[a-zA-Z][a-zA-Z0-9+.\-]*://)[^/@\s]+@") + def redact_url(url: str) -> str: - """Strip any embedded credentials (``user:password@``) from a URL so it is - safe to log. + """Strip embedded credentials from a URL so it is safe to log. Data source / policy repo URLs may be of the form - ``https://user:token@host/path``. Logging them verbatim leaks the - credentials, so we replace the userinfo component with ``***`` while keeping - the rest of the URL intact for debugging. Returns the input unchanged if it - cannot be parsed or carries no credentials. + ``https://user:token@host/path`` or carry a credential in a query parameter + (e.g. ``?token=...``). We replace any ``user:password@`` userinfo with + ``***@`` and mask the values of known sensitive query parameters, while + keeping the host, port, path and non-sensitive params intact for debugging. + Returns the input byte-for-byte unchanged if it is empty, cannot be parsed, + or carries nothing sensitive. """ + if not url: + return url try: parts = urlsplit(url) except ValueError: return url - if not parts.username and not parts.password: + + changed = False + netloc = parts.netloc + if parts.username or parts.password: + host = parts.hostname or "" + if ":" in host: # IPv6 literal - urlsplit strips the surrounding brackets + host = f"[{host}]" + if parts.port is not None: + host = f"{host}:{parts.port}" + netloc = f"***@{host}" + changed = True + + query = parts.query + if query: + pairs = parse_qsl(query, keep_blank_values=True) + if any(key.lower() in SENSITIVE_QUERY_PARAMS for key, _ in pairs): + query = urlencode( + [ + (key, "***" if key.lower() in SENSITIVE_QUERY_PARAMS else value) + for key, value in pairs + ], + safe="*", + ) + changed = True + + if not changed: return url - host = parts.hostname or "" - if parts.port is not None: - host = f"{host}:{parts.port}" - netloc = f"***@{host}" - return urlunsplit((parts.scheme, netloc, parts.path, parts.query, parts.fragment)) + return urlunsplit((parts.scheme, netloc, parts.path, query, parts.fragment)) -def redact_url_in_text(text: str, url: str) -> str: - """Replace occurrences of ``url`` (which may embed credentials) in free text - - such as a git command error message - with its redacted form, so the text - is safe to log. +def redact_url_in_text(text: str, url: str = "") -> str: + """Redact embedded credentials from free text such as a git error message. + + Scrubs any ``scheme://user:password@`` userinfo found anywhere in the text + (a regex, so it is robust to the exact URL form git happens to print), and + additionally replaces verbatim occurrences of ``url`` with its fully redacted + form so query-string tokens of a known URL are masked too. """ - if not url: + if not text: return text - return text.replace(url, redact_url(url)) + scrubbed = _USERINFO_RE.sub(lambda m: f"{m.group('scheme')}***@", text) + if url: + scrubbed = scrubbed.replace(url, redact_url(url)) + return scrubbed def is_http_error_response( diff --git a/packages/opal-common/opal_common/schemas/data.py b/packages/opal-common/opal_common/schemas/data.py index 25cb3b727..e6a210d93 100644 --- a/packages/opal-common/opal_common/schemas/data.py +++ b/packages/opal-common/opal_common/schemas/data.py @@ -1,6 +1,5 @@ from typing import Any, ClassVar, Dict, List, Optional, Set, Tuple, Union -from opal_common.fetcher.events import FetcherConfig from opal_common.fetcher.providers.http_fetch_provider import HttpFetcherConfig from opal_common.logging_utils.redaction import RedactedReprMixin from opal_common.schemas.store import JSONPatchAction diff --git a/packages/opal-common/opal_common/sources/git_policy_source.py b/packages/opal-common/opal_common/sources/git_policy_source.py index 59d8adfc2..2fa31e6dc 100644 --- a/packages/opal-common/opal_common/sources/git_policy_source.py +++ b/packages/opal-common/opal_common/sources/git_policy_source.py @@ -4,6 +4,7 @@ from opal_common.git_utils.branch_tracker import BranchTracker from opal_common.git_utils.exceptions import GitFailed from opal_common.git_utils.repo_cloner import RepoCloner +from opal_common.http_utils import redact_url from opal_common.logger import logger from opal_common.sources.base_policy_source import BasePolicySource @@ -69,14 +70,14 @@ async def get_initial_policy_state_from_remote(self): # Don't bother with remove and reclone because this case shouldn't happen on reasobable usage raise GitFailed( RuntimeError( - f"Existing repo has wrong remote url: {remote_urls}" + f"Existing repo has wrong remote url: {[redact_url(u) for u in remote_urls]}" ) ) else: logger.info( "SKIPPED cloning policy repo, found existing repo at '{path}' with remotes: {remote_urls})", path=self._cloner.path, - remote_urls=remote_urls, + remote_urls=[redact_url(u) for u in remote_urls], ) except GitFailed as e: await self._on_git_failed(e) diff --git a/packages/opal-common/opal_common/tests/redaction_test.py b/packages/opal-common/opal_common/tests/redaction_test.py index ba008a684..9d45ffb56 100644 --- a/packages/opal-common/opal_common/tests/redaction_test.py +++ b/packages/opal-common/opal_common/tests/redaction_test.py @@ -10,6 +10,8 @@ untouched. """ +from typing import List + import pytest from loguru import logger from opal_common.fetcher.events import FetchEvent @@ -100,7 +102,7 @@ def test_loguru_serialize_does_not_leak(): """The actual #901 scenario: a sink with ``serialize=True`` dumps the full record (including ``extra``) as JSON. Interpolating or binding any of these models must not put the secret on the wire.""" - captured: list[str] = [] + captured: List[str] = [] sink_id = logger.add( lambda message: captured.append(str(message)), level="DEBUG", @@ -121,7 +123,7 @@ def test_loguru_diagnose_does_not_leak_model_value(): """With ``diagnose=True`` loguru renders local variable *values* in tracebacks (the #901-class vector). The model's own value must render redacted.""" - captured: list[str] = [] + captured: List[str] = [] sink_id = logger.add( lambda message: captured.append(str(message)), level="DEBUG", @@ -144,16 +146,33 @@ def test_loguru_diagnose_does_not_leak_model_value(): @pytest.mark.parametrize( "url,expected", [ + # userinfo (user:password) is stripped ("https://user:tok@host/path", "https://***@host/path"), - ("https://x-access-token:tok@github.com/o/r.git", "https://***@github.com/o/r.git"), - ("https://host:8443/p?q=1", "https://host:8443/p?q=1"), - ("http://plain.example.com/data", "http://plain.example.com/data"), + ( + "https://x-access-token:tok@github.com/o/r.git", + "https://***@github.com/o/r.git", + ), + # username-only userinfo is still stripped (the user can be a token) + ("ssh://gittok@host/repo.git", "ssh://***@host/repo.git"), + # IPv6 literal must keep its brackets and stay a valid URL + ("https://u:p@[::1]:8443/x", "https://***@[::1]:8443/x"), + # sensitive query params are masked, non-sensitive kept + ( + "https://host/p?token=secret&page=2", + "https://host/p?token=***&page=2", + ), + ("https://host/p?access_token=secret", "https://host/p?access_token=***"), + # nothing sensitive -> returned byte-for-byte unchanged + ("https://host:8443/p?q=1#frag", "https://host:8443/p?q=1#frag"), + ("http://plain.example.com/data/", "http://plain.example.com/data/"), + # scp-style git url has no parseable userinfo and no password -> untouched + ("git@github.com:org/repo.git", "git@github.com:org/repo.git"), + ("", ""), ], ) def test_redact_url(url, expected): - redacted = redact_url(url) - assert redacted == expected - assert "tok" not in redacted + # expected values never contain a secret, so equality fully validates + assert redact_url(url) == expected def test_redact_url_in_text(): @@ -163,3 +182,40 @@ def test_redact_url_in_text(): scrubbed = redact_url_in_text(err, url) assert "SECRETTOKEN" not in scrubbed assert "https://***@github.com/org/repo.git" in scrubbed + + +def test_redact_url_in_text_scrubs_creds_not_in_known_url(): + """The regex scrub must catch a credentialed URL even when it differs from + the URL we passed (e.g. git injected creds / normalized the string).""" + # self.url has no creds, but git prints a credentialed variant + text = ( + "fatal: could not read from 'https://x-access-token:LEAKED@github.com/o/r.git/'" + ) + scrubbed = redact_url_in_text(text, "https://github.com/o/r.git") + assert "LEAKED" not in scrubbed + assert "https://***@github.com/o/r.git/" in scrubbed + + +def test_git_ssh_env_gates_verbose_tracing_on_log_diagnose(tmp_path): + """GIT_TRACE/GIT_CURL_VERBOSE dump auth headers; they must only be set when + LOG_DIAGNOSE is explicitly enabled.""" + from opal_common.config import opal_common_config + from opal_common.git_utils import env + + ssh_url = "git@github.com:org/repo.git" + original_diagnose = opal_common_config.LOG_DIAGNOSE + original_key_file = opal_common_config.GIT_SSH_KEY_FILE + # point the pem-file writer at a throwaway path so we never touch a real key + opal_common_config.GIT_SSH_KEY_FILE = str(tmp_path / "id_test") + try: + opal_common_config.LOG_DIAGNOSE = False + off = env.provide_git_ssh_environment(ssh_url, "DUMMY_KEY") + assert "GIT_TRACE" not in off and "GIT_CURL_VERBOSE" not in off + assert "GIT_SSH_COMMAND" in off + + opal_common_config.LOG_DIAGNOSE = True + on = env.provide_git_ssh_environment(ssh_url, "DUMMY_KEY") + assert on.get("GIT_TRACE") == "1" and on.get("GIT_CURL_VERBOSE") == "1" + finally: + opal_common_config.LOG_DIAGNOSE = original_diagnose + opal_common_config.GIT_SSH_KEY_FILE = original_key_file diff --git a/packages/opal-server/opal_server/git_fetcher.py b/packages/opal-server/opal_server/git_fetcher.py index f03a3b123..923bc423e 100644 --- a/packages/opal-server/opal_server/git_fetcher.py +++ b/packages/opal-server/opal_server/git_fetcher.py @@ -232,9 +232,7 @@ async def _clone(self): callbacks=self._auth_callbacks, ) except pygit2.GitError: - logger.exception( - f"Could not clone repo at {redact_url(self._source.url)}" - ) + logger.exception(f"Could not clone repo at {redact_url(self._source.url)}") else: logger.info(f"Clone completed: {redact_url(self._source.url)}") await self._notify_on_changes(repo) From 0bfacbebc72a47617fe785140938f1bb9054c2cb Mon Sep 17 00:00:00 2001 From: zivxx Date: Sun, 21 Jun 2026 15:32:18 +0300 Subject: [PATCH 05/12] style: apply docformatter to redaction_test docstrings The docformatter pre-commit hook (v1.7.5) reformats docstrings: capitalizes the summary line and splits summary from body. Pure formatting, no logic change. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../opal_common/tests/redaction_test.py | 25 +++++++++++++------ 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/packages/opal-common/opal_common/tests/redaction_test.py b/packages/opal-common/opal_common/tests/redaction_test.py index 9d45ffb56..03f24e61f 100644 --- a/packages/opal-common/opal_common/tests/redaction_test.py +++ b/packages/opal-common/opal_common/tests/redaction_test.py @@ -26,7 +26,8 @@ def _models(): - """One instance of each credential-bearing model, all carrying SECRET_TOKEN.""" + """One instance of each credential-bearing model, all carrying + SECRET_TOKEN.""" return { "HttpFetcherConfig": HttpFetcherConfig( headers={"Authorization": SECRET_TOKEN}, data={"payload": SECRET_TOKEN} @@ -53,7 +54,7 @@ def _models(): @pytest.mark.parametrize("name,model", list(_models().items())) def test_repr_and_str_redact_secrets(name, model): - """repr() / str() (used by f-strings and ``"{}".format``) must never expose + """Repr() / str() (used by f-strings and ``"{}".format``) must never expose the secret, and must show it was redacted.""" assert SECRET_TOKEN not in repr(model), f"secret leaked in repr({name})" assert SECRET_TOKEN not in str(model), f"secret leaked in str({name})" @@ -76,8 +77,11 @@ def test_wire_serialization_is_not_redacted(): def test_nested_container_models_redact_transitively(): """The objects actually published on the wire are containers like - ``DataUpdate`` that hold ``DataSourceEntry`` objects. They carry no mixin - themselves but must still redact via nested repr.""" + ``DataUpdate`` that hold ``DataSourceEntry`` objects. + + They carry no mixin themselves but must still redact via nested + repr. + """ update = DataUpdate( reason="test", entries=[ @@ -100,8 +104,11 @@ def test_nested_container_models_redact_transitively(): def test_loguru_serialize_does_not_leak(): """The actual #901 scenario: a sink with ``serialize=True`` dumps the full - record (including ``extra``) as JSON. Interpolating or binding any of these - models must not put the secret on the wire.""" + record (including ``extra``) as JSON. + + Interpolating or binding any of these models must not put the secret + on the wire. + """ captured: List[str] = [] sink_id = logger.add( lambda message: captured.append(str(message)), @@ -121,8 +128,10 @@ def test_loguru_serialize_does_not_leak(): def test_loguru_diagnose_does_not_leak_model_value(): """With ``diagnose=True`` loguru renders local variable *values* in - tracebacks (the #901-class vector). The model's own value must render - redacted.""" + tracebacks (the #901-class vector). + + The model's own value must render redacted. + """ captured: List[str] = [] sink_id = logger.add( lambda message: captured.append(str(message)), From 51786adecc7c8b837bc828249d448cbf725d461b Mon Sep 17 00:00:00 2001 From: zivxx Date: Tue, 23 Jun 2026 13:22:27 +0300 Subject: [PATCH 06/12] fix(http-utils): harden redact_url/redact_url_in_text against raise, ordering, and fragment leaks - redact_url no longer raises from a log/except path: accessing parts.username/.password/.port is what actually validates a URL (urlsplit is lazy), so an out-of-range port raised ValueError straight out of an except block. Guard the credential-detection block. - redact_url_in_text now does the known-URL replacement *before* the userinfo regex; previously the regex rewrote user:pw@ -> ***@ first, destroying the verbatim URL so its query token was never masked. - redact_url now masks sensitive params in the URL *fragment* too, and masks query/fragment values in-place so untouched params keep their exact original encoding (no urlencode %20 -> + normalization). Addresses review comments: - https://github.com/permitio/opal/pull/921#discussion_r3448558075 (@zeevmoney) - https://github.com/permitio/opal/pull/921#discussion_r3448558097 (@zeevmoney) - https://github.com/permitio/opal/pull/921#discussion_r3448558193 (@zeevmoney) Co-Authored-By: Claude Opus 4.8 (1M context) --- .../opal-common/opal_common/http_utils.py | 90 ++++++++++++------- .../opal_common/tests/redaction_test.py | 29 ++++++ 2 files changed, 87 insertions(+), 32 deletions(-) diff --git a/packages/opal-common/opal_common/http_utils.py b/packages/opal-common/opal_common/http_utils.py index ba9d94dfd..ae3d20e6d 100644 --- a/packages/opal-common/opal_common/http_utils.py +++ b/packages/opal-common/opal_common/http_utils.py @@ -1,6 +1,6 @@ import re -from typing import Union -from urllib.parse import parse_qsl, urlencode, urlsplit, urlunsplit +from typing import Tuple, Union +from urllib.parse import urlsplit, urlunsplit import aiohttp import httpx @@ -24,16 +24,40 @@ _USERINFO_RE = re.compile(r"(?P[a-zA-Z][a-zA-Z0-9+.\-]*://)[^/@\s]+@") +def _mask_sensitive_params(component: str) -> Tuple[str, bool]: + """Mask values of known sensitive params in a raw query/fragment component. + + Operates directly on the raw ``key=value&...`` text (rather than + ``parse_qsl`` + ``urlencode``) so the encoding of untouched params is + preserved byte-for-byte - only the sensitive values are replaced with + ``***``. Returns the (possibly rewritten) component and whether anything + changed. + """ + changed = False + out = [] + for pair in component.split("&"): + key, sep, _ = pair.partition("=") + if sep and key.lower() in SENSITIVE_QUERY_PARAMS: + out.append(f"{key}=***") + changed = True + else: + out.append(pair) + return "&".join(out), changed + + def redact_url(url: str) -> str: """Strip embedded credentials from a URL so it is safe to log. Data source / policy repo URLs may be of the form ``https://user:token@host/path`` or carry a credential in a query parameter (e.g. ``?token=...``). We replace any ``user:password@`` userinfo with - ``***@`` and mask the values of known sensitive query parameters, while - keeping the host, port, path and non-sensitive params intact for debugging. - Returns the input byte-for-byte unchanged if it is empty, cannot be parsed, - or carries nothing sensitive. + ``***@`` and mask the values of known sensitive query (and fragment) + parameters, while keeping the host, port, path and non-sensitive params + intact for debugging. Returns the input byte-for-byte unchanged if it is + empty, cannot be parsed, or carries nothing sensitive. + + Never raises: this helper is called from log/except paths, so any parsing + error yields the input unchanged rather than propagating. """ if not url: return url @@ -44,46 +68,48 @@ def redact_url(url: str) -> str: changed = False netloc = parts.netloc - if parts.username or parts.password: - host = parts.hostname or "" - if ":" in host: # IPv6 literal - urlsplit strips the surrounding brackets - host = f"[{host}]" - if parts.port is not None: - host = f"{host}:{parts.port}" - netloc = f"***@{host}" - changed = True - - query = parts.query - if query: - pairs = parse_qsl(query, keep_blank_values=True) - if any(key.lower() in SENSITIVE_QUERY_PARAMS for key, _ in pairs): - query = urlencode( - [ - (key, "***" if key.lower() in SENSITIVE_QUERY_PARAMS else value) - for key, value in pairs - ], - safe="*", - ) + try: + if parts.username or parts.password: + host = parts.hostname or "" + if ":" in host: # IPv6 literal - urlsplit strips the surrounding brackets + host = f"[{host}]" + if parts.port is not None: + host = f"{host}:{parts.port}" + netloc = f"***@{host}" changed = True + except ValueError: + # ``urlsplit`` is lazy - accessing ``.username``/``.password``/``.port`` + # is what actually validates and can raise (e.g. "Port out of range"). + # We must never throw from a log path, so bail out unchanged. + return url + + query, query_changed = _mask_sensitive_params(parts.query) + fragment, fragment_changed = _mask_sensitive_params(parts.fragment) + changed = changed or query_changed or fragment_changed if not changed: return url - return urlunsplit((parts.scheme, netloc, parts.path, query, parts.fragment)) + return urlunsplit((parts.scheme, netloc, parts.path, query, fragment)) def redact_url_in_text(text: str, url: str = "") -> str: """Redact embedded credentials from free text such as a git error message. - Scrubs any ``scheme://user:password@`` userinfo found anywhere in the text - (a regex, so it is robust to the exact URL form git happens to print), and - additionally replaces verbatim occurrences of ``url`` with its fully redacted - form so query-string tokens of a known URL are masked too. + Replaces verbatim occurrences of a known ``url`` with its fully redacted + form (so query-string tokens of that URL are masked too), then scrubs any + remaining ``scheme://user:password@`` userinfo found anywhere in the text + (a regex, so it is robust to the exact URL form git happens to print). + + The known-URL replacement runs *first*: the regex rewrites + ``user:pw@`` -> ``***@``, which would otherwise destroy the verbatim ``url`` + before its query tokens could be masked. """ if not text: return text - scrubbed = _USERINFO_RE.sub(lambda m: f"{m.group('scheme')}***@", text) + scrubbed = text if url: scrubbed = scrubbed.replace(url, redact_url(url)) + scrubbed = _USERINFO_RE.sub(lambda m: f"{m.group('scheme')}***@", scrubbed) return scrubbed diff --git a/packages/opal-common/opal_common/tests/redaction_test.py b/packages/opal-common/opal_common/tests/redaction_test.py index 03f24e61f..0b56c92bf 100644 --- a/packages/opal-common/opal_common/tests/redaction_test.py +++ b/packages/opal-common/opal_common/tests/redaction_test.py @@ -171,6 +171,14 @@ def test_loguru_diagnose_does_not_leak_model_value(): "https://host/p?token=***&page=2", ), ("https://host/p?access_token=secret", "https://host/p?access_token=***"), + # sensitive params in the *fragment* are masked too + ("https://host/p#access_token=secret", "https://host/p#access_token=***"), + # non-sensitive params keep their exact original encoding (no urlencode + # round-trip normalizing "%20" -> "+") + ( + "https://host/p?name=John%20Doe&token=secret", + "https://host/p?name=John%20Doe&token=***", + ), # nothing sensitive -> returned byte-for-byte unchanged ("https://host:8443/p?q=1#frag", "https://host:8443/p?q=1#frag"), ("http://plain.example.com/data/", "http://plain.example.com/data/"), @@ -184,6 +192,27 @@ def test_redact_url(url, expected): assert redact_url(url) == expected +def test_redact_url_never_raises_on_malformed_url(): + """redact_url is called from log/except paths and must never throw - e.g. an + out-of-range port only raises when .port is accessed (urlsplit is lazy).""" + bad = "https://u:p@host:99999999999/x" + # would raise ValueError("Port out of range") without the guard + assert redact_url(bad) == bad + + +def test_redact_url_in_text_masks_query_token_with_userinfo(): + """When a known URL carries *both* userinfo and a sensitive query param, + the query token must still be masked (regression: the userinfo regex used + to run first and destroy the verbatim URL before the query scrub could + match).""" + url = "https://user:SECRETTOK@host/path?token=SECRETTOK2" + text = "fatal: could not read from '" + url + "'" + scrubbed = redact_url_in_text(text, url) + assert "SECRETTOK" not in scrubbed + assert "SECRETTOK2" not in scrubbed + assert "https://***@host/path?token=***" in scrubbed + + def test_redact_url_in_text(): """Git command errors embed the (credentialed) repo URL in free text.""" url = "https://x-access-token:SECRETTOKEN@github.com/org/repo.git" From 08c354722dea2dc4755e803e57382b0734226681 Mon Sep 17 00:00:00 2001 From: zivxx Date: Tue, 23 Jun 2026 13:22:55 +0300 Subject: [PATCH 07/12] fix(redaction): mask url field of credential-bearing models in repr/str DataSourceEntry.url and FetchEvent.url can embed credentials (user:token@host or ?token=...), but RedactedReprMixin only masked config/data - so whole-model logging (repr/str, loguru serialize) still leaked the URL. Add a generic _redacted_url_fields set to the mixin that runs the value through redact_url (keeps host/path visible, unlike blanket masking), unioned across the MRO like the existing set, and declare {"url"} on both models. Wire serialization (.dict()/.json()) is untouched. Addresses review comments: - https://github.com/permitio/opal/pull/921#discussion_r3448558124 (@zeevmoney) - https://github.com/permitio/opal/pull/921#discussion_r3448558140 (@zeevmoney) Co-Authored-By: Claude Opus 4.8 (1M context) --- .../opal-common/opal_common/fetcher/events.py | 3 ++ .../opal_common/logging_utils/redaction.py | 50 +++++++++++++------ .../opal-common/opal_common/schemas/data.py | 3 ++ .../opal_common/tests/redaction_test.py | 34 +++++++++++++ 4 files changed, 76 insertions(+), 14 deletions(-) diff --git a/packages/opal-common/opal_common/fetcher/events.py b/packages/opal-common/opal_common/fetcher/events.py index d80ac3a11..51e4e930b 100644 --- a/packages/opal-common/opal_common/fetcher/events.py +++ b/packages/opal-common/opal_common/fetcher/events.py @@ -29,6 +29,9 @@ class FetchEvent(RedactedReprMixin, BaseModel): # ``config`` may carry a FetcherConfig with credentials - mask it in repr/str. _redacted_repr_fields: ClassVar[Set[str]] = {"config"} + # ``url`` can embed credentials (``user:token@host`` / ``?token=``); strip + # them via redact_url while keeping host/path visible for debugging. + _redacted_url_fields: ClassVar[Set[str]] = {"url"} # Event id to be filled by the engine id: str = None diff --git a/packages/opal-common/opal_common/logging_utils/redaction.py b/packages/opal-common/opal_common/logging_utils/redaction.py index 698129e2b..01c64dc26 100644 --- a/packages/opal-common/opal_common/logging_utils/redaction.py +++ b/packages/opal-common/opal_common/logging_utils/redaction.py @@ -1,12 +1,22 @@ from typing import ClassVar, Set +from opal_common.http_utils import redact_url + class RedactedReprMixin: """Mixin for pydantic (v1) models that may carry credentials. - Overrides ``repr()`` / ``str()`` so that any field listed in - ``_redacted_repr_fields`` (on this class or *any* of its base classes) - renders as ```` instead of its real value. + Overrides ``repr()`` / ``str()`` so that sensitive fields are masked instead + of rendering their real value. Two flavours of masking are supported: + + - ``_redacted_repr_fields`` - the value is wholly replaced with + ```` (use for opaque secret carriers such as ``config``/``data``). + - ``_redacted_url_fields`` - the value is passed through ``redact_url`` so + embedded credentials are stripped while the host/path stay visible for + debugging (use for URL fields, which can embed ``user:token@`` or + ``?token=`` credentials but are otherwise useful to see). + + Both apply on this class or *any* of its base classes. This is the central defense against credential leaks in logs: OPAL logs via loguru, and most leaks come from interpolating these models into log @@ -15,8 +25,8 @@ class RedactedReprMixin: ``str()`` for non-JSON objects such as pydantic models. Masking once at the model layer protects every such log site at once. - The redaction set is the **union** of ``_redacted_repr_fields`` across the - whole MRO, so a subclass can only ever *add* fields to redact - it can never + Each redaction set is the **union** of its declarations across the whole + MRO, so a subclass can only ever *add* fields to redact - it can never accidentally drop protection inherited from a parent. Each subclass that introduces a new secret-bearing field must still list it here. @@ -30,14 +40,26 @@ class RedactedReprMixin: #: Field names whose values may carry secrets and must be masked in repr/str. _redacted_repr_fields: ClassVar[Set[str]] = set() + #: URL field names whose embedded credentials must be stripped via redact_url. + _redacted_url_fields: ClassVar[Set[str]] = set() - def __repr_args__(self): + @classmethod + def _union_over_mro(cls, attr: str) -> Set[str]: # Union across the MRO so subclasses are additive (never lose a parent's - # protection, even if they redeclare ``_redacted_repr_fields``). - redacted: Set[str] = set() - for klass in type(self).__mro__: - redacted |= klass.__dict__.get("_redacted_repr_fields", set()) - return [ - (key, "" if key in redacted else value) - for key, value in super().__repr_args__() - ] + # protection, even if they redeclare the set). + result: Set[str] = set() + for klass in cls.__mro__: + result |= klass.__dict__.get(attr, set()) + return result + + def __repr_args__(self): + redacted = type(self)._union_over_mro("_redacted_repr_fields") + url_redacted = type(self)._union_over_mro("_redacted_url_fields") + args = [] + for key, value in super().__repr_args__(): + if key in redacted: + value = "" + elif key in url_redacted and isinstance(value, str): + value = redact_url(value) + args.append((key, value)) + return args diff --git a/packages/opal-common/opal_common/schemas/data.py b/packages/opal-common/opal_common/schemas/data.py index e6a210d93..82c6232ed 100644 --- a/packages/opal-common/opal_common/schemas/data.py +++ b/packages/opal-common/opal_common/schemas/data.py @@ -20,6 +20,9 @@ class DataSourceEntry(RedactedReprMixin, BaseModel): # ``data`` an inline payload - mask both in repr/str so they never leak into # logs (entries are frequently interpolated into log messages). _redacted_repr_fields: ClassVar[Set[str]] = {"config", "data"} + # ``url`` can embed credentials (``user:token@host`` / ``?token=``); strip + # them via redact_url while keeping host/path visible for debugging. + _redacted_url_fields: ClassVar[Set[str]] = {"url"} @validator("data") def validate_save_method(cls, value, values): diff --git a/packages/opal-common/opal_common/tests/redaction_test.py b/packages/opal-common/opal_common/tests/redaction_test.py index 0b56c92bf..e7e4a5790 100644 --- a/packages/opal-common/opal_common/tests/redaction_test.py +++ b/packages/opal-common/opal_common/tests/redaction_test.py @@ -257,3 +257,37 @@ def test_git_ssh_env_gates_verbose_tracing_on_log_diagnose(tmp_path): finally: opal_common_config.LOG_DIAGNOSE = original_diagnose opal_common_config.GIT_SSH_KEY_FILE = original_key_file + + +_URL_SECRET = "url-secret-must-not-leak" + + +@pytest.mark.parametrize( + "model", + [ + DataSourceEntry( + url=f"https://user:{_URL_SECRET}@host/path?token={_URL_SECRET}", + save_method="PUT", + ), + FetchEvent( + fetcher="HttpFetchProvider", + url=f"https://user:{_URL_SECRET}@host/path?token={_URL_SECRET}", + ), + ], +) +def test_url_field_credentials_redacted_in_repr(model): + """The ``url`` field can embed credentials; repr/str must strip them (via + redact_url) while keeping the host/path visible for debugging.""" + rendered = repr(model) + assert _URL_SECRET not in rendered + assert _URL_SECRET not in str(model) + # host/path stay visible (we redact, not blanket-mask, urls) + assert "***@host/path" in rendered + + +def test_url_field_wire_serialization_keeps_real_url(): + """Redaction is log-only: ``.dict()`` must keep the real (credentialed) url + so fetching still works.""" + url = f"https://user:{_URL_SECRET}@host/path" + entry = DataSourceEntry(url=url, save_method="PUT") + assert entry.dict()["url"] == url From 7cedb07a5589d1427a26763e79ed5602d8811bcb Mon Sep 17 00:00:00 2001 From: zivxx Date: Tue, 23 Jun 2026 13:23:12 +0300 Subject: [PATCH 08/12] fix(logs): redact credentialed URLs across log sites (in-diff and beyond) Wrap raw URL log/exception sites with redact_url / redact_url_in_text so a credentialed data-source, policy-repo or callback URL (user:token@host or ?token=...) never reaches the logs: In-diff inline review: - fastapi_rpc_fetch_provider: redact self._url in the fetch log line. - updater._fetch_data: drop the raw {exc} binding (logger.exception already records the traceback; str(e) on a 3rd-party error can carry a credentialed URL and would serialize verbatim under LOG_SERIALIZE) and redact the re-raised message. - git_policy_source: drop a stray ")" in the "found existing repo" log format. Out-of-diff sites (PR-level review): - api_policy_source: redact the 404 bundle URL (log + HTTPException detail) and the "Fetching changes from remote" log. - scopes/service + scopes/loader: redact source.url / POLICY_REPO_URL. - updater: redact entry.url passed to _update_remote_status (it is persisted into the transaction log) and the error strings; stop logging the raw inline data payload in calc_hash; redact the data-sources config URL. - callbacks/reporter: redact the user-supplied callback URLs. Addresses review comments: - https://github.com/permitio/opal/pull/921#discussion_r3448497185 (@copilot-pull-request-reviewer) - https://github.com/permitio/opal/pull/921#discussion_r3448497191 (@copilot-pull-request-reviewer) - https://github.com/permitio/opal/pull/921#discussion_r3448497173 (@copilot-pull-request-reviewer) - https://github.com/permitio/opal/pull/921#issuecomment-4762263235 (@zeevmoney) Co-Authored-By: Claude Opus 4.8 (1M context) --- .../opal_client/callbacks/reporter.py | 11 ++--- .../opal-client/opal_client/data/updater.py | 45 ++++++++++++++----- .../providers/fastapi_rpc_fetch_provider.py | 3 +- .../opal_common/sources/api_policy_source.py | 7 +-- .../opal_common/sources/git_policy_source.py | 2 +- .../opal-server/opal_server/scopes/loader.py | 3 +- .../opal-server/opal_server/scopes/service.py | 3 +- 7 files changed, 50 insertions(+), 24 deletions(-) diff --git a/packages/opal-client/opal_client/callbacks/reporter.py b/packages/opal-client/opal_client/callbacks/reporter.py index c2122d732..163c4ad96 100644 --- a/packages/opal-client/opal_client/callbacks/reporter.py +++ b/packages/opal-client/opal_client/callbacks/reporter.py @@ -5,7 +5,7 @@ from opal_client.callbacks.register import CallbackConfig, CallbacksRegister from opal_client.data.fetcher import DataFetcher from opal_common.fetcher.providers.http_fetch_provider import HttpFetcherConfig -from opal_common.http_utils import is_http_error_response +from opal_common.http_utils import is_http_error_response, redact_url from opal_common.logger import logger from opal_common.schemas.data import DataUpdateReport @@ -60,8 +60,9 @@ async def report_update_results( config.data = report_data callback_requests.append((url, config, None)) - # log only the URLs — FetcherConfig may carry Authorization headers and the full report payload - urls = [request[0] for request in callback_requests] + # log only the URLs — FetcherConfig may carry Authorization headers and the full report payload. + # Callback URLs are user-supplied and can embed credentials, so redact them too. + urls = [redact_url(request[0]) for request in callback_requests] logger.info("Reporting the update to requested callbacks", urls=urls) report_results = await self._fetcher.handle_urls(callback_requests) # log reports which we failed to send @@ -69,7 +70,7 @@ async def report_update_results( if isinstance(result, Exception): logger.error( "Failed to send report to {url}, info={exc_info}", - url=url, + url=redact_url(url), exc_info=repr(result), ) if isinstance( @@ -83,7 +84,7 @@ async def report_update_results( error_content = await result.text() logger.error( "Failed to send report to {url}, got response code {status} with error: {error}", - url=url, + url=redact_url(url), status=result.status, error=error_content, ) diff --git a/packages/opal-client/opal_client/data/updater.py b/packages/opal-client/opal_client/data/updater.py index a62449575..8fedbf71a 100644 --- a/packages/opal-client/opal_client/data/updater.py +++ b/packages/opal-client/opal_client/data/updater.py @@ -26,7 +26,11 @@ ) from opal_common.async_utils import TasksPool, repeated_call from opal_common.config import opal_common_config -from opal_common.http_utils import is_http_error_response, redact_url +from opal_common.http_utils import ( + is_http_error_response, + redact_url, + redact_url_in_text, +) from opal_common.schemas.data import ( DataEntryReport, DataSourceConfig, @@ -227,7 +231,10 @@ async def get_policy_data_config(self, url: str = None) -> DataSourceConfig: """ if url is None: url = self._data_sources_config_url - logger.info("Getting data-sources configuration from '{source}'", source=url) + logger.info( + "Getting data-sources configuration from '{source}'", + source=redact_url(url), + ) try: async with ClientSession( @@ -446,8 +453,10 @@ def calc_hash(data: JsonableValue) -> str: if not isinstance(data, str): data = json.dumps(data, default=pydantic_encoder) return hashlib.sha256(data.encode("utf-8")).hexdigest() - except Exception as e: - logger.exception(f"Failed to calculate hash for data {data}: {e}") + except Exception: + # Don't interpolate ``data`` - it may be an inline credential-bearing + # payload. logger.exception already records the traceback. + logger.exception("Failed to calculate hash for data") return "" async def _update_policy_data(self, update: DataUpdate) -> None: @@ -549,26 +558,34 @@ async def _fetch_and_save_data( try: result = await self._fetch_data(entry) except Exception as e: + # url is persisted into the transaction log and the OPA-failed error + # log; redact it (and the exception text) - the functional fetch + # already used the real entry.url above. store_transaction._update_remote_status( - url=entry.url, status=False, error=str(e) + url=redact_url(entry.url), + status=False, + error=redact_url_in_text(str(e), entry.url), ) return DataEntryReport(entry=entry, fetched=False, saved=False) try: await self._store_fetched_data(entry, result, store_transaction) except Exception as e: - logger.exception("Failed to save data update to policy-store: {exc}", exc=e) + # logger.exception already records the traceback; don't bind the raw + # exception (str(e) can carry a credentialed URL). + logger.exception("Failed to save data update to policy-store") store_transaction._update_remote_status( - url=entry.url, + url=redact_url(entry.url), status=False, - error=f"Failed to save data to policy store: {e}", + error=f"Failed to save data to policy store: " + f"{redact_url_in_text(str(e), entry.url)}", ) return DataEntryReport( entry=entry, hash=self.calc_hash(result), fetched=True, saved=False ) else: store_transaction._update_remote_status( - url=entry.url, status=True, error="" + url=redact_url(entry.url), status=True, error="" ) return DataEntryReport( entry=entry, hash=self.calc_hash(result), fetched=True, saved=True @@ -591,13 +608,17 @@ async def _fetch_data(self, entry: DataSourceEntry) -> JsonableValue: data=entry.data, ) except Exception as e: + # logger.exception already records the active traceback; don't bind + # the raw exception into the message - str(e) on a 3rd-party error + # (e.g. aiohttp) can carry a credentialed URL and would be serialized + # verbatim under LOG_SERIALIZE. logger.exception( - "Failed to fetch data for entry url {url} with exception {exc}", + "Failed to fetch data for entry url {url}", url=redact_url(entry.url), - exc=e, ) raise Exception( - f"Failed to fetch data for entry {redact_url(entry.url)}: {e}" + f"Failed to fetch data for entry {redact_url(entry.url)}: " + f"{redact_url_in_text(str(e), entry.url)}" ) if result is None: diff --git a/packages/opal-common/opal_common/fetcher/providers/fastapi_rpc_fetch_provider.py b/packages/opal-common/opal_common/fetcher/providers/fastapi_rpc_fetch_provider.py index 09b640687..027a71deb 100644 --- a/packages/opal-common/opal_common/fetcher/providers/fastapi_rpc_fetch_provider.py +++ b/packages/opal-common/opal_common/fetcher/providers/fastapi_rpc_fetch_provider.py @@ -7,6 +7,7 @@ from opal_common.fetcher.events import FetcherConfig, FetchEvent from opal_common.fetcher.fetch_provider import BaseFetchProvider from opal_common.fetcher.logger import get_logger +from opal_common.http_utils import redact_url logger = get_logger("rpc_fetch_provider") @@ -45,7 +46,7 @@ async def _fetch_(self): result = None # Note: ``args`` (rpc_arguments) may carry credentials - never log it. logger.info( - f"{self.__class__.__name__} fetching from {self._url} with RPC call {method}" + f"{self.__class__.__name__} fetching from {redact_url(self._url)} with RPC call {method}" ) async with WebSocketRpcClient( self._url, diff --git a/packages/opal-common/opal_common/sources/api_policy_source.py b/packages/opal-common/opal_common/sources/api_policy_source.py index 312db4b11..b549d45b3 100644 --- a/packages/opal-common/opal_common/sources/api_policy_source.py +++ b/packages/opal-common/opal_common/sources/api_policy_source.py @@ -9,6 +9,7 @@ from opal_common.git_utils.tar_file_to_local_git_extractor import ( TarFileToLocalGitExtractor, ) +from opal_common.http_utils import redact_url from opal_common.logger import logger from opal_common.sources.base_policy_source import BasePolicySource from opal_common.utils import ( @@ -188,11 +189,11 @@ async def fetch_policy_bundle_from_api_source( if response.status == status.HTTP_404_NOT_FOUND: logger.warning( "requested url not found: {full_url}", - full_url=full_url, + full_url=redact_url(full_url), ) raise HTTPException( status_code=status.HTTP_404_NOT_FOUND, - detail=f"requested url not found: {full_url}", + detail=f"requested url not found: {redact_url(full_url)}", ) if response.status == status.HTTP_304_NOT_MODIFIED: logger.info( @@ -263,7 +264,7 @@ async def check_for_changes(self): """ logger.info( "Fetching changes from remote: '{remote}'", - remote=self.remote_source_url, + remote=redact_url(self.remote_source_url), ) ( has_changes, diff --git a/packages/opal-common/opal_common/sources/git_policy_source.py b/packages/opal-common/opal_common/sources/git_policy_source.py index 2fa31e6dc..87611b1de 100644 --- a/packages/opal-common/opal_common/sources/git_policy_source.py +++ b/packages/opal-common/opal_common/sources/git_policy_source.py @@ -75,7 +75,7 @@ async def get_initial_policy_state_from_remote(self): ) else: logger.info( - "SKIPPED cloning policy repo, found existing repo at '{path}' with remotes: {remote_urls})", + "SKIPPED cloning policy repo, found existing repo at '{path}' with remotes: {remote_urls}", path=self._cloner.path, remote_urls=[redact_url(u) for u in remote_urls], ) diff --git a/packages/opal-server/opal_server/scopes/loader.py b/packages/opal-server/opal_server/scopes/loader.py index 485d57489..a50dc9631 100644 --- a/packages/opal-server/opal_server/scopes/loader.py +++ b/packages/opal-server/opal_server/scopes/loader.py @@ -1,3 +1,4 @@ +from opal_common.http_utils import redact_url from opal_common.logger import logger from opal_common.schemas.policy_source import ( GitPolicyScopeSource, @@ -21,7 +22,7 @@ async def _load_env_scope(repo: ScopeRepository): if opal_server_config.POLICY_REPO_URL is not None: logger.info( "Adding default scope from env: {url}", - url=opal_server_config.POLICY_REPO_URL, + url=redact_url(opal_server_config.POLICY_REPO_URL), ) auth = NoAuthData() diff --git a/packages/opal-server/opal_server/scopes/service.py b/packages/opal-server/opal_server/scopes/service.py index d3df4972f..637e9f6c5 100644 --- a/packages/opal-server/opal_server/scopes/service.py +++ b/packages/opal-server/opal_server/scopes/service.py @@ -8,6 +8,7 @@ from ddtrace import tracer from fastapi_websocket_pubsub import PubSubEndpoint from opal_common.git_utils.commit_viewer import VersionedFile +from opal_common.http_utils import redact_url from opal_common.logger import logger from opal_common.schemas.policy import PolicyUpdateMessageNotification from opal_common.schemas.policy_source import GitPolicyScopeSource @@ -130,7 +131,7 @@ async def sync_scope( source = cast(GitPolicyScopeSource, scope.policy) logger.debug( - f"Sync scope: {scope.scope_id} (remote: {source.url}, branch: {source.branch}, req_time: {req_time})" + f"Sync scope: {scope.scope_id} (remote: {redact_url(source.url)}, branch: {source.branch}, req_time: {req_time})" ) callbacks = PolicyFetcherCallbacks() From e593e784af096e82ab43a7e37855f0fb2dd953f6 Mon Sep 17 00:00:00 2001 From: zivxx Date: Tue, 23 Jun 2026 13:23:32 +0300 Subject: [PATCH 09/12] test(redaction): give the loguru-diagnose test teeth The diagnose test raised a bare ValueError("boom") that never referenced the model, so loguru's diagnose never rendered it - the assertion held even for an unprotected model (a tautology). Raise ValueError(entry) so the whole model is rendered in the traceback. Deliberately do NOT reach into entry.config: loguru diagnose evaluates that sub-expression and renders the raw dict, the one vector the mixin documents it cannot cover (verified - the reviewer's suggested entry.config[...] variant leaks). Addresses review comments: - https://github.com/permitio/opal/pull/921#discussion_r3448558152 (@zeevmoney) Co-Authored-By: Claude Opus 4.8 (1M context) --- .../opal-common/opal_common/tests/redaction_test.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/packages/opal-common/opal_common/tests/redaction_test.py b/packages/opal-common/opal_common/tests/redaction_test.py index e7e4a5790..27b26d21a 100644 --- a/packages/opal-common/opal_common/tests/redaction_test.py +++ b/packages/opal-common/opal_common/tests/redaction_test.py @@ -140,9 +140,16 @@ def test_loguru_diagnose_does_not_leak_model_value(): diagnose=True, ) try: - entry = _models()["DataSourceEntry"] # noqa: F841 - referenced in traceback + entry = _models()["DataSourceEntry"] try: - raise ValueError("boom") + # The raising line must *reference* ``entry`` (the whole model) so + # loguru's diagnose renders its repr in the traceback - otherwise the + # test is a tautology (an unprotected model would pass too). We must + # NOT reach into a credential field here (e.g. ``entry.config[...]``): + # diagnose evaluates that sub-expression and renders the raw dict, + # which the model-layer mixin documents it cannot protect. Rendering + # ``entry`` exercises exactly what the mixin defends. + raise ValueError(entry) except ValueError: logger.exception("failed while handling entry") finally: From 18df6ce874c9f3d1d8e126e1ff372af8d9dc15a7 Mon Sep 17 00:00:00 2001 From: zivxx Date: Tue, 23 Jun 2026 13:23:54 +0300 Subject: [PATCH 10/12] docs(logging): correct SSH-only rationale for git trace gating provide_git_ssh_environment early-returns for non-SSH URLs, so it only ever runs for SSH clones - which use key auth, not HTTP Authorization headers. The comment (and the LOG_DIAGNOSE docstring) claimed gating GIT_TRACE/ GIT_CURL_VERBOSE here closes an HTTP Authorization-header leak; it does not (that path is HTTPS-only and is not handled here). Correct the comment, the config docstring and the test docstring to state the real value: reducing verbose SSH protocol/host disclosure in logs. Addresses review comments: - https://github.com/permitio/opal/pull/921#discussion_r3448558164 (@zeevmoney) Co-Authored-By: Claude Opus 4.8 (1M context) --- packages/opal-common/opal_common/config.py | 5 +++-- packages/opal-common/opal_common/git_utils/env.py | 11 ++++++++--- .../opal-common/opal_common/tests/redaction_test.py | 5 +++-- 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/packages/opal-common/opal_common/config.py b/packages/opal-common/opal_common/config.py index ff6f98a64..aa79967f1 100644 --- a/packages/opal-common/opal_common/config.py +++ b/packages/opal-common/opal_common/config.py @@ -39,8 +39,9 @@ class OpalCommonConfig(Confi): description="Include diagnosis (local variable values) in tracebacks. " "Off by default because loguru renders raw variable values, which can " "leak credentials (e.g. auth headers/tokens) into logs - only enable for " - "local debugging. Also gates verbose git protocol tracing " - "(GIT_TRACE/GIT_CURL_VERBOSE), which can leak auth headers.", + "local debugging. Also gates verbose git SSH protocol tracing " + "(GIT_TRACE/GIT_CURL_VERBOSE on SSH clones), which adds noisy protocol/" + "host disclosure to logs.", ) LOG_COLORIZE = confi.bool("LOG_COLORIZE", True, description="Colorize log messages") LOG_SERIALIZE = confi.bool( diff --git a/packages/opal-common/opal_common/git_utils/env.py b/packages/opal-common/opal_common/git_utils/env.py index a5de4015d..ea12d9ca1 100644 --- a/packages/opal-common/opal_common/git_utils/env.py +++ b/packages/opal-common/opal_common/git_utils/env.py @@ -43,9 +43,14 @@ def provide_git_ssh_environment(url: str, ssh_key: str): env = { "GIT_SSH_COMMAND": f"ssh -o StrictHostKeyChecking=no -o IdentitiesOnly=yes -i {git_ssh_identity_file}", } - # GIT_TRACE / GIT_CURL_VERBOSE make git dump verbose protocol output to - # stderr - including HTTP Authorization headers - which OPAL captures into - # its logs. Only enable them when diagnosis logging is explicitly turned on. + # This function only runs for SSH clones (non-SSH urls early-return above), + # so the value of gating GIT_TRACE / GIT_CURL_VERBOSE here is reducing the + # verbose SSH protocol noise / host disclosure git dumps to stderr (which + # OPAL captures into its logs) - not closing an HTTP Authorization-header + # leak, since SSH uses key auth, not HTTP headers. (HTTPS clones, where + # GIT_CURL_VERBOSE would dump Authorization headers, never enter this path; + # that would have to be handled where the HTTPS clone env is built.) + # Only enable verbose tracing when diagnosis logging is explicitly turned on. if opal_common_config.LOG_DIAGNOSE: env["GIT_TRACE"] = "1" env["GIT_CURL_VERBOSE"] = "1" diff --git a/packages/opal-common/opal_common/tests/redaction_test.py b/packages/opal-common/opal_common/tests/redaction_test.py index 27b26d21a..22274504e 100644 --- a/packages/opal-common/opal_common/tests/redaction_test.py +++ b/packages/opal-common/opal_common/tests/redaction_test.py @@ -242,8 +242,9 @@ def test_redact_url_in_text_scrubs_creds_not_in_known_url(): def test_git_ssh_env_gates_verbose_tracing_on_log_diagnose(tmp_path): - """GIT_TRACE/GIT_CURL_VERBOSE dump auth headers; they must only be set when - LOG_DIAGNOSE is explicitly enabled.""" + """On SSH clones GIT_TRACE/GIT_CURL_VERBOSE add verbose protocol/host + disclosure to logs; they must only be set when LOG_DIAGNOSE is explicitly + enabled.""" from opal_common.config import opal_common_config from opal_common.git_utils import env From 9c73269e4f9057fae20de4645eb9dd7665492388 Mon Sep 17 00:00:00 2001 From: zivxx Date: Tue, 23 Jun 2026 13:24:37 +0300 Subject: [PATCH 11/12] feat(logging): scrub credentials at the loguru sink + LOG_DIAGNOSE note Third-party exceptions (most notably aiohttp ClientResponseError) render the full request URL - https://user:pw@host/path?token=... - inside the traceback loguru formats. The model-layer redaction can't reach those (the leaking object isn't an OPAL model), and this fires under default config (backtrace=True). Close the vector at the sink layer: - CredentialScrubbingStream wraps the stdout/stderr sink so the final formatted record (message + exception traceback, and the JSON blob under serialize=True) is run through redact_url_in_text before write. - A loguru patcher additionally scrubs record["message"] for every record, so the optional rotating file sink (which is path-based and can't be wrapped) still gets message-level scrubbing without losing rotation/retention. - Document that OPAL_LOG_DIAGNOSE now defaults to False (operators upgrading opt back in via OPAL_LOG_DIAGNOSE=true). Addresses review comments: - https://github.com/permitio/opal/pull/921#issuecomment-4762263235 (@zeevmoney) Co-Authored-By: Claude Opus 4.8 (1M context) --- .../docs/getting-started/configuration.mdx | 11 ++++- packages/opal-common/opal_common/logger.py | 26 +++++++++++- .../opal_common/logging_utils/scrubbing.py | 40 +++++++++++++++++++ .../opal_common/tests/redaction_test.py | 36 +++++++++++++++++ 4 files changed, 110 insertions(+), 3 deletions(-) create mode 100644 packages/opal-common/opal_common/logging_utils/scrubbing.py diff --git a/documentation/docs/getting-started/configuration.mdx b/documentation/docs/getting-started/configuration.mdx index d71278338..322456468 100644 --- a/documentation/docs/getting-started/configuration.mdx +++ b/documentation/docs/getting-started/configuration.mdx @@ -220,12 +220,19 @@ Whether to include traceback information in log messages. #### OPAL_LOG_DIAGNOSE -Default: `True` +Default: `False` -Include diagnosis in log messages. +Include diagnosis (local variable values) in tracebacks. Also gates verbose git +SSH protocol tracing (`GIT_TRACE`/`GIT_CURL_VERBOSE`). _Added in OPAL v0.6.0_ +:::caution Default changed +The default is now `False` (previously `True`). Operators upgrading lose +diagnostic tracebacks and verbose git tracing unless they opt back in with +`OPAL_LOG_DIAGNOSE=true`. +::: + :::danger When `OPAL_LOG_DIAGNOSE` is enabled, logs may contain sensitive information such as secrets. ::: diff --git a/packages/opal-common/opal_common/logger.py b/packages/opal-common/opal_common/logger.py index 6e2bc948f..ed8ab4bf6 100644 --- a/packages/opal-common/opal_common/logger.py +++ b/packages/opal-common/opal_common/logger.py @@ -3,13 +3,30 @@ from loguru import logger from opal_common.config import opal_common_config +from opal_common.http_utils import redact_url_in_text from opal_common.logging_utils.filter import ModuleFilter from opal_common.logging_utils.formatter import Formatter from opal_common.logging_utils.intercept import InterceptHandler +from opal_common.logging_utils.scrubbing import CredentialScrubbingStream from opal_common.logging_utils.thirdparty import hijack_uvicorn_logs from opal_common.monitoring.apm import fix_ddtrace_logging +def _scrub_record_message(record): + """loguru patcher: scrub embedded URL credentials from every record's + message before it reaches any sink. + + Runs once per record (order-independent across sinks), so it also protects + the optional rotating file sink, which is path-based and cannot be + stream-wrapped. The stream sink is *additionally* wrapped with + ``CredentialScrubbingStream`` to scrub rendered exception tracebacks, which + this message-only pass does not touch. + """ + message = record.get("message") + if message: + record["message"] = redact_url_in_text(message) + + def configure_logs(): """Takeover process logs and create a logger with Loguru according to the configuration.""" @@ -28,10 +45,17 @@ def configure_logs(): hijack_uvicorn_logs(intercept_handler) # Clean slate logger.remove() + # Scrub credentialed URLs from every record's message, for all sinks. + logger.configure(patcher=_scrub_record_message) # Logger configuration pipe = sys.stderr if opal_common_config.LOG_PIPE_TO_STDERR else sys.stdout + # Wrap the stream so embedded URL credentials are scrubbed from the final + # formatted record - including third-party exception tracebacks (e.g. aiohttp + # rendering ``https://user:pw@host?token=...``) that the model-layer + # redaction cannot reach. Message-level scrubbing for all sinks (incl. the + # rotating file sink, which cannot be wrapped) is handled by the patcher above. logger.add( - pipe, + CredentialScrubbingStream(pipe), filter=filter.filter, format=formatter.format, level=opal_common_config.LOG_LEVEL, diff --git a/packages/opal-common/opal_common/logging_utils/scrubbing.py b/packages/opal-common/opal_common/logging_utils/scrubbing.py new file mode 100644 index 000000000..983915510 --- /dev/null +++ b/packages/opal-common/opal_common/logging_utils/scrubbing.py @@ -0,0 +1,40 @@ +from opal_common.http_utils import redact_url_in_text + + +class CredentialScrubbingStream: + """A writable-stream wrapper that scrubs embedded URL credentials from every + fully-formatted log record before it reaches the underlying stream. + + This is the catch-all that the model-layer redaction (``RedactedReprMixin``) + cannot provide: third-party exceptions - most notably aiohttp + ``ClientResponseError`` - render the full request URL + (``https://user:pw@host/path?token=...``) inside the traceback that loguru + formats into the ``{exception}`` slot. The leaking object is not an OPAL + model, so masking ``repr()`` does nothing for it. + + Because loguru hands a stream sink the *final* formatted text (message + + traceback, and the JSON blob when ``serialize=True``), scrubbing on + ``write`` covers every record shape on this sink. ``redact_url_in_text`` is + idempotent, so the explicit call-site redaction upstream is unaffected. + + Note: this wraps a *stream* sink. loguru's path-based file sink (with + rotation/retention) opens the file itself and cannot be wrapped this way - + message-level scrubbing for that sink is handled in ``Formatter.format``. + """ + + def __init__(self, stream): + self._stream = stream + + def write(self, message: str): + return self._stream.write(redact_url_in_text(message)) + + def flush(self): + # loguru flushes the sink after each record when possible. + flush = getattr(self._stream, "flush", None) + if callable(flush): + flush() + + def __getattr__(self, name): + # Delegate everything else (isatty/fileno/encoding/...) to the wrapped + # stream so loguru treats this exactly like the underlying stream. + return getattr(self._stream, name) diff --git a/packages/opal-common/opal_common/tests/redaction_test.py b/packages/opal-common/opal_common/tests/redaction_test.py index 22274504e..2db5f7df4 100644 --- a/packages/opal-common/opal_common/tests/redaction_test.py +++ b/packages/opal-common/opal_common/tests/redaction_test.py @@ -299,3 +299,39 @@ def test_url_field_wire_serialization_keeps_real_url(): url = f"https://user:{_URL_SECRET}@host/path" entry = DataSourceEntry(url=url, save_method="PUT") assert entry.dict()["url"] == url + + +def test_scrubbing_stream_redacts_thirdparty_exception_traceback(): + """The model-layer mixin cannot mask a third-party exception (e.g. aiohttp) + that renders a credentialed URL in its traceback. + + The stream sink wrapper scrubs the fully-formatted record, closing + that vector. + """ + import io + + from opal_common.logging_utils.scrubbing import CredentialScrubbingStream + + buf = io.StringIO() + sink_id = logger.add( + CredentialScrubbingStream(buf), + level="DEBUG", + backtrace=True, + diagnose=False, + colorize=False, + format="{message}\n{exception}", + ) + try: + url = "https://user:THIRDPARTYSECRET@host/path" + try: + # str() of this exception carries the credentialed url, like a real + # aiohttp ClientResponseError would. + raise RuntimeError(f"connection failed for {url}") + except RuntimeError: + logger.exception("fetch failed") + finally: + logger.remove(sink_id) + + out = buf.getvalue() + assert "THIRDPARTYSECRET" not in out, "secret leaked through exception traceback" + assert "***@host/path" in out From d1136ec3cb93e45353723a585b053819b20f2e8b Mon Sep 17 00:00:00 2001 From: zivxx Date: Wed, 24 Jun 2026 03:16:56 +0300 Subject: [PATCH 12/12] style(logging): satisfy docformatter on scrubbing.py docstring CI fixes: - pre-commit / docformatter: reflow the CredentialScrubbingStream summary line so it conforms to docformatter (new file was missed locally because it was untracked when the formatters ran). Co-Authored-By: Claude Opus 4.8 (1M context) --- packages/opal-common/opal_common/logging_utils/scrubbing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/opal-common/opal_common/logging_utils/scrubbing.py b/packages/opal-common/opal_common/logging_utils/scrubbing.py index 983915510..0772f8602 100644 --- a/packages/opal-common/opal_common/logging_utils/scrubbing.py +++ b/packages/opal-common/opal_common/logging_utils/scrubbing.py @@ -2,8 +2,8 @@ class CredentialScrubbingStream: - """A writable-stream wrapper that scrubs embedded URL credentials from every - fully-formatted log record before it reaches the underlying stream. + """A writable-stream wrapper that scrubs embedded URL credentials from + every fully-formatted log record before it reaches the underlying stream. This is the catch-all that the model-layer redaction (``RedactedReprMixin``) cannot provide: third-party exceptions - most notably aiohttp