diff --git a/documentation/docs/getting-started/configuration.mdx b/documentation/docs/getting-started/configuration.mdx index 79d09b7cc..e40638b56 100644 --- a/documentation/docs/getting-started/configuration.mdx +++ b/documentation/docs/getting-started/configuration.mdx @@ -220,12 +220,19 @@ Whether to include traceback information in log messages. #### OPAL_LOG_DIAGNOSE -Default: `True` +Default: `False` -Include diagnosis in log messages. +Include diagnosis (local variable values) in tracebacks. Also gates verbose git +SSH protocol tracing (`GIT_TRACE`/`GIT_CURL_VERBOSE`). _Added in OPAL v0.6.0_ +:::caution Default changed +The default is now `False` (previously `True`). Operators upgrading lose +diagnostic tracebacks and verbose git tracing unless they opt back in with +`OPAL_LOG_DIAGNOSE=true`. +::: + :::danger When `OPAL_LOG_DIAGNOSE` is enabled, logs may contain sensitive information such as secrets. ::: diff --git a/packages/opal-client/opal_client/callbacks/reporter.py b/packages/opal-client/opal_client/callbacks/reporter.py index c2122d732..163c4ad96 100644 --- a/packages/opal-client/opal_client/callbacks/reporter.py +++ b/packages/opal-client/opal_client/callbacks/reporter.py @@ -5,7 +5,7 @@ from opal_client.callbacks.register import CallbackConfig, CallbacksRegister from opal_client.data.fetcher import DataFetcher from opal_common.fetcher.providers.http_fetch_provider import HttpFetcherConfig -from opal_common.http_utils import is_http_error_response +from opal_common.http_utils import is_http_error_response, redact_url from opal_common.logger import logger from opal_common.schemas.data import DataUpdateReport @@ -60,8 +60,9 @@ async def report_update_results( config.data = report_data callback_requests.append((url, config, None)) - # log only the URLs — FetcherConfig may carry Authorization headers and the full report payload - urls = [request[0] for request in callback_requests] + # log only the URLs — FetcherConfig may carry Authorization headers and the full report payload. + # Callback URLs are user-supplied and can embed credentials, so redact them too. + urls = [redact_url(request[0]) for request in callback_requests] logger.info("Reporting the update to requested callbacks", urls=urls) report_results = await self._fetcher.handle_urls(callback_requests) # log reports which we failed to send @@ -69,7 +70,7 @@ async def report_update_results( if isinstance(result, Exception): logger.error( "Failed to send report to {url}, info={exc_info}", - url=url, + url=redact_url(url), exc_info=repr(result), ) if isinstance( @@ -83,7 +84,7 @@ async def report_update_results( error_content = await result.text() logger.error( "Failed to send report to {url}, got response code {status} with error: {error}", - url=url, + url=redact_url(url), status=result.status, error=error_content, ) diff --git a/packages/opal-client/opal_client/data/fetcher.py b/packages/opal-client/opal_client/data/fetcher.py index edb38944b..df0bba8cd 100644 --- a/packages/opal-client/opal_client/data/fetcher.py +++ b/packages/opal-client/opal_client/data/fetcher.py @@ -7,6 +7,7 @@ from opal_common.fetcher import FetchingEngine from opal_common.fetcher.events import FetcherConfig from opal_common.fetcher.providers.http_fetch_provider import HttpFetcherConfig +from opal_common.http_utils import redact_url from opal_common.logger import logger from opal_common.utils import get_authorization_header, tuple_to_dict @@ -62,20 +63,20 @@ async def handle_url( ) -> Optional[JsonableValue]: """Helper function wrapping self._engine.handle_url.""" if data is not None: - logger.info("Data provided inline for url: {url}", url=url) + logger.info("Data provided inline for url: {url}", url=redact_url(url)) return data if url is None: logger.error("Invalid data update: no embedded data or URL") return None - logger.info("Fetching data from url: {url}", url=url) + logger.info("Fetching data from url: {url}", url=redact_url(url)) try: # ask the engine to get our data response = await self._engine.handle_url(url, config=config) return response except asyncio.TimeoutError as e: - logger.exception("Timeout while fetching url: {url}", url=url) + logger.exception("Timeout while fetching url: {url}", url=redact_url(url)) raise async def handle_urls( diff --git a/packages/opal-client/opal_client/data/updater.py b/packages/opal-client/opal_client/data/updater.py index 3563b40ab..8fedbf71a 100644 --- a/packages/opal-client/opal_client/data/updater.py +++ b/packages/opal-client/opal_client/data/updater.py @@ -26,7 +26,11 @@ ) from opal_common.async_utils import TasksPool, repeated_call from opal_common.config import opal_common_config -from opal_common.http_utils import is_http_error_response +from opal_common.http_utils import ( + is_http_error_response, + redact_url, + redact_url_in_text, +) from opal_common.schemas.data import ( DataEntryReport, DataSourceConfig, @@ -227,7 +231,10 @@ async def get_policy_data_config(self, url: str = None) -> DataSourceConfig: """ if url is None: url = self._data_sources_config_url - logger.info("Getting data-sources configuration from '{source}'", source=url) + logger.info( + "Getting data-sources configuration from '{source}'", + source=redact_url(url), + ) try: async with ClientSession( @@ -446,8 +453,10 @@ def calc_hash(data: JsonableValue) -> str: if not isinstance(data, str): data = json.dumps(data, default=pydantic_encoder) return hashlib.sha256(data.encode("utf-8")).hexdigest() - except Exception as e: - logger.exception(f"Failed to calculate hash for data {data}: {e}") + except Exception: + # Don't interpolate ``data`` - it may be an inline credential-bearing + # payload. logger.exception already records the traceback. + logger.exception("Failed to calculate hash for data") return "" async def _update_policy_data(self, update: DataUpdate) -> None: @@ -475,14 +484,17 @@ async def _update_policy_data(self, update: DataUpdate) -> None: for entry in update.entries: if not entry.topics: - logger.debug("Data entry {entry} has no topics, skipping", entry=entry) + logger.debug( + "Data entry for url {url} has no topics, skipping", + url=redact_url(entry.url), + ) continue # Only process entries that match one of our subscribed data topics if set(entry.topics).isdisjoint(set(self._data_topics)): logger.debug( - "Data entry {entry} has no topics matching the data topics, skipping", - entry=entry, + "Data entry for url {url} has no topics matching the data topics, skipping", + url=redact_url(entry.url), ) continue @@ -546,26 +558,34 @@ async def _fetch_and_save_data( try: result = await self._fetch_data(entry) except Exception as e: + # url is persisted into the transaction log and the OPA-failed error + # log; redact it (and the exception text) - the functional fetch + # already used the real entry.url above. store_transaction._update_remote_status( - url=entry.url, status=False, error=str(e) + url=redact_url(entry.url), + status=False, + error=redact_url_in_text(str(e), entry.url), ) return DataEntryReport(entry=entry, fetched=False, saved=False) try: await self._store_fetched_data(entry, result, store_transaction) except Exception as e: - logger.exception("Failed to save data update to policy-store: {exc}", exc=e) + # logger.exception already records the traceback; don't bind the raw + # exception (str(e) can carry a credentialed URL). + logger.exception("Failed to save data update to policy-store") store_transaction._update_remote_status( - url=entry.url, + url=redact_url(entry.url), status=False, - error=f"Failed to save data to policy store: {e}", + error=f"Failed to save data to policy store: " + f"{redact_url_in_text(str(e), entry.url)}", ) return DataEntryReport( entry=entry, hash=self.calc_hash(result), fetched=True, saved=False ) else: store_transaction._update_remote_status( - url=entry.url, status=True, error="" + url=redact_url(entry.url), status=True, error="" ) return DataEntryReport( entry=entry, hash=self.calc_hash(result), fetched=True, saved=True @@ -588,15 +608,21 @@ async def _fetch_data(self, entry: DataSourceEntry) -> JsonableValue: data=entry.data, ) except Exception as e: + # logger.exception already records the active traceback; don't bind + # the raw exception into the message - str(e) on a 3rd-party error + # (e.g. aiohttp) can carry a credentialed URL and would be serialized + # verbatim under LOG_SERIALIZE. logger.exception( - "Failed to fetch data for entry {entry} with exception {exc}", - entry=entry, - exc=e, + "Failed to fetch data for entry url {url}", + url=redact_url(entry.url), + ) + raise Exception( + f"Failed to fetch data for entry {redact_url(entry.url)}: " + f"{redact_url_in_text(str(e), entry.url)}" ) - raise Exception(f"Failed to fetch data for entry {entry.url}: {e}") if result is None: - raise Exception(f"Fetched data is empty for entry {entry.url}") + raise Exception(f"Fetched data is empty for entry {redact_url(entry.url)}") if isinstance(result, aiohttp.ClientResponse) and is_http_error_response( result @@ -604,12 +630,12 @@ async def _fetch_data(self, entry: DataSourceEntry) -> JsonableValue: error_content = await result.text() logger.error( "Failed to decode response from url: '{url}', got response code {status} with response: {error}", - url=entry.url, + url=redact_url(entry.url), status=result.status, error=error_content, ) raise Exception( - f"Failed to decode response from url: '{entry.url}', got response code {result.status} with response: {error_content}" + f"Failed to decode response from url: '{redact_url(entry.url)}', got response code {result.status} with response: {error_content}" ) return result diff --git a/packages/opal-common/opal_common/config.py b/packages/opal-common/opal_common/config.py index a65dd6211..aa79967f1 100644 --- a/packages/opal-common/opal_common/config.py +++ b/packages/opal-common/opal_common/config.py @@ -34,7 +34,14 @@ class OpalCommonConfig(Confi): "LOG_TRACEBACK", True, description="Include traceback in log messages" ) LOG_DIAGNOSE = confi.bool( - "LOG_DIAGNOSE", True, description="Include diagnosis in log messages" + "LOG_DIAGNOSE", + False, + description="Include diagnosis (local variable values) in tracebacks. " + "Off by default because loguru renders raw variable values, which can " + "leak credentials (e.g. auth headers/tokens) into logs - only enable for " + "local debugging. Also gates verbose git SSH protocol tracing " + "(GIT_TRACE/GIT_CURL_VERBOSE on SSH clones), which adds noisy protocol/" + "host disclosure to logs.", ) LOG_COLORIZE = confi.bool("LOG_COLORIZE", True, description="Colorize log messages") LOG_SERIALIZE = confi.bool( diff --git a/packages/opal-common/opal_common/fetcher/events.py b/packages/opal-common/opal_common/fetcher/events.py index 878a56d79..51e4e930b 100644 --- a/packages/opal-common/opal_common/fetcher/events.py +++ b/packages/opal-common/opal_common/fetcher/events.py @@ -1,13 +1,18 @@ -from typing import List, Optional +from typing import ClassVar, List, Optional, Set +from opal_common.logging_utils.redaction import RedactedReprMixin from pydantic import BaseModel, Field -class FetcherConfig(BaseModel): +class FetcherConfig(RedactedReprMixin, BaseModel): """The configuration of a fetcher, used as part of a FetchEvent Fetch - Provider's have their own uniqueue events and configurations. + Provider's have their own unique events and configurations. Configurations + + Note: subclasses commonly carry credentials (e.g. auth headers). They must + list any secret-bearing fields in ``_redacted_repr_fields`` so they never + leak into logs - see ``RedactedReprMixin``. """ fetcher: Optional[str] = Field( @@ -16,12 +21,18 @@ class FetcherConfig(BaseModel): ) -class FetchEvent(BaseModel): +class FetchEvent(RedactedReprMixin, BaseModel): """Event used to describe an queue fetching tasks Design note - By using a Pydantic model - we can create a potentially transfer FetchEvents to be handled by other network nodes (perhaps via RPC) """ + # ``config`` may carry a FetcherConfig with credentials - mask it in repr/str. + _redacted_repr_fields: ClassVar[Set[str]] = {"config"} + # ``url`` can embed credentials (``user:token@host`` / ``?token=``); strip + # them via redact_url while keeping host/path visible for debugging. + _redacted_url_fields: ClassVar[Set[str]] = {"url"} + # Event id to be filled by the engine id: str = None # optional name of the specific event diff --git a/packages/opal-common/opal_common/fetcher/providers/fastapi_rpc_fetch_provider.py b/packages/opal-common/opal_common/fetcher/providers/fastapi_rpc_fetch_provider.py index 4b574a8ea..027a71deb 100644 --- a/packages/opal-common/opal_common/fetcher/providers/fastapi_rpc_fetch_provider.py +++ b/packages/opal-common/opal_common/fetcher/providers/fastapi_rpc_fetch_provider.py @@ -1,10 +1,13 @@ """Simple HTTP get data fetcher using requests supports.""" +from typing import ClassVar, Set + from fastapi_websocket_rpc.rpc_methods import RpcMethodsBase from fastapi_websocket_rpc.websocket_rpc_client import WebSocketRpcClient from opal_common.fetcher.events import FetcherConfig, FetchEvent from opal_common.fetcher.fetch_provider import BaseFetchProvider from opal_common.fetcher.logger import get_logger +from opal_common.http_utils import redact_url logger = get_logger("rpc_fetch_provider") @@ -12,6 +15,9 @@ class FastApiRpcFetchConfig(FetcherConfig): """Config for FastApiRpcFetchConfig's Adding HTTP headers.""" + # ``rpc_arguments`` may carry credentials - mask it in repr/str. + _redacted_repr_fields: ClassVar[Set[str]] = {"rpc_arguments"} + rpc_method_name: str rpc_arguments: dict @@ -38,8 +44,9 @@ async def _fetch_(self): args = self._event.config.rpc_arguments method = self._event.config.rpc_method_name result = None + # Note: ``args`` (rpc_arguments) may carry credentials - never log it. logger.info( - f"{self.__class__.__name__} fetching from {self._url} with RPC call {method}({args})" + f"{self.__class__.__name__} fetching from {redact_url(self._url)} with RPC call {method}" ) async with WebSocketRpcClient( self._url, diff --git a/packages/opal-common/opal_common/fetcher/providers/http_fetch_provider.py b/packages/opal-common/opal_common/fetcher/providers/http_fetch_provider.py index 80ac50202..6fb5f658a 100644 --- a/packages/opal-common/opal_common/fetcher/providers/http_fetch_provider.py +++ b/packages/opal-common/opal_common/fetcher/providers/http_fetch_provider.py @@ -1,7 +1,7 @@ """Simple HTTP get data fetcher using requests supports.""" from enum import Enum -from typing import Any, Union, cast +from typing import Any, ClassVar, Set, Union, cast import httpx from aiohttp import ClientResponse, ClientSession, ClientTimeout @@ -9,7 +9,7 @@ from opal_common.fetcher.events import FetcherConfig, FetchEvent from opal_common.fetcher.fetch_provider import BaseFetchProvider from opal_common.fetcher.logger import get_logger -from opal_common.http_utils import is_http_error_response +from opal_common.http_utils import is_http_error_response, redact_url from opal_common.security.sslcontext import get_custom_ssl_context from pydantic import validator @@ -28,6 +28,10 @@ class HttpMethods(Enum): class HttpFetcherConfig(FetcherConfig): """Config for HttpFetchProvider's Adding HTTP headers.""" + # ``headers`` carries Authorization tokens and ``data`` the (possibly + # sensitive) payload - mask both in repr/str so they never leak into logs. + _redacted_repr_fields: ClassVar[Set[str]] = {"headers", "data"} + headers: dict = None is_json: bool = True process_data: bool = True @@ -91,7 +95,7 @@ async def __aexit__(self, exc_type=None, exc_val=None, tb=None): await self._session.__aexit__(exc_type, exc_val, tb) async def _fetch_(self): - logger.debug(f"{self.__class__.__name__} fetching from {self._url}") + logger.debug(f"{self.__class__.__name__} fetching from {redact_url(self._url)}") http_method = self.match_http_method_from_type( self._session, self._event.config.method ) diff --git a/packages/opal-common/opal_common/git_utils/env.py b/packages/opal-common/opal_common/git_utils/env.py index 5123d04b6..ea12d9ca1 100644 --- a/packages/opal-common/opal_common/git_utils/env.py +++ b/packages/opal-common/opal_common/git_utils/env.py @@ -40,8 +40,18 @@ def provide_git_ssh_environment(url: str, ssh_key: str): if not is_ssh_repo_url(url) or ssh_key is None: return {} # no ssh config git_ssh_identity_file = save_ssh_key_to_pem_file(ssh_key) - return { + env = { "GIT_SSH_COMMAND": f"ssh -o StrictHostKeyChecking=no -o IdentitiesOnly=yes -i {git_ssh_identity_file}", - "GIT_TRACE": "1", - "GIT_CURL_VERBOSE": "1", } + # This function only runs for SSH clones (non-SSH urls early-return above), + # so the value of gating GIT_TRACE / GIT_CURL_VERBOSE here is reducing the + # verbose SSH protocol noise / host disclosure git dumps to stderr (which + # OPAL captures into its logs) - not closing an HTTP Authorization-header + # leak, since SSH uses key auth, not HTTP headers. (HTTPS clones, where + # GIT_CURL_VERBOSE would dump Authorization headers, never enter this path; + # that would have to be handled where the HTTPS clone env is built.) + # Only enable verbose tracing when diagnosis logging is explicitly turned on. + if opal_common_config.LOG_DIAGNOSE: + env["GIT_TRACE"] = "1" + env["GIT_CURL_VERBOSE"] = "1" + return env diff --git a/packages/opal-common/opal_common/git_utils/repo_cloner.py b/packages/opal-common/opal_common/git_utils/repo_cloner.py index 174823aca..f1121881b 100644 --- a/packages/opal-common/opal_common/git_utils/repo_cloner.py +++ b/packages/opal-common/opal_common/git_utils/repo_cloner.py @@ -10,6 +10,7 @@ from opal_common.config import opal_common_config from opal_common.git_utils.env import provide_git_ssh_environment from opal_common.git_utils.exceptions import GitFailed +from opal_common.http_utils import redact_url, redact_url_in_text from opal_common.logger import logger from opal_common.utils import get_filepaths_with_glob from tenacity import RetryError, retry, stop, wait @@ -179,7 +180,7 @@ async def clone(self) -> CloneResult: """ logger.info( "Cloning repo from '{url}' to '{to_path}' (branch: '{branch}')", - url=self.url, + url=redact_url(self.url), to_path=self.path, branch=self.branch_name, ) @@ -196,7 +197,10 @@ def _attempt_clone_from_url(self) -> CloneResult: except (GitError, GitCommandError) as e: raise GitFailed(e) except RetryError as e: - logger.exception("cannot clone policy repo: {error}", error=e) + logger.exception( + "cannot clone policy repo: {error}", + error=redact_url_in_text(str(e), self.url), + ) raise GitFailed(e) else: logger.info("Clone succeeded", repo_path=self.path) @@ -208,5 +212,8 @@ def _clone(self, env) -> Repo: url=self.url, to_path=self.path, branch=self.branch_name, env=env ) except (GitError, GitCommandError) as e: - logger.error("cannot clone policy repo: {error}", error=e) + logger.error( + "cannot clone policy repo: {error}", + error=redact_url_in_text(str(e), self.url), + ) raise diff --git a/packages/opal-common/opal_common/http_utils.py b/packages/opal-common/opal_common/http_utils.py index 9c2d35a76..ae3d20e6d 100644 --- a/packages/opal-common/opal_common/http_utils.py +++ b/packages/opal-common/opal_common/http_utils.py @@ -1,8 +1,117 @@ -from typing import Union +import re +from typing import Tuple, Union +from urllib.parse import urlsplit, urlunsplit import aiohttp import httpx +#: Query-string parameter names whose values may carry credentials. +SENSITIVE_QUERY_PARAMS = frozenset( + { + "token", + "access_token", + "api_key", + "apikey", + "key", + "password", + "secret", + "sig", + "signature", + } +) + +#: Matches ``scheme://userinfo@`` anywhere in free text. +_USERINFO_RE = re.compile(r"(?P[a-zA-Z][a-zA-Z0-9+.\-]*://)[^/@\s]+@") + + +def _mask_sensitive_params(component: str) -> Tuple[str, bool]: + """Mask values of known sensitive params in a raw query/fragment component. + + Operates directly on the raw ``key=value&...`` text (rather than + ``parse_qsl`` + ``urlencode``) so the encoding of untouched params is + preserved byte-for-byte - only the sensitive values are replaced with + ``***``. Returns the (possibly rewritten) component and whether anything + changed. + """ + changed = False + out = [] + for pair in component.split("&"): + key, sep, _ = pair.partition("=") + if sep and key.lower() in SENSITIVE_QUERY_PARAMS: + out.append(f"{key}=***") + changed = True + else: + out.append(pair) + return "&".join(out), changed + + +def redact_url(url: str) -> str: + """Strip embedded credentials from a URL so it is safe to log. + + Data source / policy repo URLs may be of the form + ``https://user:token@host/path`` or carry a credential in a query parameter + (e.g. ``?token=...``). We replace any ``user:password@`` userinfo with + ``***@`` and mask the values of known sensitive query (and fragment) + parameters, while keeping the host, port, path and non-sensitive params + intact for debugging. Returns the input byte-for-byte unchanged if it is + empty, cannot be parsed, or carries nothing sensitive. + + Never raises: this helper is called from log/except paths, so any parsing + error yields the input unchanged rather than propagating. + """ + if not url: + return url + try: + parts = urlsplit(url) + except ValueError: + return url + + changed = False + netloc = parts.netloc + try: + if parts.username or parts.password: + host = parts.hostname or "" + if ":" in host: # IPv6 literal - urlsplit strips the surrounding brackets + host = f"[{host}]" + if parts.port is not None: + host = f"{host}:{parts.port}" + netloc = f"***@{host}" + changed = True + except ValueError: + # ``urlsplit`` is lazy - accessing ``.username``/``.password``/``.port`` + # is what actually validates and can raise (e.g. "Port out of range"). + # We must never throw from a log path, so bail out unchanged. + return url + + query, query_changed = _mask_sensitive_params(parts.query) + fragment, fragment_changed = _mask_sensitive_params(parts.fragment) + changed = changed or query_changed or fragment_changed + + if not changed: + return url + return urlunsplit((parts.scheme, netloc, parts.path, query, fragment)) + + +def redact_url_in_text(text: str, url: str = "") -> str: + """Redact embedded credentials from free text such as a git error message. + + Replaces verbatim occurrences of a known ``url`` with its fully redacted + form (so query-string tokens of that URL are masked too), then scrubs any + remaining ``scheme://user:password@`` userinfo found anywhere in the text + (a regex, so it is robust to the exact URL form git happens to print). + + The known-URL replacement runs *first*: the regex rewrites + ``user:pw@`` -> ``***@``, which would otherwise destroy the verbatim ``url`` + before its query tokens could be masked. + """ + if not text: + return text + scrubbed = text + if url: + scrubbed = scrubbed.replace(url, redact_url(url)) + scrubbed = _USERINFO_RE.sub(lambda m: f"{m.group('scheme')}***@", scrubbed) + return scrubbed + def is_http_error_response( response: Union[aiohttp.ClientResponse, httpx.Response] diff --git a/packages/opal-common/opal_common/logger.py b/packages/opal-common/opal_common/logger.py index 6e2bc948f..ed8ab4bf6 100644 --- a/packages/opal-common/opal_common/logger.py +++ b/packages/opal-common/opal_common/logger.py @@ -3,13 +3,30 @@ from loguru import logger from opal_common.config import opal_common_config +from opal_common.http_utils import redact_url_in_text from opal_common.logging_utils.filter import ModuleFilter from opal_common.logging_utils.formatter import Formatter from opal_common.logging_utils.intercept import InterceptHandler +from opal_common.logging_utils.scrubbing import CredentialScrubbingStream from opal_common.logging_utils.thirdparty import hijack_uvicorn_logs from opal_common.monitoring.apm import fix_ddtrace_logging +def _scrub_record_message(record): + """loguru patcher: scrub embedded URL credentials from every record's + message before it reaches any sink. + + Runs once per record (order-independent across sinks), so it also protects + the optional rotating file sink, which is path-based and cannot be + stream-wrapped. The stream sink is *additionally* wrapped with + ``CredentialScrubbingStream`` to scrub rendered exception tracebacks, which + this message-only pass does not touch. + """ + message = record.get("message") + if message: + record["message"] = redact_url_in_text(message) + + def configure_logs(): """Takeover process logs and create a logger with Loguru according to the configuration.""" @@ -28,10 +45,17 @@ def configure_logs(): hijack_uvicorn_logs(intercept_handler) # Clean slate logger.remove() + # Scrub credentialed URLs from every record's message, for all sinks. + logger.configure(patcher=_scrub_record_message) # Logger configuration pipe = sys.stderr if opal_common_config.LOG_PIPE_TO_STDERR else sys.stdout + # Wrap the stream so embedded URL credentials are scrubbed from the final + # formatted record - including third-party exception tracebacks (e.g. aiohttp + # rendering ``https://user:pw@host?token=...``) that the model-layer + # redaction cannot reach. Message-level scrubbing for all sinks (incl. the + # rotating file sink, which cannot be wrapped) is handled by the patcher above. logger.add( - pipe, + CredentialScrubbingStream(pipe), filter=filter.filter, format=formatter.format, level=opal_common_config.LOG_LEVEL, diff --git a/packages/opal-common/opal_common/logging_utils/redaction.py b/packages/opal-common/opal_common/logging_utils/redaction.py new file mode 100644 index 000000000..01c64dc26 --- /dev/null +++ b/packages/opal-common/opal_common/logging_utils/redaction.py @@ -0,0 +1,65 @@ +from typing import ClassVar, Set + +from opal_common.http_utils import redact_url + + +class RedactedReprMixin: + """Mixin for pydantic (v1) models that may carry credentials. + + Overrides ``repr()`` / ``str()`` so that sensitive fields are masked instead + of rendering their real value. Two flavours of masking are supported: + + - ``_redacted_repr_fields`` - the value is wholly replaced with + ```` (use for opaque secret carriers such as ``config``/``data``). + - ``_redacted_url_fields`` - the value is passed through ``redact_url`` so + embedded credentials are stripped while the host/path stay visible for + debugging (use for URL fields, which can embed ``user:token@`` or + ``?token=`` credentials but are otherwise useful to see). + + Both apply on this class or *any* of its base classes. + + This is the central defense against credential leaks in logs: OPAL logs via + loguru, and most leaks come from interpolating these models into log + messages (e.g. ``logger.info("... {entry}", entry=entry)``) or from loguru's + ``serialize=True`` sink, which dumps the record as JSON and falls back to + ``str()`` for non-JSON objects such as pydantic models. Masking once at the + model layer protects every such log site at once. + + Each redaction set is the **union** of its declarations across the whole + MRO, so a subclass can only ever *add* fields to redact - it can never + accidentally drop protection inherited from a parent. Each subclass that + introduces a new secret-bearing field must still list it here. + + Note: this only affects human / log rendering. Wire serialization uses + ``.dict()`` / ``.json()``, which are untouched, so transport is unaffected. + + Caveat: this masks whole *fields* of the model. It does not protect a secret + that is logged by reaching *into* the model (e.g. ``logger.info("{h}", + h=config.headers)``); avoid logging credential-bearing attributes directly. + """ + + #: Field names whose values may carry secrets and must be masked in repr/str. + _redacted_repr_fields: ClassVar[Set[str]] = set() + #: URL field names whose embedded credentials must be stripped via redact_url. + _redacted_url_fields: ClassVar[Set[str]] = set() + + @classmethod + def _union_over_mro(cls, attr: str) -> Set[str]: + # Union across the MRO so subclasses are additive (never lose a parent's + # protection, even if they redeclare the set). + result: Set[str] = set() + for klass in cls.__mro__: + result |= klass.__dict__.get(attr, set()) + return result + + def __repr_args__(self): + redacted = type(self)._union_over_mro("_redacted_repr_fields") + url_redacted = type(self)._union_over_mro("_redacted_url_fields") + args = [] + for key, value in super().__repr_args__(): + if key in redacted: + value = "" + elif key in url_redacted and isinstance(value, str): + value = redact_url(value) + args.append((key, value)) + return args diff --git a/packages/opal-common/opal_common/logging_utils/scrubbing.py b/packages/opal-common/opal_common/logging_utils/scrubbing.py new file mode 100644 index 000000000..0772f8602 --- /dev/null +++ b/packages/opal-common/opal_common/logging_utils/scrubbing.py @@ -0,0 +1,40 @@ +from opal_common.http_utils import redact_url_in_text + + +class CredentialScrubbingStream: + """A writable-stream wrapper that scrubs embedded URL credentials from + every fully-formatted log record before it reaches the underlying stream. + + This is the catch-all that the model-layer redaction (``RedactedReprMixin``) + cannot provide: third-party exceptions - most notably aiohttp + ``ClientResponseError`` - render the full request URL + (``https://user:pw@host/path?token=...``) inside the traceback that loguru + formats into the ``{exception}`` slot. The leaking object is not an OPAL + model, so masking ``repr()`` does nothing for it. + + Because loguru hands a stream sink the *final* formatted text (message + + traceback, and the JSON blob when ``serialize=True``), scrubbing on + ``write`` covers every record shape on this sink. ``redact_url_in_text`` is + idempotent, so the explicit call-site redaction upstream is unaffected. + + Note: this wraps a *stream* sink. loguru's path-based file sink (with + rotation/retention) opens the file itself and cannot be wrapped this way - + message-level scrubbing for that sink is handled in ``Formatter.format``. + """ + + def __init__(self, stream): + self._stream = stream + + def write(self, message: str): + return self._stream.write(redact_url_in_text(message)) + + def flush(self): + # loguru flushes the sink after each record when possible. + flush = getattr(self._stream, "flush", None) + if callable(flush): + flush() + + def __getattr__(self, name): + # Delegate everything else (isatty/fileno/encoding/...) to the wrapped + # stream so loguru treats this exactly like the underlying stream. + return getattr(self._stream, name) diff --git a/packages/opal-common/opal_common/schemas/data.py b/packages/opal-common/opal_common/schemas/data.py index 32e62c98f..82c6232ed 100644 --- a/packages/opal-common/opal_common/schemas/data.py +++ b/packages/opal-common/opal_common/schemas/data.py @@ -1,9 +1,7 @@ -from logging import basicConfig -from pydoc import describe -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Any, ClassVar, Dict, List, Optional, Set, Tuple, Union -from opal_common.fetcher.events import FetcherConfig from opal_common.fetcher.providers.http_fetch_provider import HttpFetcherConfig +from opal_common.logging_utils.redaction import RedactedReprMixin from opal_common.schemas.store import JSONPatchAction from pydantic import AnyHttpUrl, BaseModel, Field, root_validator, validator @@ -13,11 +11,19 @@ DEFAULT_DATA_TOPIC = "policy_data" -class DataSourceEntry(BaseModel): +class DataSourceEntry(RedactedReprMixin, BaseModel): """ Data source configuration - where client's should retrieve data from and how they should store it """ + # ``config`` may carry fetcher auth (e.g. Authorization headers) and + # ``data`` an inline payload - mask both in repr/str so they never leak into + # logs (entries are frequently interpolated into log messages). + _redacted_repr_fields: ClassVar[Set[str]] = {"config", "data"} + # ``url`` can embed credentials (``user:token@host`` / ``?token=``); strip + # them via redact_url while keeping host/path visible for debugging. + _redacted_url_fields: ClassVar[Set[str]] = {"url"} + @validator("data") def validate_save_method(cls, value, values): if values["save_method"] not in ["PUT", "PATCH"]: diff --git a/packages/opal-common/opal_common/sources/api_policy_source.py b/packages/opal-common/opal_common/sources/api_policy_source.py index 312db4b11..b549d45b3 100644 --- a/packages/opal-common/opal_common/sources/api_policy_source.py +++ b/packages/opal-common/opal_common/sources/api_policy_source.py @@ -9,6 +9,7 @@ from opal_common.git_utils.tar_file_to_local_git_extractor import ( TarFileToLocalGitExtractor, ) +from opal_common.http_utils import redact_url from opal_common.logger import logger from opal_common.sources.base_policy_source import BasePolicySource from opal_common.utils import ( @@ -188,11 +189,11 @@ async def fetch_policy_bundle_from_api_source( if response.status == status.HTTP_404_NOT_FOUND: logger.warning( "requested url not found: {full_url}", - full_url=full_url, + full_url=redact_url(full_url), ) raise HTTPException( status_code=status.HTTP_404_NOT_FOUND, - detail=f"requested url not found: {full_url}", + detail=f"requested url not found: {redact_url(full_url)}", ) if response.status == status.HTTP_304_NOT_MODIFIED: logger.info( @@ -263,7 +264,7 @@ async def check_for_changes(self): """ logger.info( "Fetching changes from remote: '{remote}'", - remote=self.remote_source_url, + remote=redact_url(self.remote_source_url), ) ( has_changes, diff --git a/packages/opal-common/opal_common/sources/git_policy_source.py b/packages/opal-common/opal_common/sources/git_policy_source.py index 59d8adfc2..87611b1de 100644 --- a/packages/opal-common/opal_common/sources/git_policy_source.py +++ b/packages/opal-common/opal_common/sources/git_policy_source.py @@ -4,6 +4,7 @@ from opal_common.git_utils.branch_tracker import BranchTracker from opal_common.git_utils.exceptions import GitFailed from opal_common.git_utils.repo_cloner import RepoCloner +from opal_common.http_utils import redact_url from opal_common.logger import logger from opal_common.sources.base_policy_source import BasePolicySource @@ -69,14 +70,14 @@ async def get_initial_policy_state_from_remote(self): # Don't bother with remove and reclone because this case shouldn't happen on reasobable usage raise GitFailed( RuntimeError( - f"Existing repo has wrong remote url: {remote_urls}" + f"Existing repo has wrong remote url: {[redact_url(u) for u in remote_urls]}" ) ) else: logger.info( - "SKIPPED cloning policy repo, found existing repo at '{path}' with remotes: {remote_urls})", + "SKIPPED cloning policy repo, found existing repo at '{path}' with remotes: {remote_urls}", path=self._cloner.path, - remote_urls=remote_urls, + remote_urls=[redact_url(u) for u in remote_urls], ) except GitFailed as e: await self._on_git_failed(e) diff --git a/packages/opal-common/opal_common/tests/redaction_test.py b/packages/opal-common/opal_common/tests/redaction_test.py new file mode 100644 index 000000000..2db5f7df4 --- /dev/null +++ b/packages/opal-common/opal_common/tests/redaction_test.py @@ -0,0 +1,337 @@ +"""Regression tests for credential redaction in repr/str of credential-bearing +models (follow-up to #901 / #902). + +OPAL logs via loguru. Credentials leaked because log statements interpolated +whole models (e.g. ``logger.info("... {entry}", entry=entry)``) or relied on +loguru's ``serialize=True`` sink, which dumps ``record["extra"]`` as JSON and +falls back to ``str()`` for non-JSON objects. ``RedactedReprMixin`` masks the +secret-bearing fields at the model layer so every current and future log site is +protected at once - while leaving ``.dict()`` / ``.json()`` (the wire format) +untouched. +""" + +from typing import List + +import pytest +from loguru import logger +from opal_common.fetcher.events import FetchEvent +from opal_common.fetcher.providers.fastapi_rpc_fetch_provider import ( + FastApiRpcFetchConfig, +) +from opal_common.fetcher.providers.http_fetch_provider import HttpFetcherConfig +from opal_common.http_utils import redact_url, redact_url_in_text +from opal_common.schemas.data import DataSourceEntry, DataUpdate + +SECRET_TOKEN = "Bearer super-secret-token-must-not-leak" + + +def _models(): + """One instance of each credential-bearing model, all carrying + SECRET_TOKEN.""" + return { + "HttpFetcherConfig": HttpFetcherConfig( + headers={"Authorization": SECRET_TOKEN}, data={"payload": SECRET_TOKEN} + ), + "DataSourceEntry": DataSourceEntry( + url="http://data.example.com", + save_method="PUT", + config={"headers": {"Authorization": SECRET_TOKEN}}, + data={"payload": SECRET_TOKEN}, + ), + "FetchEvent": FetchEvent( + fetcher="HttpFetchProvider", + url="http://data.example.com", + config={"headers": {"Authorization": SECRET_TOKEN}}, + ), + # rpc_arguments is a secret-bearing field on a different FetcherConfig + # subclass; it must be redacted too (regression for a missed subclass). + "FastApiRpcFetchConfig": FastApiRpcFetchConfig( + rpc_method_name="publish", + rpc_arguments={"token": SECRET_TOKEN}, + ), + } + + +@pytest.mark.parametrize("name,model", list(_models().items())) +def test_repr_and_str_redact_secrets(name, model): + """Repr() / str() (used by f-strings and ``"{}".format``) must never expose + the secret, and must show it was redacted.""" + assert SECRET_TOKEN not in repr(model), f"secret leaked in repr({name})" + assert SECRET_TOKEN not in str(model), f"secret leaked in str({name})" + assert SECRET_TOKEN not in f"{model}", f"secret leaked in f-string of {name}" + assert "" in repr(model), f"{name} did not redact anything" + + +def test_wire_serialization_is_not_redacted(): + """Redaction is only for human/log rendering - ``.dict()`` (the transport + format) must still carry the real values so fetching keeps working.""" + cfg = HttpFetcherConfig(headers={"Authorization": SECRET_TOKEN}) + entry = DataSourceEntry( + url="http://data.example.com", + save_method="PUT", + config={"headers": {"Authorization": SECRET_TOKEN}}, + ) + assert cfg.dict()["headers"]["Authorization"] == SECRET_TOKEN + assert entry.dict()["config"]["headers"]["Authorization"] == SECRET_TOKEN + + +def test_nested_container_models_redact_transitively(): + """The objects actually published on the wire are containers like + ``DataUpdate`` that hold ``DataSourceEntry`` objects. + + They carry no mixin themselves but must still redact via nested + repr. + """ + update = DataUpdate( + reason="test", + entries=[ + DataSourceEntry( + url="http://data.example.com", + save_method="PUT", + config={"headers": {"Authorization": SECRET_TOKEN}}, + data={"payload": SECRET_TOKEN}, + ) + ], + ) + assert SECRET_TOKEN not in repr(update) + assert SECRET_TOKEN not in str(update) + assert SECRET_TOKEN not in f"{update}" + # wire format still carries the real value + assert update.dict()["entries"][0]["config"]["headers"]["Authorization"] == ( + SECRET_TOKEN + ) + + +def test_loguru_serialize_does_not_leak(): + """The actual #901 scenario: a sink with ``serialize=True`` dumps the full + record (including ``extra``) as JSON. + + Interpolating or binding any of these models must not put the secret + on the wire. + """ + captured: List[str] = [] + sink_id = logger.add( + lambda message: captured.append(str(message)), + level="DEBUG", + serialize=True, + ) + try: + for model in _models().values(): + logger.info("logging model {model}", model=model) + logger.bind(bound=model).info("bound model") + finally: + logger.remove(sink_id) + + combined = "\n".join(captured) + assert SECRET_TOKEN not in combined, "secret leaked through loguru serialize=True" + + +def test_loguru_diagnose_does_not_leak_model_value(): + """With ``diagnose=True`` loguru renders local variable *values* in + tracebacks (the #901-class vector). + + The model's own value must render redacted. + """ + captured: List[str] = [] + sink_id = logger.add( + lambda message: captured.append(str(message)), + level="DEBUG", + backtrace=True, + diagnose=True, + ) + try: + entry = _models()["DataSourceEntry"] + try: + # The raising line must *reference* ``entry`` (the whole model) so + # loguru's diagnose renders its repr in the traceback - otherwise the + # test is a tautology (an unprotected model would pass too). We must + # NOT reach into a credential field here (e.g. ``entry.config[...]``): + # diagnose evaluates that sub-expression and renders the raw dict, + # which the model-layer mixin documents it cannot protect. Rendering + # ``entry`` exercises exactly what the mixin defends. + raise ValueError(entry) + except ValueError: + logger.exception("failed while handling entry") + finally: + logger.remove(sink_id) + + combined = "\n".join(captured) + assert SECRET_TOKEN not in combined, "secret leaked through loguru diagnose=True" + + +@pytest.mark.parametrize( + "url,expected", + [ + # userinfo (user:password) is stripped + ("https://user:tok@host/path", "https://***@host/path"), + ( + "https://x-access-token:tok@github.com/o/r.git", + "https://***@github.com/o/r.git", + ), + # username-only userinfo is still stripped (the user can be a token) + ("ssh://gittok@host/repo.git", "ssh://***@host/repo.git"), + # IPv6 literal must keep its brackets and stay a valid URL + ("https://u:p@[::1]:8443/x", "https://***@[::1]:8443/x"), + # sensitive query params are masked, non-sensitive kept + ( + "https://host/p?token=secret&page=2", + "https://host/p?token=***&page=2", + ), + ("https://host/p?access_token=secret", "https://host/p?access_token=***"), + # sensitive params in the *fragment* are masked too + ("https://host/p#access_token=secret", "https://host/p#access_token=***"), + # non-sensitive params keep their exact original encoding (no urlencode + # round-trip normalizing "%20" -> "+") + ( + "https://host/p?name=John%20Doe&token=secret", + "https://host/p?name=John%20Doe&token=***", + ), + # nothing sensitive -> returned byte-for-byte unchanged + ("https://host:8443/p?q=1#frag", "https://host:8443/p?q=1#frag"), + ("http://plain.example.com/data/", "http://plain.example.com/data/"), + # scp-style git url has no parseable userinfo and no password -> untouched + ("git@github.com:org/repo.git", "git@github.com:org/repo.git"), + ("", ""), + ], +) +def test_redact_url(url, expected): + # expected values never contain a secret, so equality fully validates + assert redact_url(url) == expected + + +def test_redact_url_never_raises_on_malformed_url(): + """redact_url is called from log/except paths and must never throw - e.g. an + out-of-range port only raises when .port is accessed (urlsplit is lazy).""" + bad = "https://u:p@host:99999999999/x" + # would raise ValueError("Port out of range") without the guard + assert redact_url(bad) == bad + + +def test_redact_url_in_text_masks_query_token_with_userinfo(): + """When a known URL carries *both* userinfo and a sensitive query param, + the query token must still be masked (regression: the userinfo regex used + to run first and destroy the verbatim URL before the query scrub could + match).""" + url = "https://user:SECRETTOK@host/path?token=SECRETTOK2" + text = "fatal: could not read from '" + url + "'" + scrubbed = redact_url_in_text(text, url) + assert "SECRETTOK" not in scrubbed + assert "SECRETTOK2" not in scrubbed + assert "https://***@host/path?token=***" in scrubbed + + +def test_redact_url_in_text(): + """Git command errors embed the (credentialed) repo URL in free text.""" + url = "https://x-access-token:SECRETTOKEN@github.com/org/repo.git" + err = f"Cmd('git') failed: fatal: could not read from '{url}'" + scrubbed = redact_url_in_text(err, url) + assert "SECRETTOKEN" not in scrubbed + assert "https://***@github.com/org/repo.git" in scrubbed + + +def test_redact_url_in_text_scrubs_creds_not_in_known_url(): + """The regex scrub must catch a credentialed URL even when it differs from + the URL we passed (e.g. git injected creds / normalized the string).""" + # self.url has no creds, but git prints a credentialed variant + text = ( + "fatal: could not read from 'https://x-access-token:LEAKED@github.com/o/r.git/'" + ) + scrubbed = redact_url_in_text(text, "https://github.com/o/r.git") + assert "LEAKED" not in scrubbed + assert "https://***@github.com/o/r.git/" in scrubbed + + +def test_git_ssh_env_gates_verbose_tracing_on_log_diagnose(tmp_path): + """On SSH clones GIT_TRACE/GIT_CURL_VERBOSE add verbose protocol/host + disclosure to logs; they must only be set when LOG_DIAGNOSE is explicitly + enabled.""" + from opal_common.config import opal_common_config + from opal_common.git_utils import env + + ssh_url = "git@github.com:org/repo.git" + original_diagnose = opal_common_config.LOG_DIAGNOSE + original_key_file = opal_common_config.GIT_SSH_KEY_FILE + # point the pem-file writer at a throwaway path so we never touch a real key + opal_common_config.GIT_SSH_KEY_FILE = str(tmp_path / "id_test") + try: + opal_common_config.LOG_DIAGNOSE = False + off = env.provide_git_ssh_environment(ssh_url, "DUMMY_KEY") + assert "GIT_TRACE" not in off and "GIT_CURL_VERBOSE" not in off + assert "GIT_SSH_COMMAND" in off + + opal_common_config.LOG_DIAGNOSE = True + on = env.provide_git_ssh_environment(ssh_url, "DUMMY_KEY") + assert on.get("GIT_TRACE") == "1" and on.get("GIT_CURL_VERBOSE") == "1" + finally: + opal_common_config.LOG_DIAGNOSE = original_diagnose + opal_common_config.GIT_SSH_KEY_FILE = original_key_file + + +_URL_SECRET = "url-secret-must-not-leak" + + +@pytest.mark.parametrize( + "model", + [ + DataSourceEntry( + url=f"https://user:{_URL_SECRET}@host/path?token={_URL_SECRET}", + save_method="PUT", + ), + FetchEvent( + fetcher="HttpFetchProvider", + url=f"https://user:{_URL_SECRET}@host/path?token={_URL_SECRET}", + ), + ], +) +def test_url_field_credentials_redacted_in_repr(model): + """The ``url`` field can embed credentials; repr/str must strip them (via + redact_url) while keeping the host/path visible for debugging.""" + rendered = repr(model) + assert _URL_SECRET not in rendered + assert _URL_SECRET not in str(model) + # host/path stay visible (we redact, not blanket-mask, urls) + assert "***@host/path" in rendered + + +def test_url_field_wire_serialization_keeps_real_url(): + """Redaction is log-only: ``.dict()`` must keep the real (credentialed) url + so fetching still works.""" + url = f"https://user:{_URL_SECRET}@host/path" + entry = DataSourceEntry(url=url, save_method="PUT") + assert entry.dict()["url"] == url + + +def test_scrubbing_stream_redacts_thirdparty_exception_traceback(): + """The model-layer mixin cannot mask a third-party exception (e.g. aiohttp) + that renders a credentialed URL in its traceback. + + The stream sink wrapper scrubs the fully-formatted record, closing + that vector. + """ + import io + + from opal_common.logging_utils.scrubbing import CredentialScrubbingStream + + buf = io.StringIO() + sink_id = logger.add( + CredentialScrubbingStream(buf), + level="DEBUG", + backtrace=True, + diagnose=False, + colorize=False, + format="{message}\n{exception}", + ) + try: + url = "https://user:THIRDPARTYSECRET@host/path" + try: + # str() of this exception carries the credentialed url, like a real + # aiohttp ClientResponseError would. + raise RuntimeError(f"connection failed for {url}") + except RuntimeError: + logger.exception("fetch failed") + finally: + logger.remove(sink_id) + + out = buf.getvalue() + assert "THIRDPARTYSECRET" not in out, "secret leaked through exception traceback" + assert "***@host/path" in out diff --git a/packages/opal-server/opal_server/data/api.py b/packages/opal-server/opal_server/data/api.py index da5d043a9..0b64a6a5e 100644 --- a/packages/opal-server/opal_server/data/api.py +++ b/packages/opal-server/opal_server/data/api.py @@ -98,7 +98,10 @@ async def get_data_sources_config(authorization: Optional[str] = Header(None)): redirect_url = set_url_query_param(url, "token", token) return RedirectResponse(url=redirect_url) else: - logger.error("pydantic model invalid", model=data_sources_config) + logger.error( + "data source configuration is invalid: neither an inline config " + "nor an external_source_url was provided" + ) raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail="Did not find a data source configuration!", diff --git a/packages/opal-server/opal_server/data/data_update_publisher.py b/packages/opal-server/opal_server/data/data_update_publisher.py index 64bc32bbe..34342d4ec 100644 --- a/packages/opal-server/opal_server/data/data_update_publisher.py +++ b/packages/opal-server/opal_server/data/data_update_publisher.py @@ -3,6 +3,7 @@ from typing import List from fastapi_utils.tasks import repeat_every +from opal_common.http_utils import redact_url from opal_common.logger import logger from opal_common.schemas.data import ( DataSourceEntryWithPollingInterval, @@ -75,7 +76,7 @@ async def publish_data_updates(self, update: DataUpdate): # a nicer format of entries to the log logged_entries = [ dict( - url=entry.url, + url=redact_url(entry.url), method=entry.save_method, path=entry.dst_path or "/", inline_data=(entry.data is not None), @@ -94,9 +95,9 @@ async def publish_data_updates(self, update: DataUpdate): all_topic_combos.update(topic_combos) else: logger.warning( - "[{pid}] No topics were provided for the following entry: {entry}", + "[{pid}] No topics were provided for the entry with url: {url}", pid=os.getpid(), - entry=entry, + url=redact_url(entry.url), ) # publish all topics with all their sub combinations diff --git a/packages/opal-server/opal_server/git_fetcher.py b/packages/opal-server/opal_server/git_fetcher.py index e52083c98..923bc423e 100644 --- a/packages/opal-server/opal_server/git_fetcher.py +++ b/packages/opal-server/opal_server/git_fetcher.py @@ -13,6 +13,7 @@ from git import Repo from opal_common.async_utils import run_sync from opal_common.git_utils.bundle_maker import BundleMaker +from opal_common.http_utils import redact_url from opal_common.logger import logger from opal_common.schemas.policy import PolicyBundle from opal_common.schemas.policy_source import ( @@ -106,10 +107,10 @@ def verify_found_repo_matches_remote( for remote in repo.remotes: if remote.url == expected_remote_url: logger.debug( - f"found target repo url is referred by remote: {remote.name}, url={remote.url}" + f"found target repo url is referred by remote: {remote.name}, url={redact_url(remote.url)}" ) return - error: str = f"Repo mismatch! No remote matches target url: {expected_remote_url}, found urls: {[remote.url for remote in repo.remotes]}" + error: str = f"Repo mismatch! No remote matches target url: {redact_url(expected_remote_url)}, found urls: {[redact_url(remote.url) for remote in repo.remotes]}" logger.error(error) raise ValueError(error) @@ -136,7 +137,7 @@ def __init__( self._remote = remote_name self._scope_id = scope_id logger.debug( - f"Initializing git fetcher: scope_id={scope_id}, url={source.url}, branch={self._source.branch}, source_id={self._source_id}" + f"Initializing git fetcher: scope_id={scope_id}, url={redact_url(source.url)}, branch={self._source.branch}, source_id={self._source_id}" ) async def _get_repo_lock(self): @@ -185,7 +186,7 @@ async def fetch_and_notify_on_changes( ) if should_fetch: logger.debug( - f"Fetching remote (force_fetch={force_fetch}): {self._remote} ({self._source.url})" + f"Fetching remote (force_fetch={force_fetch}): {self._remote} ({redact_url(self._source.url)})" ) GitPolicyFetcher.repos_last_fetched[ self._source_id @@ -194,7 +195,9 @@ async def fetch_and_notify_on_changes( repo.remotes[self._remote].fetch, callbacks=self._auth_callbacks, ) - logger.debug(f"Fetch completed: {self._source.url}") + logger.debug( + f"Fetch completed: {redact_url(self._source.url)}" + ) # New commits might be present because of a previous fetch made by another scope await self._notify_on_changes(repo) @@ -218,7 +221,7 @@ def _discover_repository(self, path: Path) -> bool: async def _clone(self): logger.info( "Cloning repo at '{url}' to '{path}'", - url=self._source.url, + url=redact_url(self._source.url), path=self._repo_path, ) try: @@ -229,9 +232,9 @@ async def _clone(self): callbacks=self._auth_callbacks, ) except pygit2.GitError: - logger.exception(f"Could not clone repo at {self._source.url}") + logger.exception(f"Could not clone repo at {redact_url(self._source.url)}") else: - logger.info(f"Clone completed: {self._source.url}") + logger.info(f"Clone completed: {redact_url(self._source.url)}") await self._notify_on_changes(repo) def _get_repo(self) -> Repository: diff --git a/packages/opal-server/opal_server/scopes/loader.py b/packages/opal-server/opal_server/scopes/loader.py index 485d57489..a50dc9631 100644 --- a/packages/opal-server/opal_server/scopes/loader.py +++ b/packages/opal-server/opal_server/scopes/loader.py @@ -1,3 +1,4 @@ +from opal_common.http_utils import redact_url from opal_common.logger import logger from opal_common.schemas.policy_source import ( GitPolicyScopeSource, @@ -21,7 +22,7 @@ async def _load_env_scope(repo: ScopeRepository): if opal_server_config.POLICY_REPO_URL is not None: logger.info( "Adding default scope from env: {url}", - url=opal_server_config.POLICY_REPO_URL, + url=redact_url(opal_server_config.POLICY_REPO_URL), ) auth = NoAuthData() diff --git a/packages/opal-server/opal_server/scopes/service.py b/packages/opal-server/opal_server/scopes/service.py index d3df4972f..637e9f6c5 100644 --- a/packages/opal-server/opal_server/scopes/service.py +++ b/packages/opal-server/opal_server/scopes/service.py @@ -8,6 +8,7 @@ from ddtrace import tracer from fastapi_websocket_pubsub import PubSubEndpoint from opal_common.git_utils.commit_viewer import VersionedFile +from opal_common.http_utils import redact_url from opal_common.logger import logger from opal_common.schemas.policy import PolicyUpdateMessageNotification from opal_common.schemas.policy_source import GitPolicyScopeSource @@ -130,7 +131,7 @@ async def sync_scope( source = cast(GitPolicyScopeSource, scope.policy) logger.debug( - f"Sync scope: {scope.scope_id} (remote: {source.url}, branch: {source.branch}, req_time: {req_time})" + f"Sync scope: {scope.scope_id} (remote: {redact_url(source.url)}, branch: {source.branch}, req_time: {req_time})" ) callbacks = PolicyFetcherCallbacks()