Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions .claude/plans/docs/05-config-reference.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# 05 — OPAL Config Reference (private)

Private, internal supplement to the public operator docs under `documentation/docs/`. Tracks
`OPAL_*` env vars added or clarified by the OPAL Server Git Fixes work, with the declaring
`file:line` so contributors can jump straight to the `Confi` declaration. Each key maps to an
`OPAL_<NAME>` env var (the `OPAL_` prefix is added once by the component's `Confi(prefix="OPAL_")`
instantiation — the bare name is what appears in the table).

## 4. opal-server keys

| Env var | Type | Default | Purpose | Declared at |
|---|---|---|---|---|
| `OPAL_SCOPES_GIT_FETCH_TIMEOUT` | float (seconds) | `120.0` | Hard timeout for a single scope git clone/fetch. On timeout the operation is abandoned and the scope is marked failed, so one unreachable repo can never block boot or other scopes. `0` = no timeout. | `packages/opal-server/opal_server/config.py:150-156` |
| `OPAL_SCOPES_GIT_MAX_WORKERS` | int | `10` | Size of the dedicated `ThreadPoolExecutor` for scope git operations. Isolating git work keeps a hung fetch from starving bundle serving and other server work that uses the default executor. | `packages/opal-server/opal_server/config.py:157-163` |

> **Caveat (timeout is soft, not a hard kill).** `OPAL_SCOPES_GIT_FETCH_TIMEOUT` is enforced via
> `asyncio.wait_for`, which cancels the *await* — unblocking the event loop and the awaiting
> coroutine — but the underlying pygit2 call keeps running on its pool thread until the OS network
> timeout. The dedicated pool (`OPAL_SCOPES_GIT_MAX_WORKERS`) bounds and isolates those lingering
> threads so they cannot affect bundle serving or other scopes. Hard-kill via subprocess is out of
> scope. See spec §6.
14 changes: 14 additions & 0 deletions packages/opal-server/opal_server/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,20 @@ class OpalServerConfig(Confi):
0,
description="The timeout for cloning the policy repository (0 means wait forever)",
)
SCOPES_GIT_FETCH_TIMEOUT = confi.float(
"SCOPES_GIT_FETCH_TIMEOUT",
120.0,
description="Hard timeout in seconds for a single scope git clone/fetch. "
"On timeout the operation is abandoned and the scope is marked failed so "
"one unreachable repo can never block boot or other scopes (0 = no timeout).",
)
SCOPES_GIT_MAX_WORKERS = confi.int(
"SCOPES_GIT_MAX_WORKERS",
10,
description="Size of the dedicated thread pool for scope git operations. "
"Isolating git work keeps a hung fetch from starving bundle serving and "
"other server work that uses the default executor.",
)
LEADER_LOCK_FILE_PATH = confi.str(
"LEADER_LOCK_FILE_PATH",
"/tmp/opal_server_leader.lock",
Expand Down
51 changes: 46 additions & 5 deletions packages/opal-server/opal_server/git_fetcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,15 @@
import hashlib
import shutil
import time
from concurrent.futures import ThreadPoolExecutor
from functools import partial
from pathlib import Path
from typing import Optional, cast

import aiofiles.os
import pygit2
from ddtrace import tracer
from git import Repo
from opal_common.async_utils import run_sync
from opal_common.git_utils.bundle_maker import BundleMaker
from opal_common.logger import logger
from opal_common.schemas.policy import PolicyBundle
Expand All @@ -34,6 +35,40 @@
)


_git_executor: Optional[ThreadPoolExecutor] = None


def _get_git_executor() -> ThreadPoolExecutor:
"""Lazily build the dedicated pool for scope git operations.

Isolated from the default executor so a hung clone/fetch can never starve
bundle serving or other server work.
"""
global _git_executor
if _git_executor is None:
_git_executor = ThreadPoolExecutor(
max_workers=opal_server_config.SCOPES_GIT_MAX_WORKERS,
thread_name_prefix="opal-git",
)
return _git_executor


async def run_in_git_executor(func, *args, timeout: float, **kwargs):
"""Run a blocking git call on the dedicated pool with a hard timeout.

Raises ``TimeoutError`` (via ``asyncio.wait_for``) when the call exceeds
``timeout`` seconds. ``timeout <= 0`` means no limit. NOTE: the timeout
unblocks the event loop and the awaiting coroutine, but the underlying
pygit2 call keeps running on its pool thread until the OS network timeout;
the dedicated pool keeps that lingering thread isolated.
"""
loop = asyncio.get_event_loop()
fut = loop.run_in_executor(_get_git_executor(), partial(func, *args, **kwargs))

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch — switched to asyncio.get_running_loop() in 5dd53a2.

if timeout and timeout > 0:
return await asyncio.wait_for(fut, timeout=timeout)
return await fut


class PolicyFetcherCallbacks:
async def on_update(self, old_head: Optional[str], head: str):
pass
Expand Down Expand Up @@ -190,9 +225,10 @@ async def fetch_and_notify_on_changes(
GitPolicyFetcher.repos_last_fetched[
self._source_id
] = datetime.datetime.now()
await run_sync(
await run_in_git_executor(
repo.remotes[self._remote].fetch,

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in 5dd53a2repos_last_fetched is now updated only after the fetch completes successfully, so a timeout/error leaves it stale and won't suppress a later force_fetch via _was_fetched_after.

callbacks=self._auth_callbacks,
timeout=opal_server_config.SCOPES_GIT_FETCH_TIMEOUT,
)
logger.debug(f"Fetch completed: {self._source.url}")

Expand Down Expand Up @@ -222,14 +258,19 @@ async def _clone(self):
path=self._repo_path,
)
try:
repo: Repository = await run_sync(
repo: Repository = await run_in_git_executor(
clone_repository,
self._source.url,
str(self._repo_path),
callbacks=self._auth_callbacks,
timeout=opal_server_config.SCOPES_GIT_FETCH_TIMEOUT,
)
except (pygit2.GitError, asyncio.TimeoutError) as exc:
logger.error(
"Could not clone repo at {url}: {err}",
url=self._source.url,
err=repr(exc),
)
except pygit2.GitError:
logger.exception(f"Could not clone repo at {self._source.url}")
else:
logger.info(f"Clone completed: {self._source.url}")
await self._notify_on_changes(repo)
Expand Down
22 changes: 22 additions & 0 deletions packages/opal-server/opal_server/tests/fetch_timeout_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import time

import pytest

from opal_server.config import opal_server_config
from opal_server.git_fetcher import run_in_git_executor


@pytest.mark.asyncio
async def test_hanging_git_op_raises_timeout(monkeypatch):
"""A clone/fetch that hangs must surface TimeoutError, not block forever."""
monkeypatch.setattr(opal_server_config, "SCOPES_GIT_FETCH_TIMEOUT", 0.2)

def _hang():
time.sleep(5)

start = time.time()
with pytest.raises(TimeoutError):
await run_in_git_executor(
_hang, timeout=opal_server_config.SCOPES_GIT_FETCH_TIMEOUT
)
assert time.time() - start < 2, "wait_for did not unblock promptly"
Comment thread
Copilot marked this conversation as resolved.
Outdated
30 changes: 30 additions & 0 deletions packages/opal-server/opal_server/tests/git_executor_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import time

import pytest

from opal_server.config import OpalServerConfig
from opal_server.git_fetcher import run_in_git_executor


def test_git_resilience_config_defaults():
clean = OpalServerConfig(prefix="OPAL_")
assert clean.SCOPES_GIT_FETCH_TIMEOUT == 120.0
assert clean.SCOPES_GIT_MAX_WORKERS == 10


@pytest.mark.asyncio
async def test_run_in_git_executor_returns_value():
result = await run_in_git_executor(lambda: 21 * 2, timeout=5)
assert result == 42


@pytest.mark.asyncio
async def test_run_in_git_executor_times_out():
with pytest.raises(TimeoutError):
await run_in_git_executor(lambda: time.sleep(2), timeout=0.1)


@pytest.mark.asyncio
async def test_zero_timeout_means_no_limit():
result = await run_in_git_executor(lambda: "ok", timeout=0)
assert result == "ok"
Loading