Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Keep smoke patch fixtures byte-stable across Windows checkouts.
eval/benchmarks/swe_verified/smoke/fixtures/official_patches/*.patch text eol=lf

# Scenario YAML files contain UTF-8 content and should not be rewritten to CRLF.
eval/scenarios/*.yaml text eol=lf
eval/benchmarks/**/*.yaml text eol=lf
32 changes: 32 additions & 0 deletions backend/monitor/api/http/global_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,11 +161,43 @@ def evaluation_batch_detail_snapshot(batch_id: str):
return _or_404(monitor_gateway.get_evaluation_batch_detail, batch_id)


@router.get("/evaluation/batches/{batch_id}/aggregate")
def evaluation_batch_aggregate_snapshot(batch_id: str):
return _or_404(monitor_gateway.get_evaluation_batch_aggregate, batch_id)


@router.get("/evaluation/runs/{run_id}")
def evaluation_run_detail_snapshot(run_id: str):
return _or_404(monitor_gateway.get_evaluation_run_detail, run_id)


@router.get("/evaluation/runs/{run_id}/artifacts")
def evaluation_run_artifacts_snapshot(run_id: str):
return _or_404(monitor_gateway.get_evaluation_run_artifacts, run_id)


@router.get("/evaluation/compare")
def evaluation_compare_snapshot(
baseline_batch_id: str = Query(..., min_length=1),
candidate_batch_id: str = Query(..., min_length=1),
):
try:
return monitor_gateway.compare_evaluation_batches(
baseline_batch_id=baseline_batch_id,
candidate_batch_id=candidate_batch_id,
)
except KeyError as exc:
raise HTTPException(status_code=404, detail=str(exc)) from exc


@router.get("/evaluation/batches/{batch_id}/export")
def evaluation_batch_export_snapshot(batch_id: str, format: str | None = Query(default=None)):
try:
return monitor_gateway.export_evaluation_batch(batch_id=batch_id, export_format=format)
except KeyError as exc:
raise HTTPException(status_code=404, detail=str(exc)) from exc


@router.get("/resources")
def resources_overview():
return monitor_gateway.get_resource_overview()
Expand Down
148 changes: 142 additions & 6 deletions backend/monitor/application/use_cases/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,22 @@

from __future__ import annotations

import logging
from typing import Any

from backend.monitor.infrastructure.evaluation import evaluation_execution_service, evaluation_read_service
from backend.monitor.infrastructure.evaluation.evaluation_scheduler import EvaluationJobScheduler, EvaluationJobSpec
from eval.exporter import build_batch_export

logger = logging.getLogger(__name__)

def _build_monitor_evaluation_run_fact_rows(metrics_rows: list[dict[str, Any]]) -> list[dict[str, str]]:

def _build_monitor_evaluation_run_fact_rows(
metrics_rows: list[dict[str, Any]],
*,
judge_result: dict[str, Any] | None = None,
artifacts: list[dict[str, Any]] | None = None,
) -> list[dict[str, str]]:
metrics_by_tier = {str(row.get("tier") or "").strip().lower(): row.get("metrics") or {} for row in metrics_rows}
system_metrics = metrics_by_tier.get("system") or {}
objective_metrics = metrics_by_tier.get("objective") or {}
Expand All @@ -25,18 +34,41 @@ def _build_monitor_evaluation_run_fact_rows(metrics_rows: list[dict[str, Any]])
if total_duration_ms is not None:
duration_value = int(total_duration_ms) if float(total_duration_ms).is_integer() else total_duration_ms
facts.append({"label": "Duration (ms)", "value": str(duration_value)})
if judge_result:
facts.append({"label": "Judge verdict", "value": str(judge_result.get("verdict") or "-")})
if judge_result.get("scores"):
first_score_key = sorted(dict(judge_result["scores"]).keys())[0]
facts.append(
{
"label": f"Judge {first_score_key}",
"value": str(dict(judge_result["scores"]).get(first_score_key)),
}
)
if artifacts is not None:
facts.append({"label": "Artifacts", "value": str(len(artifacts))})
return facts


def _build_monitor_evaluation_run_row(run: dict[str, Any], metrics_rows: list[dict[str, Any]]) -> dict[str, Any]:
def _build_monitor_evaluation_run_row(
run: dict[str, Any],
metrics_rows: list[dict[str, Any]],
*,
judge_result: dict[str, Any] | None = None,
artifacts: list[dict[str, Any]] | None = None,
benchmark: dict[str, Any] | None = None,
) -> dict[str, Any]:
return {
"run_id": str(run.get("id") or "") or None,
"thread_id": str(run.get("thread_id") or "") or None,
"status": str(run.get("status") or "") or None,
"started_at": str(run.get("started_at") or "") or None,
"finished_at": str(run.get("finished_at") or "") or None,
"user_message": str(run.get("user_message") or "") or None,
"facts": _build_monitor_evaluation_run_fact_rows(metrics_rows),
"final_response": str(run.get("final_response") or "") or None,
"facts": _build_monitor_evaluation_run_fact_rows(metrics_rows, judge_result=judge_result, artifacts=artifacts),
"judge_result": judge_result,
"artifact_count": len(artifacts or []),
"benchmark": benchmark,
}


Expand Down Expand Up @@ -70,7 +102,16 @@ def get_monitor_evaluation_workbench() -> dict[str, Any]:
completed_runs += 1
elif status in {"error", "failed", "cancelled"}:
failed_runs += 1
run_rows.append(_build_monitor_evaluation_run_row(run, store.get_metrics(str(run.get("id") or ""))))
run_id = str(run.get("id") or "")
run_rows.append(
_build_monitor_evaluation_run_row(
run,
store.get_metrics(run_id),
judge_result=_dump_model(store.get_judge_result(run_id)),
artifacts=[artifact.model_dump(mode="json") for artifact in store.get_artifacts(run_id)],
benchmark=_dump_model(store.get_benchmark_info(run_id)),
)
)

return {
"headline": "Evaluation Workbench",
Expand All @@ -92,9 +133,21 @@ def get_monitor_evaluation_run_detail(run_id: str) -> dict[str, Any]:
run = store.get_run(run_id)
if run is None:
raise KeyError(f"Evaluation run not found: {run_id}")
run_row = _build_monitor_evaluation_run_row(run, store.get_metrics(run_id))
artifacts = [artifact.model_dump(mode="json") for artifact in store.get_artifacts(run_id)]
judge_result = _dump_model(store.get_judge_result(run_id))
benchmark = _dump_model(store.get_benchmark_info(run_id))
run_row = _build_monitor_evaluation_run_row(
run,
store.get_metrics(run_id),
judge_result=judge_result,
artifacts=artifacts,
benchmark=benchmark,
)
detail = {"run": run_row, "facts": run_row["facts"], "limitations": []}
detail["batch_run"] = evaluation_read_service.make_eval_batch_service().get_batch_run_for_eval_run(run_id)
detail["judge_result"] = judge_result
detail["artifacts"] = artifacts
detail["benchmark"] = benchmark
return detail


Expand All @@ -115,6 +168,10 @@ def get_monitor_evaluation_scenarios() -> dict[str, Any]:
"sandbox": scenario.sandbox,
"message_count": len(scenario.messages),
"timeout_seconds": scenario.timeout_seconds,
"benchmark": scenario.benchmark.model_dump(mode="json") if scenario.benchmark else None,
"workspace": scenario.workspace.model_dump(mode="json") if scenario.workspace else None,
"judge_type": scenario.judge_config.type if scenario.judge_config else None,
"export_format": scenario.export.format if scenario.export else None,
}
for scenario in evaluation_execution_service.load_monitor_eval_scenarios()
]
Expand All @@ -129,12 +186,14 @@ def create_monitor_evaluation_batch(
sandbox: str,
max_concurrent: int,
) -> dict[str, Any]:
scenarios = evaluation_execution_service.select_monitor_eval_scenarios(list(scenario_ids), sandbox=sandbox)
batch = evaluation_read_service.make_eval_batch_service().create_batch(
submitted_by_user_id=submitted_by_user_id,
agent_user_id=agent_user_id,
scenario_ids=scenario_ids,
sandbox=sandbox,
max_concurrent=max_concurrent,
scenario_refs=scenarios,
)
return {"batch": batch}

Expand All @@ -152,6 +211,7 @@ def start_monitor_evaluation_batch(
config = batch.get("config_json") or {}
scenario_ids = config.get("scenario_ids")
sandbox = config.get("sandbox")
max_concurrent = int(config.get("max_concurrent") or 1)
agent_user_id = batch.get("agent_user_id")
if not scenario_ids:
raise ValueError("Evaluation batch is missing scenario_ids")
Expand All @@ -171,10 +231,86 @@ def start_monitor_evaluation_batch(
execution_base_url=execution_base_url.rstrip("/"),
token=token,
agent_user_id=str(agent_user_id),
max_concurrent=max_concurrent,
)
)
return {"accepted": True, "batch": updated}


def get_monitor_evaluation_batch_detail(batch_id: str) -> dict[str, Any]:
return evaluation_read_service.make_eval_batch_service().get_batch_detail(batch_id)
batch_service = evaluation_read_service.make_eval_batch_service()
detail = batch_service.get_batch_detail(batch_id)
detail["aggregate"] = batch_service.get_batch_summary(batch_id)["summary"]
return detail


def get_monitor_evaluation_batch_aggregate(batch_id: str) -> dict[str, Any]:
return evaluation_read_service.make_eval_batch_service().get_batch_summary(batch_id)


def compare_monitor_evaluation_batches(*, baseline_batch_id: str, candidate_batch_id: str) -> dict[str, Any]:
return evaluation_read_service.make_eval_batch_service().compare_batches(baseline_batch_id, candidate_batch_id)


def get_monitor_evaluation_run_artifacts(run_id: str) -> dict[str, Any]:
store = evaluation_read_service.make_trajectory_store()
if store.get_run(run_id) is None:
raise KeyError(f"Evaluation run not found: {run_id}")
return {
"run_id": run_id,
"artifacts": [artifact.model_dump(mode="json") for artifact in store.get_artifacts(run_id)],
"judge_result": _dump_model(store.get_judge_result(run_id)),
"benchmark": _dump_model(store.get_benchmark_info(run_id)),
}


def export_monitor_evaluation_batch(batch_id: str, *, export_format: str | None = None) -> dict[str, Any]:
batch_service = evaluation_read_service.make_eval_batch_service()
store = evaluation_read_service.make_trajectory_store()
detail = batch_service.get_batch_detail(batch_id)
batch = detail["batch"]
aggregate = batch_service.get_batch_summary(batch_id)["summary"]
resolved_format = export_format or _resolve_batch_export_format(batch)
run_records: list[dict[str, Any]] = []
for batch_run in detail["runs"]:
run_id = str(batch_run.get("eval_run_id") or "")
if not run_id:
continue
run = store.get_run(run_id)
if run is None:
logger.warning("Skipping export for missing evaluation run %s in batch %s", run_id, batch_id)
continue
run_records.append(
{
"run_id": run_id,
"scenario_id": batch_run.get("scenario_id"),
"batch_run": batch_run,
"run": {
"run_id": run_id,
"thread_id": run.get("thread_id"),
"status": run.get("status"),
"final_response": run.get("final_response"),
},
"judge_result": _dump_model(store.get_judge_result(run_id)),
"artifacts": [artifact.model_dump(mode="json") for artifact in store.get_artifacts(run_id)],
"benchmark": _dump_model(store.get_benchmark_info(run_id)),
}
)
return build_batch_export(batch=batch, aggregate=aggregate, run_records=run_records, export_format=resolved_format)


def _resolve_batch_export_format(batch: dict[str, Any]) -> str:
config = batch.get("config_json") or {}
scenario_refs = list(config.get("scenario_refs") or [])
for scenario_ref in scenario_refs:
export_config = dict(scenario_ref.get("export") or {})
export_format = str(export_config.get("format") or "").strip()
if export_format:
return export_format
return "generic_json"


def _dump_model(value: Any) -> dict[str, Any] | None:
if value is None:
return None
return value.model_dump(mode="json") if hasattr(value, "model_dump") else dict(value)
Original file line number Diff line number Diff line change
Expand Up @@ -19,5 +19,6 @@ def submit(self, spec: EvaluationJobSpec) -> None:
execution_base_url=spec.execution_base_url,
token=spec.token,
agent_user_id=spec.agent_user_id,
max_concurrent=spec.max_concurrent,
batch_service=make_eval_batch_service(),
)
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,32 @@

from __future__ import annotations

import logging
import os
from pathlib import Path

from backend.monitor.infrastructure.evaluation import evaluation_storage_service
from eval.batch_executor import EvaluationBatchExecutor
from eval.batch_service import EvaluationBatchService
from eval.harness.client import EvalClient
from eval.harness.runner import EvalRunner
from eval.harness.scenario import load_scenarios_from_dir
from eval.harness.scenario import load_scenarios_from_dirs, parse_scenario_dirs
from eval.models import EvalScenario

EVAL_SCENARIO_DIR = Path(__file__).resolve().parents[4] / "eval" / "scenarios"
logger = logging.getLogger(__name__)

_EVAL_ROOT = Path(__file__).resolve().parents[4] / "eval"
EVAL_SCENARIO_DIRS = [_EVAL_ROOT / "scenarios", _EVAL_ROOT / "benchmarks"]


def resolve_monitor_eval_scenario_dirs() -> list[Path]:
return parse_scenario_dirs(os.getenv("LEON_EVAL_SCENARIO_DIRS"), default_dirs=EVAL_SCENARIO_DIRS)


def load_monitor_eval_scenarios() -> list[EvalScenario]:
return load_scenarios_from_dir(EVAL_SCENARIO_DIR)
scenario_dirs = resolve_monitor_eval_scenario_dirs()
logger.info("Loading monitor evaluation scenarios from %s", ", ".join(str(path) for path in scenario_dirs))
return load_scenarios_from_dirs(scenario_dirs)


def select_monitor_eval_scenarios(scenario_ids: list[str], *, sandbox: str) -> list[EvalScenario]:
Expand All @@ -34,6 +45,7 @@ async def run_monitor_evaluation_batch(
execution_base_url: str,
token: str,
agent_user_id: str,
max_concurrent: int,
batch_service: EvaluationBatchService,
) -> None:
client = EvalClient(base_url=execution_base_url, token=token)
Expand All @@ -44,6 +56,6 @@ async def run_monitor_evaluation_batch(
store=evaluation_storage_service.make_trajectory_store(),
)
executor = EvaluationBatchExecutor(runner=runner, batch_service=batch_service)
await executor.run_batch(batch_id, scenarios)
await executor.run_batch(batch_id, scenarios, max_concurrent=max_concurrent)
finally:
await client.close()
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ class EvaluationJobSpec:
execution_base_url: str
token: str
agent_user_id: str
max_concurrent: int = 1


class EvaluationJobScheduler(Protocol):
Expand Down
19 changes: 19 additions & 0 deletions backend/monitor/infrastructure/web/gateway.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,25 @@ def get_evaluation_run_detail(run_id: str) -> dict[str, Any]:
return monitor_evaluation.get_monitor_evaluation_run_detail(run_id)


def get_evaluation_batch_aggregate(batch_id: str) -> dict[str, Any]:
return monitor_evaluation.get_monitor_evaluation_batch_aggregate(batch_id)


def compare_evaluation_batches(*, baseline_batch_id: str, candidate_batch_id: str) -> dict[str, Any]:
return monitor_evaluation.compare_monitor_evaluation_batches(
baseline_batch_id=baseline_batch_id,
candidate_batch_id=candidate_batch_id,
)


def get_evaluation_run_artifacts(run_id: str) -> dict[str, Any]:
return monitor_evaluation.get_monitor_evaluation_run_artifacts(run_id)


def export_evaluation_batch(*, batch_id: str, export_format: str | None = None) -> dict[str, Any]:
return monitor_evaluation.export_monitor_evaluation_batch(batch_id, export_format=export_format)


def get_resource_overview() -> dict[str, Any]:
return monitor_resources.get_monitor_resource_overview()

Expand Down
Loading
Loading