diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 000000000..c21db2d08 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,6 @@ +# Keep smoke patch fixtures byte-stable across Windows checkouts. +eval/benchmarks/swe_verified/smoke/fixtures/official_patches/*.patch text eol=lf + +# Scenario YAML files contain UTF-8 content and should not be rewritten to CRLF. +eval/scenarios/*.yaml text eol=lf +eval/benchmarks/**/*.yaml text eol=lf diff --git a/backend/monitor/api/http/global_router.py b/backend/monitor/api/http/global_router.py index e8570b332..94ac32352 100644 --- a/backend/monitor/api/http/global_router.py +++ b/backend/monitor/api/http/global_router.py @@ -161,11 +161,43 @@ def evaluation_batch_detail_snapshot(batch_id: str): return _or_404(monitor_gateway.get_evaluation_batch_detail, batch_id) +@router.get("/evaluation/batches/{batch_id}/aggregate") +def evaluation_batch_aggregate_snapshot(batch_id: str): + return _or_404(monitor_gateway.get_evaluation_batch_aggregate, batch_id) + + @router.get("/evaluation/runs/{run_id}") def evaluation_run_detail_snapshot(run_id: str): return _or_404(monitor_gateway.get_evaluation_run_detail, run_id) +@router.get("/evaluation/runs/{run_id}/artifacts") +def evaluation_run_artifacts_snapshot(run_id: str): + return _or_404(monitor_gateway.get_evaluation_run_artifacts, run_id) + + +@router.get("/evaluation/compare") +def evaluation_compare_snapshot( + baseline_batch_id: str = Query(..., min_length=1), + candidate_batch_id: str = Query(..., min_length=1), +): + try: + return monitor_gateway.compare_evaluation_batches( + baseline_batch_id=baseline_batch_id, + candidate_batch_id=candidate_batch_id, + ) + except KeyError as exc: + raise HTTPException(status_code=404, detail=str(exc)) from exc + + +@router.get("/evaluation/batches/{batch_id}/export") +def evaluation_batch_export_snapshot(batch_id: str, format: str | None = Query(default=None)): + try: + return monitor_gateway.export_evaluation_batch(batch_id=batch_id, export_format=format) + except KeyError as exc: + raise HTTPException(status_code=404, detail=str(exc)) from exc + + @router.get("/resources") def resources_overview(): return monitor_gateway.get_resource_overview() diff --git a/backend/monitor/application/use_cases/evaluation.py b/backend/monitor/application/use_cases/evaluation.py index 25560312d..24755b9a9 100644 --- a/backend/monitor/application/use_cases/evaluation.py +++ b/backend/monitor/application/use_cases/evaluation.py @@ -2,13 +2,22 @@ from __future__ import annotations +import logging from typing import Any from backend.monitor.infrastructure.evaluation import evaluation_execution_service, evaluation_read_service from backend.monitor.infrastructure.evaluation.evaluation_scheduler import EvaluationJobScheduler, EvaluationJobSpec +from eval.exporter import build_batch_export +logger = logging.getLogger(__name__) -def _build_monitor_evaluation_run_fact_rows(metrics_rows: list[dict[str, Any]]) -> list[dict[str, str]]: + +def _build_monitor_evaluation_run_fact_rows( + metrics_rows: list[dict[str, Any]], + *, + judge_result: dict[str, Any] | None = None, + artifacts: list[dict[str, Any]] | None = None, +) -> list[dict[str, str]]: metrics_by_tier = {str(row.get("tier") or "").strip().lower(): row.get("metrics") or {} for row in metrics_rows} system_metrics = metrics_by_tier.get("system") or {} objective_metrics = metrics_by_tier.get("objective") or {} @@ -25,10 +34,29 @@ def _build_monitor_evaluation_run_fact_rows(metrics_rows: list[dict[str, Any]]) if total_duration_ms is not None: duration_value = int(total_duration_ms) if float(total_duration_ms).is_integer() else total_duration_ms facts.append({"label": "Duration (ms)", "value": str(duration_value)}) + if judge_result: + facts.append({"label": "Judge verdict", "value": str(judge_result.get("verdict") or "-")}) + if judge_result.get("scores"): + first_score_key = sorted(dict(judge_result["scores"]).keys())[0] + facts.append( + { + "label": f"Judge {first_score_key}", + "value": str(dict(judge_result["scores"]).get(first_score_key)), + } + ) + if artifacts is not None: + facts.append({"label": "Artifacts", "value": str(len(artifacts))}) return facts -def _build_monitor_evaluation_run_row(run: dict[str, Any], metrics_rows: list[dict[str, Any]]) -> dict[str, Any]: +def _build_monitor_evaluation_run_row( + run: dict[str, Any], + metrics_rows: list[dict[str, Any]], + *, + judge_result: dict[str, Any] | None = None, + artifacts: list[dict[str, Any]] | None = None, + benchmark: dict[str, Any] | None = None, +) -> dict[str, Any]: return { "run_id": str(run.get("id") or "") or None, "thread_id": str(run.get("thread_id") or "") or None, @@ -36,7 +64,11 @@ def _build_monitor_evaluation_run_row(run: dict[str, Any], metrics_rows: list[di "started_at": str(run.get("started_at") or "") or None, "finished_at": str(run.get("finished_at") or "") or None, "user_message": str(run.get("user_message") or "") or None, - "facts": _build_monitor_evaluation_run_fact_rows(metrics_rows), + "final_response": str(run.get("final_response") or "") or None, + "facts": _build_monitor_evaluation_run_fact_rows(metrics_rows, judge_result=judge_result, artifacts=artifacts), + "judge_result": judge_result, + "artifact_count": len(artifacts or []), + "benchmark": benchmark, } @@ -70,7 +102,16 @@ def get_monitor_evaluation_workbench() -> dict[str, Any]: completed_runs += 1 elif status in {"error", "failed", "cancelled"}: failed_runs += 1 - run_rows.append(_build_monitor_evaluation_run_row(run, store.get_metrics(str(run.get("id") or "")))) + run_id = str(run.get("id") or "") + run_rows.append( + _build_monitor_evaluation_run_row( + run, + store.get_metrics(run_id), + judge_result=_dump_model(store.get_judge_result(run_id)), + artifacts=[artifact.model_dump(mode="json") for artifact in store.get_artifacts(run_id)], + benchmark=_dump_model(store.get_benchmark_info(run_id)), + ) + ) return { "headline": "Evaluation Workbench", @@ -92,9 +133,21 @@ def get_monitor_evaluation_run_detail(run_id: str) -> dict[str, Any]: run = store.get_run(run_id) if run is None: raise KeyError(f"Evaluation run not found: {run_id}") - run_row = _build_monitor_evaluation_run_row(run, store.get_metrics(run_id)) + artifacts = [artifact.model_dump(mode="json") for artifact in store.get_artifacts(run_id)] + judge_result = _dump_model(store.get_judge_result(run_id)) + benchmark = _dump_model(store.get_benchmark_info(run_id)) + run_row = _build_monitor_evaluation_run_row( + run, + store.get_metrics(run_id), + judge_result=judge_result, + artifacts=artifacts, + benchmark=benchmark, + ) detail = {"run": run_row, "facts": run_row["facts"], "limitations": []} detail["batch_run"] = evaluation_read_service.make_eval_batch_service().get_batch_run_for_eval_run(run_id) + detail["judge_result"] = judge_result + detail["artifacts"] = artifacts + detail["benchmark"] = benchmark return detail @@ -115,6 +168,10 @@ def get_monitor_evaluation_scenarios() -> dict[str, Any]: "sandbox": scenario.sandbox, "message_count": len(scenario.messages), "timeout_seconds": scenario.timeout_seconds, + "benchmark": scenario.benchmark.model_dump(mode="json") if scenario.benchmark else None, + "workspace": scenario.workspace.model_dump(mode="json") if scenario.workspace else None, + "judge_type": scenario.judge_config.type if scenario.judge_config else None, + "export_format": scenario.export.format if scenario.export else None, } for scenario in evaluation_execution_service.load_monitor_eval_scenarios() ] @@ -129,12 +186,14 @@ def create_monitor_evaluation_batch( sandbox: str, max_concurrent: int, ) -> dict[str, Any]: + scenarios = evaluation_execution_service.select_monitor_eval_scenarios(list(scenario_ids), sandbox=sandbox) batch = evaluation_read_service.make_eval_batch_service().create_batch( submitted_by_user_id=submitted_by_user_id, agent_user_id=agent_user_id, scenario_ids=scenario_ids, sandbox=sandbox, max_concurrent=max_concurrent, + scenario_refs=scenarios, ) return {"batch": batch} @@ -152,6 +211,7 @@ def start_monitor_evaluation_batch( config = batch.get("config_json") or {} scenario_ids = config.get("scenario_ids") sandbox = config.get("sandbox") + max_concurrent = int(config.get("max_concurrent") or 1) agent_user_id = batch.get("agent_user_id") if not scenario_ids: raise ValueError("Evaluation batch is missing scenario_ids") @@ -171,10 +231,86 @@ def start_monitor_evaluation_batch( execution_base_url=execution_base_url.rstrip("/"), token=token, agent_user_id=str(agent_user_id), + max_concurrent=max_concurrent, ) ) return {"accepted": True, "batch": updated} def get_monitor_evaluation_batch_detail(batch_id: str) -> dict[str, Any]: - return evaluation_read_service.make_eval_batch_service().get_batch_detail(batch_id) + batch_service = evaluation_read_service.make_eval_batch_service() + detail = batch_service.get_batch_detail(batch_id) + detail["aggregate"] = batch_service.get_batch_summary(batch_id)["summary"] + return detail + + +def get_monitor_evaluation_batch_aggregate(batch_id: str) -> dict[str, Any]: + return evaluation_read_service.make_eval_batch_service().get_batch_summary(batch_id) + + +def compare_monitor_evaluation_batches(*, baseline_batch_id: str, candidate_batch_id: str) -> dict[str, Any]: + return evaluation_read_service.make_eval_batch_service().compare_batches(baseline_batch_id, candidate_batch_id) + + +def get_monitor_evaluation_run_artifacts(run_id: str) -> dict[str, Any]: + store = evaluation_read_service.make_trajectory_store() + if store.get_run(run_id) is None: + raise KeyError(f"Evaluation run not found: {run_id}") + return { + "run_id": run_id, + "artifacts": [artifact.model_dump(mode="json") for artifact in store.get_artifacts(run_id)], + "judge_result": _dump_model(store.get_judge_result(run_id)), + "benchmark": _dump_model(store.get_benchmark_info(run_id)), + } + + +def export_monitor_evaluation_batch(batch_id: str, *, export_format: str | None = None) -> dict[str, Any]: + batch_service = evaluation_read_service.make_eval_batch_service() + store = evaluation_read_service.make_trajectory_store() + detail = batch_service.get_batch_detail(batch_id) + batch = detail["batch"] + aggregate = batch_service.get_batch_summary(batch_id)["summary"] + resolved_format = export_format or _resolve_batch_export_format(batch) + run_records: list[dict[str, Any]] = [] + for batch_run in detail["runs"]: + run_id = str(batch_run.get("eval_run_id") or "") + if not run_id: + continue + run = store.get_run(run_id) + if run is None: + logger.warning("Skipping export for missing evaluation run %s in batch %s", run_id, batch_id) + continue + run_records.append( + { + "run_id": run_id, + "scenario_id": batch_run.get("scenario_id"), + "batch_run": batch_run, + "run": { + "run_id": run_id, + "thread_id": run.get("thread_id"), + "status": run.get("status"), + "final_response": run.get("final_response"), + }, + "judge_result": _dump_model(store.get_judge_result(run_id)), + "artifacts": [artifact.model_dump(mode="json") for artifact in store.get_artifacts(run_id)], + "benchmark": _dump_model(store.get_benchmark_info(run_id)), + } + ) + return build_batch_export(batch=batch, aggregate=aggregate, run_records=run_records, export_format=resolved_format) + + +def _resolve_batch_export_format(batch: dict[str, Any]) -> str: + config = batch.get("config_json") or {} + scenario_refs = list(config.get("scenario_refs") or []) + for scenario_ref in scenario_refs: + export_config = dict(scenario_ref.get("export") or {}) + export_format = str(export_config.get("format") or "").strip() + if export_format: + return export_format + return "generic_json" + + +def _dump_model(value: Any) -> dict[str, Any] | None: + if value is None: + return None + return value.model_dump(mode="json") if hasattr(value, "model_dump") else dict(value) diff --git a/backend/monitor/infrastructure/evaluation/background_task_scheduler.py b/backend/monitor/infrastructure/evaluation/background_task_scheduler.py index 87f5deadc..06f56b157 100644 --- a/backend/monitor/infrastructure/evaluation/background_task_scheduler.py +++ b/backend/monitor/infrastructure/evaluation/background_task_scheduler.py @@ -19,5 +19,6 @@ def submit(self, spec: EvaluationJobSpec) -> None: execution_base_url=spec.execution_base_url, token=spec.token, agent_user_id=spec.agent_user_id, + max_concurrent=spec.max_concurrent, batch_service=make_eval_batch_service(), ) diff --git a/backend/monitor/infrastructure/evaluation/evaluation_execution_service.py b/backend/monitor/infrastructure/evaluation/evaluation_execution_service.py index cb9de7566..add53ff6b 100644 --- a/backend/monitor/infrastructure/evaluation/evaluation_execution_service.py +++ b/backend/monitor/infrastructure/evaluation/evaluation_execution_service.py @@ -2,6 +2,8 @@ from __future__ import annotations +import logging +import os from pathlib import Path from backend.monitor.infrastructure.evaluation import evaluation_storage_service @@ -9,14 +11,23 @@ from eval.batch_service import EvaluationBatchService from eval.harness.client import EvalClient from eval.harness.runner import EvalRunner -from eval.harness.scenario import load_scenarios_from_dir +from eval.harness.scenario import load_scenarios_from_dirs, parse_scenario_dirs from eval.models import EvalScenario -EVAL_SCENARIO_DIR = Path(__file__).resolve().parents[4] / "eval" / "scenarios" +logger = logging.getLogger(__name__) + +_EVAL_ROOT = Path(__file__).resolve().parents[4] / "eval" +EVAL_SCENARIO_DIRS = [_EVAL_ROOT / "scenarios", _EVAL_ROOT / "benchmarks"] + + +def resolve_monitor_eval_scenario_dirs() -> list[Path]: + return parse_scenario_dirs(os.getenv("LEON_EVAL_SCENARIO_DIRS"), default_dirs=EVAL_SCENARIO_DIRS) def load_monitor_eval_scenarios() -> list[EvalScenario]: - return load_scenarios_from_dir(EVAL_SCENARIO_DIR) + scenario_dirs = resolve_monitor_eval_scenario_dirs() + logger.info("Loading monitor evaluation scenarios from %s", ", ".join(str(path) for path in scenario_dirs)) + return load_scenarios_from_dirs(scenario_dirs) def select_monitor_eval_scenarios(scenario_ids: list[str], *, sandbox: str) -> list[EvalScenario]: @@ -34,6 +45,7 @@ async def run_monitor_evaluation_batch( execution_base_url: str, token: str, agent_user_id: str, + max_concurrent: int, batch_service: EvaluationBatchService, ) -> None: client = EvalClient(base_url=execution_base_url, token=token) @@ -44,6 +56,6 @@ async def run_monitor_evaluation_batch( store=evaluation_storage_service.make_trajectory_store(), ) executor = EvaluationBatchExecutor(runner=runner, batch_service=batch_service) - await executor.run_batch(batch_id, scenarios) + await executor.run_batch(batch_id, scenarios, max_concurrent=max_concurrent) finally: await client.close() diff --git a/backend/monitor/infrastructure/evaluation/evaluation_scheduler.py b/backend/monitor/infrastructure/evaluation/evaluation_scheduler.py index 0bae0c3cd..3ddf94816 100644 --- a/backend/monitor/infrastructure/evaluation/evaluation_scheduler.py +++ b/backend/monitor/infrastructure/evaluation/evaluation_scheduler.py @@ -15,6 +15,7 @@ class EvaluationJobSpec: execution_base_url: str token: str agent_user_id: str + max_concurrent: int = 1 class EvaluationJobScheduler(Protocol): diff --git a/backend/monitor/infrastructure/web/gateway.py b/backend/monitor/infrastructure/web/gateway.py index 91e8d123a..ac3c2d61f 100644 --- a/backend/monitor/infrastructure/web/gateway.py +++ b/backend/monitor/infrastructure/web/gateway.py @@ -138,6 +138,25 @@ def get_evaluation_run_detail(run_id: str) -> dict[str, Any]: return monitor_evaluation.get_monitor_evaluation_run_detail(run_id) +def get_evaluation_batch_aggregate(batch_id: str) -> dict[str, Any]: + return monitor_evaluation.get_monitor_evaluation_batch_aggregate(batch_id) + + +def compare_evaluation_batches(*, baseline_batch_id: str, candidate_batch_id: str) -> dict[str, Any]: + return monitor_evaluation.compare_monitor_evaluation_batches( + baseline_batch_id=baseline_batch_id, + candidate_batch_id=candidate_batch_id, + ) + + +def get_evaluation_run_artifacts(run_id: str) -> dict[str, Any]: + return monitor_evaluation.get_monitor_evaluation_run_artifacts(run_id) + + +def export_evaluation_batch(*, batch_id: str, export_format: str | None = None) -> dict[str, Any]: + return monitor_evaluation.export_monitor_evaluation_batch(batch_id, export_format=export_format) + + def get_resource_overview() -> dict[str, Any]: return monitor_resources.get_monitor_resource_overview() diff --git a/core/tools/command/posix_executor.py b/core/tools/command/posix_executor.py index 2f92dc5a9..0bae0d801 100644 --- a/core/tools/command/posix_executor.py +++ b/core/tools/command/posix_executor.py @@ -3,6 +3,7 @@ from __future__ import annotations import asyncio +import logging import os import uuid from typing import ClassVar @@ -11,6 +12,8 @@ from .base import require_subprocess_pipe +logger = logging.getLogger(__name__) + class PosixShellExecutor(BaseExecutor): """Executor for bash/zsh-style shells with a persistent blocking session.""" @@ -116,7 +119,7 @@ async def execute_async( ) async_cmd = AsyncCommand(command_id=command_id, command_line=command, cwd=work_dir, process=proc) self._running_commands[command_id] = async_cmd - asyncio.create_task(self._monitor_process(async_cmd)) + async_cmd.monitor_task = asyncio.create_task(self._monitor_process(async_cmd)) return async_cmd async def _monitor_process(self, async_cmd: AsyncCommand) -> None: @@ -124,14 +127,35 @@ async def _monitor_process(self, async_cmd: AsyncCommand) -> None: if proc is None: return - stdout_bytes, stderr_bytes = await proc.communicate() + try: + stdout_bytes, stderr_bytes = await proc.communicate() + except Exception: + logger.exception("Failed to monitor async POSIX command %s", async_cmd.command_id) + async_cmd.exit_code = proc.returncode if proc.returncode is not None else 1 + async_cmd.done = True + raise + async_cmd.stdout_buffer.append(stdout_bytes.decode("utf-8", errors="replace")) async_cmd.stderr_buffer.append(stderr_bytes.decode("utf-8", errors="replace")) async_cmd.exit_code = proc.returncode async_cmd.done = True + async def _sync_status(self, async_cmd: AsyncCommand, *, timeout: float = 0.5) -> None: + monitor_task = async_cmd.monitor_task + if async_cmd.done or monitor_task is None: + return + + try: + await asyncio.wait_for(asyncio.shield(monitor_task), timeout=timeout) + except TimeoutError: + logger.debug("Async POSIX command %s is still running after status sync window", async_cmd.command_id) + async def get_status(self, command_id: str) -> AsyncCommand | None: - return self._running_commands.get(command_id) + async_cmd = self._running_commands.get(command_id) + if async_cmd is None: + return None + await self._sync_status(async_cmd) + return async_cmd async def wait_for(self, command_id: str, timeout: float | None = None) -> ExecuteResult | None: async_cmd = self._running_commands.get(command_id) @@ -140,7 +164,11 @@ async def wait_for(self, command_id: str, timeout: float | None = None) -> Execu if not async_cmd.done: try: - await asyncio.wait_for(self._wait_until_done(async_cmd), timeout=timeout) + monitor_task = async_cmd.monitor_task + if monitor_task is None: + await asyncio.wait_for(self._wait_until_done(async_cmd), timeout=timeout) + else: + await asyncio.wait_for(asyncio.shield(monitor_task), timeout=timeout) except TimeoutError: return ExecuteResult( exit_code=-1, diff --git a/core/tools/command/powershell/executor.py b/core/tools/command/powershell/executor.py index eb8968cba..f3a8e1137 100644 --- a/core/tools/command/powershell/executor.py +++ b/core/tools/command/powershell/executor.py @@ -3,12 +3,14 @@ from __future__ import annotations import asyncio +import logging import os import uuid from sandbox.interfaces.executor import AsyncCommand, BaseExecutor, ExecuteResult _RUNNING_COMMANDS: dict[str, AsyncCommand] = {} +logger = logging.getLogger(__name__) class PowerShellExecutor(BaseExecutor): @@ -112,7 +114,7 @@ async def execute_async( ) _RUNNING_COMMANDS[command_id] = async_cmd - asyncio.create_task(self._monitor_process(async_cmd)) + async_cmd.monitor_task = asyncio.create_task(self._monitor_process(async_cmd)) return async_cmd @@ -122,7 +124,13 @@ async def _monitor_process(self, async_cmd: AsyncCommand) -> None: if proc is None: return - stdout_bytes, stderr_bytes = await proc.communicate() + try: + stdout_bytes, stderr_bytes = await proc.communicate() + except Exception: + logger.exception("Failed to monitor async PowerShell command %s", async_cmd.command_id) + async_cmd.exit_code = proc.returncode if proc.returncode is not None else 1 + async_cmd.done = True + raise async_cmd.stdout_buffer.append(stdout_bytes.decode("utf-8", errors="replace")) async_cmd.stderr_buffer.append(stderr_bytes.decode("utf-8", errors="replace")) @@ -130,7 +138,24 @@ async def _monitor_process(self, async_cmd: AsyncCommand) -> None: async_cmd.done = True async def get_status(self, command_id: str) -> AsyncCommand | None: - return _RUNNING_COMMANDS.get(command_id) + async_cmd = _RUNNING_COMMANDS.get(command_id) + if async_cmd is None: + return None + await self._sync_status(async_cmd) + return async_cmd + + async def _sync_status(self, async_cmd: AsyncCommand, *, timeout: float = 0.5) -> None: + monitor_task = async_cmd.monitor_task + if async_cmd.done or monitor_task is None: + return + + try: + await asyncio.wait_for(asyncio.shield(monitor_task), timeout=timeout) + except TimeoutError: + logger.debug( + "Async PowerShell command %s is still running after status sync window", + async_cmd.command_id, + ) async def wait_for( self, @@ -143,10 +168,14 @@ async def wait_for( if not async_cmd.done: try: - await asyncio.wait_for( - self._wait_until_done(async_cmd), - timeout=timeout, - ) + monitor_task = async_cmd.monitor_task + if monitor_task is None: + await asyncio.wait_for( + self._wait_until_done(async_cmd), + timeout=timeout, + ) + else: + await asyncio.wait_for(asyncio.shield(monitor_task), timeout=timeout) except TimeoutError: return ExecuteResult( exit_code=-1, diff --git a/docs/docs.json b/docs/docs.json index 6e193f8f5..7185a029d 100644 --- a/docs/docs.json +++ b/docs/docs.json @@ -42,7 +42,8 @@ "en/concepts", "en/multi-agent-chat", "en/sandbox", - "en/memory" + "en/memory", + "en/evaluation-benchmarks" ] }, { @@ -80,7 +81,8 @@ "zh/concepts", "zh/multi-agent-chat", "zh/sandbox", - "zh/memory" + "zh/memory", + "zh/evaluation-benchmarks" ] }, { diff --git a/docs/en/evaluation-benchmarks.mdx b/docs/en/evaluation-benchmarks.mdx new file mode 100644 index 000000000..6a9ead616 --- /dev/null +++ b/docs/en/evaluation-benchmarks.mdx @@ -0,0 +1,88 @@ +--- +title: Evaluation Benchmarks +sidebarTitle: Evaluation Benchmarks +description: Current benchmark support, validation commands, and live-environment limits +icon: book-open +keywords: [evaluation, benchmark, swe-bench, monitor, playwright] +--- + +## What is currently proven + +Mycel now has a real benchmark acceptance path for a local `SWE-bench Verified` smoke slice. + +The validated path covers: + +- structured benchmark scenarios published by `/api/monitor/evaluation/scenarios` +- batch creation and start via `/api/monitor/evaluation/batches` +- background execution through `EvaluationBatchExecutor` +- trace, metrics, and artifact persistence +- command-judge scoring +- batch aggregate and compare APIs +- export via `/api/monitor/evaluation/batches/{batch_id}/export` +- frontend monitor verification with Playwright CLI + +## Validation environment + +The acceptance run does **not** use the full production `backend.web` stack. + +Instead it uses: + +- `eval.benchmarks.swe_verified.acceptance` as a local FastAPI harness +- product monitor routers, batch service, executor, judge, and export code +- an in-memory fake Supabase client +- fake thread endpoints that still exercise the real `EvalClient` HTTP + SSE path + +This is enough to prove that the monitor evaluation chain is wired correctly for: + +- scenario intake +- orchestration +- trajectory and artifact collection +- judge execution +- aggregation and comparison +- export + +It does **not** prove the live-only parts of the main web runtime such as: + +- Postgres checkpoint boot for `backend.web` +- real Supabase auth and storage wiring +- actual repo checkout and patch capture +- official SWE-bench evaluator execution + +## Support Matrix + +| Benchmark family | Current status | Proven path | Not yet supported | +| --- | --- | --- | --- | +| SWE-bench Verified smoke | Partial | scenario catalog, batch create/start, command judge, compare, `predictions_json` export, monitor Playwright flow | official evaluator bridge, repo checkout, patch/test-log capture, `predictions_jsonl` export | +| Terminal-Bench | Not supported | none | task loader, verifier bridge, transcript artifacts, export | +| GAIA | Not supported | none | dataset loader, attachment ingest, scorer, export | +| tau2-bench / AppWorld | Not supported | none | simulator bridge, state validator, reward integration | +| WebArena / VisualWebArena / WorkArena | Not supported | none | browser env adapter, DOM/network/screenshot artifacts, validator bridge | +| OSWorld | Not supported | none | desktop runtime, recording, GUI validator | + +## Commands used in acceptance + +```bash +./.venv/bin/python -m pytest \ + tests/Unit/eval/test_swe_verified_acceptance.py \ + tests/Unit/eval/test_judge.py \ + tests/Unit/eval/test_exporter.py \ + tests/Unit/eval/test_swe_verified_assets.py + +./.venv/bin/python -m eval.benchmarks.swe_verified.acceptance serve --port 8765 + +./.venv/bin/python -m eval.benchmarks.swe_verified.acceptance rpc \ + --request eval/benchmarks/swe_verified/smoke/rpc/judge_request.json + +./.venv/bin/python -m eval.benchmarks.swe_verified.acceptance rpc \ + --request eval/benchmarks/swe_verified/smoke/rpc/export_request.json + +cd frontend/monitor +npx playwright test -c playwright.acceptance.config.ts +``` + +## Known limits + +- `backend.web` still needs real Postgres and Supabase runtime config. The acceptance harness proves monitor evaluation logic, not full app bootstrap. +- The smoke slice is scenario-backed. The JSON asset bundle under `eval/benchmarks/swe_verified/smoke/` still needs an official evaluator bridge for live scoring. +- Artifact coverage is currently limited to final response, benchmark metadata, workspace metadata, and requested placeholders. It is not yet a full SWE patch/test-log artifact stack. +- The backend export surface currently proves `predictions_json`. The sample asset bundle still documents `sample_predictions.jsonl` for future official evaluator integration. diff --git a/docs/zh/evaluation-benchmarks.mdx b/docs/zh/evaluation-benchmarks.mdx new file mode 100644 index 000000000..2c8410c48 --- /dev/null +++ b/docs/zh/evaluation-benchmarks.mdx @@ -0,0 +1,88 @@ +--- +title: 评测 Benchmark +sidebarTitle: 评测 Benchmark +description: 当前 benchmark 支持情况、验收命令与 live 环境边界 +icon: book-open +keywords: [评测, benchmark, swe-bench, monitor, playwright] +--- + +## 当前已被真实验证的能力 + +Mycel 目前已经有一条本地 `SWE-bench Verified` smoke slice 的真实验收链路。 + +已验证环节包括: + +- `/api/monitor/evaluation/scenarios` 发布结构化 benchmark scenario +- `/api/monitor/evaluation/batches` 创建与启动 batch +- `EvaluationBatchExecutor` 后台执行 +- 轨迹、指标与 artifact 持久化 +- command judge 打分 +- batch aggregate 与 compare API +- `/api/monitor/evaluation/batches/{batch_id}/export` 导出 +- monitor 前端的 Playwright CLI 浏览器验证 + +## 实际验证环境 + +本次验收**没有**直接启动完整生产形态的 `backend.web`。 + +实际使用的是: + +- `eval.benchmarks.swe_verified.acceptance` 本地 FastAPI harness +- 产品里的 monitor router、batch service、executor、judge、export 代码 +- 内存版 fake Supabase client +- fake thread endpoint,但仍然走真实 `EvalClient` 的 HTTP + SSE 路径 + +这足以证明以下 monitor 评测主链成立: + +- 输入 +- 编排 +- 轨迹与 artifact 采集 +- judge 执行 +- 汇总与对比 +- 导出 + +但它**不能**证明主 web runtime 的 live-only 部分,例如: + +- `backend.web` 的 Postgres checkpoint 启动 +- 真实 Supabase auth / storage wiring +- 实际 repo checkout 与 patch 捕获 +- 官方 SWE-bench evaluator 执行 + +## 支持矩阵 + +| Benchmark 家族 | 当前状态 | 已证明的路径 | 仍不支持 | +| --- | --- | --- | --- | +| SWE-bench Verified smoke | 部分支持 | scenario catalog、batch create/start、command judge、compare、`predictions_json` export、monitor Playwright 流程 | official evaluator bridge、repo checkout、patch/test-log 捕获、`predictions_jsonl` export | +| Terminal-Bench | 不支持 | 无 | task loader、verifier bridge、transcript artifact、export | +| GAIA | 不支持 | 无 | dataset loader、附件接入、scorer、export | +| tau2-bench / AppWorld | 不支持 | 无 | simulator bridge、state validator、reward integration | +| WebArena / VisualWebArena / WorkArena | 不支持 | 无 | browser env adapter、DOM/network/screenshot artifact、validator bridge | +| OSWorld | 不支持 | 无 | desktop runtime、录屏、GUI validator | + +## 本次验收使用的命令 + +```bash +./.venv/bin/python -m pytest \ + tests/Unit/eval/test_swe_verified_acceptance.py \ + tests/Unit/eval/test_judge.py \ + tests/Unit/eval/test_exporter.py \ + tests/Unit/eval/test_swe_verified_assets.py + +./.venv/bin/python -m eval.benchmarks.swe_verified.acceptance serve --port 8765 + +./.venv/bin/python -m eval.benchmarks.swe_verified.acceptance rpc \ + --request eval/benchmarks/swe_verified/smoke/rpc/judge_request.json + +./.venv/bin/python -m eval.benchmarks.swe_verified.acceptance rpc \ + --request eval/benchmarks/swe_verified/smoke/rpc/export_request.json + +cd frontend/monitor +npx playwright test -c playwright.acceptance.config.ts +``` + +## 已知限制 + +- `backend.web` 仍然需要真实的 Postgres 与 Supabase runtime config。本次 harness 证明的是 monitor evaluation 逻辑,不是完整 app bootstrap。 +- smoke slice 目前仍是 scenario-backed。`eval/benchmarks/swe_verified/smoke/` 里的 JSON 资产还需要 official evaluator bridge 才能进入 live 打分。 +- 当前 artifact 只证明了 final response、benchmark metadata、workspace metadata 和 requested placeholder,不是完整的 SWE patch/test-log artifact 栈。 +- 后端 export 目前真实证明的是 `predictions_json`。样例资产里的 `sample_predictions.jsonl` 仍然属于后续 official evaluator 对接材料。 diff --git a/eval/batch_executor.py b/eval/batch_executor.py index b6bceb5b9..eb833a84f 100644 --- a/eval/batch_executor.py +++ b/eval/batch_executor.py @@ -1,28 +1,45 @@ from __future__ import annotations +import asyncio +import logging + from eval.models import EvalResult, EvalScenario +logger = logging.getLogger(__name__) + class EvaluationBatchExecutor: def __init__(self, *, runner, batch_service) -> None: self._runner = runner self._batch_service = batch_service - async def run_batch(self, batch_id: str, scenarios: list[EvalScenario]) -> list[EvalResult]: + async def run_batch(self, batch_id: str, scenarios: list[EvalScenario], *, max_concurrent: int = 1) -> list[EvalResult]: self._batch_service.update_batch_status(batch_id, "running") + semaphore = asyncio.Semaphore(max(1, max_concurrent)) results: list[EvalResult] = [] - try: - for scenario in scenarios: + failed_scenarios: list[str] = [] + + async def _run_single_scenario(scenario: EvalScenario) -> EvalResult | None: + async with semaphore: + logger.info("Running evaluation scenario %s in batch %s", scenario.id, batch_id) self._batch_service.mark_batch_run_running_for_scenario(batch_id, scenario.id) try: result = await self._runner.run_scenario(scenario) except Exception as exc: + logger.exception("Evaluation scenario %s failed in batch %s", scenario.id, batch_id) self._batch_service.record_eval_error(batch_id, scenario.id, exc) - raise + failed_scenarios.append(scenario.id) + return None self._batch_service.record_eval_result(batch_id, result) + return result + + for task in asyncio.as_completed([asyncio.create_task(_run_single_scenario(scenario)) for scenario in scenarios]): + result = await task + if result is not None: results.append(result) - except Exception: - self._batch_service.update_batch_status(batch_id, "failed") - raise - self._batch_service.update_batch_status(batch_id, "completed") + + final_status = "failed" if failed_scenarios else "completed" + if failed_scenarios: + logger.warning("Batch %s completed with failed scenarios: %s", batch_id, ", ".join(failed_scenarios)) + self._batch_service.update_batch_status(batch_id, final_status) return results diff --git a/eval/batch_service.py b/eval/batch_service.py index bdfe2186d..04c960a24 100644 --- a/eval/batch_service.py +++ b/eval/batch_service.py @@ -4,6 +4,8 @@ from typing import Any from uuid import uuid4 +from eval.models import EvalScenario + class EvaluationBatchService: def __init__(self, *, batch_repo) -> None: @@ -17,19 +19,23 @@ def create_batch( scenario_ids: list[str], sandbox: str, max_concurrent: int, + scenario_refs: list[EvalScenario] | None = None, ) -> dict: now = datetime.now(UTC).isoformat() batch_id = f"eval-batch-{uuid4().hex[:12]}" + scenario_refs = scenario_refs or [] + kind = "benchmark_batch" if any(scenario.benchmark for scenario in scenario_refs) else "scenario_batch" batch = self._batch_repo.create_batch( { "batch_id": batch_id, - "kind": "scenario_batch", + "kind": kind, "submitted_by_user_id": submitted_by_user_id, "agent_user_id": agent_user_id, "config_json": { "scenario_ids": list(scenario_ids), "sandbox": sandbox, "max_concurrent": max_concurrent, + "scenario_refs": [self._serialize_scenario_ref(scenario) for scenario in scenario_refs], }, "status": "pending", "created_at": now, @@ -95,6 +101,43 @@ def refresh_batch_summary(self, batch_id: str) -> dict: "completed_runs": sum(1 for row in batch_runs if row.get("status") == "completed"), "failed_runs": sum(1 for row in batch_runs if row.get("status") in {"failed", "cancelled"}), } + scored_runs = 0 + passed_runs = 0 + failed_judges = 0 + total_tokens = 0 + total_artifacts = 0 + score_totals: dict[str, float] = {} + score_counts: dict[str, int] = {} + benchmark_families: set[str] = set() + benchmark_splits: set[str] = set() + for row in batch_runs: + row_summary = row.get("summary_json") or {} + benchmark_family = str(row_summary.get("benchmark_family") or "").strip() + benchmark_split = str(row_summary.get("benchmark_split") or "").strip() + if benchmark_family: + benchmark_families.add(benchmark_family) + if benchmark_split: + benchmark_splits.add(benchmark_split) + total_tokens += int(row_summary.get("total_tokens") or 0) + total_artifacts += int(row_summary.get("artifact_count") or 0) + verdict = str(row_summary.get("judge_verdict") or "").strip().lower() + if verdict: + scored_runs += 1 + if verdict == "passed": + passed_runs += 1 + elif verdict in {"failed", "error"}: + failed_judges += 1 + for key, value in dict(row_summary.get("scores") or {}).items(): + score_totals[str(key)] = score_totals.get(str(key), 0.0) + float(value) + score_counts[str(key)] = score_counts.get(str(key), 0) + 1 + summary["judge_passed_runs"] = passed_runs + summary["judge_failed_runs"] = failed_judges + summary["pass_rate"] = passed_runs / scored_runs if scored_runs else 0.0 + summary["avg_total_tokens"] = total_tokens / max(1, summary["completed_runs"]) + summary["artifact_count_total"] = total_artifacts + summary["avg_scores"] = {key: score_totals[key] / score_counts[key] for key in sorted(score_totals) if score_counts.get(key)} + summary["benchmark_families"] = sorted(benchmark_families) + summary["benchmark_splits"] = sorted(benchmark_splits) updated = self._batch_repo.update_batch( batch_id, summary_json=summary, @@ -154,6 +197,17 @@ def record_eval_result(self, batch_id: str, result: Any) -> dict: summary = { "total_tokens": int(result.system_metrics.total_tokens), "tool_call_count": int(result.system_metrics.tool_call_count), + "artifact_count": len(result.artifacts), + "benchmark_family": result.benchmark.family if result.benchmark else "", + "benchmark_name": result.benchmark.name if result.benchmark else "", + "benchmark_split": result.benchmark.split if result.benchmark else "", + "instance_id": result.benchmark.instance_id if result.benchmark else "", + "judge_type": result.judge_result.judge_type if result.judge_result else "", + "judge_status": result.judge_result.status if result.judge_result else "", + "judge_verdict": result.judge_result.verdict if result.judge_result else "", + "scores": result.judge_result.scores if result.judge_result else {}, + "export_format": result.export_config.format if result.export_config else "", + "export_key": result.export_config.key if result.export_config else "", } updated = self._batch_repo.update_batch_run( batch_run["batch_run_id"], @@ -186,3 +240,61 @@ def _find_batch_run_for_scenario(self, batch_id: str, scenario_id: str) -> dict: if str(batch_run.get("scenario_id") or "") == scenario_id: return batch_run raise KeyError(f"Evaluation batch run not found for scenario {scenario_id} in batch {batch_id}") + + def get_batch_summary(self, batch_id: str) -> dict[str, Any]: + batch = self._batch_repo.get_batch(batch_id) + if batch is None: + raise KeyError(f"Evaluation batch not found: {batch_id}") + return { + "batch_id": batch_id, + "status": batch.get("status"), + "summary": self.refresh_batch_summary(batch_id), + } + + def compare_batches(self, baseline_batch_id: str, candidate_batch_id: str) -> dict[str, Any]: + baseline = self.get_batch_summary(baseline_batch_id) + candidate = self.get_batch_summary(candidate_batch_id) + baseline_summary = baseline["summary"] + candidate_summary = candidate["summary"] + deltas = {} + for key in ("pass_rate", "judge_passed_runs", "judge_failed_runs", "avg_total_tokens", "artifact_count_total"): + baseline_value = float(baseline_summary.get(key) or 0.0) + candidate_value = float(candidate_summary.get(key) or 0.0) + deltas[key] = { + "baseline": baseline_value, + "candidate": candidate_value, + "delta": candidate_value - baseline_value, + } + score_keys = sorted( + set(dict(baseline_summary.get("avg_scores") or {}).keys()) | set(dict(candidate_summary.get("avg_scores") or {}).keys()) + ) + deltas["avg_scores"] = { + key: { + "baseline": float(dict(baseline_summary.get("avg_scores") or {}).get(key) or 0.0), + "candidate": float(dict(candidate_summary.get("avg_scores") or {}).get(key) or 0.0), + "delta": float(dict(candidate_summary.get("avg_scores") or {}).get(key) or 0.0) + - float(dict(baseline_summary.get("avg_scores") or {}).get(key) or 0.0), + } + for key in score_keys + } + return { + "baseline_batch_id": baseline_batch_id, + "candidate_batch_id": candidate_batch_id, + "baseline": baseline_summary, + "candidate": candidate_summary, + "delta": deltas, + } + + @staticmethod + def _serialize_scenario_ref(scenario: EvalScenario) -> dict[str, Any]: + return { + "scenario_id": scenario.id, + "name": scenario.name, + "category": scenario.category, + "sandbox": scenario.sandbox, + "benchmark": scenario.benchmark.model_dump(mode="json") if scenario.benchmark else None, + "workspace": scenario.workspace.model_dump(mode="json") if scenario.workspace else None, + "judge_config": scenario.judge_config.model_dump(mode="json") if scenario.judge_config else None, + "artifact_policy": scenario.artifact_policy.model_dump(mode="json") if scenario.artifact_policy else None, + "export": scenario.export.model_dump(mode="json") if scenario.export else None, + } diff --git a/eval/benchmarks/__init__.py b/eval/benchmarks/__init__.py new file mode 100644 index 000000000..f479f4533 --- /dev/null +++ b/eval/benchmarks/__init__.py @@ -0,0 +1 @@ +"""Benchmark-specific assets and helpers.""" diff --git a/eval/benchmarks/swe_verified/README.md b/eval/benchmarks/swe_verified/README.md new file mode 100644 index 000000000..fd5a557cb --- /dev/null +++ b/eval/benchmarks/swe_verified/README.md @@ -0,0 +1,63 @@ +# SWE-bench Verified Smoke Slice + +This directory freezes the `#13` P0 sample assets for a `SWE-bench Verified` smoke slice. It does not add judge or export platform code; it fixes the sample inputs that `#12` and `#14` will consume later. + +## Data source + +- Dataset: `SWE-bench/SWE-bench_Verified` +- Dataset split: `test` +- Dataset revision: `91aa3ed51b709be6457e12d00300a6a596d4c6a3` +- Official dataset guide: `https://www.swebench.com/SWE-bench/guides/datasets/` +- Official evaluation guide: `https://www.swebench.com/SWE-bench/guides/evaluation/` +- Official repo: `https://github.com/SWE-bench/SWE-bench` + +## Slice rules + +- Single repo only: `pytest-dev/pytest` +- Single `environment_setup_commit` only: `634cde9506eb1f48dec3ec77974ee8dc952207c6` +- Three verified instances: + - `pytest-dev__pytest-7521` + - `pytest-dev__pytest-7571` + - `pytest-dev__pytest-7490` +- Selection bias: + - prefer `<15 min fix` or `15 min - 1 hour` + - prefer `1-2` `FAIL_TO_PASS` tests + - keep one repo + one env pin so smoke prep can reuse a single checkout and environment + +## Files + +- `smoke/manifest.json`: frozen instance definitions with `repo`, `base_commit`, test mappings, and official patch hashes +- `smoke/judge_config.json`: P0 judge profile and official evaluator invocation template +- `smoke/sample_evaluator_input.json`: concrete evaluator input envelope for the smoke slice +- `smoke/sample_predictions.jsonl`: gold-format prediction records using the official dataset patches +- `smoke/export_contract.json`: minimum backend export fields that `#12` must align with +- `smoke/export_golden.json`: golden export fixture for contract verification +- `smoke/rpc/*.json`: JSON-RPC request/response fixtures that simulate judge/export preparation calls +- `smoke/fixtures/official_patches/*`: official solution patches and test patches pinned by sha256 + +## Validation entrypoints + +- Static asset + branch coverage under the project Python 3.12 environment: + - `./.venv/bin/python3.12 -m pytest tests/Unit/eval/test_swe_verified_assets.py` +- Smoke asset verification without the optional `datasets` dependency: + - `./.venv/bin/python3.12 -m eval.benchmarks.swe_verified.verify_smoke_assets --skip-official-dataset` +- Full alignment against the upstream dataset in an environment where `datasets` is installed: + - `python -m eval.benchmarks.swe_verified.verify_smoke_assets` + +## Current boundary + +- Completed here: + - sample instances + - repo / commit pins + - judge config + - evaluator input assets + - export contract and golden fixture + - JSON-RPC preparation fixtures +- Still blocked on `#12`: + - no benchmark-aware judge bridge in product code + - no backend export API that emits the contract shape + - no repo checkout / evaluator orchestration wired into monitor batches +- Once `#12` is ready: + - feed `smoke/sample_evaluator_input.json` into the new judge bridge + - emit an export payload and diff it against `smoke/export_golden.json` + - use `smoke/rpc/*.json` as request/response contract fixtures for backend tests diff --git a/eval/benchmarks/swe_verified/__init__.py b/eval/benchmarks/swe_verified/__init__.py new file mode 100644 index 000000000..6e3aff355 --- /dev/null +++ b/eval/benchmarks/swe_verified/__init__.py @@ -0,0 +1 @@ +"""SWE-bench Verified smoke slice assets.""" diff --git a/eval/benchmarks/swe_verified/acceptance.py b/eval/benchmarks/swe_verified/acceptance.py new file mode 100644 index 000000000..781c49d6a --- /dev/null +++ b/eval/benchmarks/swe_verified/acceptance.py @@ -0,0 +1,434 @@ +from __future__ import annotations + +import argparse +import asyncio +import json +import logging +import re +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any + +import uvicorn +from fastapi import APIRouter, FastAPI, HTTPException, Query +from fastapi.responses import JSONResponse, StreamingResponse +from pydantic import BaseModel + +from backend.monitor.api.http import global_router +from backend.monitor.infrastructure.evaluation import evaluation_storage_service +from eval.batch_service import EvaluationBatchService +from eval.benchmarks.swe_verified.assets import load_smoke_asset_bundle, resolve_repo_path +from eval.storage import TrajectoryStore +from storage.container import StorageContainer +from tests.fakes.supabase import FakeSupabaseClient + +logger = logging.getLogger(__name__) + +REPO_ROOT = Path(__file__).resolve().parents[3] +_TOKEN = "token-1" +_CLIENT = FakeSupabaseClient() +_MARKER_RE = re.compile(r"\[\[SWE_SMOKE::(?P[^:\]]+)::(?Ppass|fail|error)\]\]") + + +def create_fake_supabase_client() -> FakeSupabaseClient: + return _CLIENT + + +def reset_fake_supabase_client() -> FakeSupabaseClient: + global _CLIENT + _CLIENT = FakeSupabaseClient() + return _CLIENT + + +def evaluate_smoke_judge_payload( + *, + instance_id: str, + payload: dict[str, Any], + profile_id: str = "swe_verified_pytest_smoke_gold_v1", +) -> dict[str, Any]: + bundle = load_smoke_asset_bundle() + known_instances = {instance.instance_id for instance in bundle.manifest.instances} + if instance_id not in known_instances: + raise ValueError(f"Unknown SWE-bench Verified smoke instance: {instance_id}") + + result = dict(payload.get("result") or {}) + final_response = str(result.get("final_response") or "") + artifacts = list(result.get("artifacts") or []) + artifact_names = {str(artifact.get("name") or "") for artifact in artifacts if isinstance(artifact, dict)} + required_artifacts = {"final-response", "benchmark-instance", "workspace"} + missing_artifacts = sorted(required_artifacts - artifact_names) + + passed = f"PATCH_OK::{instance_id}" in final_response and not missing_artifacts + rationale_parts = [] + rationale_parts.append("Found patch marker." if f"PATCH_OK::{instance_id}" in final_response else "Patch marker missing.") + if missing_artifacts: + rationale_parts.append(f"Missing artifacts: {', '.join(missing_artifacts)}.") + else: + rationale_parts.append("Required artifacts present.") + verdict = "passed" if passed else "failed" + artifact_coverage = (len(required_artifacts) - len(missing_artifacts)) / len(required_artifacts) + + return { + "status": "completed", + "verdict": verdict, + "rationale": " ".join(rationale_parts), + "scores": { + "resolved": 1.0 if passed else 0.0, + "artifact_coverage": artifact_coverage, + }, + "metadata": { + "instance_id": instance_id, + "judge_profile": profile_id, + "missing_artifacts": missing_artifacts, + }, + } + + +def simulate_jsonrpc_request(request: dict[str, Any]) -> dict[str, Any]: + bundle = load_smoke_asset_bundle() + request_id = request.get("id") + method = str(request.get("method") or "") + params = dict(request.get("params") or {}) + + def _error(code: int, message: str) -> dict[str, Any]: + return {"jsonrpc": "2.0", "id": request_id, "error": {"code": code, "message": message}} + + if request.get("jsonrpc") != "2.0": + return _error(-32600, "Only JSON-RPC 2.0 requests are supported.") + + if method == "eval.prepareJudgeRun": + if params.get("benchmark") != "swe_verified": + return _error(-32602, "benchmark must be swe_verified") + for key in ("judge_config_path", "evaluator_input_path"): + target = params.get(key) + if not target or not resolve_repo_path(str(target)).exists(): + return _error(-32602, f"{key} is missing or does not exist") + response = dict(bundle.rpc["judge_response"]) + response["id"] = request_id + return response + + if method == "eval.previewExport": + for key in ("contract_path", "source_slice_path"): + target = params.get(key) + if not target or not resolve_repo_path(str(target)).exists(): + return _error(-32602, f"{key} is missing or does not exist") + response = dict(bundle.rpc["export_response"]) + response["id"] = request_id + return response + + return _error(-32601, f"Unsupported method: {method}") + + +class _FakeAuthService: + def verify_token(self, token: str) -> dict[str, str]: + if token != _TOKEN: + raise ValueError("Unknown acceptance token") + return {"user_id": "owner-1"} + + +class _CreateThreadRequest(BaseModel): + agent_user_id: str + sandbox: str = "local" + cwd: str | None = None + + +class _RunMessageRequest(BaseModel): + message: str + enable_trajectory: bool = True + + +@dataclass +class _ThreadRecord: + thread_id: str + agent_user_id: str + sandbox: str + cwd: str | None + runtime_status: dict[str, Any] = field(default_factory=lambda: {"context": {"usage_percent": 0.42}}) + conversation: list[dict[str, Any]] = field(default_factory=list) + trace_events: list[dict[str, Any]] = field(default_factory=list) + sse_events: list[dict[str, Any]] = field(default_factory=list) + deleted: bool = False + + +class _ThreadHarness: + def __init__(self) -> None: + self._counter = 1 + self._threads: dict[str, _ThreadRecord] = {} + + def create_thread(self, payload: _CreateThreadRequest) -> str: + thread_id = f"thread-{self._counter}" + self._counter += 1 + self._threads[thread_id] = _ThreadRecord( + thread_id=thread_id, + agent_user_id=payload.agent_user_id, + sandbox=payload.sandbox, + cwd=payload.cwd, + ) + return thread_id + + def run_message(self, thread_id: str, message: str) -> None: + record = self._threads.get(thread_id) + if record is None: + raise KeyError(f"Thread not found: {thread_id}") + + user_text, instance_id, mode = self._parse_message(message) + status_payload = { + "tokens": { + "input_tokens": 120, + "output_tokens": 80, + "total_tokens": 200, + "total_cost_usd": 0.02, + }, + "context": {"usage_percent": 0.42}, + } + if mode == "error": + sse_events = [ + self._sse_event(1, "status", status_payload), + self._sse_event(2, "error", {"error": f"simulated runtime failure for {instance_id}"}), + ] + assistant_text = "" + else: + assistant_lines = [f"Inspecting repository checkout for {instance_id}."] + if mode == "pass": + assistant_lines.append(f"PATCH_OK::{instance_id}") + assistant_lines.append("Focused tests: 1 passed.") + else: + assistant_lines.append("Unable to confirm the requested fix.") + assistant_lines.append("Focused tests: 1 failed.") + assistant_text = "\n".join(assistant_lines) + sse_events = [ + self._sse_event( + 1, + "tool_call", + { + "id": f"tool-{thread_id}-1", + "name": "inspect_repo", + "args": {"cwd": record.cwd or "/workspace/pytest", "instance_id": instance_id}, + }, + ), + self._sse_event( + 2, + "tool_result", + { + "tool_call_id": f"tool-{thread_id}-1", + "content": f"Repository metadata collected for {instance_id}.", + }, + ), + self._sse_event(3, "text", {"content": assistant_text}), + self._sse_event(4, "status", status_payload), + self._sse_event(5, "run_done", {"status": "completed"}), + ] + + record.runtime_status = status_payload + record.conversation = [ + {"role": "user", "content": user_text}, + {"role": "assistant", "content": assistant_text}, + ] + record.sse_events = sse_events + record.trace_events = [ + { + "seq": event["id"], + "actor": "agent" if event["event"] != "tool_result" else "tool", + "event_type": event["event"], + "summary": event["event"], + "payload": event["data"], + } + for event in sse_events + ] + + def events_after(self, thread_id: str, after: int) -> list[dict[str, Any]]: + record = self._threads.get(thread_id) + if record is None: + raise KeyError(f"Thread not found: {thread_id}") + return [event for event in record.sse_events if int(event["id"]) > after] + + def runtime(self, thread_id: str) -> dict[str, Any]: + record = self._threads.get(thread_id) + if record is None: + raise KeyError(f"Thread not found: {thread_id}") + return record.runtime_status + + def delete_thread(self, thread_id: str) -> None: + record = self._threads.get(thread_id) + if record is None: + raise KeyError(f"Thread not found: {thread_id}") + record.deleted = True + + def monitor_thread_detail(self, thread_id: str) -> dict[str, Any]: + record = self._threads.get(thread_id) + if record is None: + raise KeyError(f"Thread not found: {thread_id}") + return { + "thread": { + "thread_id": record.thread_id, + "agent_user_id": record.agent_user_id, + "sandbox": record.sandbox, + "cwd": record.cwd, + "status": "deleted" if record.deleted else "active", + }, + "trajectory": { + "conversation": record.conversation, + "events": record.trace_events, + }, + } + + @staticmethod + def _parse_message(message: str) -> tuple[str, str, str]: + match = _MARKER_RE.search(message) + if match is None: + raise ValueError("Acceptance harness expected a [[SWE_SMOKE::::]] marker in the message.") + user_text = _MARKER_RE.sub("", message, count=1).strip() + return user_text, match.group("instance_id"), match.group("mode") + + @staticmethod + def _sse_event(event_id: int, event_type: str, payload: dict[str, Any]) -> dict[str, Any]: + return {"id": event_id, "event": event_type, "data": payload} + + +def create_acceptance_app() -> FastAPI: + reset_fake_supabase_client() + thread_harness = _ThreadHarness() + storage_container = StorageContainer(supabase_client=create_fake_supabase_client()) + + app = FastAPI(title="SWE-bench Verified Acceptance Harness") + app.state.auth_service = _FakeAuthService() + + def _make_trajectory_store() -> TrajectoryStore: + return TrajectoryStore(eval_repo=storage_container.eval_repo()) + + def _make_eval_batch_service() -> EvaluationBatchService: + return EvaluationBatchService(batch_repo=storage_container.evaluation_batch_repo()) + + evaluation_storage_service.make_trajectory_store = _make_trajectory_store + evaluation_storage_service.make_eval_batch_service = _make_eval_batch_service + + monitor_thread_router = APIRouter() + thread_router = APIRouter() + + @monitor_thread_router.get("/threads") + async def monitor_threads() -> dict[str, Any]: + return { + "threads": [ + { + "thread_id": record.thread_id, + "agent_user_id": record.agent_user_id, + "sandbox": record.sandbox, + "status": "deleted" if record.deleted else "active", + } + for record in thread_harness._threads.values() + ] + } + + @monitor_thread_router.get("/threads/{thread_id}") + async def monitor_thread_detail(thread_id: str) -> dict[str, Any]: + try: + return thread_harness.monitor_thread_detail(thread_id) + except KeyError as exc: + raise HTTPException(status_code=404, detail=str(exc)) from exc + + @thread_router.post("/api/threads") + async def create_thread(payload: _CreateThreadRequest) -> dict[str, str]: + return {"thread_id": thread_harness.create_thread(payload)} + + @thread_router.post("/api/threads/{thread_id}/messages") + async def run_message(thread_id: str, payload: _RunMessageRequest) -> dict[str, Any]: + try: + thread_harness.run_message(thread_id, payload.message) + except KeyError as exc: + raise HTTPException(status_code=404, detail=str(exc)) from exc + return {"accepted": True, "thread_id": thread_id} + + @thread_router.get("/api/threads/{thread_id}/events") + async def stream_events(thread_id: str, after: int = Query(default=0, ge=0)) -> StreamingResponse: + try: + events = thread_harness.events_after(thread_id, after) + except KeyError as exc: + raise HTTPException(status_code=404, detail=str(exc)) from exc + + async def _emit() -> Any: + for event in events: + yield f"id: {event['id']}\n".encode() + yield f"event: {event['event']}\n".encode() + yield f"data: {json.dumps(event['data'])}\n\n".encode() + await asyncio.sleep(0) + + return StreamingResponse(_emit(), media_type="text/event-stream") + + @thread_router.get("/api/threads/{thread_id}/runtime") + async def thread_runtime(thread_id: str) -> dict[str, Any]: + try: + return thread_harness.runtime(thread_id) + except KeyError as exc: + raise HTTPException(status_code=404, detail=str(exc)) from exc + + @thread_router.delete("/api/threads/{thread_id}") + async def delete_thread(thread_id: str) -> JSONResponse: + try: + thread_harness.delete_thread(thread_id) + except KeyError as exc: + raise HTTPException(status_code=404, detail=str(exc)) from exc + return JSONResponse({"deleted": True}) + + @thread_router.get("/healthz") + async def healthz() -> dict[str, str]: + return {"status": "ok"} + + app.include_router(global_router.router, prefix="/api/monitor") + app.include_router(monitor_thread_router, prefix="/api/monitor") + app.include_router(thread_router) + return app + + +def _run_judge_cli(args: argparse.Namespace) -> int: + payload = json.load(args.stdin) + result = evaluate_smoke_judge_payload( + instance_id=args.instance_id, + payload=payload, + profile_id=args.profile_id, + ) + print(json.dumps(result)) + return 0 + + +def _run_rpc_cli(args: argparse.Namespace) -> int: + with Path(args.request).open(encoding="utf-8") as handle: + request = json.load(handle) + print(json.dumps(simulate_jsonrpc_request(request), indent=2)) + return 0 + + +def _run_server_cli(args: argparse.Namespace) -> int: + logger.info("Starting SWE-bench Verified acceptance harness on port %s", args.port) + app = create_acceptance_app() + uvicorn.run(app, host="127.0.0.1", port=args.port, log_level="info") + return 0 + + +def build_arg_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(description="SWE-bench Verified acceptance helpers.") + subparsers = parser.add_subparsers(dest="command", required=True) + + serve = subparsers.add_parser("serve", help="Start the local acceptance harness.") + serve.add_argument("--port", type=int, default=8765) + serve.set_defaults(handler=_run_server_cli) + + judge = subparsers.add_parser("judge", help="Run the smoke command judge.") + judge.add_argument("--instance-id", required=True) + judge.add_argument("--profile-id", default="swe_verified_pytest_smoke_gold_v1") + judge.add_argument("--stdin", type=argparse.FileType("r"), default="-") + judge.set_defaults(handler=_run_judge_cli) + + rpc = subparsers.add_parser("rpc", help="Simulate a JSON-RPC benchmark preparation call.") + rpc.add_argument("--request", required=True) + rpc.set_defaults(handler=_run_rpc_cli) + return parser + + +def main() -> int: + parser = build_arg_parser() + args = parser.parse_args() + return int(args.handler(args)) + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/eval/benchmarks/swe_verified/assets.py b/eval/benchmarks/swe_verified/assets.py new file mode 100644 index 000000000..ee0422890 --- /dev/null +++ b/eval/benchmarks/swe_verified/assets.py @@ -0,0 +1,398 @@ +from __future__ import annotations + +import copy +import hashlib +import json +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +from pydantic import BaseModel, Field + +REPO_ROOT = Path(__file__).resolve().parents[3] +SMOKE_ROOT = Path(__file__).resolve().parent / "smoke" + + +class SmokeBenchmark(BaseModel): + family: str + dataset_name: str + dataset_split: str + dataset_revision: str + source_urls: dict[str, str] = Field(default_factory=dict) + + +class SmokeSelection(BaseModel): + repo: str + environment_setup_commit: str + max_instances: int + selection_rules: list[str] = Field(default_factory=list) + + +class SmokeJudgeHints(BaseModel): + prediction_key: str + success_requirements: list[str] = Field(default_factory=list) + prediction_format: dict[str, str] = Field(default_factory=dict) + + +class SmokeRuntimeHints(BaseModel): + checkout_url: str + working_directory: str + language: str + + +class SmokeInstance(BaseModel): + instance_id: str + repo: str + base_commit: str + environment_setup_commit: str + difficulty: str + created_at: str + version: str + problem_statement: str + hints_text: str = "" + fail_to_pass: list[str] = Field(default_factory=list) + pass_to_pass: list[str] = Field(default_factory=list) + fail_to_pass_count: int + pass_to_pass_count: int + official_patch_path: str + official_patch_sha256: str + official_test_patch_path: str + official_test_patch_sha256: str + judge: SmokeJudgeHints + runtime: SmokeRuntimeHints + selection_rank: int + + +class SmokeManifest(BaseModel): + slice_id: str + benchmark: SmokeBenchmark + selection: SmokeSelection + instances: list[SmokeInstance] = Field(default_factory=list) + + +class OfficialEvaluatorConfig(BaseModel): + module: str + command_template: list[str] = Field(default_factory=list) + prediction_format: str + required_prediction_fields: list[str] = Field(default_factory=list) + gold_predictions_supported: bool = False + + +class JudgeScoringConfig(BaseModel): + resolved_field: str + required_test_sets: list[str] = Field(default_factory=list) + failure_policy: str + + +class JudgeArtifactsConfig(BaseModel): + prediction_records_path: str + export_contract_path: str + golden_export_path: str + + +class JudgeConfig(BaseModel): + profile_id: str + benchmark: str + slice_manifest_path: str + dataset_name: str + dataset_split: str + dataset_revision: str + repo: str + environment_setup_commit: str + instance_ids: list[str] = Field(default_factory=list) + official_evaluator: OfficialEvaluatorConfig + scoring: JudgeScoringConfig + artifacts: JudgeArtifactsConfig + + +class SampleEvaluatorInput(BaseModel): + judge_profile: str + slice_id: str + run_id: str + max_workers: int + dataset_name: str + dataset_split: str + dataset_revision: str + repo: str + environment_setup_commit: str + instance_ids: list[str] = Field(default_factory=list) + predictions_path: str + official_patch_mode: str + + +class ExportContract(BaseModel): + contract_id: str + description: str + top_level_required: list[str] = Field(default_factory=list) + instance_required: list[str] = Field(default_factory=list) + prediction_record_required: list[str] = Field(default_factory=list) + judge_inputs_required: list[str] = Field(default_factory=list) + judge_result_required: list[str] = Field(default_factory=list) + artifacts_required: list[str] = Field(default_factory=list) + + +@dataclass +class SmokeAssetBundle: + manifest: SmokeManifest + judge_config: JudgeConfig + sample_evaluator_input: SampleEvaluatorInput + export_contract: ExportContract + export_golden: dict[str, Any] + predictions: list[dict[str, Any]] + rpc: dict[str, dict[str, Any]] + + +def resolve_repo_path(path: str) -> Path: + resolved = Path(path) + if not resolved.is_absolute(): + resolved = REPO_ROOT / resolved + return resolved + + +def _load_json(path: Path) -> Any: + with path.open(encoding="utf-8") as handle: + return json.load(handle) + + +def _load_jsonl(path: Path) -> list[dict[str, Any]]: + records: list[dict[str, Any]] = [] + with path.open(encoding="utf-8") as handle: + for line in handle: + line = line.strip() + if not line: + continue + records.append(json.loads(line)) + return records + + +def load_smoke_asset_bundle(smoke_root: Path = SMOKE_ROOT) -> SmokeAssetBundle: + manifest = SmokeManifest.model_validate(_load_json(smoke_root / "manifest.json")) + judge_config = JudgeConfig.model_validate(_load_json(smoke_root / "judge_config.json")) + sample_evaluator_input = SampleEvaluatorInput.model_validate(_load_json(smoke_root / "sample_evaluator_input.json")) + export_contract = ExportContract.model_validate(_load_json(smoke_root / "export_contract.json")) + export_golden = _load_json(smoke_root / "export_golden.json") + predictions = _load_jsonl(smoke_root / "sample_predictions.jsonl") + rpc = { + "judge_request": _load_json(smoke_root / "rpc/judge_request.json"), + "judge_response": _load_json(smoke_root / "rpc/judge_response.json"), + "export_request": _load_json(smoke_root / "rpc/export_request.json"), + "export_response": _load_json(smoke_root / "rpc/export_response.json"), + } + return SmokeAssetBundle( + manifest=manifest, + judge_config=judge_config, + sample_evaluator_input=sample_evaluator_input, + export_contract=export_contract, + export_golden=export_golden, + predictions=predictions, + rpc=rpc, + ) + + +def clone_bundle(bundle: SmokeAssetBundle) -> SmokeAssetBundle: + return copy.deepcopy(bundle) + + +def _sha256_for_path(path: Path) -> str: + return hashlib.sha256(path.read_bytes()).hexdigest() + + +def _require_keys(payload: dict[str, Any], required: list[str], *, label: str, issues: list[str]) -> None: + missing = [key for key in required if key not in payload] + if missing: + issues.append(f"{label} missing keys: {', '.join(missing)}") + + +def validate_smoke_assets(bundle: SmokeAssetBundle) -> list[str]: + issues: list[str] = [] + manifest = bundle.manifest + instance_ids = [instance.instance_id for instance in manifest.instances] + + if not manifest.instances: + issues.append("manifest must contain at least one instance") + return issues + + if len(instance_ids) != len(set(instance_ids)): + issues.append("manifest contains duplicate instance_id values") + + if len(manifest.instances) > manifest.selection.max_instances: + issues.append("manifest exceeds selection.max_instances") + + if bundle.judge_config.instance_ids != instance_ids: + issues.append("judge config instance_ids do not match manifest order") + + if bundle.sample_evaluator_input.instance_ids != instance_ids: + issues.append("sample evaluator input instance_ids do not match manifest order") + + if bundle.judge_config.profile_id != bundle.sample_evaluator_input.judge_profile: + issues.append("judge profile mismatch between judge_config and sample_evaluator_input") + + if bundle.judge_config.dataset_name != manifest.benchmark.dataset_name: + issues.append("judge config dataset_name does not match manifest") + + if bundle.judge_config.dataset_revision != manifest.benchmark.dataset_revision: + issues.append("judge config dataset_revision does not match manifest") + + if bundle.sample_evaluator_input.dataset_name != manifest.benchmark.dataset_name: + issues.append("sample evaluator input dataset_name does not match manifest") + + if bundle.sample_evaluator_input.dataset_revision != manifest.benchmark.dataset_revision: + issues.append("sample evaluator input dataset_revision does not match manifest") + + if bundle.sample_evaluator_input.max_workers != 1: + issues.append("sample evaluator input must pin max_workers to 1 for smoke validation") + + predictions_path = resolve_repo_path(bundle.sample_evaluator_input.predictions_path) + if not predictions_path.exists(): + issues.append(f"sample predictions file is missing: {predictions_path}") + + if len(bundle.predictions) != len(manifest.instances): + issues.append("sample predictions line count does not match manifest instances") + + prediction_lookup = {record.get("instance_id"): record for record in bundle.predictions} + if list(prediction_lookup) != instance_ids: + issues.append("sample predictions instance_ids do not match manifest order") + + for instance in manifest.instances: + if instance.repo != manifest.selection.repo: + issues.append(f"{instance.instance_id} repo does not match slice repo") + if instance.environment_setup_commit != manifest.selection.environment_setup_commit: + issues.append(f"{instance.instance_id} environment_setup_commit does not match slice pin") + if instance.fail_to_pass_count != len(instance.fail_to_pass): + issues.append(f"{instance.instance_id} fail_to_pass_count is inconsistent") + if instance.pass_to_pass_count != len(instance.pass_to_pass): + issues.append(f"{instance.instance_id} pass_to_pass_count is inconsistent") + if not instance.fail_to_pass: + issues.append(f"{instance.instance_id} must include at least one FAIL_TO_PASS test") + if instance.judge.prediction_key != instance.instance_id: + issues.append(f"{instance.instance_id} judge prediction_key must equal instance_id") + if set(instance.judge.prediction_format) != {"instance_id", "model_name_or_path", "model_patch"}: + issues.append(f"{instance.instance_id} prediction_format keys are incomplete") + + patch_path = resolve_repo_path(instance.official_patch_path) + test_patch_path = resolve_repo_path(instance.official_test_patch_path) + if not patch_path.exists(): + issues.append(f"{instance.instance_id} patch fixture is missing: {patch_path}") + else: + actual_patch_sha = _sha256_for_path(patch_path) + if actual_patch_sha != instance.official_patch_sha256: + issues.append(f"{instance.instance_id} patch fixture sha256 mismatch") + if not test_patch_path.exists(): + issues.append(f"{instance.instance_id} test patch fixture is missing: {test_patch_path}") + else: + actual_test_patch_sha = _sha256_for_path(test_patch_path) + if actual_test_patch_sha != instance.official_test_patch_sha256: + issues.append(f"{instance.instance_id} test patch fixture sha256 mismatch") + + prediction = prediction_lookup.get(instance.instance_id) + if prediction is None: + issues.append(f"{instance.instance_id} missing from sample predictions") + else: + expected_prediction_fields = set(bundle.judge_config.official_evaluator.required_prediction_fields) + if set(prediction) != expected_prediction_fields: + issues.append(f"{instance.instance_id} prediction fields do not match judge config") + if prediction.get("model_name_or_path") != "gold": + issues.append(f"{instance.instance_id} sample prediction must use the gold label") + if hashlib.sha256(prediction.get("model_patch", "").encode("utf-8")).hexdigest() != instance.official_patch_sha256: + issues.append(f"{instance.instance_id} sample prediction patch does not match fixture sha256") + + _require_keys(bundle.export_golden, bundle.export_contract.top_level_required, label="export_golden", issues=issues) + + export_instances = bundle.export_golden.get("instances", []) + if [row.get("instance_id") for row in export_instances] != instance_ids: + issues.append("export_golden instances do not match manifest order") + + for row in export_instances: + instance_id = row.get("instance_id", "") + _require_keys(row, bundle.export_contract.instance_required, label=f"export_golden[{instance_id}]", issues=issues) + _require_keys( + row.get("prediction_record", {}), + bundle.export_contract.prediction_record_required, + label=f"export_golden[{instance_id}].prediction_record", + issues=issues, + ) + _require_keys( + row.get("judge_inputs", {}), + bundle.export_contract.judge_inputs_required, + label=f"export_golden[{instance_id}].judge_inputs", + issues=issues, + ) + _require_keys( + row.get("judge_result", {}), + bundle.export_contract.judge_result_required, + label=f"export_golden[{instance_id}].judge_result", + issues=issues, + ) + _require_keys( + row.get("artifacts", {}), + bundle.export_contract.artifacts_required, + label=f"export_golden[{instance_id}].artifacts", + issues=issues, + ) + + judge_request = bundle.rpc["judge_request"] + judge_response = bundle.rpc["judge_response"] + export_request = bundle.rpc["export_request"] + export_response = bundle.rpc["export_response"] + if judge_request.get("jsonrpc") != "2.0" or judge_response.get("jsonrpc") != "2.0": + issues.append("judge rpc fixtures must use jsonrpc=2.0") + if export_request.get("jsonrpc") != "2.0" or export_response.get("jsonrpc") != "2.0": + issues.append("export rpc fixtures must use jsonrpc=2.0") + if judge_request.get("id") != judge_response.get("id"): + issues.append("judge rpc request/response ids do not match") + if export_request.get("id") != export_response.get("id"): + issues.append("export rpc request/response ids do not match") + for rpc_key, payload_key in ( + ("judge_request", "judge_config_path"), + ("judge_request", "evaluator_input_path"), + ("export_request", "contract_path"), + ("export_request", "source_slice_path"), + ): + target = bundle.rpc[rpc_key]["params"][payload_key] + if not resolve_repo_path(target).exists(): + issues.append(f"{rpc_key} references a missing file: {target}") + + return issues + + +def validate_official_dataset_alignment(bundle: SmokeAssetBundle, dataset_rows: list[dict[str, Any]] | None = None) -> list[str]: + if dataset_rows is None: + from datasets import load_dataset + + dataset_rows = list( + load_dataset( + bundle.manifest.benchmark.dataset_name, + split=bundle.manifest.benchmark.dataset_split, + ) + ) + + row_lookup = {row["instance_id"]: row for row in dataset_rows} + issues: list[str] = [] + for instance in bundle.manifest.instances: + row = row_lookup.get(instance.instance_id) + if row is None: + issues.append(f"{instance.instance_id} is missing from the official dataset") + continue + if row["repo"] != instance.repo: + issues.append(f"{instance.instance_id} repo does not match official dataset") + if row["base_commit"] != instance.base_commit: + issues.append(f"{instance.instance_id} base_commit does not match official dataset") + if row["environment_setup_commit"] != instance.environment_setup_commit: + issues.append(f"{instance.instance_id} environment_setup_commit does not match official dataset") + if row["difficulty"] != instance.difficulty: + issues.append(f"{instance.instance_id} difficulty does not match official dataset") + if row["problem_statement"] != instance.problem_statement: + issues.append(f"{instance.instance_id} problem_statement does not match official dataset") + if row.get("hints_text", "") != instance.hints_text: + issues.append(f"{instance.instance_id} hints_text does not match official dataset") + fail_to_pass = json.loads(row["FAIL_TO_PASS"]) + pass_to_pass = json.loads(row["PASS_TO_PASS"]) + if fail_to_pass != instance.fail_to_pass: + issues.append(f"{instance.instance_id} FAIL_TO_PASS does not match official dataset") + if pass_to_pass != instance.pass_to_pass: + issues.append(f"{instance.instance_id} PASS_TO_PASS does not match official dataset") + if hashlib.sha256(row["patch"].encode("utf-8")).hexdigest() != instance.official_patch_sha256: + issues.append(f"{instance.instance_id} patch sha256 does not match official dataset") + if hashlib.sha256(row["test_patch"].encode("utf-8")).hexdigest() != instance.official_test_patch_sha256: + issues.append(f"{instance.instance_id} test_patch sha256 does not match official dataset") + return issues diff --git a/eval/benchmarks/swe_verified/smoke/export_contract.json b/eval/benchmarks/swe_verified/smoke/export_contract.json new file mode 100644 index 000000000..d4f447f37 --- /dev/null +++ b/eval/benchmarks/swe_verified/smoke/export_contract.json @@ -0,0 +1,47 @@ +{ + "contract_id": "mycel.swe_verified.export.v1", + "description": "Minimum fields the backend export must align with for the SWE-bench Verified smoke slice.", + "top_level_required": [ + "export_version", + "benchmark", + "slice_id", + "dataset_name", + "dataset_split", + "dataset_revision", + "judge_profile", + "run_id", + "generated_at", + "predictions_path", + "instances" + ], + "instance_required": [ + "instance_id", + "repo", + "base_commit", + "environment_setup_commit", + "problem_statement", + "status", + "prediction_record", + "judge_inputs", + "judge_result", + "artifacts" + ], + "prediction_record_required": [ + "instance_id", + "model_name_or_path", + "model_patch_sha256" + ], + "judge_inputs_required": [ + "fail_to_pass", + "pass_to_pass" + ], + "judge_result_required": [ + "status", + "resolved" + ], + "artifacts_required": [ + "patch_diff", + "test_patch", + "predictions_record" + ] +} diff --git a/eval/benchmarks/swe_verified/smoke/export_golden.json b/eval/benchmarks/swe_verified/smoke/export_golden.json new file mode 100644 index 000000000..f3eca7cfc --- /dev/null +++ b/eval/benchmarks/swe_verified/smoke/export_golden.json @@ -0,0 +1,347 @@ +{ + "export_version": "1.0", + "benchmark": "swe_verified", + "slice_id": "swe_verified_pytest_smoke_v1", + "dataset_name": "SWE-bench/SWE-bench_Verified", + "dataset_split": "test", + "dataset_revision": "91aa3ed51b709be6457e12d00300a6a596d4c6a3", + "judge_profile": "swe_verified_pytest_smoke_gold_v1", + "run_id": "sample-swe-verified-pytest-gold", + "generated_at": "2026-04-21T00:00:00Z", + "predictions_path": "eval/benchmarks/swe_verified/smoke/sample_predictions.jsonl", + "instances": [ + { + "instance_id": "pytest-dev__pytest-7521", + "repo": "pytest-dev/pytest", + "base_commit": "41d211c24a6781843b174379d6d6538f5c17adb9", + "environment_setup_commit": "634cde9506eb1f48dec3ec77974ee8dc952207c6", + "problem_statement": "pytest 6.0.0rc1: capfd.readouterr() converts \\r to \\n\nI am testing pytest 6.0.0rc1 with Fedora packages. This is the first failure I get, from borgbackup 1.1.13.\r\n\r\n```\r\n______________________ test_progress_percentage_sameline _______________________\r\n\r\ncapfd = <_pytest.capture.CaptureFixture object at 0x7f9bd55e4d00>\r\nmonkeypatch = <_pytest.monkeypatch.MonkeyPatch object at 0x7f9bcbbced60>\r\n\r\n def test_progress_percentage_sameline(capfd, monkeypatch):\r\n # run the test as if it was in a 4x1 terminal\r\n monkeypatch.setenv('COLUMNS', '4')\r\n monkeypatch.setenv('LINES', '1')\r\n pi = ProgressIndicatorPercent(1000, step=5, start=0, msg=\"%3.0f%%\")\r\n pi.logger.setLevel('INFO')\r\n pi.show(0)\r\n out, err = capfd.readouterr()\r\n> assert err == ' 0%\\r'\r\nE AssertionError: assert ' 0%\\n' == ' 0%\\r'\r\nE - 0%\r\nE ? ^\r\nE + 0%\r\nE ? ^\r\n\r\nbuild/lib.linux-x86_64-3.9/borg/testsuite/helpers.py:748: AssertionError\r\n```\r\n\r\nI've distilled a reproducer:\r\n\r\n```python\r\ndef test_cafd_includes_carriage_return(capfd):\r\n print('Greetings from DOS', end='\\r')\r\n out, err = capfd.readouterr()\r\n assert out.endswith('\\r')\r\n```\r\n\r\npytest 5:\r\n\r\n```\r\n============================= test session starts ==============================\r\nplatform linux -- Python 3.8.4, pytest-5.4.3, py-1.9.0, pluggy-0.13.1\r\nrootdir: /home/churchyard/tmp/pytest_reproducers\r\ncollected 1 item\r\n\r\ntest_capfd.py . [100%]\r\n\r\n============================== 1 passed in 0.00s ===============================\r\n\r\n\r\nPackage Version\r\n-------------- -------\r\nattrs 19.3.0 \r\nmore-itertools 8.4.0 \r\npackaging 20.4 \r\npip 19.3.1 \r\npluggy 0.13.1 \r\npy 1.9.0 \r\npyparsing 2.4.7 \r\npytest 5.4.3 \r\nsetuptools 41.6.0 \r\nsix 1.15.0 \r\nwcwidth 0.2.5 \r\n\r\n```\r\n\r\npytest 6:\r\n\r\n```\r\n============================= test session starts ==============================\r\nplatform linux -- Python 3.8.4, pytest-6.0.0rc1, py-1.9.0, pluggy-0.13.1\r\nrootdir: /home/churchyard/tmp/pytest_reproducers\r\ncollected 1 item\r\n\r\ntest_capfd.py F [100%]\r\n\r\n=================================== FAILURES ===================================\r\n______________________ test_cafd_includes_carriage_return ______________________\r\n\r\ncapfd = <_pytest.capture.CaptureFixture object at 0x7f1ddd3219a0>\r\n\r\n def test_cafd_includes_carriage_return(capfd):\r\n print('Greetings from DOS', end='\\r')\r\n out, err = capfd.readouterr()\r\n> assert out.endswith('\\r')\r\nE AssertionError: assert False\r\nE + where False = ('\\r')\r\nE + where = 'Greetings from DOS\\n'.endswith\r\n\r\ntest_capfd.py:4: AssertionError\r\n=========================== short test summary info ============================\r\nFAILED test_capfd.py::test_cafd_includes_carriage_return - AssertionError: as...\r\n============================== 1 failed in 0.01s ===============================\r\n\r\n\r\nPackage Version \r\n-------------- --------\r\nattrs 19.3.0 \r\niniconfig 1.0.0 \r\nmore-itertools 8.4.0 \r\npackaging 20.4 \r\npip 19.3.1 \r\npluggy 0.13.1 \r\npy 1.9.0 \r\npyparsing 3.0.0a2 \r\npytest 6.0.0rc1\r\nsetuptools 41.6.0 \r\nsix 1.15.0 \r\ntoml 0.10.1 \r\n```\r\n\r\nThis is Fedora 32 with Python 3.8 (the original failure in borgbackup is Fedora 33 with Python 3.9).\r\n\r\n\r\nI could have not found anything about this change in the changelog nor at https://docs.pytest.org/en/latest/capture.html hence I assume this is a regression. I've labeled it as such, but feel free to change that.\n", + "status": "prepared", + "prediction_record": { + "instance_id": "pytest-dev__pytest-7521", + "model_name_or_path": "gold", + "model_patch_sha256": "e1f62165b6ecc14c60b08bb71a59b927adba0c3b9a8393394481cc6a4f0f8f0e" + }, + "judge_inputs": { + "fail_to_pass": [ + "testing/test_capture.py::TestCaptureFixture::test_cafd_preserves_newlines[\\r\\n]", + "testing/test_capture.py::TestCaptureFixture::test_cafd_preserves_newlines[\\r]" + ], + "pass_to_pass": [ + "test_capsysbinary.py::test_hello", + "[100%]", + "testing/test_capture.py::TestCaptureManager::test_capturing_basic_api[no]", + "testing/test_capture.py::TestCaptureManager::test_capturing_basic_api[sys]", + "testing/test_capture.py::TestCaptureManager::test_capturing_basic_api[fd]", + "testing/test_capture.py::TestCaptureManager::test_init_capturing", + "testing/test_capture.py::TestCaptureFixture::test_cafd_preserves_newlines[\\n]", + "testing/test_capture.py::TestCaptureIO::test_text", + "testing/test_capture.py::TestCaptureIO::test_unicode_and_str_mixture", + "testing/test_capture.py::TestCaptureIO::test_write_bytes_to_buffer", + "testing/test_capture.py::TestTeeCaptureIO::test_write_bytes_to_buffer", + "testing/test_capture.py::TestTeeCaptureIO::test_text", + "testing/test_capture.py::TestTeeCaptureIO::test_unicode_and_str_mixture", + "testing/test_capture.py::test_dontreadfrominput", + "testing/test_capture.py::TestFDCapture::test_stderr", + "testing/test_capture.py::TestFDCapture::test_stdin", + "testing/test_capture.py::TestFDCapture::test_simple_resume_suspend", + "testing/test_capture.py::TestFDCapture::test_capfd_sys_stdout_mode", + "testing/test_capture.py::TestStdCapture::test_capturing_done_simple", + "testing/test_capture.py::TestStdCapture::test_capturing_reset_simple", + "testing/test_capture.py::TestStdCapture::test_capturing_readouterr", + "testing/test_capture.py::TestStdCapture::test_capture_results_accessible_by_attribute", + "testing/test_capture.py::TestStdCapture::test_capturing_readouterr_unicode", + "testing/test_capture.py::TestStdCapture::test_reset_twice_error", + "testing/test_capture.py::TestStdCapture::test_capturing_modify_sysouterr_in_between", + "testing/test_capture.py::TestStdCapture::test_capturing_error_recursive", + "testing/test_capture.py::TestStdCapture::test_just_out_capture", + "testing/test_capture.py::TestStdCapture::test_just_err_capture", + "testing/test_capture.py::TestStdCapture::test_stdin_restored", + "testing/test_capture.py::TestStdCapture::test_stdin_nulled_by_default", + "testing/test_capture.py::TestTeeStdCapture::test_capturing_done_simple", + "testing/test_capture.py::TestTeeStdCapture::test_capturing_reset_simple", + "testing/test_capture.py::TestTeeStdCapture::test_capturing_readouterr", + "testing/test_capture.py::TestTeeStdCapture::test_capture_results_accessible_by_attribute", + "testing/test_capture.py::TestTeeStdCapture::test_capturing_readouterr_unicode", + "testing/test_capture.py::TestTeeStdCapture::test_reset_twice_error", + "testing/test_capture.py::TestTeeStdCapture::test_capturing_modify_sysouterr_in_between", + "testing/test_capture.py::TestTeeStdCapture::test_just_out_capture", + "testing/test_capture.py::TestTeeStdCapture::test_just_err_capture", + "testing/test_capture.py::TestTeeStdCapture::test_stdin_restored", + "testing/test_capture.py::TestTeeStdCapture::test_stdin_nulled_by_default", + "testing/test_capture.py::TestTeeStdCapture::test_capturing_error_recursive", + "testing/test_capture.py::TestStdCaptureFD::test_capturing_done_simple", + "testing/test_capture.py::TestStdCaptureFD::test_capturing_reset_simple", + "testing/test_capture.py::TestStdCaptureFD::test_capturing_readouterr", + "testing/test_capture.py::TestStdCaptureFD::test_capture_results_accessible_by_attribute", + "testing/test_capture.py::TestStdCaptureFD::test_capturing_readouterr_unicode", + "testing/test_capture.py::TestStdCaptureFD::test_reset_twice_error", + "testing/test_capture.py::TestStdCaptureFD::test_capturing_modify_sysouterr_in_between", + "testing/test_capture.py::TestStdCaptureFD::test_capturing_error_recursive", + "testing/test_capture.py::TestStdCaptureFD::test_just_out_capture", + "testing/test_capture.py::TestStdCaptureFD::test_just_err_capture", + "testing/test_capture.py::TestStdCaptureFD::test_stdin_restored", + "testing/test_capture.py::TestStdCaptureFD::test_stdin_nulled_by_default", + "testing/test_capture.py::TestStdCaptureFD::test_intermingling", + "testing/test_capture.py::test_capture_not_started_but_reset", + "testing/test_capture.py::test_using_capsys_fixture_works_with_sys_stdout_encoding", + "testing/test_capture.py::test_capsys_results_accessible_by_attribute", + "testing/test_capture.py::test_fdcapture_tmpfile_remains_the_same", + "testing/test_capture.py::test_stderr_write_returns_len", + "testing/test_capture.py::test__get_multicapture", + "testing/test_capture.py::test_capturing_unicode[fd]", + "testing/test_capture.py::test_capturing_unicode[sys]", + "testing/test_capture.py::test_capturing_bytes_in_utf8_encoding[fd]", + "testing/test_capture.py::test_capturing_bytes_in_utf8_encoding[sys]", + "testing/test_capture.py::test_collect_capturing", + "testing/test_capture.py::TestPerTestCapturing::test_capture_and_fixtures", + "testing/test_capture.py::TestPerTestCapturing::test_no_carry_over", + "testing/test_capture.py::TestPerTestCapturing::test_teardown_capturing", + "testing/test_capture.py::TestPerTestCapturing::test_teardown_capturing_final", + "testing/test_capture.py::TestPerTestCapturing::test_capturing_outerr", + "testing/test_capture.py::TestCaptureFixture::test_std_functional[opt0]", + "testing/test_capture.py::TestCaptureFixture::test_std_functional[opt1]", + "testing/test_capture.py::TestCaptureFixture::test_capsyscapfd", + "testing/test_capture.py::TestCaptureFixture::test_capturing_getfixturevalue", + "testing/test_capture.py::TestCaptureFixture::test_capsyscapfdbinary", + "testing/test_capture.py::TestCaptureFixture::test_capture_is_represented_on_failure_issue128[sys]", + "testing/test_capture.py::TestCaptureFixture::test_capture_is_represented_on_failure_issue128[fd]", + "testing/test_capture.py::TestCaptureFixture::test_stdfd_functional", + "testing/test_capture.py::TestCaptureFixture::test_capfdbinary", + "testing/test_capture.py::TestCaptureFixture::test_capsysbinary", + "testing/test_capture.py::TestCaptureFixture::test_partial_setup_failure", + "testing/test_capture.py::TestCaptureFixture::test_fixture_use_by_other_fixtures_teardown[capsys]", + "testing/test_capture.py::TestCaptureFixture::test_fixture_use_by_other_fixtures_teardown[capfd]", + "testing/test_capture.py::test_setup_failure_does_not_kill_capturing", + "testing/test_capture.py::test_capture_conftest_runtest_setup", + "testing/test_capture.py::test_capture_badoutput_issue412", + "testing/test_capture.py::test_capture_early_option_parsing", + "testing/test_capture.py::test_capture_binary_output", + "testing/test_capture.py::TestFDCapture::test_simple", + "testing/test_capture.py::TestFDCapture::test_simple_many", + "testing/test_capture.py::TestFDCapture::test_simple_fail_second_start", + "testing/test_capture.py::TestFDCapture::test_writeorg", + "testing/test_capture.py::TestStdCaptureFDinvalidFD::test_fdcapture_invalid_fd_with_fd_reuse", + "testing/test_capture.py::TestStdCaptureFDinvalidFD::test_fdcapture_invalid_fd_without_fd_reuse", + "testing/test_capture.py::test_capturing_and_logging_fundamentals[SysCapture(2)]", + "testing/test_capture.py::test_capturing_and_logging_fundamentals[SysCapture(2,", + "testing/test_capture.py::test_capturing_and_logging_fundamentals[FDCapture(2)]", + "testing/test_capture.py::test_error_attribute_issue555", + "testing/test_capture.py::test_dontreadfrominput_has_encoding", + "testing/test_capture.py::test_typeerror_encodedfile_write", + "testing/test_capture.py::test_encodedfile_writelines", + "testing/test_capture.py::TestLoggingInteraction::test_logging_stream_ownership", + "testing/test_capture.py::TestLoggingInteraction::test_logging_and_immediate_setupteardown", + "testing/test_capture.py::TestLoggingInteraction::test_logging_and_crossscope_fixtures", + "testing/test_capture.py::TestLoggingInteraction::test_conftestlogging_is_shown", + "testing/test_capture.py::TestLoggingInteraction::test_conftestlogging_and_test_logging", + "testing/test_capture.py::TestLoggingInteraction::test_logging_after_cap_stopped", + "testing/test_capture.py::TestCaptureFixture::test_keyboardinterrupt_disables_capturing", + "testing/test_capture.py::TestCaptureFixture::test_capture_and_logging", + "testing/test_capture.py::TestCaptureFixture::test_disabled_capture_fixture[True-capsys]", + "testing/test_capture.py::TestCaptureFixture::test_disabled_capture_fixture[True-capfd]", + "testing/test_capture.py::TestCaptureFixture::test_disabled_capture_fixture[False-capsys]", + "testing/test_capture.py::TestCaptureFixture::test_disabled_capture_fixture[False-capfd]", + "testing/test_capture.py::TestCaptureFixture::test_fixture_use_by_other_fixtures[capsys]", + "testing/test_capture.py::TestCaptureFixture::test_fixture_use_by_other_fixtures[capfd]", + "testing/test_capture.py::test_error_during_readouterr", + "testing/test_capture.py::TestStdCaptureFD::test_simple_only_fd", + "testing/test_capture.py::TestStdCaptureFDinvalidFD::test_stdcapture_fd_invalid_fd", + "testing/test_capture.py::test_close_and_capture_again", + "testing/test_capture.py::test_crash_on_closing_tmpfile_py27", + "testing/test_capture.py::test_global_capture_with_live_logging", + "testing/test_capture.py::test_capture_with_live_logging[capsys]", + "testing/test_capture.py::test_capture_with_live_logging[capfd]", + "testing/test_capture.py::test_logging_while_collecting" + ] + }, + "judge_result": { + "status": "not_run", + "resolved": null + }, + "artifacts": { + "patch_diff": { + "path": "eval/benchmarks/swe_verified/smoke/fixtures/official_patches/pytest-dev__pytest-7521.patch", + "sha256": "e1f62165b6ecc14c60b08bb71a59b927adba0c3b9a8393394481cc6a4f0f8f0e" + }, + "test_patch": { + "path": "eval/benchmarks/swe_verified/smoke/fixtures/official_patches/pytest-dev__pytest-7521.test.patch", + "sha256": "5d7b9e51ea700508725976f0643bd1c601afcb5373481118f8741e4660d64e59" + }, + "predictions_record": { + "path": "eval/benchmarks/swe_verified/smoke/sample_predictions.jsonl", + "lookup_key": "pytest-dev__pytest-7521" + } + } + }, + { + "instance_id": "pytest-dev__pytest-7571", + "repo": "pytest-dev/pytest", + "base_commit": "422685d0bdc110547535036c1ff398b5e1c44145", + "environment_setup_commit": "634cde9506eb1f48dec3ec77974ee8dc952207c6", + "problem_statement": "caplog fixture doesn't restore log level after test\nFrom the documentation at https://docs.pytest.org/en/6.0.0/logging.html#caplog-fixture, \"The log levels set are restored automatically at the end of the test\".\r\nIt used to work, but looks broken in new 6.0 release. Minimal example to reproduce:\r\n\r\n```\r\ndef test_foo(caplog):\r\n caplog.set_level(42)\r\n\r\ndef test_bar(caplog):\r\n print(caplog.handler.level)\r\n```\r\n\r\nIt prints \"0\" for pytest<6, \"42\" after.\n", + "status": "prepared", + "prediction_record": { + "instance_id": "pytest-dev__pytest-7571", + "model_name_or_path": "gold", + "model_patch_sha256": "37e8f7df78a7ad5ab31b32b2785a8009b5d9ebf99c1762e999e6e1cc3384ec24" + }, + "judge_inputs": { + "fail_to_pass": [ + "testing/logging/test_fixture.py::test_change_level_undos_handler_level" + ], + "pass_to_pass": [ + "testing/logging/test_fixture.py::test_change_level", + "testing/logging/test_fixture.py::test_with_statement", + "testing/logging/test_fixture.py::test_log_access", + "testing/logging/test_fixture.py::test_messages", + "testing/logging/test_fixture.py::test_record_tuples", + "testing/logging/test_fixture.py::test_unicode", + "testing/logging/test_fixture.py::test_clear", + "testing/logging/test_fixture.py::test_caplog_captures_for_all_stages", + "testing/logging/test_fixture.py::test_fixture_help", + "testing/logging/test_fixture.py::test_change_level_undo", + "testing/logging/test_fixture.py::test_ini_controls_global_log_level", + "testing/logging/test_fixture.py::test_caplog_can_override_global_log_level", + "testing/logging/test_fixture.py::test_caplog_captures_despite_exception", + "testing/logging/test_fixture.py::test_log_report_captures_according_to_config_option_upon_failure" + ] + }, + "judge_result": { + "status": "not_run", + "resolved": null + }, + "artifacts": { + "patch_diff": { + "path": "eval/benchmarks/swe_verified/smoke/fixtures/official_patches/pytest-dev__pytest-7571.patch", + "sha256": "37e8f7df78a7ad5ab31b32b2785a8009b5d9ebf99c1762e999e6e1cc3384ec24" + }, + "test_patch": { + "path": "eval/benchmarks/swe_verified/smoke/fixtures/official_patches/pytest-dev__pytest-7571.test.patch", + "sha256": "3cd1ee3c3825b1cdda389712041a55e668f53ab63ed2ac99707ef7b36d4928db" + }, + "predictions_record": { + "path": "eval/benchmarks/swe_verified/smoke/sample_predictions.jsonl", + "lookup_key": "pytest-dev__pytest-7571" + } + } + }, + { + "instance_id": "pytest-dev__pytest-7490", + "repo": "pytest-dev/pytest", + "base_commit": "7f7a36478abe7dd1fa993b115d22606aa0e35e88", + "environment_setup_commit": "634cde9506eb1f48dec3ec77974ee8dc952207c6", + "problem_statement": "Pytest 6: Dynamically adding xfail marker in test no longer ignores failure\n\r\n\r\n## Description\r\n\r\nWith pytest 5.x, we can dynamically add an xfail to a test `request` object using `request.node.add_marker(mark)` (see example below). In 5.x this treated the failing test like a a test marked statically with an `xfail`. With 6.0.0rc0 it raises. \r\n\r\n## Versions\r\n\r\n
\r\n\r\n```\r\n$ pip list\r\nPackage Version Location \r\n----------------------------- ------------------------------- --------------------------------------------------------------\r\na 1.0 \r\naioftp 0.13.0 \r\naiohttp 3.6.2 \r\nalabaster 0.7.12 \r\napipkg 1.5 \r\naplus 0.11.0 \r\nappdirs 1.4.3 \r\nappnope 0.1.0 \r\narrow 0.15.7 \r\naspy.yaml 1.3.0 \r\nastropy 3.2.3 \r\nasv 0.4.1 \r\nasync-timeout 3.0.1 \r\natomicwrites 1.3.0 \r\nattrs 19.1.0 \r\naws-sam-translator 1.15.1 \r\naws-xray-sdk 0.95 \r\nBabel 2.7.0 \r\nbackcall 0.1.0 \r\nbinaryornot 0.4.4 \r\nblack 19.10b0 \r\nbleach 3.1.0 \r\nblurb 1.0.7 \r\nbokeh 1.3.4 \r\nboto 2.49.0 \r\nboto3 1.7.84 \r\nbotocore 1.10.84 \r\nbqplot 0.12.12 \r\nbranca 0.3.1 \r\ncachetools 4.1.0 \r\ncertifi 2019.9.11 \r\ncffi 1.13.2 \r\ncfgv 2.0.1 \r\ncfn-lint 0.25.0 \r\ncftime 1.0.4.2 \r\nchardet 3.0.4 \r\nClick 7.0 \r\nclick-plugins 1.1.1 \r\ncligj 0.5.0 \r\ncloudpickle 1.2.2 \r\ncolorama 0.4.3 \r\ncolorcet 2.0.2 \r\ncoloredlogs 14.0 \r\ncookiecutter 1.7.2 \r\ncookies 2.2.1 \r\ncoverage 4.5.4 \r\ncryptography 2.8 \r\ncycler 0.10.0 \r\nCython 3.0a5 \r\ncytoolz 0.10.1 \r\ndask 2.4.0 /Users/taugspurger/Envs/pandas-dev/lib/python3.7/site-packages\r\nDateTime 4.3 \r\ndecorator 4.4.0 \r\ndefusedxml 0.6.0 \r\nDeprecated 1.2.7 \r\ndistributed 2.4.0 \r\ndocker 4.1.0 \r\ndocutils 0.15.2 \r\necdsa 0.14.1 \r\nentrypoints 0.3 \r\net-xmlfile 1.0.1 \r\nexecnet 1.7.1 \r\nfastparquet 0.3.3 /Users/taugspurger/sandbox/fastparquet \r\nfeedparser 5.2.1 \r\nFiona 1.8.8 \r\nflake8 3.7.9 \r\nflake8-rst 0.7.1 \r\nfletcher 0.3.1 \r\nflit 2.1.0 \r\nflit-core 2.1.0 \r\nfsspec 0.7.4 \r\nfuture 0.18.2 \r\ngcsfs 0.6.2 \r\ngeopandas 0.6.0+1.g95b8e1a.dirty /Users/taugspurger/sandbox/geopandas \r\ngitdb2 2.0.5 \r\nGitPython 3.0.2 \r\ngoogle-auth 1.16.1 \r\ngoogle-auth-oauthlib 0.4.1 \r\ngraphviz 0.13 \r\nh5py 2.10.0 \r\nHeapDict 1.0.1 \r\nholoviews 1.12.6 \r\nhumanfriendly 8.1 \r\nhunter 3.1.3 \r\nhvplot 0.5.2 \r\nhypothesis 4.36.2 \r\nidentify 1.4.7 \r\nidna 2.8 \r\nimagesize 1.1.0 \r\nimportlib-metadata 0.23 \r\nimportlib-resources 1.0.2 \r\niniconfig 1.0.0 \r\nintake 0.5.3 \r\nipydatawidgets 4.0.1 \r\nipykernel 5.1.2 \r\nipyleaflet 0.13.0 \r\nipympl 0.5.6 \r\nipython 7.11.1 \r\nipython-genutils 0.2.0 \r\nipyvolume 0.5.2 \r\nipyvue 1.3.2 \r\nipyvuetify 1.4.0 \r\nipywebrtc 0.5.0 \r\nipywidgets 7.5.1 \r\nisort 4.3.21 \r\njdcal 1.4.1 \r\njedi 0.16.0 \r\nJinja2 2.11.2 \r\njinja2-time 0.2.0 \r\njmespath 0.9.4 \r\njoblib 0.14.1 \r\njson5 0.9.4 \r\njsondiff 1.1.1 \r\njsonpatch 1.24 \r\njsonpickle 1.2 \r\njsonpointer 2.0 \r\njsonschema 3.0.2 \r\njupyter 1.0.0 \r\njupyter-client 5.3.3 \r\njupyter-console 6.0.0 \r\njupyter-core 4.5.0 \r\njupyterlab 2.1.2 \r\njupyterlab-server 1.1.4 \r\nkiwisolver 1.1.0 \r\nline-profiler 2.1.1 \r\nllvmlite 0.33.0 \r\nlocket 0.2.0 /Users/taugspurger/sandbox/locket.py \r\nlxml 4.5.0 \r\nmanhole 1.6.0 \r\nMarkdown 3.1.1 \r\nMarkupSafe 1.1.1 \r\nmatplotlib 3.2.2 \r\nmccabe 0.6.1 \r\nmemory-profiler 0.55.0 \r\nmistune 0.8.4 \r\nmock 3.0.5 \r\nmore-itertools 7.2.0 \r\nmoto 1.3.6 \r\nmsgpack 0.6.2 \r\nmultidict 4.5.2 \r\nmunch 2.3.2 \r\nmypy 0.730 \r\nmypy-extensions 0.4.1 \r\nnbconvert 5.6.0 \r\nnbformat 4.4.0 \r\nnbsphinx 0.4.2 \r\nnest-asyncio 1.3.3 \r\nnodeenv 1.3.3 \r\nnotebook 6.0.1 \r\nnumexpr 2.7.1 \r\nnumpy 1.19.0 \r\nnumpydoc 1.0.0.dev0 \r\noauthlib 3.1.0 \r\nodfpy 1.4.0 \r\nopenpyxl 3.0.3 \r\npackaging 20.4 \r\npandas 1.1.0.dev0+1758.g035e1fe831 /Users/taugspurger/sandbox/pandas \r\npandas-sphinx-theme 0.0.1.dev0 /Users/taugspurger/sandbox/pandas-sphinx-theme \r\npandocfilters 1.4.2 \r\nparam 1.9.2 \r\nparfive 1.0.0 \r\nparso 0.6.0 \r\npartd 1.0.0 \r\npathspec 0.8.0 \r\npatsy 0.5.1 \r\npexpect 4.7.0 \r\npickleshare 0.7.5 \r\nPillow 6.1.0 \r\npip 20.0.2 \r\npluggy 0.13.0 \r\npoyo 0.5.0 \r\npre-commit 1.18.3 \r\nprogressbar2 3.51.3 \r\nprometheus-client 0.7.1 \r\nprompt-toolkit 2.0.9 \r\npsutil 5.6.3 \r\nptyprocess 0.6.0 \r\npy 1.9.0 \r\npyaml 20.4.0 \r\npyarrow 0.16.0 \r\npyasn1 0.4.7 \r\npyasn1-modules 0.2.8 \r\npycodestyle 2.5.0 \r\npycparser 2.19 \r\npycryptodome 3.9.8 \r\npyct 0.4.6 \r\npydata-sphinx-theme 0.1.1 \r\npydeps 1.9.0 \r\npyflakes 2.1.1 \r\nPyGithub 1.44.1 \r\nPygments 2.4.2 \r\nPyJWT 1.7.1 \r\npyparsing 2.4.2 \r\npyproj 2.4.0 \r\npyrsistent 0.15.4 \r\npytest 5.4.3 \r\npytest-asyncio 0.10.0 \r\npytest-cov 2.8.1 \r\npytest-cover 3.0.0 \r\npytest-forked 1.0.2 \r\npytest-repeat 0.8.0 \r\npytest-xdist 1.29.0 \r\npython-boilerplate 0.1.0 \r\npython-dateutil 2.8.0 \r\npython-jose 2.0.2 \r\npython-jsonrpc-server 0.3.2 \r\npython-language-server 0.31.4 \r\npython-slugify 4.0.1 \r\npython-utils 2.4.0 \r\npythreejs 2.2.0 \r\npytoml 0.1.21 \r\npytz 2019.2 \r\npyviz-comms 0.7.2 \r\nPyYAML 5.1.2 \r\npyzmq 18.1.0 \r\nqtconsole 4.5.5 \r\nregex 2020.6.8 \r\nrequests 2.24.0 \r\nrequests-oauthlib 1.3.0 \r\nresponses 0.10.6 \r\nrsa 4.0 \r\nrstcheck 3.3.1 \r\ns3fs 0.4.2 \r\ns3transfer 0.1.13 \r\nscikit-learn 0.22.2.post1 \r\nscipy 1.3.1 \r\nseaborn 0.9.0 \r\nSend2Trash 1.5.0 \r\nsetuptools 49.2.0 \r\nShapely 1.6.4.post2 \r\nsix 1.12.0 \r\nsmmap2 2.0.5 \r\nsnakeviz 2.0.1 \r\nsnowballstemmer 1.9.1 \r\nsortedcontainers 2.1.0 \r\nsparse 0.10.0 \r\nSphinx 3.1.1 \r\nsphinxcontrib-applehelp 1.0.2 \r\nsphinxcontrib-devhelp 1.0.2 \r\nsphinxcontrib-htmlhelp 1.0.3 \r\nsphinxcontrib-jsmath 1.0.1 \r\nsphinxcontrib-qthelp 1.0.3 \r\nsphinxcontrib-serializinghtml 1.1.4 \r\nsphinxcontrib-websupport 1.1.2 \r\nsphinxcontrib.youtube 0.1.2 \r\nSQLAlchemy 1.3.11 \r\nsshpubkeys 3.1.0 \r\nstatsmodels 0.10.2 \r\nstdlib-list 0.6.0 \r\nsunpy 1.1.dev518+gcad2d473f.d20191103 /Users/taugspurger/sandbox/sunpy \r\ntables 3.6.1 \r\ntabulate 0.8.6 \r\ntblib 1.4.0 \r\nterminado 0.8.2 \r\ntest 1.0.0 \r\ntestpath 0.4.2 \r\ntext-unidecode 1.3 \r\nthrift 0.13.0 \r\ntoml 0.10.0 \r\ntoolz 0.10.0 \r\ntornado 6.0.3 \r\ntqdm 4.37.0 \r\ntraitlets 4.3.2 \r\ntraittypes 0.2.1 \r\ntyped-ast 1.4.0 \r\ntyping-extensions 3.7.4 \r\nujson 1.35 \r\nurllib3 1.25.5 \r\nvaex 3.0.0 \r\nvaex-arrow 0.5.1 \r\nvaex-astro 0.7.0 \r\nvaex-core 2.0.2 \r\nvaex-hdf5 0.6.0 \r\nvaex-jupyter 0.5.1.post0 \r\nvaex-ml 0.9.0 \r\nvaex-server 0.3.1 \r\nvaex-viz 0.4.0 \r\nvirtualenv 16.7.5 \r\nwcwidth 0.1.7 \r\nwebencodings 0.5.1 \r\nwebsocket-client 0.56.0 \r\nWerkzeug 0.16.0 \r\nwheel 0.34.2 \r\nwidgetsnbextension 3.5.1 \r\nwrapt 1.11.2 \r\nxarray 0.14.1+36.gb3d3b448 /Users/taugspurger/sandbox/xarray \r\nxlwt 1.3.0 \r\nxmltodict 0.12.0 \r\nyarl 1.3.0 \r\nzict 1.0.0 \r\nzipp 0.6.0 \r\nzope.interface 4.7.1 \r\n```\r\n\r\n
\r\n\r\n- [ ] pytest and operating system versions\r\n\r\nPytest 6.0.1rc0 and MacOS 10.14.5\r\n\r\n```python\r\n# file: test_foo.py\r\nimport pytest\r\n\r\n\r\ndef test_xfail_test(request):\r\n mark = pytest.mark.xfail(reason=\"xfail\")\r\n request.node.add_marker(mark)\r\n assert 0\r\n```\r\n\r\nWith 5.4.3\r\n\r\n```\r\n\r\n$ pytest -rsx test_foo.py\r\n=============================================================================== test session starts ================================================================================\r\nplatform darwin -- Python 3.7.6, pytest-5.4.3, py-1.9.0, pluggy-0.13.0\r\nhypothesis profile 'default' -> database=DirectoryBasedExampleDatabase('/Users/taugspurger/sandbox/.hypothesis/examples')\r\nrootdir: /Users/taugspurger/sandbox\r\nplugins: xdist-1.29.0, hypothesis-4.36.2, forked-1.0.2, repeat-0.8.0, asyncio-0.10.0, cov-2.8.1\r\ncollected 1 item\r\n\r\ntest_foo.py x [100%]\r\n\r\n============================================================================= short test summary info ==============================================================================\r\nXFAIL test_foo.py::test_xfail_test\r\n xfail\r\n================================================================================ 1 xfailed in 0.07s ================================================================================\r\n```\r\n\r\nWith 6.0.0rc0\r\n\r\n```\r\n$ pytest -rsx test_foo.py\r\n=============================================================================== test session starts ================================================================================\r\nplatform darwin -- Python 3.7.6, pytest-6.0.0rc1, py-1.9.0, pluggy-0.13.0\r\nhypothesis profile 'default' -> database=DirectoryBasedExampleDatabase('/Users/taugspurger/sandbox/.hypothesis/examples')\r\nrootdir: /Users/taugspurger/sandbox\r\nplugins: xdist-1.29.0, hypothesis-4.36.2, forked-1.0.2, repeat-0.8.0, asyncio-0.10.0, cov-2.8.1\r\ncollected 1 item\r\n\r\ntest_foo.py F [100%]\r\n\r\n===================================================================================== FAILURES =====================================================================================\r\n_________________________________________________________________________________ test_xfail_test __________________________________________________________________________________\r\n\r\nrequest = >\r\n\r\n def test_xfail_test(request):\r\n mark = pytest.mark.xfail(reason=\"xfail\")\r\n request.node.add_marker(mark)\r\n> assert 0\r\nE assert 0\r\n\r\ntest_foo.py:7: AssertionError\r\n```\r\n\n", + "status": "prepared", + "prediction_record": { + "instance_id": "pytest-dev__pytest-7490", + "model_name_or_path": "gold", + "model_patch_sha256": "88a7af7e123619306d887c6a9bd1f905acc4872b454df914094c038b71356e30" + }, + "judge_inputs": { + "fail_to_pass": [ + "testing/test_skipping.py::TestXFail::test_dynamic_xfail_set_during_runtest_failed", + "testing/test_skipping.py::TestXFail::test_dynamic_xfail_set_during_runtest_passed_strict" + ], + "pass_to_pass": [ + "testing/test_skipping.py::test_importorskip", + "testing/test_skipping.py::TestEvaluation::test_no_marker", + "testing/test_skipping.py::TestEvaluation::test_marked_xfail_no_args", + "testing/test_skipping.py::TestEvaluation::test_marked_skipif_no_args", + "testing/test_skipping.py::TestEvaluation::test_marked_one_arg", + "testing/test_skipping.py::TestEvaluation::test_marked_one_arg_with_reason", + "testing/test_skipping.py::TestEvaluation::test_marked_one_arg_twice", + "testing/test_skipping.py::TestEvaluation::test_marked_one_arg_twice2", + "testing/test_skipping.py::TestEvaluation::test_marked_skipif_with_boolean_without_reason", + "testing/test_skipping.py::TestEvaluation::test_marked_skipif_with_invalid_boolean", + "testing/test_skipping.py::TestEvaluation::test_skipif_class", + "testing/test_skipping.py::TestXFail::test_xfail_simple[True]", + "testing/test_skipping.py::TestXFail::test_xfail_simple[False]", + "testing/test_skipping.py::TestXFail::test_xfail_xpassed", + "testing/test_skipping.py::TestXFail::test_xfail_using_platform", + "testing/test_skipping.py::TestXFail::test_xfail_xpassed_strict", + "testing/test_skipping.py::TestXFail::test_xfail_run_anyway", + "testing/test_skipping.py::TestXFail::test_xfail_run_with_skip_mark[test_input0-expected0]", + "testing/test_skipping.py::TestXFail::test_xfail_run_with_skip_mark[test_input1-expected1]", + "testing/test_skipping.py::TestXFail::test_xfail_evalfalse_but_fails", + "testing/test_skipping.py::TestXFail::test_xfail_not_report_default", + "testing/test_skipping.py::TestXFail::test_xfail_not_run_xfail_reporting", + "testing/test_skipping.py::TestXFail::test_xfail_not_run_no_setup_run", + "testing/test_skipping.py::TestXFail::test_xfail_xpass", + "testing/test_skipping.py::TestXFail::test_xfail_imperative", + "testing/test_skipping.py::TestXFail::test_xfail_imperative_in_setup_function", + "testing/test_skipping.py::TestXFail::test_dynamic_xfail_no_run", + "testing/test_skipping.py::TestXFail::test_dynamic_xfail_set_during_funcarg_setup", + "testing/test_skipping.py::TestXFail::test_xfail_raises[TypeError-TypeError-*1", + "testing/test_skipping.py::TestXFail::test_xfail_raises[(AttributeError,", + "testing/test_skipping.py::TestXFail::test_xfail_raises[TypeError-IndexError-*1", + "testing/test_skipping.py::TestXFail::test_strict_sanity", + "testing/test_skipping.py::TestXFail::test_strict_xfail[True]", + "testing/test_skipping.py::TestXFail::test_strict_xfail[False]", + "testing/test_skipping.py::TestXFail::test_strict_xfail_condition[True]", + "testing/test_skipping.py::TestXFail::test_strict_xfail_condition[False]", + "testing/test_skipping.py::TestXFail::test_xfail_condition_keyword[True]", + "testing/test_skipping.py::TestXFail::test_xfail_condition_keyword[False]", + "testing/test_skipping.py::TestXFail::test_strict_xfail_default_from_file[true]", + "testing/test_skipping.py::TestXFail::test_strict_xfail_default_from_file[false]", + "testing/test_skipping.py::TestXFailwithSetupTeardown::test_failing_setup_issue9", + "testing/test_skipping.py::TestXFailwithSetupTeardown::test_failing_teardown_issue9", + "testing/test_skipping.py::TestSkip::test_skip_class", + "testing/test_skipping.py::TestSkip::test_skips_on_false_string", + "testing/test_skipping.py::TestSkip::test_arg_as_reason", + "testing/test_skipping.py::TestSkip::test_skip_no_reason", + "testing/test_skipping.py::TestSkip::test_skip_with_reason", + "testing/test_skipping.py::TestSkip::test_only_skips_marked_test", + "testing/test_skipping.py::TestSkip::test_strict_and_skip", + "testing/test_skipping.py::TestSkipif::test_skipif_conditional", + "testing/test_skipping.py::TestSkipif::test_skipif_reporting[\"hasattr(sys,", + "testing/test_skipping.py::TestSkipif::test_skipif_reporting[True,", + "testing/test_skipping.py::TestSkipif::test_skipif_using_platform", + "testing/test_skipping.py::TestSkipif::test_skipif_reporting_multiple[skipif-SKIP-skipped]", + "testing/test_skipping.py::TestSkipif::test_skipif_reporting_multiple[xfail-XPASS-xpassed]", + "testing/test_skipping.py::test_skip_not_report_default", + "testing/test_skipping.py::test_skipif_class", + "testing/test_skipping.py::test_skipped_reasons_functional", + "testing/test_skipping.py::test_skipped_folding", + "testing/test_skipping.py::test_reportchars", + "testing/test_skipping.py::test_reportchars_error", + "testing/test_skipping.py::test_reportchars_all", + "testing/test_skipping.py::test_reportchars_all_error", + "testing/test_skipping.py::test_errors_in_xfail_skip_expressions", + "testing/test_skipping.py::test_xfail_skipif_with_globals", + "testing/test_skipping.py::test_default_markers", + "testing/test_skipping.py::test_xfail_test_setup_exception", + "testing/test_skipping.py::test_imperativeskip_on_xfail_test", + "testing/test_skipping.py::TestBooleanCondition::test_skipif", + "testing/test_skipping.py::TestBooleanCondition::test_skipif_noreason", + "testing/test_skipping.py::TestBooleanCondition::test_xfail", + "testing/test_skipping.py::test_xfail_item", + "testing/test_skipping.py::test_module_level_skip_error", + "testing/test_skipping.py::test_module_level_skip_with_allow_module_level", + "testing/test_skipping.py::test_invalid_skip_keyword_parameter", + "testing/test_skipping.py::test_mark_xfail_item", + "testing/test_skipping.py::test_summary_list_after_errors", + "testing/test_skipping.py::test_relpath_rootdir" + ] + }, + "judge_result": { + "status": "not_run", + "resolved": null + }, + "artifacts": { + "patch_diff": { + "path": "eval/benchmarks/swe_verified/smoke/fixtures/official_patches/pytest-dev__pytest-7490.patch", + "sha256": "88a7af7e123619306d887c6a9bd1f905acc4872b454df914094c038b71356e30" + }, + "test_patch": { + "path": "eval/benchmarks/swe_verified/smoke/fixtures/official_patches/pytest-dev__pytest-7490.test.patch", + "sha256": "fe5323cfe9d6be9648be22ffb13a95079c6e17c13f09987f4c1b8589d27c690c" + }, + "predictions_record": { + "path": "eval/benchmarks/swe_verified/smoke/sample_predictions.jsonl", + "lookup_key": "pytest-dev__pytest-7490" + } + } + } + ] +} diff --git a/eval/benchmarks/swe_verified/smoke/fixtures/official_patches/pytest-dev__pytest-7490.patch b/eval/benchmarks/swe_verified/smoke/fixtures/official_patches/pytest-dev__pytest-7490.patch new file mode 100644 index 000000000..c696853a1 --- /dev/null +++ b/eval/benchmarks/swe_verified/smoke/fixtures/official_patches/pytest-dev__pytest-7490.patch @@ -0,0 +1,45 @@ +diff --git a/src/_pytest/skipping.py b/src/_pytest/skipping.py +--- a/src/_pytest/skipping.py ++++ b/src/_pytest/skipping.py +@@ -231,17 +231,14 @@ def evaluate_xfail_marks(item: Item) -> Optional[Xfail]: + + @hookimpl(tryfirst=True) + def pytest_runtest_setup(item: Item) -> None: +- item._store[skipped_by_mark_key] = False +- + skipped = evaluate_skip_marks(item) ++ item._store[skipped_by_mark_key] = skipped is not None + if skipped: +- item._store[skipped_by_mark_key] = True + skip(skipped.reason) + +- if not item.config.option.runxfail: +- item._store[xfailed_key] = xfailed = evaluate_xfail_marks(item) +- if xfailed and not xfailed.run: +- xfail("[NOTRUN] " + xfailed.reason) ++ item._store[xfailed_key] = xfailed = evaluate_xfail_marks(item) ++ if xfailed and not item.config.option.runxfail and not xfailed.run: ++ xfail("[NOTRUN] " + xfailed.reason) + + + @hookimpl(hookwrapper=True) +@@ -250,12 +247,16 @@ def pytest_runtest_call(item: Item) -> Generator[None, None, None]: + if xfailed is None: + item._store[xfailed_key] = xfailed = evaluate_xfail_marks(item) + +- if not item.config.option.runxfail: +- if xfailed and not xfailed.run: +- xfail("[NOTRUN] " + xfailed.reason) ++ if xfailed and not item.config.option.runxfail and not xfailed.run: ++ xfail("[NOTRUN] " + xfailed.reason) + + yield + ++ # The test run may have added an xfail mark dynamically. ++ xfailed = item._store.get(xfailed_key, None) ++ if xfailed is None: ++ item._store[xfailed_key] = xfailed = evaluate_xfail_marks(item) ++ + + @hookimpl(hookwrapper=True) + def pytest_runtest_makereport(item: Item, call: CallInfo[None]): diff --git a/eval/benchmarks/swe_verified/smoke/fixtures/official_patches/pytest-dev__pytest-7490.test.patch b/eval/benchmarks/swe_verified/smoke/fixtures/official_patches/pytest-dev__pytest-7490.test.patch new file mode 100644 index 000000000..1fec890e3 --- /dev/null +++ b/eval/benchmarks/swe_verified/smoke/fixtures/official_patches/pytest-dev__pytest-7490.test.patch @@ -0,0 +1,45 @@ +diff --git a/testing/test_skipping.py b/testing/test_skipping.py +--- a/testing/test_skipping.py ++++ b/testing/test_skipping.py +@@ -1,6 +1,7 @@ + import sys + + import pytest ++from _pytest.pytester import Testdir + from _pytest.runner import runtestprotocol + from _pytest.skipping import evaluate_skip_marks + from _pytest.skipping import evaluate_xfail_marks +@@ -425,6 +426,33 @@ def test_this2(arg): + result = testdir.runpytest(p) + result.stdout.fnmatch_lines(["*1 xfailed*"]) + ++ def test_dynamic_xfail_set_during_runtest_failed(self, testdir: Testdir) -> None: ++ # Issue #7486. ++ p = testdir.makepyfile( ++ """ ++ import pytest ++ def test_this(request): ++ request.node.add_marker(pytest.mark.xfail(reason="xfail")) ++ assert 0 ++ """ ++ ) ++ result = testdir.runpytest(p) ++ result.assert_outcomes(xfailed=1) ++ ++ def test_dynamic_xfail_set_during_runtest_passed_strict( ++ self, testdir: Testdir ++ ) -> None: ++ # Issue #7486. ++ p = testdir.makepyfile( ++ """ ++ import pytest ++ def test_this(request): ++ request.node.add_marker(pytest.mark.xfail(reason="xfail", strict=True)) ++ """ ++ ) ++ result = testdir.runpytest(p) ++ result.assert_outcomes(failed=1) ++ + @pytest.mark.parametrize( + "expected, actual, matchline", + [ diff --git a/eval/benchmarks/swe_verified/smoke/fixtures/official_patches/pytest-dev__pytest-7521.patch b/eval/benchmarks/swe_verified/smoke/fixtures/official_patches/pytest-dev__pytest-7521.patch new file mode 100644 index 000000000..1cd62e517 --- /dev/null +++ b/eval/benchmarks/swe_verified/smoke/fixtures/official_patches/pytest-dev__pytest-7521.patch @@ -0,0 +1,11 @@ +diff --git a/src/_pytest/capture.py b/src/_pytest/capture.py +--- a/src/_pytest/capture.py ++++ b/src/_pytest/capture.py +@@ -388,6 +388,7 @@ def __init__(self, targetfd: int) -> None: + TemporaryFile(buffering=0), # type: ignore[arg-type] + encoding="utf-8", + errors="replace", ++ newline="", + write_through=True, + ) + if targetfd in patchsysdict: diff --git a/eval/benchmarks/swe_verified/smoke/fixtures/official_patches/pytest-dev__pytest-7521.test.patch b/eval/benchmarks/swe_verified/smoke/fixtures/official_patches/pytest-dev__pytest-7521.test.patch new file mode 100644 index 000000000..21cdac5b3 --- /dev/null +++ b/eval/benchmarks/swe_verified/smoke/fixtures/official_patches/pytest-dev__pytest-7521.test.patch @@ -0,0 +1,16 @@ +diff --git a/testing/test_capture.py b/testing/test_capture.py +--- a/testing/test_capture.py ++++ b/testing/test_capture.py +@@ -514,6 +514,12 @@ def test_hello(capfd): + ) + reprec.assertoutcome(passed=1) + ++ @pytest.mark.parametrize("nl", ("\n", "\r\n", "\r")) ++ def test_cafd_preserves_newlines(self, capfd, nl): ++ print("test", end=nl) ++ out, err = capfd.readouterr() ++ assert out.endswith(nl) ++ + def test_capfdbinary(self, testdir): + reprec = testdir.inline_runsource( + """\ diff --git a/eval/benchmarks/swe_verified/smoke/fixtures/official_patches/pytest-dev__pytest-7571.patch b/eval/benchmarks/swe_verified/smoke/fixtures/official_patches/pytest-dev__pytest-7571.patch new file mode 100644 index 000000000..4edeece51 --- /dev/null +++ b/eval/benchmarks/swe_verified/smoke/fixtures/official_patches/pytest-dev__pytest-7571.patch @@ -0,0 +1,28 @@ +diff --git a/src/_pytest/logging.py b/src/_pytest/logging.py +--- a/src/_pytest/logging.py ++++ b/src/_pytest/logging.py +@@ -345,6 +345,7 @@ def __init__(self, item: nodes.Node) -> None: + """Creates a new funcarg.""" + self._item = item + # dict of log name -> log level ++ self._initial_handler_level = None # type: Optional[int] + self._initial_logger_levels = {} # type: Dict[Optional[str], int] + + def _finalize(self) -> None: +@@ -353,6 +354,8 @@ def _finalize(self) -> None: + This restores the log levels changed by :meth:`set_level`. + """ + # restore log levels ++ if self._initial_handler_level is not None: ++ self.handler.setLevel(self._initial_handler_level) + for logger_name, level in self._initial_logger_levels.items(): + logger = logging.getLogger(logger_name) + logger.setLevel(level) +@@ -434,6 +437,7 @@ def set_level(self, level: Union[int, str], logger: Optional[str] = None) -> Non + # save the original log-level to restore it during teardown + self._initial_logger_levels.setdefault(logger, logger_obj.level) + logger_obj.setLevel(level) ++ self._initial_handler_level = self.handler.level + self.handler.setLevel(level) + + @contextmanager diff --git a/eval/benchmarks/swe_verified/smoke/fixtures/official_patches/pytest-dev__pytest-7571.test.patch b/eval/benchmarks/swe_verified/smoke/fixtures/official_patches/pytest-dev__pytest-7571.test.patch new file mode 100644 index 000000000..c1cb1186a --- /dev/null +++ b/eval/benchmarks/swe_verified/smoke/fixtures/official_patches/pytest-dev__pytest-7571.test.patch @@ -0,0 +1,59 @@ +diff --git a/testing/logging/test_fixture.py b/testing/logging/test_fixture.py +--- a/testing/logging/test_fixture.py ++++ b/testing/logging/test_fixture.py +@@ -2,6 +2,7 @@ + + import pytest + from _pytest.logging import caplog_records_key ++from _pytest.pytester import Testdir + + logger = logging.getLogger(__name__) + sublogger = logging.getLogger(__name__ + ".baz") +@@ -27,8 +28,11 @@ def test_change_level(caplog): + assert "CRITICAL" in caplog.text + + +-def test_change_level_undo(testdir): +- """Ensure that 'set_level' is undone after the end of the test""" ++def test_change_level_undo(testdir: Testdir) -> None: ++ """Ensure that 'set_level' is undone after the end of the test. ++ ++ Tests the logging output themselves (affacted both by logger and handler levels). ++ """ + testdir.makepyfile( + """ + import logging +@@ -50,6 +54,33 @@ def test2(caplog): + result.stdout.no_fnmatch_line("*log from test2*") + + ++def test_change_level_undos_handler_level(testdir: Testdir) -> None: ++ """Ensure that 'set_level' is undone after the end of the test (handler). ++ ++ Issue #7569. Tests the handler level specifically. ++ """ ++ testdir.makepyfile( ++ """ ++ import logging ++ ++ def test1(caplog): ++ assert caplog.handler.level == 0 ++ caplog.set_level(41) ++ assert caplog.handler.level == 41 ++ ++ def test2(caplog): ++ assert caplog.handler.level == 0 ++ ++ def test3(caplog): ++ assert caplog.handler.level == 0 ++ caplog.set_level(43) ++ assert caplog.handler.level == 43 ++ """ ++ ) ++ result = testdir.runpytest() ++ result.assert_outcomes(passed=3) ++ ++ + def test_with_statement(caplog): + with caplog.at_level(logging.INFO): + logger.debug("handler DEBUG level") diff --git a/eval/benchmarks/swe_verified/smoke/judge_config.json b/eval/benchmarks/swe_verified/smoke/judge_config.json new file mode 100644 index 000000000..b911bfe49 --- /dev/null +++ b/eval/benchmarks/swe_verified/smoke/judge_config.json @@ -0,0 +1,53 @@ +{ + "profile_id": "swe_verified_pytest_smoke_gold_v1", + "benchmark": "swe_verified", + "slice_manifest_path": "eval/benchmarks/swe_verified/smoke/manifest.json", + "dataset_name": "SWE-bench/SWE-bench_Verified", + "dataset_split": "test", + "dataset_revision": "91aa3ed51b709be6457e12d00300a6a596d4c6a3", + "repo": "pytest-dev/pytest", + "environment_setup_commit": "634cde9506eb1f48dec3ec77974ee8dc952207c6", + "instance_ids": [ + "pytest-dev__pytest-7521", + "pytest-dev__pytest-7571", + "pytest-dev__pytest-7490" + ], + "official_evaluator": { + "module": "swebench.harness.run_evaluation", + "command_template": [ + "python", + "-m", + "swebench.harness.run_evaluation", + "--dataset_name", + "SWE-bench/SWE-bench_Verified", + "--predictions_path", + "{predictions_path}", + "--max_workers", + "{max_workers}", + "--instance_ids", + "{instance_ids_csv}", + "--run_id", + "{run_id}" + ], + "prediction_format": "jsonl", + "required_prediction_fields": [ + "instance_id", + "model_name_or_path", + "model_patch" + ], + "gold_predictions_supported": true + }, + "scoring": { + "resolved_field": "resolved", + "required_test_sets": [ + "FAIL_TO_PASS", + "PASS_TO_PASS" + ], + "failure_policy": "any FAIL_TO_PASS miss or PASS_TO_PASS regression marks the instance unresolved" + }, + "artifacts": { + "prediction_records_path": "eval/benchmarks/swe_verified/smoke/sample_predictions.jsonl", + "export_contract_path": "eval/benchmarks/swe_verified/smoke/export_contract.json", + "golden_export_path": "eval/benchmarks/swe_verified/smoke/export_golden.json" + } +} diff --git a/eval/benchmarks/swe_verified/smoke/manifest.json b/eval/benchmarks/swe_verified/smoke/manifest.json new file mode 100644 index 000000000..35030f1de --- /dev/null +++ b/eval/benchmarks/swe_verified/smoke/manifest.json @@ -0,0 +1,367 @@ +{ + "slice_id": "swe_verified_pytest_smoke_v1", + "benchmark": { + "family": "SWE-bench Verified", + "dataset_name": "SWE-bench/SWE-bench_Verified", + "dataset_split": "test", + "dataset_revision": "91aa3ed51b709be6457e12d00300a6a596d4c6a3", + "source_urls": { + "dataset": "https://huggingface.co/datasets/SWE-bench/SWE-bench_Verified", + "dataset_guide": "https://www.swebench.com/SWE-bench/guides/datasets/", + "evaluation_guide": "https://www.swebench.com/SWE-bench/guides/evaluation/", + "official_repo": "https://github.com/SWE-bench/SWE-bench" + } + }, + "selection": { + "repo": "pytest-dev/pytest", + "environment_setup_commit": "634cde9506eb1f48dec3ec77974ee8dc952207c6", + "max_instances": 3, + "selection_rules": [ + "restrict to a single repo so smoke runs reuse one checkout shape", + "restrict to a single environment_setup_commit so judge prep can reuse one pinned environment", + "prefer <15 min or 15 min - 1 hour verified tasks for quick smoke validation", + "prefer 1-2 FAIL_TO_PASS tests per instance to keep evaluator turnaround short" + ] + }, + "instances": [ + { + "instance_id": "pytest-dev__pytest-7521", + "repo": "pytest-dev/pytest", + "base_commit": "41d211c24a6781843b174379d6d6538f5c17adb9", + "environment_setup_commit": "634cde9506eb1f48dec3ec77974ee8dc952207c6", + "difficulty": "<15 min fix", + "created_at": "2020-07-20T15:55:11Z", + "version": "6.0", + "problem_statement": "pytest 6.0.0rc1: capfd.readouterr() converts \\r to \\n\nI am testing pytest 6.0.0rc1 with Fedora packages. This is the first failure I get, from borgbackup 1.1.13.\r\n\r\n```\r\n______________________ test_progress_percentage_sameline _______________________\r\n\r\ncapfd = <_pytest.capture.CaptureFixture object at 0x7f9bd55e4d00>\r\nmonkeypatch = <_pytest.monkeypatch.MonkeyPatch object at 0x7f9bcbbced60>\r\n\r\n def test_progress_percentage_sameline(capfd, monkeypatch):\r\n # run the test as if it was in a 4x1 terminal\r\n monkeypatch.setenv('COLUMNS', '4')\r\n monkeypatch.setenv('LINES', '1')\r\n pi = ProgressIndicatorPercent(1000, step=5, start=0, msg=\"%3.0f%%\")\r\n pi.logger.setLevel('INFO')\r\n pi.show(0)\r\n out, err = capfd.readouterr()\r\n> assert err == ' 0%\\r'\r\nE AssertionError: assert ' 0%\\n' == ' 0%\\r'\r\nE - 0%\r\nE ? ^\r\nE + 0%\r\nE ? ^\r\n\r\nbuild/lib.linux-x86_64-3.9/borg/testsuite/helpers.py:748: AssertionError\r\n```\r\n\r\nI've distilled a reproducer:\r\n\r\n```python\r\ndef test_cafd_includes_carriage_return(capfd):\r\n print('Greetings from DOS', end='\\r')\r\n out, err = capfd.readouterr()\r\n assert out.endswith('\\r')\r\n```\r\n\r\npytest 5:\r\n\r\n```\r\n============================= test session starts ==============================\r\nplatform linux -- Python 3.8.4, pytest-5.4.3, py-1.9.0, pluggy-0.13.1\r\nrootdir: /home/churchyard/tmp/pytest_reproducers\r\ncollected 1 item\r\n\r\ntest_capfd.py . [100%]\r\n\r\n============================== 1 passed in 0.00s ===============================\r\n\r\n\r\nPackage Version\r\n-------------- -------\r\nattrs 19.3.0 \r\nmore-itertools 8.4.0 \r\npackaging 20.4 \r\npip 19.3.1 \r\npluggy 0.13.1 \r\npy 1.9.0 \r\npyparsing 2.4.7 \r\npytest 5.4.3 \r\nsetuptools 41.6.0 \r\nsix 1.15.0 \r\nwcwidth 0.2.5 \r\n\r\n```\r\n\r\npytest 6:\r\n\r\n```\r\n============================= test session starts ==============================\r\nplatform linux -- Python 3.8.4, pytest-6.0.0rc1, py-1.9.0, pluggy-0.13.1\r\nrootdir: /home/churchyard/tmp/pytest_reproducers\r\ncollected 1 item\r\n\r\ntest_capfd.py F [100%]\r\n\r\n=================================== FAILURES ===================================\r\n______________________ test_cafd_includes_carriage_return ______________________\r\n\r\ncapfd = <_pytest.capture.CaptureFixture object at 0x7f1ddd3219a0>\r\n\r\n def test_cafd_includes_carriage_return(capfd):\r\n print('Greetings from DOS', end='\\r')\r\n out, err = capfd.readouterr()\r\n> assert out.endswith('\\r')\r\nE AssertionError: assert False\r\nE + where False = ('\\r')\r\nE + where = 'Greetings from DOS\\n'.endswith\r\n\r\ntest_capfd.py:4: AssertionError\r\n=========================== short test summary info ============================\r\nFAILED test_capfd.py::test_cafd_includes_carriage_return - AssertionError: as...\r\n============================== 1 failed in 0.01s ===============================\r\n\r\n\r\nPackage Version \r\n-------------- --------\r\nattrs 19.3.0 \r\niniconfig 1.0.0 \r\nmore-itertools 8.4.0 \r\npackaging 20.4 \r\npip 19.3.1 \r\npluggy 0.13.1 \r\npy 1.9.0 \r\npyparsing 3.0.0a2 \r\npytest 6.0.0rc1\r\nsetuptools 41.6.0 \r\nsix 1.15.0 \r\ntoml 0.10.1 \r\n```\r\n\r\nThis is Fedora 32 with Python 3.8 (the original failure in borgbackup is Fedora 33 with Python 3.9).\r\n\r\n\r\nI could have not found anything about this change in the changelog nor at https://docs.pytest.org/en/latest/capture.html hence I assume this is a regression. I've labeled it as such, but feel free to change that.\n", + "hints_text": "Bisected to 29e4cb5d45f44379aba948c2cd791b3b97210e31 (#6899 / \"Remove safe_text_dupfile() and simplify EncodedFile\") - cc @bluetech \nThanks for trying the rc @hroncok and @The-Compiler for the bisection (which is very helpful). It does look like a regression to me, i.e. the previous behavior seems better. I'll take a look soon.\nI've got a fix for this, PR incoming!", + "fail_to_pass": [ + "testing/test_capture.py::TestCaptureFixture::test_cafd_preserves_newlines[\\r\\n]", + "testing/test_capture.py::TestCaptureFixture::test_cafd_preserves_newlines[\\r]" + ], + "pass_to_pass": [ + "test_capsysbinary.py::test_hello", + "[100%]", + "testing/test_capture.py::TestCaptureManager::test_capturing_basic_api[no]", + "testing/test_capture.py::TestCaptureManager::test_capturing_basic_api[sys]", + "testing/test_capture.py::TestCaptureManager::test_capturing_basic_api[fd]", + "testing/test_capture.py::TestCaptureManager::test_init_capturing", + "testing/test_capture.py::TestCaptureFixture::test_cafd_preserves_newlines[\\n]", + "testing/test_capture.py::TestCaptureIO::test_text", + "testing/test_capture.py::TestCaptureIO::test_unicode_and_str_mixture", + "testing/test_capture.py::TestCaptureIO::test_write_bytes_to_buffer", + "testing/test_capture.py::TestTeeCaptureIO::test_write_bytes_to_buffer", + "testing/test_capture.py::TestTeeCaptureIO::test_text", + "testing/test_capture.py::TestTeeCaptureIO::test_unicode_and_str_mixture", + "testing/test_capture.py::test_dontreadfrominput", + "testing/test_capture.py::TestFDCapture::test_stderr", + "testing/test_capture.py::TestFDCapture::test_stdin", + "testing/test_capture.py::TestFDCapture::test_simple_resume_suspend", + "testing/test_capture.py::TestFDCapture::test_capfd_sys_stdout_mode", + "testing/test_capture.py::TestStdCapture::test_capturing_done_simple", + "testing/test_capture.py::TestStdCapture::test_capturing_reset_simple", + "testing/test_capture.py::TestStdCapture::test_capturing_readouterr", + "testing/test_capture.py::TestStdCapture::test_capture_results_accessible_by_attribute", + "testing/test_capture.py::TestStdCapture::test_capturing_readouterr_unicode", + "testing/test_capture.py::TestStdCapture::test_reset_twice_error", + "testing/test_capture.py::TestStdCapture::test_capturing_modify_sysouterr_in_between", + "testing/test_capture.py::TestStdCapture::test_capturing_error_recursive", + "testing/test_capture.py::TestStdCapture::test_just_out_capture", + "testing/test_capture.py::TestStdCapture::test_just_err_capture", + "testing/test_capture.py::TestStdCapture::test_stdin_restored", + "testing/test_capture.py::TestStdCapture::test_stdin_nulled_by_default", + "testing/test_capture.py::TestTeeStdCapture::test_capturing_done_simple", + "testing/test_capture.py::TestTeeStdCapture::test_capturing_reset_simple", + "testing/test_capture.py::TestTeeStdCapture::test_capturing_readouterr", + "testing/test_capture.py::TestTeeStdCapture::test_capture_results_accessible_by_attribute", + "testing/test_capture.py::TestTeeStdCapture::test_capturing_readouterr_unicode", + "testing/test_capture.py::TestTeeStdCapture::test_reset_twice_error", + "testing/test_capture.py::TestTeeStdCapture::test_capturing_modify_sysouterr_in_between", + "testing/test_capture.py::TestTeeStdCapture::test_just_out_capture", + "testing/test_capture.py::TestTeeStdCapture::test_just_err_capture", + "testing/test_capture.py::TestTeeStdCapture::test_stdin_restored", + "testing/test_capture.py::TestTeeStdCapture::test_stdin_nulled_by_default", + "testing/test_capture.py::TestTeeStdCapture::test_capturing_error_recursive", + "testing/test_capture.py::TestStdCaptureFD::test_capturing_done_simple", + "testing/test_capture.py::TestStdCaptureFD::test_capturing_reset_simple", + "testing/test_capture.py::TestStdCaptureFD::test_capturing_readouterr", + "testing/test_capture.py::TestStdCaptureFD::test_capture_results_accessible_by_attribute", + "testing/test_capture.py::TestStdCaptureFD::test_capturing_readouterr_unicode", + "testing/test_capture.py::TestStdCaptureFD::test_reset_twice_error", + "testing/test_capture.py::TestStdCaptureFD::test_capturing_modify_sysouterr_in_between", + "testing/test_capture.py::TestStdCaptureFD::test_capturing_error_recursive", + "testing/test_capture.py::TestStdCaptureFD::test_just_out_capture", + "testing/test_capture.py::TestStdCaptureFD::test_just_err_capture", + "testing/test_capture.py::TestStdCaptureFD::test_stdin_restored", + "testing/test_capture.py::TestStdCaptureFD::test_stdin_nulled_by_default", + "testing/test_capture.py::TestStdCaptureFD::test_intermingling", + "testing/test_capture.py::test_capture_not_started_but_reset", + "testing/test_capture.py::test_using_capsys_fixture_works_with_sys_stdout_encoding", + "testing/test_capture.py::test_capsys_results_accessible_by_attribute", + "testing/test_capture.py::test_fdcapture_tmpfile_remains_the_same", + "testing/test_capture.py::test_stderr_write_returns_len", + "testing/test_capture.py::test__get_multicapture", + "testing/test_capture.py::test_capturing_unicode[fd]", + "testing/test_capture.py::test_capturing_unicode[sys]", + "testing/test_capture.py::test_capturing_bytes_in_utf8_encoding[fd]", + "testing/test_capture.py::test_capturing_bytes_in_utf8_encoding[sys]", + "testing/test_capture.py::test_collect_capturing", + "testing/test_capture.py::TestPerTestCapturing::test_capture_and_fixtures", + "testing/test_capture.py::TestPerTestCapturing::test_no_carry_over", + "testing/test_capture.py::TestPerTestCapturing::test_teardown_capturing", + "testing/test_capture.py::TestPerTestCapturing::test_teardown_capturing_final", + "testing/test_capture.py::TestPerTestCapturing::test_capturing_outerr", + "testing/test_capture.py::TestCaptureFixture::test_std_functional[opt0]", + "testing/test_capture.py::TestCaptureFixture::test_std_functional[opt1]", + "testing/test_capture.py::TestCaptureFixture::test_capsyscapfd", + "testing/test_capture.py::TestCaptureFixture::test_capturing_getfixturevalue", + "testing/test_capture.py::TestCaptureFixture::test_capsyscapfdbinary", + "testing/test_capture.py::TestCaptureFixture::test_capture_is_represented_on_failure_issue128[sys]", + "testing/test_capture.py::TestCaptureFixture::test_capture_is_represented_on_failure_issue128[fd]", + "testing/test_capture.py::TestCaptureFixture::test_stdfd_functional", + "testing/test_capture.py::TestCaptureFixture::test_capfdbinary", + "testing/test_capture.py::TestCaptureFixture::test_capsysbinary", + "testing/test_capture.py::TestCaptureFixture::test_partial_setup_failure", + "testing/test_capture.py::TestCaptureFixture::test_fixture_use_by_other_fixtures_teardown[capsys]", + "testing/test_capture.py::TestCaptureFixture::test_fixture_use_by_other_fixtures_teardown[capfd]", + "testing/test_capture.py::test_setup_failure_does_not_kill_capturing", + "testing/test_capture.py::test_capture_conftest_runtest_setup", + "testing/test_capture.py::test_capture_badoutput_issue412", + "testing/test_capture.py::test_capture_early_option_parsing", + "testing/test_capture.py::test_capture_binary_output", + "testing/test_capture.py::TestFDCapture::test_simple", + "testing/test_capture.py::TestFDCapture::test_simple_many", + "testing/test_capture.py::TestFDCapture::test_simple_fail_second_start", + "testing/test_capture.py::TestFDCapture::test_writeorg", + "testing/test_capture.py::TestStdCaptureFDinvalidFD::test_fdcapture_invalid_fd_with_fd_reuse", + "testing/test_capture.py::TestStdCaptureFDinvalidFD::test_fdcapture_invalid_fd_without_fd_reuse", + "testing/test_capture.py::test_capturing_and_logging_fundamentals[SysCapture(2)]", + "testing/test_capture.py::test_capturing_and_logging_fundamentals[SysCapture(2,", + "testing/test_capture.py::test_capturing_and_logging_fundamentals[FDCapture(2)]", + "testing/test_capture.py::test_error_attribute_issue555", + "testing/test_capture.py::test_dontreadfrominput_has_encoding", + "testing/test_capture.py::test_typeerror_encodedfile_write", + "testing/test_capture.py::test_encodedfile_writelines", + "testing/test_capture.py::TestLoggingInteraction::test_logging_stream_ownership", + "testing/test_capture.py::TestLoggingInteraction::test_logging_and_immediate_setupteardown", + "testing/test_capture.py::TestLoggingInteraction::test_logging_and_crossscope_fixtures", + "testing/test_capture.py::TestLoggingInteraction::test_conftestlogging_is_shown", + "testing/test_capture.py::TestLoggingInteraction::test_conftestlogging_and_test_logging", + "testing/test_capture.py::TestLoggingInteraction::test_logging_after_cap_stopped", + "testing/test_capture.py::TestCaptureFixture::test_keyboardinterrupt_disables_capturing", + "testing/test_capture.py::TestCaptureFixture::test_capture_and_logging", + "testing/test_capture.py::TestCaptureFixture::test_disabled_capture_fixture[True-capsys]", + "testing/test_capture.py::TestCaptureFixture::test_disabled_capture_fixture[True-capfd]", + "testing/test_capture.py::TestCaptureFixture::test_disabled_capture_fixture[False-capsys]", + "testing/test_capture.py::TestCaptureFixture::test_disabled_capture_fixture[False-capfd]", + "testing/test_capture.py::TestCaptureFixture::test_fixture_use_by_other_fixtures[capsys]", + "testing/test_capture.py::TestCaptureFixture::test_fixture_use_by_other_fixtures[capfd]", + "testing/test_capture.py::test_error_during_readouterr", + "testing/test_capture.py::TestStdCaptureFD::test_simple_only_fd", + "testing/test_capture.py::TestStdCaptureFDinvalidFD::test_stdcapture_fd_invalid_fd", + "testing/test_capture.py::test_close_and_capture_again", + "testing/test_capture.py::test_crash_on_closing_tmpfile_py27", + "testing/test_capture.py::test_global_capture_with_live_logging", + "testing/test_capture.py::test_capture_with_live_logging[capsys]", + "testing/test_capture.py::test_capture_with_live_logging[capfd]", + "testing/test_capture.py::test_logging_while_collecting" + ], + "fail_to_pass_count": 2, + "pass_to_pass_count": 125, + "official_patch_path": "eval/benchmarks/swe_verified/smoke/fixtures/official_patches/pytest-dev__pytest-7521.patch", + "official_patch_sha256": "e1f62165b6ecc14c60b08bb71a59b927adba0c3b9a8393394481cc6a4f0f8f0e", + "official_test_patch_path": "eval/benchmarks/swe_verified/smoke/fixtures/official_patches/pytest-dev__pytest-7521.test.patch", + "official_test_patch_sha256": "5d7b9e51ea700508725976f0643bd1c601afcb5373481118f8741e4660d64e59", + "judge": { + "prediction_key": "pytest-dev__pytest-7521", + "success_requirements": [ + "all fail_to_pass tests must pass", + "all pass_to_pass tests must remain passing" + ], + "prediction_format": { + "instance_id": "pytest-dev__pytest-7521", + "model_name_or_path": "string", + "model_patch": "unified diff string" + } + }, + "runtime": { + "checkout_url": "https://github.com/pytest-dev/pytest.git", + "working_directory": "/workspace/pytest", + "language": "python" + }, + "selection_rank": 1 + }, + { + "instance_id": "pytest-dev__pytest-7571", + "repo": "pytest-dev/pytest", + "base_commit": "422685d0bdc110547535036c1ff398b5e1c44145", + "environment_setup_commit": "634cde9506eb1f48dec3ec77974ee8dc952207c6", + "difficulty": "15 min - 1 hour", + "created_at": "2020-07-29T12:00:47Z", + "version": "6.0", + "problem_statement": "caplog fixture doesn't restore log level after test\nFrom the documentation at https://docs.pytest.org/en/6.0.0/logging.html#caplog-fixture, \"The log levels set are restored automatically at the end of the test\".\r\nIt used to work, but looks broken in new 6.0 release. Minimal example to reproduce:\r\n\r\n```\r\ndef test_foo(caplog):\r\n caplog.set_level(42)\r\n\r\ndef test_bar(caplog):\r\n print(caplog.handler.level)\r\n```\r\n\r\nIt prints \"0\" for pytest<6, \"42\" after.\n", + "hints_text": "This probably regressed in fcbaab8b0b89abc622dbfb7982cf9bd8c91ef301. I will take a look.", + "fail_to_pass": [ + "testing/logging/test_fixture.py::test_change_level_undos_handler_level" + ], + "pass_to_pass": [ + "testing/logging/test_fixture.py::test_change_level", + "testing/logging/test_fixture.py::test_with_statement", + "testing/logging/test_fixture.py::test_log_access", + "testing/logging/test_fixture.py::test_messages", + "testing/logging/test_fixture.py::test_record_tuples", + "testing/logging/test_fixture.py::test_unicode", + "testing/logging/test_fixture.py::test_clear", + "testing/logging/test_fixture.py::test_caplog_captures_for_all_stages", + "testing/logging/test_fixture.py::test_fixture_help", + "testing/logging/test_fixture.py::test_change_level_undo", + "testing/logging/test_fixture.py::test_ini_controls_global_log_level", + "testing/logging/test_fixture.py::test_caplog_can_override_global_log_level", + "testing/logging/test_fixture.py::test_caplog_captures_despite_exception", + "testing/logging/test_fixture.py::test_log_report_captures_according_to_config_option_upon_failure" + ], + "fail_to_pass_count": 1, + "pass_to_pass_count": 14, + "official_patch_path": "eval/benchmarks/swe_verified/smoke/fixtures/official_patches/pytest-dev__pytest-7571.patch", + "official_patch_sha256": "37e8f7df78a7ad5ab31b32b2785a8009b5d9ebf99c1762e999e6e1cc3384ec24", + "official_test_patch_path": "eval/benchmarks/swe_verified/smoke/fixtures/official_patches/pytest-dev__pytest-7571.test.patch", + "official_test_patch_sha256": "3cd1ee3c3825b1cdda389712041a55e668f53ab63ed2ac99707ef7b36d4928db", + "judge": { + "prediction_key": "pytest-dev__pytest-7571", + "success_requirements": [ + "all fail_to_pass tests must pass", + "all pass_to_pass tests must remain passing" + ], + "prediction_format": { + "instance_id": "pytest-dev__pytest-7571", + "model_name_or_path": "string", + "model_patch": "unified diff string" + } + }, + "runtime": { + "checkout_url": "https://github.com/pytest-dev/pytest.git", + "working_directory": "/workspace/pytest", + "language": "python" + }, + "selection_rank": 2 + }, + { + "instance_id": "pytest-dev__pytest-7490", + "repo": "pytest-dev/pytest", + "base_commit": "7f7a36478abe7dd1fa993b115d22606aa0e35e88", + "environment_setup_commit": "634cde9506eb1f48dec3ec77974ee8dc952207c6", + "difficulty": "15 min - 1 hour", + "created_at": "2020-07-13T22:20:10Z", + "version": "6.0", + "problem_statement": "Pytest 6: Dynamically adding xfail marker in test no longer ignores failure\n\r\n\r\n## Description\r\n\r\nWith pytest 5.x, we can dynamically add an xfail to a test `request` object using `request.node.add_marker(mark)` (see example below). In 5.x this treated the failing test like a a test marked statically with an `xfail`. With 6.0.0rc0 it raises. \r\n\r\n## Versions\r\n\r\n
\r\n\r\n```\r\n$ pip list\r\nPackage Version Location \r\n----------------------------- ------------------------------- --------------------------------------------------------------\r\na 1.0 \r\naioftp 0.13.0 \r\naiohttp 3.6.2 \r\nalabaster 0.7.12 \r\napipkg 1.5 \r\naplus 0.11.0 \r\nappdirs 1.4.3 \r\nappnope 0.1.0 \r\narrow 0.15.7 \r\naspy.yaml 1.3.0 \r\nastropy 3.2.3 \r\nasv 0.4.1 \r\nasync-timeout 3.0.1 \r\natomicwrites 1.3.0 \r\nattrs 19.1.0 \r\naws-sam-translator 1.15.1 \r\naws-xray-sdk 0.95 \r\nBabel 2.7.0 \r\nbackcall 0.1.0 \r\nbinaryornot 0.4.4 \r\nblack 19.10b0 \r\nbleach 3.1.0 \r\nblurb 1.0.7 \r\nbokeh 1.3.4 \r\nboto 2.49.0 \r\nboto3 1.7.84 \r\nbotocore 1.10.84 \r\nbqplot 0.12.12 \r\nbranca 0.3.1 \r\ncachetools 4.1.0 \r\ncertifi 2019.9.11 \r\ncffi 1.13.2 \r\ncfgv 2.0.1 \r\ncfn-lint 0.25.0 \r\ncftime 1.0.4.2 \r\nchardet 3.0.4 \r\nClick 7.0 \r\nclick-plugins 1.1.1 \r\ncligj 0.5.0 \r\ncloudpickle 1.2.2 \r\ncolorama 0.4.3 \r\ncolorcet 2.0.2 \r\ncoloredlogs 14.0 \r\ncookiecutter 1.7.2 \r\ncookies 2.2.1 \r\ncoverage 4.5.4 \r\ncryptography 2.8 \r\ncycler 0.10.0 \r\nCython 3.0a5 \r\ncytoolz 0.10.1 \r\ndask 2.4.0 /Users/taugspurger/Envs/pandas-dev/lib/python3.7/site-packages\r\nDateTime 4.3 \r\ndecorator 4.4.0 \r\ndefusedxml 0.6.0 \r\nDeprecated 1.2.7 \r\ndistributed 2.4.0 \r\ndocker 4.1.0 \r\ndocutils 0.15.2 \r\necdsa 0.14.1 \r\nentrypoints 0.3 \r\net-xmlfile 1.0.1 \r\nexecnet 1.7.1 \r\nfastparquet 0.3.3 /Users/taugspurger/sandbox/fastparquet \r\nfeedparser 5.2.1 \r\nFiona 1.8.8 \r\nflake8 3.7.9 \r\nflake8-rst 0.7.1 \r\nfletcher 0.3.1 \r\nflit 2.1.0 \r\nflit-core 2.1.0 \r\nfsspec 0.7.4 \r\nfuture 0.18.2 \r\ngcsfs 0.6.2 \r\ngeopandas 0.6.0+1.g95b8e1a.dirty /Users/taugspurger/sandbox/geopandas \r\ngitdb2 2.0.5 \r\nGitPython 3.0.2 \r\ngoogle-auth 1.16.1 \r\ngoogle-auth-oauthlib 0.4.1 \r\ngraphviz 0.13 \r\nh5py 2.10.0 \r\nHeapDict 1.0.1 \r\nholoviews 1.12.6 \r\nhumanfriendly 8.1 \r\nhunter 3.1.3 \r\nhvplot 0.5.2 \r\nhypothesis 4.36.2 \r\nidentify 1.4.7 \r\nidna 2.8 \r\nimagesize 1.1.0 \r\nimportlib-metadata 0.23 \r\nimportlib-resources 1.0.2 \r\niniconfig 1.0.0 \r\nintake 0.5.3 \r\nipydatawidgets 4.0.1 \r\nipykernel 5.1.2 \r\nipyleaflet 0.13.0 \r\nipympl 0.5.6 \r\nipython 7.11.1 \r\nipython-genutils 0.2.0 \r\nipyvolume 0.5.2 \r\nipyvue 1.3.2 \r\nipyvuetify 1.4.0 \r\nipywebrtc 0.5.0 \r\nipywidgets 7.5.1 \r\nisort 4.3.21 \r\njdcal 1.4.1 \r\njedi 0.16.0 \r\nJinja2 2.11.2 \r\njinja2-time 0.2.0 \r\njmespath 0.9.4 \r\njoblib 0.14.1 \r\njson5 0.9.4 \r\njsondiff 1.1.1 \r\njsonpatch 1.24 \r\njsonpickle 1.2 \r\njsonpointer 2.0 \r\njsonschema 3.0.2 \r\njupyter 1.0.0 \r\njupyter-client 5.3.3 \r\njupyter-console 6.0.0 \r\njupyter-core 4.5.0 \r\njupyterlab 2.1.2 \r\njupyterlab-server 1.1.4 \r\nkiwisolver 1.1.0 \r\nline-profiler 2.1.1 \r\nllvmlite 0.33.0 \r\nlocket 0.2.0 /Users/taugspurger/sandbox/locket.py \r\nlxml 4.5.0 \r\nmanhole 1.6.0 \r\nMarkdown 3.1.1 \r\nMarkupSafe 1.1.1 \r\nmatplotlib 3.2.2 \r\nmccabe 0.6.1 \r\nmemory-profiler 0.55.0 \r\nmistune 0.8.4 \r\nmock 3.0.5 \r\nmore-itertools 7.2.0 \r\nmoto 1.3.6 \r\nmsgpack 0.6.2 \r\nmultidict 4.5.2 \r\nmunch 2.3.2 \r\nmypy 0.730 \r\nmypy-extensions 0.4.1 \r\nnbconvert 5.6.0 \r\nnbformat 4.4.0 \r\nnbsphinx 0.4.2 \r\nnest-asyncio 1.3.3 \r\nnodeenv 1.3.3 \r\nnotebook 6.0.1 \r\nnumexpr 2.7.1 \r\nnumpy 1.19.0 \r\nnumpydoc 1.0.0.dev0 \r\noauthlib 3.1.0 \r\nodfpy 1.4.0 \r\nopenpyxl 3.0.3 \r\npackaging 20.4 \r\npandas 1.1.0.dev0+1758.g035e1fe831 /Users/taugspurger/sandbox/pandas \r\npandas-sphinx-theme 0.0.1.dev0 /Users/taugspurger/sandbox/pandas-sphinx-theme \r\npandocfilters 1.4.2 \r\nparam 1.9.2 \r\nparfive 1.0.0 \r\nparso 0.6.0 \r\npartd 1.0.0 \r\npathspec 0.8.0 \r\npatsy 0.5.1 \r\npexpect 4.7.0 \r\npickleshare 0.7.5 \r\nPillow 6.1.0 \r\npip 20.0.2 \r\npluggy 0.13.0 \r\npoyo 0.5.0 \r\npre-commit 1.18.3 \r\nprogressbar2 3.51.3 \r\nprometheus-client 0.7.1 \r\nprompt-toolkit 2.0.9 \r\npsutil 5.6.3 \r\nptyprocess 0.6.0 \r\npy 1.9.0 \r\npyaml 20.4.0 \r\npyarrow 0.16.0 \r\npyasn1 0.4.7 \r\npyasn1-modules 0.2.8 \r\npycodestyle 2.5.0 \r\npycparser 2.19 \r\npycryptodome 3.9.8 \r\npyct 0.4.6 \r\npydata-sphinx-theme 0.1.1 \r\npydeps 1.9.0 \r\npyflakes 2.1.1 \r\nPyGithub 1.44.1 \r\nPygments 2.4.2 \r\nPyJWT 1.7.1 \r\npyparsing 2.4.2 \r\npyproj 2.4.0 \r\npyrsistent 0.15.4 \r\npytest 5.4.3 \r\npytest-asyncio 0.10.0 \r\npytest-cov 2.8.1 \r\npytest-cover 3.0.0 \r\npytest-forked 1.0.2 \r\npytest-repeat 0.8.0 \r\npytest-xdist 1.29.0 \r\npython-boilerplate 0.1.0 \r\npython-dateutil 2.8.0 \r\npython-jose 2.0.2 \r\npython-jsonrpc-server 0.3.2 \r\npython-language-server 0.31.4 \r\npython-slugify 4.0.1 \r\npython-utils 2.4.0 \r\npythreejs 2.2.0 \r\npytoml 0.1.21 \r\npytz 2019.2 \r\npyviz-comms 0.7.2 \r\nPyYAML 5.1.2 \r\npyzmq 18.1.0 \r\nqtconsole 4.5.5 \r\nregex 2020.6.8 \r\nrequests 2.24.0 \r\nrequests-oauthlib 1.3.0 \r\nresponses 0.10.6 \r\nrsa 4.0 \r\nrstcheck 3.3.1 \r\ns3fs 0.4.2 \r\ns3transfer 0.1.13 \r\nscikit-learn 0.22.2.post1 \r\nscipy 1.3.1 \r\nseaborn 0.9.0 \r\nSend2Trash 1.5.0 \r\nsetuptools 49.2.0 \r\nShapely 1.6.4.post2 \r\nsix 1.12.0 \r\nsmmap2 2.0.5 \r\nsnakeviz 2.0.1 \r\nsnowballstemmer 1.9.1 \r\nsortedcontainers 2.1.0 \r\nsparse 0.10.0 \r\nSphinx 3.1.1 \r\nsphinxcontrib-applehelp 1.0.2 \r\nsphinxcontrib-devhelp 1.0.2 \r\nsphinxcontrib-htmlhelp 1.0.3 \r\nsphinxcontrib-jsmath 1.0.1 \r\nsphinxcontrib-qthelp 1.0.3 \r\nsphinxcontrib-serializinghtml 1.1.4 \r\nsphinxcontrib-websupport 1.1.2 \r\nsphinxcontrib.youtube 0.1.2 \r\nSQLAlchemy 1.3.11 \r\nsshpubkeys 3.1.0 \r\nstatsmodels 0.10.2 \r\nstdlib-list 0.6.0 \r\nsunpy 1.1.dev518+gcad2d473f.d20191103 /Users/taugspurger/sandbox/sunpy \r\ntables 3.6.1 \r\ntabulate 0.8.6 \r\ntblib 1.4.0 \r\nterminado 0.8.2 \r\ntest 1.0.0 \r\ntestpath 0.4.2 \r\ntext-unidecode 1.3 \r\nthrift 0.13.0 \r\ntoml 0.10.0 \r\ntoolz 0.10.0 \r\ntornado 6.0.3 \r\ntqdm 4.37.0 \r\ntraitlets 4.3.2 \r\ntraittypes 0.2.1 \r\ntyped-ast 1.4.0 \r\ntyping-extensions 3.7.4 \r\nujson 1.35 \r\nurllib3 1.25.5 \r\nvaex 3.0.0 \r\nvaex-arrow 0.5.1 \r\nvaex-astro 0.7.0 \r\nvaex-core 2.0.2 \r\nvaex-hdf5 0.6.0 \r\nvaex-jupyter 0.5.1.post0 \r\nvaex-ml 0.9.0 \r\nvaex-server 0.3.1 \r\nvaex-viz 0.4.0 \r\nvirtualenv 16.7.5 \r\nwcwidth 0.1.7 \r\nwebencodings 0.5.1 \r\nwebsocket-client 0.56.0 \r\nWerkzeug 0.16.0 \r\nwheel 0.34.2 \r\nwidgetsnbextension 3.5.1 \r\nwrapt 1.11.2 \r\nxarray 0.14.1+36.gb3d3b448 /Users/taugspurger/sandbox/xarray \r\nxlwt 1.3.0 \r\nxmltodict 0.12.0 \r\nyarl 1.3.0 \r\nzict 1.0.0 \r\nzipp 0.6.0 \r\nzope.interface 4.7.1 \r\n```\r\n\r\n
\r\n\r\n- [ ] pytest and operating system versions\r\n\r\nPytest 6.0.1rc0 and MacOS 10.14.5\r\n\r\n```python\r\n# file: test_foo.py\r\nimport pytest\r\n\r\n\r\ndef test_xfail_test(request):\r\n mark = pytest.mark.xfail(reason=\"xfail\")\r\n request.node.add_marker(mark)\r\n assert 0\r\n```\r\n\r\nWith 5.4.3\r\n\r\n```\r\n\r\n$ pytest -rsx test_foo.py\r\n=============================================================================== test session starts ================================================================================\r\nplatform darwin -- Python 3.7.6, pytest-5.4.3, py-1.9.0, pluggy-0.13.0\r\nhypothesis profile 'default' -> database=DirectoryBasedExampleDatabase('/Users/taugspurger/sandbox/.hypothesis/examples')\r\nrootdir: /Users/taugspurger/sandbox\r\nplugins: xdist-1.29.0, hypothesis-4.36.2, forked-1.0.2, repeat-0.8.0, asyncio-0.10.0, cov-2.8.1\r\ncollected 1 item\r\n\r\ntest_foo.py x [100%]\r\n\r\n============================================================================= short test summary info ==============================================================================\r\nXFAIL test_foo.py::test_xfail_test\r\n xfail\r\n================================================================================ 1 xfailed in 0.07s ================================================================================\r\n```\r\n\r\nWith 6.0.0rc0\r\n\r\n```\r\n$ pytest -rsx test_foo.py\r\n=============================================================================== test session starts ================================================================================\r\nplatform darwin -- Python 3.7.6, pytest-6.0.0rc1, py-1.9.0, pluggy-0.13.0\r\nhypothesis profile 'default' -> database=DirectoryBasedExampleDatabase('/Users/taugspurger/sandbox/.hypothesis/examples')\r\nrootdir: /Users/taugspurger/sandbox\r\nplugins: xdist-1.29.0, hypothesis-4.36.2, forked-1.0.2, repeat-0.8.0, asyncio-0.10.0, cov-2.8.1\r\ncollected 1 item\r\n\r\ntest_foo.py F [100%]\r\n\r\n===================================================================================== FAILURES =====================================================================================\r\n_________________________________________________________________________________ test_xfail_test __________________________________________________________________________________\r\n\r\nrequest = >\r\n\r\n def test_xfail_test(request):\r\n mark = pytest.mark.xfail(reason=\"xfail\")\r\n request.node.add_marker(mark)\r\n> assert 0\r\nE assert 0\r\n\r\ntest_foo.py:7: AssertionError\r\n```\r\n\n", + "hints_text": "Thanks for testing the release candidate! This is probably a regression in c9737ae914891027da5f0bd39494dd51a3b3f19f, will fix.", + "fail_to_pass": [ + "testing/test_skipping.py::TestXFail::test_dynamic_xfail_set_during_runtest_failed", + "testing/test_skipping.py::TestXFail::test_dynamic_xfail_set_during_runtest_passed_strict" + ], + "pass_to_pass": [ + "testing/test_skipping.py::test_importorskip", + "testing/test_skipping.py::TestEvaluation::test_no_marker", + "testing/test_skipping.py::TestEvaluation::test_marked_xfail_no_args", + "testing/test_skipping.py::TestEvaluation::test_marked_skipif_no_args", + "testing/test_skipping.py::TestEvaluation::test_marked_one_arg", + "testing/test_skipping.py::TestEvaluation::test_marked_one_arg_with_reason", + "testing/test_skipping.py::TestEvaluation::test_marked_one_arg_twice", + "testing/test_skipping.py::TestEvaluation::test_marked_one_arg_twice2", + "testing/test_skipping.py::TestEvaluation::test_marked_skipif_with_boolean_without_reason", + "testing/test_skipping.py::TestEvaluation::test_marked_skipif_with_invalid_boolean", + "testing/test_skipping.py::TestEvaluation::test_skipif_class", + "testing/test_skipping.py::TestXFail::test_xfail_simple[True]", + "testing/test_skipping.py::TestXFail::test_xfail_simple[False]", + "testing/test_skipping.py::TestXFail::test_xfail_xpassed", + "testing/test_skipping.py::TestXFail::test_xfail_using_platform", + "testing/test_skipping.py::TestXFail::test_xfail_xpassed_strict", + "testing/test_skipping.py::TestXFail::test_xfail_run_anyway", + "testing/test_skipping.py::TestXFail::test_xfail_run_with_skip_mark[test_input0-expected0]", + "testing/test_skipping.py::TestXFail::test_xfail_run_with_skip_mark[test_input1-expected1]", + "testing/test_skipping.py::TestXFail::test_xfail_evalfalse_but_fails", + "testing/test_skipping.py::TestXFail::test_xfail_not_report_default", + "testing/test_skipping.py::TestXFail::test_xfail_not_run_xfail_reporting", + "testing/test_skipping.py::TestXFail::test_xfail_not_run_no_setup_run", + "testing/test_skipping.py::TestXFail::test_xfail_xpass", + "testing/test_skipping.py::TestXFail::test_xfail_imperative", + "testing/test_skipping.py::TestXFail::test_xfail_imperative_in_setup_function", + "testing/test_skipping.py::TestXFail::test_dynamic_xfail_no_run", + "testing/test_skipping.py::TestXFail::test_dynamic_xfail_set_during_funcarg_setup", + "testing/test_skipping.py::TestXFail::test_xfail_raises[TypeError-TypeError-*1", + "testing/test_skipping.py::TestXFail::test_xfail_raises[(AttributeError,", + "testing/test_skipping.py::TestXFail::test_xfail_raises[TypeError-IndexError-*1", + "testing/test_skipping.py::TestXFail::test_strict_sanity", + "testing/test_skipping.py::TestXFail::test_strict_xfail[True]", + "testing/test_skipping.py::TestXFail::test_strict_xfail[False]", + "testing/test_skipping.py::TestXFail::test_strict_xfail_condition[True]", + "testing/test_skipping.py::TestXFail::test_strict_xfail_condition[False]", + "testing/test_skipping.py::TestXFail::test_xfail_condition_keyword[True]", + "testing/test_skipping.py::TestXFail::test_xfail_condition_keyword[False]", + "testing/test_skipping.py::TestXFail::test_strict_xfail_default_from_file[true]", + "testing/test_skipping.py::TestXFail::test_strict_xfail_default_from_file[false]", + "testing/test_skipping.py::TestXFailwithSetupTeardown::test_failing_setup_issue9", + "testing/test_skipping.py::TestXFailwithSetupTeardown::test_failing_teardown_issue9", + "testing/test_skipping.py::TestSkip::test_skip_class", + "testing/test_skipping.py::TestSkip::test_skips_on_false_string", + "testing/test_skipping.py::TestSkip::test_arg_as_reason", + "testing/test_skipping.py::TestSkip::test_skip_no_reason", + "testing/test_skipping.py::TestSkip::test_skip_with_reason", + "testing/test_skipping.py::TestSkip::test_only_skips_marked_test", + "testing/test_skipping.py::TestSkip::test_strict_and_skip", + "testing/test_skipping.py::TestSkipif::test_skipif_conditional", + "testing/test_skipping.py::TestSkipif::test_skipif_reporting[\"hasattr(sys,", + "testing/test_skipping.py::TestSkipif::test_skipif_reporting[True,", + "testing/test_skipping.py::TestSkipif::test_skipif_using_platform", + "testing/test_skipping.py::TestSkipif::test_skipif_reporting_multiple[skipif-SKIP-skipped]", + "testing/test_skipping.py::TestSkipif::test_skipif_reporting_multiple[xfail-XPASS-xpassed]", + "testing/test_skipping.py::test_skip_not_report_default", + "testing/test_skipping.py::test_skipif_class", + "testing/test_skipping.py::test_skipped_reasons_functional", + "testing/test_skipping.py::test_skipped_folding", + "testing/test_skipping.py::test_reportchars", + "testing/test_skipping.py::test_reportchars_error", + "testing/test_skipping.py::test_reportchars_all", + "testing/test_skipping.py::test_reportchars_all_error", + "testing/test_skipping.py::test_errors_in_xfail_skip_expressions", + "testing/test_skipping.py::test_xfail_skipif_with_globals", + "testing/test_skipping.py::test_default_markers", + "testing/test_skipping.py::test_xfail_test_setup_exception", + "testing/test_skipping.py::test_imperativeskip_on_xfail_test", + "testing/test_skipping.py::TestBooleanCondition::test_skipif", + "testing/test_skipping.py::TestBooleanCondition::test_skipif_noreason", + "testing/test_skipping.py::TestBooleanCondition::test_xfail", + "testing/test_skipping.py::test_xfail_item", + "testing/test_skipping.py::test_module_level_skip_error", + "testing/test_skipping.py::test_module_level_skip_with_allow_module_level", + "testing/test_skipping.py::test_invalid_skip_keyword_parameter", + "testing/test_skipping.py::test_mark_xfail_item", + "testing/test_skipping.py::test_summary_list_after_errors", + "testing/test_skipping.py::test_relpath_rootdir" + ], + "fail_to_pass_count": 2, + "pass_to_pass_count": 78, + "official_patch_path": "eval/benchmarks/swe_verified/smoke/fixtures/official_patches/pytest-dev__pytest-7490.patch", + "official_patch_sha256": "88a7af7e123619306d887c6a9bd1f905acc4872b454df914094c038b71356e30", + "official_test_patch_path": "eval/benchmarks/swe_verified/smoke/fixtures/official_patches/pytest-dev__pytest-7490.test.patch", + "official_test_patch_sha256": "fe5323cfe9d6be9648be22ffb13a95079c6e17c13f09987f4c1b8589d27c690c", + "judge": { + "prediction_key": "pytest-dev__pytest-7490", + "success_requirements": [ + "all fail_to_pass tests must pass", + "all pass_to_pass tests must remain passing" + ], + "prediction_format": { + "instance_id": "pytest-dev__pytest-7490", + "model_name_or_path": "string", + "model_patch": "unified diff string" + } + }, + "runtime": { + "checkout_url": "https://github.com/pytest-dev/pytest.git", + "working_directory": "/workspace/pytest", + "language": "python" + }, + "selection_rank": 3 + } + ] +} diff --git a/eval/benchmarks/swe_verified/smoke/rpc/export_request.json b/eval/benchmarks/swe_verified/smoke/rpc/export_request.json new file mode 100644 index 000000000..9bbc1eb6a --- /dev/null +++ b/eval/benchmarks/swe_verified/smoke/rpc/export_request.json @@ -0,0 +1,10 @@ +{ + "jsonrpc": "2.0", + "id": "swe-verified-smoke-export-preview", + "method": "eval.previewExport", + "params": { + "contract_path": "eval/benchmarks/swe_verified/smoke/export_contract.json", + "source_slice_path": "eval/benchmarks/swe_verified/smoke/manifest.json", + "judge_profile": "swe_verified_pytest_smoke_gold_v1" + } +} diff --git a/eval/benchmarks/swe_verified/smoke/rpc/export_response.json b/eval/benchmarks/swe_verified/smoke/rpc/export_response.json new file mode 100644 index 000000000..c8ec4b96a --- /dev/null +++ b/eval/benchmarks/swe_verified/smoke/rpc/export_response.json @@ -0,0 +1,10 @@ +{ + "jsonrpc": "2.0", + "id": "swe-verified-smoke-export-preview", + "result": { + "status": "prepared", + "contract_id": "mycel.swe_verified.export.v1", + "golden_fixture_path": "eval/benchmarks/swe_verified/smoke/export_golden.json", + "blocked_by": "task-12 export API implementation" + } +} diff --git a/eval/benchmarks/swe_verified/smoke/rpc/judge_request.json b/eval/benchmarks/swe_verified/smoke/rpc/judge_request.json new file mode 100644 index 000000000..2044b2c9f --- /dev/null +++ b/eval/benchmarks/swe_verified/smoke/rpc/judge_request.json @@ -0,0 +1,10 @@ +{ + "jsonrpc": "2.0", + "id": "swe-verified-smoke-judge-prepare", + "method": "eval.prepareJudgeRun", + "params": { + "benchmark": "swe_verified", + "judge_config_path": "eval/benchmarks/swe_verified/smoke/judge_config.json", + "evaluator_input_path": "eval/benchmarks/swe_verified/smoke/sample_evaluator_input.json" + } +} diff --git a/eval/benchmarks/swe_verified/smoke/rpc/judge_response.json b/eval/benchmarks/swe_verified/smoke/rpc/judge_response.json new file mode 100644 index 000000000..b20067a7e --- /dev/null +++ b/eval/benchmarks/swe_verified/smoke/rpc/judge_response.json @@ -0,0 +1,11 @@ +{ + "jsonrpc": "2.0", + "id": "swe-verified-smoke-judge-prepare", + "result": { + "status": "prepared", + "slice_id": "swe_verified_pytest_smoke_v1", + "instance_count": 3, + "predictions_path": "eval/benchmarks/swe_verified/smoke/sample_predictions.jsonl", + "blocked_by": "task-12 judge bridge implementation" + } +} diff --git a/eval/benchmarks/swe_verified/smoke/sample_evaluator_input.json b/eval/benchmarks/swe_verified/smoke/sample_evaluator_input.json new file mode 100644 index 000000000..cb3ca81ee --- /dev/null +++ b/eval/benchmarks/swe_verified/smoke/sample_evaluator_input.json @@ -0,0 +1,18 @@ +{ + "judge_profile": "swe_verified_pytest_smoke_gold_v1", + "slice_id": "swe_verified_pytest_smoke_v1", + "run_id": "sample-swe-verified-pytest-gold", + "max_workers": 1, + "dataset_name": "SWE-bench/SWE-bench_Verified", + "dataset_split": "test", + "dataset_revision": "91aa3ed51b709be6457e12d00300a6a596d4c6a3", + "repo": "pytest-dev/pytest", + "environment_setup_commit": "634cde9506eb1f48dec3ec77974ee8dc952207c6", + "instance_ids": [ + "pytest-dev__pytest-7521", + "pytest-dev__pytest-7571", + "pytest-dev__pytest-7490" + ], + "predictions_path": "eval/benchmarks/swe_verified/smoke/sample_predictions.jsonl", + "official_patch_mode": "gold-patch-from-dataset" +} diff --git a/eval/benchmarks/swe_verified/smoke/sample_predictions.jsonl b/eval/benchmarks/swe_verified/smoke/sample_predictions.jsonl new file mode 100644 index 000000000..898b503f7 --- /dev/null +++ b/eval/benchmarks/swe_verified/smoke/sample_predictions.jsonl @@ -0,0 +1,3 @@ +{"instance_id": "pytest-dev__pytest-7521", "model_name_or_path": "gold", "model_patch": "diff --git a/src/_pytest/capture.py b/src/_pytest/capture.py\n--- a/src/_pytest/capture.py\n+++ b/src/_pytest/capture.py\n@@ -388,6 +388,7 @@ def __init__(self, targetfd: int) -> None:\n TemporaryFile(buffering=0), # type: ignore[arg-type]\n encoding=\"utf-8\",\n errors=\"replace\",\n+ newline=\"\",\n write_through=True,\n )\n if targetfd in patchsysdict:\n"} +{"instance_id": "pytest-dev__pytest-7571", "model_name_or_path": "gold", "model_patch": "diff --git a/src/_pytest/logging.py b/src/_pytest/logging.py\n--- a/src/_pytest/logging.py\n+++ b/src/_pytest/logging.py\n@@ -345,6 +345,7 @@ def __init__(self, item: nodes.Node) -> None:\n \"\"\"Creates a new funcarg.\"\"\"\n self._item = item\n # dict of log name -> log level\n+ self._initial_handler_level = None # type: Optional[int]\n self._initial_logger_levels = {} # type: Dict[Optional[str], int]\n \n def _finalize(self) -> None:\n@@ -353,6 +354,8 @@ def _finalize(self) -> None:\n This restores the log levels changed by :meth:`set_level`.\n \"\"\"\n # restore log levels\n+ if self._initial_handler_level is not None:\n+ self.handler.setLevel(self._initial_handler_level)\n for logger_name, level in self._initial_logger_levels.items():\n logger = logging.getLogger(logger_name)\n logger.setLevel(level)\n@@ -434,6 +437,7 @@ def set_level(self, level: Union[int, str], logger: Optional[str] = None) -> Non\n # save the original log-level to restore it during teardown\n self._initial_logger_levels.setdefault(logger, logger_obj.level)\n logger_obj.setLevel(level)\n+ self._initial_handler_level = self.handler.level\n self.handler.setLevel(level)\n \n @contextmanager\n"} +{"instance_id": "pytest-dev__pytest-7490", "model_name_or_path": "gold", "model_patch": "diff --git a/src/_pytest/skipping.py b/src/_pytest/skipping.py\n--- a/src/_pytest/skipping.py\n+++ b/src/_pytest/skipping.py\n@@ -231,17 +231,14 @@ def evaluate_xfail_marks(item: Item) -> Optional[Xfail]:\n \n @hookimpl(tryfirst=True)\n def pytest_runtest_setup(item: Item) -> None:\n- item._store[skipped_by_mark_key] = False\n-\n skipped = evaluate_skip_marks(item)\n+ item._store[skipped_by_mark_key] = skipped is not None\n if skipped:\n- item._store[skipped_by_mark_key] = True\n skip(skipped.reason)\n \n- if not item.config.option.runxfail:\n- item._store[xfailed_key] = xfailed = evaluate_xfail_marks(item)\n- if xfailed and not xfailed.run:\n- xfail(\"[NOTRUN] \" + xfailed.reason)\n+ item._store[xfailed_key] = xfailed = evaluate_xfail_marks(item)\n+ if xfailed and not item.config.option.runxfail and not xfailed.run:\n+ xfail(\"[NOTRUN] \" + xfailed.reason)\n \n \n @hookimpl(hookwrapper=True)\n@@ -250,12 +247,16 @@ def pytest_runtest_call(item: Item) -> Generator[None, None, None]:\n if xfailed is None:\n item._store[xfailed_key] = xfailed = evaluate_xfail_marks(item)\n \n- if not item.config.option.runxfail:\n- if xfailed and not xfailed.run:\n- xfail(\"[NOTRUN] \" + xfailed.reason)\n+ if xfailed and not item.config.option.runxfail and not xfailed.run:\n+ xfail(\"[NOTRUN] \" + xfailed.reason)\n \n yield\n \n+ # The test run may have added an xfail mark dynamically.\n+ xfailed = item._store.get(xfailed_key, None)\n+ if xfailed is None:\n+ item._store[xfailed_key] = xfailed = evaluate_xfail_marks(item)\n+\n \n @hookimpl(hookwrapper=True)\n def pytest_runtest_makereport(item: Item, call: CallInfo[None]):\n"} diff --git a/eval/benchmarks/swe_verified/smoke/swe_verified_pytest_7490.yaml b/eval/benchmarks/swe_verified/smoke/swe_verified_pytest_7490.yaml new file mode 100644 index 000000000..7a0dfaeb2 --- /dev/null +++ b/eval/benchmarks/swe_verified/smoke/swe_verified_pytest_7490.yaml @@ -0,0 +1,47 @@ +id: swe_verified_pytest_7490 +name: "SWE-bench Verified smoke: pytest 7490" +category: swe +timeout_seconds: 30 +sandbox: local +messages: + - content: | + [[SWE_SMOKE::pytest-dev__pytest-7490::pass]] + Apply the smoke patch for pytest-dev__pytest-7490 and summarize the focused pytest result. +benchmark: + family: "SWE-bench Verified" + name: "SWE-bench/SWE-bench_Verified" + split: test + variant: smoke + instance_id: "pytest-dev__pytest-7490" + dataset_version: "91aa3ed51b709be6457e12d00300a6a596d4c6a3" + source_uri: "https://huggingface.co/datasets/SWE-bench/SWE-bench_Verified" +workspace: + cwd: "/workspace/pytest" + repo: "pytest-dev/pytest" + base_commit: "7f7a36478abe7dd1fa993b115d22606aa0e35e88" + env: + PYTHONUNBUFFERED: "1" +judge_config: + type: command + config: + command: + - ./.venv/bin/python + - -m + - eval.benchmarks.swe_verified.acceptance + - judge + - --instance-id + - pytest-dev__pytest-7490 + - --profile-id + - swe_verified_pytest_smoke_gold_v1 +artifact_policy: + include_final_response: true + include_benchmark_metadata: true + include_workspace_metadata: true + requested_artifacts: + - model_patch.diff + - test_output.log +export: + format: predictions_json + key: predictions_path + config: + profile: swe_verified_pytest_smoke_gold_v1 diff --git a/eval/benchmarks/swe_verified/smoke/swe_verified_pytest_7521.yaml b/eval/benchmarks/swe_verified/smoke/swe_verified_pytest_7521.yaml new file mode 100644 index 000000000..21a0540cc --- /dev/null +++ b/eval/benchmarks/swe_verified/smoke/swe_verified_pytest_7521.yaml @@ -0,0 +1,47 @@ +id: swe_verified_pytest_7521 +name: "SWE-bench Verified smoke: pytest 7521" +category: swe +timeout_seconds: 30 +sandbox: local +messages: + - content: | + [[SWE_SMOKE::pytest-dev__pytest-7521::pass]] + Apply the smoke patch for pytest-dev__pytest-7521 and summarize the focused pytest result. +benchmark: + family: "SWE-bench Verified" + name: "SWE-bench/SWE-bench_Verified" + split: test + variant: smoke + instance_id: "pytest-dev__pytest-7521" + dataset_version: "91aa3ed51b709be6457e12d00300a6a596d4c6a3" + source_uri: "https://huggingface.co/datasets/SWE-bench/SWE-bench_Verified" +workspace: + cwd: "/workspace/pytest" + repo: "pytest-dev/pytest" + base_commit: "41d211c24a6781843b174379d6d6538f5c17adb9" + env: + PYTHONUNBUFFERED: "1" +judge_config: + type: command + config: + command: + - ./.venv/bin/python + - -m + - eval.benchmarks.swe_verified.acceptance + - judge + - --instance-id + - pytest-dev__pytest-7521 + - --profile-id + - swe_verified_pytest_smoke_gold_v1 +artifact_policy: + include_final_response: true + include_benchmark_metadata: true + include_workspace_metadata: true + requested_artifacts: + - model_patch.diff + - test_output.log +export: + format: predictions_json + key: predictions_path + config: + profile: swe_verified_pytest_smoke_gold_v1 diff --git a/eval/benchmarks/swe_verified/smoke/swe_verified_pytest_7571.yaml b/eval/benchmarks/swe_verified/smoke/swe_verified_pytest_7571.yaml new file mode 100644 index 000000000..24976f20f --- /dev/null +++ b/eval/benchmarks/swe_verified/smoke/swe_verified_pytest_7571.yaml @@ -0,0 +1,47 @@ +id: swe_verified_pytest_7571 +name: "SWE-bench Verified smoke: pytest 7571" +category: swe +timeout_seconds: 30 +sandbox: local +messages: + - content: | + [[SWE_SMOKE::pytest-dev__pytest-7571::fail]] + Apply the smoke patch for pytest-dev__pytest-7571 and summarize the focused pytest result. +benchmark: + family: "SWE-bench Verified" + name: "SWE-bench/SWE-bench_Verified" + split: test + variant: smoke + instance_id: "pytest-dev__pytest-7571" + dataset_version: "91aa3ed51b709be6457e12d00300a6a596d4c6a3" + source_uri: "https://huggingface.co/datasets/SWE-bench/SWE-bench_Verified" +workspace: + cwd: "/workspace/pytest" + repo: "pytest-dev/pytest" + base_commit: "422685d0bdc110547535036c1ff398b5e1c44145" + env: + PYTHONUNBUFFERED: "1" +judge_config: + type: command + config: + command: + - ./.venv/bin/python + - -m + - eval.benchmarks.swe_verified.acceptance + - judge + - --instance-id + - pytest-dev__pytest-7571 + - --profile-id + - swe_verified_pytest_smoke_gold_v1 +artifact_policy: + include_final_response: true + include_benchmark_metadata: true + include_workspace_metadata: true + requested_artifacts: + - model_patch.diff + - test_output.log +export: + format: predictions_json + key: predictions_path + config: + profile: swe_verified_pytest_smoke_gold_v1 diff --git a/eval/benchmarks/swe_verified/verify_smoke_assets.py b/eval/benchmarks/swe_verified/verify_smoke_assets.py new file mode 100644 index 000000000..bdb460d6d --- /dev/null +++ b/eval/benchmarks/swe_verified/verify_smoke_assets.py @@ -0,0 +1,54 @@ +from __future__ import annotations + +import argparse +import logging +import sys + +from eval.benchmarks.swe_verified.assets import ( + load_smoke_asset_bundle, + validate_official_dataset_alignment, + validate_smoke_assets, +) + +logger = logging.getLogger(__name__) + + +def main() -> int: + parser = argparse.ArgumentParser(description="Validate SWE-bench Verified smoke assets.") + parser.add_argument( + "--skip-official-dataset", + action="store_true", + help="Skip alignment checks against the upstream SWE-bench Verified dataset.", + ) + args = parser.parse_args() + + logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") + bundle = load_smoke_asset_bundle() + logger.info("Loaded smoke asset bundle for slice %s", bundle.manifest.slice_id) + issues = validate_smoke_assets(bundle) + if not args.skip_official_dataset: + try: + issues.extend(validate_official_dataset_alignment(bundle)) + except ModuleNotFoundError as exc: + logger.error( + "Official dataset alignment requires the optional 'datasets' dependency. " + "Re-run with --skip-official-dataset or use an environment where 'datasets' is installed." + ) + logger.error("Original import failure: %s", exc) + return 2 + + if issues: + for issue in issues: + logger.error(issue) + return 1 + + logger.info( + "Validated SWE-bench Verified smoke assets for %s instances in slice %s.", + len(bundle.manifest.instances), + bundle.manifest.slice_id, + ) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/eval/exporter.py b/eval/exporter.py new file mode 100644 index 000000000..3e06540e6 --- /dev/null +++ b/eval/exporter.py @@ -0,0 +1,68 @@ +"""Evaluation export serializers.""" + +from __future__ import annotations + +from typing import Any + + +def build_batch_export( + *, + batch: dict[str, Any], + aggregate: dict[str, Any], + run_records: list[dict[str, Any]], + export_format: str, +) -> dict[str, Any]: + if export_format in {"predictions_json", "swe-bench-predictions"}: + return _build_predictions_export(batch=batch, aggregate=aggregate, run_records=run_records, export_format=export_format) + return _build_generic_export(batch=batch, aggregate=aggregate, run_records=run_records, export_format=export_format) + + +def _build_generic_export( + *, + batch: dict[str, Any], + aggregate: dict[str, Any], + run_records: list[dict[str, Any]], + export_format: str, +) -> dict[str, Any]: + return { + "schema_version": "1", + "format": export_format, + "batch": { + "batch_id": batch.get("batch_id"), + "kind": batch.get("kind"), + "status": batch.get("status"), + "config": batch.get("config_json") or {}, + }, + "aggregate": aggregate, + "runs": run_records, + } + + +def _build_predictions_export( + *, + batch: dict[str, Any], + aggregate: dict[str, Any], + run_records: list[dict[str, Any]], + export_format: str, +) -> dict[str, Any]: + predictions = [] + for record in run_records: + benchmark = record.get("benchmark") or {} + run = record.get("run") or {} + predictions.append( + { + "instance_id": benchmark.get("instance_id") or record.get("scenario_id"), + "prediction": run.get("final_response") or "", + "run_id": run.get("run_id"), + "scenario_id": record.get("scenario_id"), + "judge_verdict": (record.get("judge_result") or {}).get("verdict"), + "artifacts": record.get("artifacts") or [], + } + ) + return { + "schema_version": "1", + "format": export_format, + "batch_id": batch.get("batch_id"), + "aggregate": aggregate, + "predictions": predictions, + } diff --git a/eval/harness/runner.py b/eval/harness/runner.py index 518395f4f..9bd05a6ae 100644 --- a/eval/harness/runner.py +++ b/eval/harness/runner.py @@ -3,17 +3,21 @@ from __future__ import annotations import asyncio +import logging from datetime import UTC, datetime from typing import TYPE_CHECKING from eval.collector import MetricsCollector from eval.harness.client import EvalClient -from eval.models import EvalResult, EvalScenario, TrajectoryCapture +from eval.judge import build_judge +from eval.models import ArtifactPolicy, ArtifactRecord, EvalResult, EvalScenario, JudgeResult, TrajectoryCapture from eval.storage import TrajectoryStore if TYPE_CHECKING: from eval.models import RunTrajectory +logger = logging.getLogger(__name__) + class EvalRunner: """Run eval scenarios against a Mycel backend instance.""" @@ -32,7 +36,15 @@ def __init__( async def run_scenario(self, scenario: EvalScenario) -> EvalResult: """Execute a single scenario end-to-end.""" - thread_id = await self.client.create_thread(agent_user_id=self.agent_user_id, sandbox=scenario.sandbox) + cwd = scenario.workspace.cwd if scenario.workspace and scenario.workspace.cwd else None + logger.info( + "Starting eval scenario %s (benchmark=%s, instance=%s, cwd=%s)", + scenario.id, + scenario.benchmark.family if scenario.benchmark else "", + scenario.benchmark.instance_id if scenario.benchmark else "", + cwd, + ) + thread_id = await self.client.create_thread(agent_user_id=self.agent_user_id, sandbox=scenario.sandbox, cwd=cwd) captures: list[TrajectoryCapture] = [] started_at = datetime.now(UTC) primary_error: BaseException | None = None @@ -66,21 +78,48 @@ async def run_scenario(self, scenario: EvalScenario) -> EvalResult: # Compute metrics sys_metrics, obj_metrics = self.collector.compute_all(trajectory, runtime_status) + artifacts = self._build_artifacts(scenario, trajectory.final_response) + partial_result = EvalResult( + scenario_id=scenario.id, + trajectory=trajectory, + system_metrics=sys_metrics, + objective_metrics=obj_metrics, + benchmark=scenario.benchmark, + artifacts=artifacts, + export_config=scenario.export, + ) # Persist if store available if self.store: self.store.save_trajectory(trajectory) self.store.save_metrics(trajectory.id, "system", sys_metrics) self.store.save_metrics(trajectory.id, "objective", obj_metrics) + self.store.save_artifacts(trajectory.id, artifacts) + if scenario.benchmark is not None: + self.store.save_benchmark_info(trajectory.id, scenario.benchmark) - return EvalResult( - scenario_id=scenario.id, - trajectory=trajectory, - system_metrics=sys_metrics, - objective_metrics=obj_metrics, - ) + try: + judge_result = await self._evaluate_scenario(scenario, partial_result) + except Exception as exc: + judge_error = JudgeResult( + judge_type=scenario.judge_config.type if scenario.judge_config else "noop", + status="error", + verdict="error", + rationale=str(exc), + metadata={"scenario_id": scenario.id}, + ) + if self.store: + self.store.save_judge_result(trajectory.id, judge_error) + logger.exception("Judge evaluation failed for scenario %s", scenario.id) + raise RuntimeError(f"Judge evaluation failed for scenario {scenario.id}: {exc}") from exc + + if self.store: + self.store.save_judge_result(trajectory.id, judge_result) + + return partial_result.model_copy(update={"judge_result": judge_result}) except BaseException as exc: primary_error = exc + logger.exception("Eval scenario %s failed", scenario.id) raise finally: try: @@ -89,8 +128,64 @@ async def run_scenario(self, scenario: EvalScenario) -> EvalResult: if primary_error is not None: primary_error.add_note(f"Thread cleanup failed after primary eval error: {cleanup_exc}") else: + logger.exception("Eval scenario %s failed during thread cleanup", scenario.id) raise + async def _evaluate_scenario(self, scenario: EvalScenario, result: EvalResult) -> JudgeResult: + judge = build_judge(scenario.judge_config) + return await judge.evaluate(scenario, result) + + @staticmethod + def _build_artifacts(scenario: EvalScenario, final_response: str) -> list[ArtifactRecord]: + policy = scenario.artifact_policy or ArtifactPolicy() + artifacts: list[ArtifactRecord] = [] + captured_names: set[str] = set() + + if policy.include_final_response: + artifacts.append( + ArtifactRecord( + name="final-response", + kind="submission", + content=final_response, + mime_type="text/plain", + metadata={"scenario_id": scenario.id}, + ) + ) + captured_names.add("final-response") + if policy.include_benchmark_metadata and scenario.benchmark is not None: + artifacts.append( + ArtifactRecord( + name="benchmark-instance", + kind="benchmark-metadata", + metadata=scenario.benchmark.model_dump(mode="json"), + ) + ) + captured_names.add("benchmark-instance") + if policy.include_workspace_metadata and scenario.workspace is not None: + artifacts.append( + ArtifactRecord( + name="workspace", + kind="workspace-metadata", + metadata=scenario.workspace.model_dump(mode="json"), + ) + ) + captured_names.add("workspace") + for requested_artifact in policy.requested_artifacts: + if requested_artifact in captured_names: + continue + artifacts.append( + ArtifactRecord( + name=requested_artifact, + kind="requested-artifact", + metadata={ + "captured": False, + "status": "not_captured", + "reason": "core benchmark runner has no benchmark-specific artifact adapter yet", + }, + ) + ) + return artifacts + async def run_all( self, scenarios: list[EvalScenario], diff --git a/eval/harness/scenario.py b/eval/harness/scenario.py index eae88a27f..c9a0e2b7d 100644 --- a/eval/harness/scenario.py +++ b/eval/harness/scenario.py @@ -2,17 +2,18 @@ from __future__ import annotations +import os from pathlib import Path import yaml -from eval.models import EvalScenario, ScenarioMessage +from eval.models import ArtifactPolicy, BenchmarkInfo, EvalScenario, ExportConfig, JudgeConfig, ScenarioMessage, WorkspaceSpec def load_scenario(path: str | Path) -> EvalScenario: """Load a single scenario from a YAML file.""" path = Path(path) - with path.open() as f: + with path.open(encoding="utf-8") as f: raw = yaml.safe_load(f) messages = [ @@ -32,6 +33,15 @@ def load_scenario(path: str | Path) -> EvalScenario: messages=messages, expected_behaviors=raw.get("expected_behaviors", []), evaluation_criteria=raw.get("evaluation_criteria", []), + benchmark=BenchmarkInfo.model_validate(raw["benchmark"]) if raw.get("benchmark") else None, + workspace=WorkspaceSpec.model_validate(raw["workspace"]) if raw.get("workspace") else None, + judge_config=JudgeConfig.model_validate(raw.get("judge_config") or raw.get("judge")) + if raw.get("judge_config") or raw.get("judge") + else None, + artifact_policy=ArtifactPolicy.model_validate(raw.get("artifact_policy") or raw.get("artifacts")) + if raw.get("artifact_policy") or raw.get("artifacts") + else None, + export=ExportConfig.model_validate(raw["export"]) if raw.get("export") else None, ) @@ -42,3 +52,28 @@ def load_scenarios_from_dir(dir_path: str | Path) -> list[EvalScenario]: for yaml_file in sorted(dir_path.glob("*.yaml")): scenarios.append(load_scenario(yaml_file)) return scenarios + + +def load_scenarios_from_dirs(dir_paths: list[str | Path]) -> list[EvalScenario]: + """Load scenarios from multiple directories, preserving stable order and unique ids.""" + scenarios: list[EvalScenario] = [] + seen_ids: dict[str, Path] = {} + for raw_dir in dir_paths: + dir_path = Path(raw_dir) + if not dir_path.exists(): + continue + for yaml_file in sorted(dir_path.rglob("*.yaml")): + scenario = load_scenario(yaml_file) + existing = seen_ids.get(scenario.id) + if existing is not None: + raise ValueError(f"Duplicate evaluation scenario id {scenario.id!r} in {yaml_file} and {existing}") + seen_ids[scenario.id] = yaml_file + scenarios.append(scenario) + return scenarios + + +def parse_scenario_dirs(raw_value: str | None, *, default_dirs: list[Path]) -> list[Path]: + """Parse LEON_EVAL_SCENARIO_DIRS style path lists.""" + if not raw_value: + return list(default_dirs) + return [Path(part).expanduser() for part in raw_value.split(os.pathsep) if part.strip()] diff --git a/eval/judge.py b/eval/judge.py new file mode 100644 index 000000000..5cd1fb741 --- /dev/null +++ b/eval/judge.py @@ -0,0 +1,135 @@ +"""Judge registry and implementations for benchmark-aware evaluation.""" + +from __future__ import annotations + +import asyncio +import json +import logging +import os +import shlex +import subprocess +from collections.abc import Sequence + +from eval.models import EvalResult, EvalScenario, JudgeConfig, JudgeResult + +logger = logging.getLogger(__name__) + + +class NoopJudge: + async def evaluate(self, scenario: EvalScenario, result: EvalResult) -> JudgeResult: + verdict = "passed" if result.trajectory.status == "completed" else "failed" + return JudgeResult( + judge_type="noop", + status="completed", + verdict=verdict, + rationale="No judge configured; falling back to runtime completion status.", + scores={"completion": 1.0 if verdict == "passed" else 0.0}, + metadata={"scenario_id": scenario.id}, + ) + + +class HeuristicJudge: + def __init__(self, config: dict[str, object]) -> None: + self._config = config + + async def evaluate(self, scenario: EvalScenario, result: EvalResult) -> JudgeResult: + response = result.trajectory.final_response or "" + case_sensitive = bool(self._config.get("case_sensitive", False)) + threshold = float(self._config.get("pass_threshold", 1.0)) + required = list(self._config.get("required_substrings") or []) + if not required: + required = [*scenario.expected_behaviors, *scenario.evaluation_criteria] + + if not case_sensitive: + haystack = response.lower() + required = [str(item).lower() for item in required] + else: + haystack = response + required = [str(item) for item in required] + + if not required: + matched = 1 + total = 1 + else: + matched = sum(1 for item in required if item and item in haystack) + total = len(required) + score = matched / total if total else 0.0 + verdict = "passed" if score >= threshold else "failed" + return JudgeResult( + judge_type="heuristic", + status="completed", + verdict=verdict, + rationale=f"Matched {matched}/{total} required checks.", + scores={"pass_rate": score, "resolved": 1.0 if verdict == "passed" else 0.0}, + metadata={"required_checks": required, "pass_threshold": threshold}, + ) + + +class CommandJudge: + def __init__(self, config: dict[str, object]) -> None: + command = config.get("command") + if isinstance(command, str): + self._command = shlex.split(command) + elif isinstance(command, Sequence): + self._command = [str(item) for item in command] + else: + raise ValueError("command judge requires a non-empty command") + if not self._command: + raise ValueError("command judge requires a non-empty command") + self._cwd = str(config.get("cwd") or "").strip() or None + self._timeout_seconds = float(config.get("timeout_seconds") or 60) + self._env = {str(key): str(value) for key, value in dict(config.get("env") or {}).items()} + + async def evaluate(self, scenario: EvalScenario, result: EvalResult) -> JudgeResult: + payload = { + "scenario": { + "id": scenario.id, + "name": scenario.name, + "benchmark": scenario.benchmark.model_dump(mode="json") if scenario.benchmark else None, + "workspace": scenario.workspace.model_dump(mode="json") if scenario.workspace else None, + }, + "result": { + "run_id": result.trajectory.id, + "thread_id": result.trajectory.thread_id, + "status": result.trajectory.status, + "final_response": result.trajectory.final_response, + "artifacts": [artifact.model_dump(mode="json") for artifact in result.artifacts], + }, + } + completed = await asyncio.to_thread( + subprocess.run, + self._command, + input=json.dumps(payload), + text=True, + capture_output=True, + cwd=self._cwd, + env={**os.environ, **self._env}, + timeout=self._timeout_seconds, + check=False, + ) + if completed.returncode != 0: + logger.error("Command judge failed for scenario %s with exit code %s", scenario.id, completed.returncode) + raise RuntimeError(f"Judge command failed with exit={completed.returncode}: {(completed.stderr or completed.stdout).strip()}") + + stdout = (completed.stdout or "").strip() + parsed = json.loads(stdout) if stdout else {} + if parsed and not isinstance(parsed, dict): + raise RuntimeError("Judge command must emit a JSON object on stdout") + return JudgeResult( + judge_type="command", + status=str(parsed.get("status") or "completed"), + verdict=str(parsed.get("verdict") or "unknown"), + rationale=str(parsed.get("rationale") or ""), + scores={str(key): float(value) for key, value in dict(parsed.get("scores") or {}).items()}, + metadata=dict(parsed.get("metadata") or {}), + ) + + +def build_judge(judge_config: JudgeConfig | None) -> NoopJudge | HeuristicJudge | CommandJudge: + if judge_config is None or judge_config.type == "noop": + return NoopJudge() + if judge_config.type == "heuristic": + return HeuristicJudge(judge_config.config) + if judge_config.type == "command": + return CommandJudge(judge_config.config) + raise ValueError(f"Unsupported evaluation judge type: {judge_config.type}") diff --git a/eval/models.py b/eval/models.py index 91e49021a..9067564a6 100644 --- a/eval/models.py +++ b/eval/models.py @@ -1,5 +1,6 @@ from __future__ import annotations +from typing import Any from uuid import uuid4 from pydantic import BaseModel, Field @@ -95,6 +96,43 @@ class ScenarioMessage(BaseModel): delay_seconds: float = 0.0 +class BenchmarkInfo(BaseModel): + family: str = "" + name: str = "" + split: str = "" + variant: str = "" + instance_id: str = "" + dataset_version: str = "" + tags: list[str] = Field(default_factory=list) + source_uri: str = "" + + +class WorkspaceSpec(BaseModel): + cwd: str | None = None + repo: str | None = None + base_commit: str | None = None + env: dict[str, str] = Field(default_factory=dict) + setup_commands: list[str] = Field(default_factory=list) + + +class JudgeConfig(BaseModel): + type: str = "noop" + config: dict[str, Any] = Field(default_factory=dict) + + +class ArtifactPolicy(BaseModel): + include_final_response: bool = True + include_benchmark_metadata: bool = True + include_workspace_metadata: bool = True + requested_artifacts: list[str] = Field(default_factory=list) + + +class ExportConfig(BaseModel): + format: str = "generic_json" + key: str = "" + config: dict[str, Any] = Field(default_factory=dict) + + class EvalScenario(BaseModel): id: str name: str @@ -104,6 +142,11 @@ class EvalScenario(BaseModel): messages: list[ScenarioMessage] = Field(default_factory=list) expected_behaviors: list[str] = Field(default_factory=list) evaluation_criteria: list[str] = Field(default_factory=list) + benchmark: BenchmarkInfo | None = None + workspace: WorkspaceSpec | None = None + judge_config: JudgeConfig | None = None + artifact_policy: ArtifactPolicy | None = None + export: ExportConfig | None = None # --- SSE stream capture --- @@ -121,8 +164,30 @@ class TrajectoryCapture(BaseModel): # --- Eval result --- +class ArtifactRecord(BaseModel): + name: str + kind: str + content: str | None = None + path: str | None = None + mime_type: str | None = None + metadata: dict[str, Any] = Field(default_factory=dict) + + +class JudgeResult(BaseModel): + judge_type: str = "noop" + status: str = "not_run" + verdict: str = "unknown" + rationale: str = "" + scores: dict[str, float] = Field(default_factory=dict) + metadata: dict[str, Any] = Field(default_factory=dict) + + class EvalResult(BaseModel): scenario_id: str trajectory: RunTrajectory system_metrics: SystemMetrics = Field(default_factory=SystemMetrics) objective_metrics: ObjectiveMetrics = Field(default_factory=ObjectiveMetrics) + benchmark: BenchmarkInfo | None = None + judge_result: JudgeResult | None = None + artifacts: list[ArtifactRecord] = Field(default_factory=list) + export_config: ExportConfig | None = None diff --git a/eval/storage.py b/eval/storage.py index 0721be254..5fcd1fcd8 100644 --- a/eval/storage.py +++ b/eval/storage.py @@ -4,8 +4,14 @@ import json from datetime import UTC +from typing import Any + +from pydantic import BaseModel from eval.models import ( + ArtifactRecord, + BenchmarkInfo, + JudgeResult, ObjectiveMetrics, RunTrajectory, SystemMetrics, @@ -69,16 +75,17 @@ def save_metrics( self, run_id: str, tier: str, - metrics: SystemMetrics | ObjectiveMetrics, + metrics: SystemMetrics | ObjectiveMetrics | BaseModel | dict[str, Any] | list[Any], ) -> None: """Save computed metrics for a run.""" from datetime import datetime + metrics_json = metrics.model_dump_json() if isinstance(metrics, BaseModel) else json.dumps(metrics) self._repo.save_metrics( run_id=run_id, tier=tier, timestamp=datetime.now(UTC).isoformat(), - metrics_json=metrics.model_dump_json(), + metrics_json=metrics_json, ) def get_trajectory(self, run_id: str) -> RunTrajectory | None: @@ -106,3 +113,49 @@ def get_metrics(self, run_id: str, tier: str | None = None) -> list[dict]: del d["metrics_json"] result.append(d) return result + + def save_artifacts(self, run_id: str, artifacts: list[ArtifactRecord] | list[dict[str, Any]]) -> None: + payload = [ + artifact.model_dump(mode="json") + if isinstance(artifact, ArtifactRecord) + else ArtifactRecord.model_validate(artifact).model_dump(mode="json") + for artifact in artifacts + ] + self.save_metrics(run_id, "artifacts", payload) + + def get_artifacts(self, run_id: str) -> list[ArtifactRecord]: + payload = self.get_latest_payload(run_id, "artifacts") + if not payload: + return [] + if not isinstance(payload, list): + raise RuntimeError("Expected artifacts payload to be a list") + return [ArtifactRecord.model_validate(item) for item in payload] + + def save_judge_result(self, run_id: str, judge_result: JudgeResult | dict[str, Any]) -> None: + payload = judge_result if isinstance(judge_result, JudgeResult) else JudgeResult.model_validate(judge_result) + self.save_metrics(run_id, "judge", payload) + + def get_judge_result(self, run_id: str) -> JudgeResult | None: + payload = self.get_latest_payload(run_id, "judge") + if payload is None: + return None + if not isinstance(payload, dict): + raise RuntimeError("Expected judge payload to be an object") + return JudgeResult.model_validate(payload) + + def save_benchmark_info(self, run_id: str, benchmark: BenchmarkInfo) -> None: + self.save_metrics(run_id, "benchmark", benchmark) + + def get_benchmark_info(self, run_id: str) -> BenchmarkInfo | None: + payload = self.get_latest_payload(run_id, "benchmark") + if payload is None: + return None + if not isinstance(payload, dict): + raise RuntimeError("Expected benchmark payload to be an object") + return BenchmarkInfo.model_validate(payload) + + def get_latest_payload(self, run_id: str, tier: str) -> Any: + rows = self.get_metrics(run_id, tier=tier) + if not rows: + return None + return rows[-1].get("metrics") diff --git a/frontend/monitor/.gitignore b/frontend/monitor/.gitignore index 4f3a35af7..36b52bfc2 100644 --- a/frontend/monitor/.gitignore +++ b/frontend/monitor/.gitignore @@ -1,4 +1,5 @@ node_modules dist .DS_Store - +playwright-report +test-results diff --git a/frontend/monitor/package-lock.json b/frontend/monitor/package-lock.json index f578ff02d..a36cdd07e 100644 --- a/frontend/monitor/package-lock.json +++ b/frontend/monitor/package-lock.json @@ -17,6 +17,7 @@ "@types/react": "^19.2.5", "@types/react-dom": "^19.2.3", "@vitejs/plugin-react": "^5.1.1", + "playwright": "^1.59.1", "typescript": "~5.9.3", "vite": "^7.2.4", "vitest": "^3.2.4" @@ -1814,6 +1815,53 @@ "url": "https://github.com/sponsors/jonschlinkert" } }, + "node_modules/playwright": { + "version": "1.59.1", + "resolved": "https://registry.npmjs.org/playwright/-/playwright-1.59.1.tgz", + "integrity": "sha512-C8oWjPR3F81yljW9o5OxcWzfh6avkVwDD2VYdwIGqTkl+OGFISgypqzfu7dOe4QNLL2aqcWBmI3PMtLIK233lw==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "playwright-core": "1.59.1" + }, + "bin": { + "playwright": "cli.js" + }, + "engines": { + "node": ">=18" + }, + "optionalDependencies": { + "fsevents": "2.3.2" + } + }, + "node_modules/playwright-core": { + "version": "1.59.1", + "resolved": "https://registry.npmjs.org/playwright-core/-/playwright-core-1.59.1.tgz", + "integrity": "sha512-HBV/RJg81z5BiiZ9yPzIiClYV/QMsDCKUyogwH9p3MCP6IYjUFu/MActgYAvK0oWyV9NlwM3GLBjADyWgydVyg==", + "dev": true, + "license": "Apache-2.0", + "bin": { + "playwright-core": "cli.js" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/playwright/node_modules/fsevents": { + "version": "2.3.2", + "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.2.tgz", + "integrity": "sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA==", + "dev": true, + "hasInstallScript": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": "^8.16.0 || ^10.6.0 || >=11.0.0" + } + }, "node_modules/postcss": { "version": "8.5.6", "resolved": "https://registry.npmjs.org/postcss/-/postcss-8.5.6.tgz", diff --git a/frontend/monitor/package.json b/frontend/monitor/package.json index 04c70a665..5110835cd 100644 --- a/frontend/monitor/package.json +++ b/frontend/monitor/package.json @@ -7,7 +7,8 @@ "dev": "vite", "build": "tsc --noEmit && vite build", "preview": "vite preview", - "test": "vitest" + "test": "vitest", + "test:e2e": "playwright test" }, "dependencies": { "lucide-react": "^0.562.0", @@ -19,6 +20,7 @@ "@types/react": "^19.2.5", "@types/react-dom": "^19.2.3", "@vitejs/plugin-react": "^5.1.1", + "playwright": "^1.59.1", "typescript": "~5.9.3", "vite": "^7.2.4", "vitest": "^3.2.4" diff --git a/frontend/monitor/playwright.acceptance.config.ts b/frontend/monitor/playwright.acceptance.config.ts new file mode 100644 index 000000000..8637eccce --- /dev/null +++ b/frontend/monitor/playwright.acceptance.config.ts @@ -0,0 +1,36 @@ +import path from "node:path"; +import { fileURLToPath } from "node:url"; + +import { defineConfig } from "playwright/test"; +const configDir = path.dirname(fileURLToPath(import.meta.url)); +const repoRoot = path.resolve(configDir, "../.."); +const harnessPort = Number(process.env.LEON_MONITOR_ACCEPTANCE_BACKEND_PORT || 8766); +const monitorPort = Number(process.env.LEON_MONITOR_ACCEPTANCE_FRONTEND_PORT || 4175); + +export default defineConfig({ + testDir: "./playwright", + testMatch: /.*acceptance\.e2e\.ts/, + timeout: 60_000, + workers: 1, + use: { + baseURL: `http://127.0.0.1:${monitorPort}`, + trace: "retain-on-failure", + screenshot: "only-on-failure", + }, + webServer: [ + { + command: `${path.join(repoRoot, ".venv", "bin", "python")} -m eval.benchmarks.swe_verified.acceptance serve --port ${harnessPort}`, + cwd: repoRoot, + port: harnessPort, + reuseExistingServer: true, + timeout: 30_000, + }, + { + command: `LEON_BACKEND_PORT=${harnessPort} LEON_MONITOR_BACKEND_PORT=${harnessPort} npm run dev -- --host 127.0.0.1 --port ${monitorPort}`, + cwd: path.join(repoRoot, "frontend", "monitor"), + port: monitorPort, + reuseExistingServer: true, + timeout: 30_000, + }, + ], +}); diff --git a/frontend/monitor/playwright.config.ts b/frontend/monitor/playwright.config.ts new file mode 100644 index 000000000..63c01628a --- /dev/null +++ b/frontend/monitor/playwright.config.ts @@ -0,0 +1,40 @@ +import path from "node:path"; +import { fileURLToPath } from "node:url"; + +import { defineConfig } from "playwright/test"; + +import { loadMonitorPorts } from "./dev-ports"; + +const monitorPorts = loadMonitorPorts(); +const configDir = path.dirname(fileURLToPath(import.meta.url)); +const repoRoot = path.resolve(configDir, "../.."); +const harnessPort = Number(process.env.LEON_MONITOR_BACKEND_PORT || process.env.LEON_BACKEND_PORT || 8001); +const monitorPort = monitorPorts.devPort; + +export default defineConfig({ + testDir: "./playwright", + testMatch: /.*\.e2e\.ts/, + timeout: 60_000, + workers: 1, + use: { + baseURL: `http://127.0.0.1:${monitorPort}`, + trace: "retain-on-failure", + screenshot: "only-on-failure", + }, + webServer: [ + { + command: `${path.join(repoRoot, ".venv", "bin", "python")} frontend/monitor/playwright/monitor_evaluation_harness.py --port ${harnessPort}`, + cwd: repoRoot, + port: harnessPort, + reuseExistingServer: true, + timeout: 30_000, + }, + { + command: `LEON_BACKEND_PORT=${harnessPort} LEON_MONITOR_BACKEND_PORT=${harnessPort} npm run dev -- --host 127.0.0.1 --port ${monitorPort}`, + cwd: path.join(repoRoot, "frontend", "monitor"), + port: monitorPort, + reuseExistingServer: true, + timeout: 30_000, + }, + ], +}); diff --git a/frontend/monitor/playwright/evaluation.acceptance.e2e.ts b/frontend/monitor/playwright/evaluation.acceptance.e2e.ts new file mode 100644 index 000000000..14f470272 --- /dev/null +++ b/frontend/monitor/playwright/evaluation.acceptance.e2e.ts @@ -0,0 +1,53 @@ +import { expect, test, type Page } from "playwright/test"; + +const MONITOR_TOKEN_KEY = "leon-monitor-token"; + +test.beforeEach(async ({ page }) => { + await page.addInitScript((tokenKey) => { + window.localStorage.setItem(tokenKey, "token-1"); + }, MONITOR_TOKEN_KEY); +}); + +function field(page: Page, label: string) { + return page.locator(".evaluation-create-form__field").filter({ hasText: label }); +} + +test("creates and starts a benchmark batch against the acceptance harness", async ({ page }) => { + await page.goto("/evaluation"); + + await expect(page.getByRole("heading", { level: 1, name: "Evaluation" })).toBeVisible(); + await field(page, "Agent user id").locator("input").fill("agent-1"); + await field(page, "Family").locator("select").selectOption("SWE-bench Verified"); + await field(page, "Judge profile").locator("select").selectOption("command"); + await field(page, "Export profile").locator("select").selectOption("predictions_json"); + await page.getByRole("button", { name: /swe_verified_pytest_7521/i }).click(); + await page.getByRole("button", { name: /swe_verified_pytest_7571/i }).click(); + await page.getByRole("button", { name: "Create batch" }).click(); + + await expect(page).toHaveURL(/\/evaluation\/batches\/eval-batch-/); + await expect(page.getByRole("heading", { name: /Evaluation Batch eval-batch-/ })).toBeVisible(); + + await page.getByRole("button", { name: "Start evaluation batch" }).click(); + await expect(page.getByText("Batch execution scheduled.")).toBeVisible(); + await page.waitForTimeout(1000); + await page.reload(); + + const runLink = page.locator("table tbody a").first(); + await expect(runLink).toBeVisible(); + await runLink.click(); + + await expect(page.getByRole("heading", { name: /Evaluation Run/ })).toBeVisible(); + await expect(page.getByRole("heading", { name: "Artifact Viewer" })).toBeVisible(); + await expect(page.getByRole("heading", { name: "Raw Trace" })).toBeVisible(); + await expect(page.getByText("Conversation", { exact: true })).toBeVisible(); + await expect(page.getByText("Events", { exact: true })).toBeVisible(); + await expect(page.getByRole("heading", { name: "model_patch.diff", exact: true })).toBeVisible(); + await expect(page.getByText("SWE-bench Verified", { exact: true })).toBeVisible(); +}); + +test("renders the backend 404 boundary for a missing evaluation run", async ({ page }) => { + await page.goto("/evaluation/runs/missing-run"); + + await expect(page.getByRole("heading", { name: "Evaluation run missing-run: Request failed" })).toBeVisible(); + await expect(page.getByText(/Evaluation run not found: missing-run/)).toBeVisible(); +}); diff --git a/frontend/monitor/playwright/evaluation.e2e.ts b/frontend/monitor/playwright/evaluation.e2e.ts new file mode 100644 index 000000000..95e48f8f2 --- /dev/null +++ b/frontend/monitor/playwright/evaluation.e2e.ts @@ -0,0 +1,66 @@ +import { expect, test } from "playwright/test"; + +const MONITOR_TOKEN_KEY = "leon-monitor-token"; + +test.beforeEach(async ({ page }) => { + await page.addInitScript((tokenKey) => { + window.localStorage.setItem(tokenKey, "token-1"); + }, MONITOR_TOKEN_KEY); +}); + +test("creates a benchmark batch and opens trace-rich run detail", async ({ page }) => { + await page.goto("/evaluation"); + + await expect(page.getByRole("heading", { level: 1, name: "Evaluation" })).toBeVisible(); + await page.getByLabel("Agent user id").fill("agent-1"); + await page.getByLabel("Family").selectOption("SWE-bench Verified"); + await page.getByLabel("Judge profile").selectOption("swe_verified_smoke"); + await page.getByLabel("Export profile").selectOption("predictions_jsonl"); + await page.getByRole("button", { name: /swe_verified_pytest_7521/i }).click(); + await page.getByRole("button", { name: "Create batch" }).click(); + + await expect(page).toHaveURL(/\/evaluation\/batches\/eval-batch-created-\d+$/); + await expect(page.getByRole("heading", { name: /Evaluation Batch eval-batch-created-/ })).toBeVisible(); + await expect(page.getByText("SWE-bench Verified").first()).toBeVisible(); + + await page.getByRole("button", { name: "Start evaluation batch" }).click(); + await expect(page.getByText("Batch execution scheduled.")).toBeVisible(); + await expect(page.getByRole("link", { name: /created-run-\d+/ })).toBeVisible(); + + await page.getByRole("link", { name: /created-run-\d+/ }).first().click(); + + await expect(page.getByRole("heading", { name: /Evaluation Run created-run-/ })).toBeVisible(); + await expect(page.getByRole("heading", { name: "Artifact Viewer" })).toBeVisible(); + await expect(page.getByRole("heading", { name: "Raw Trace" })).toBeVisible(); + await expect(page.getByText("Smoke harness verdict generated by the monitor Playwright fixture.").first()).toBeVisible(); + await expect(page.getByText("Inspecting repository checkout")).toBeVisible(); + await expect(page.getByText("model_patch.diff")).toBeVisible(); +}); + +test("shows regression compare output and downloads a live export payload", async ({ page }) => { + await page.goto("/evaluation"); + + await page.getByLabel("Baseline batch").selectOption("batch-candidate"); + await page.getByLabel("Candidate batch").selectOption("batch-baseline"); + await page.getByRole("button", { name: "Compare batches" }).click(); + + await expect(page.getByText("Regression detected")).toBeVisible(); + await expect(page.getByRole("cell", { name: "Pass Rate" })).toBeVisible(); + await expect(page.getByRole("cell", { name: "-50.0%" }).first()).toBeVisible(); + + await page.goto("/evaluation/batches/batch-candidate"); + const downloadPromise = page.waitForEvent("download"); + await page.getByRole("button", { name: "Download export" }).click(); + const download = await downloadPromise; + + expect(download.suggestedFilename()).toBe("batch-candidate-predictions_jsonl.json"); + await expect(page.getByText("Downloaded batch-candidate-predictions_jsonl.json")).toBeVisible(); + await expect(page.getByText("\"run_records\"")).toBeVisible(); +}); + +test("renders backend failures instead of hanging when a run is missing", async ({ page }) => { + await page.goto("/evaluation/runs/missing-run"); + + await expect(page.getByRole("heading", { name: "Evaluation run missing-run: Request failed" })).toBeVisible(); + await expect(page.getByText("Evaluation run not found: missing-run")).toBeVisible(); +}); diff --git a/frontend/monitor/playwright/monitor_evaluation_harness.py b/frontend/monitor/playwright/monitor_evaluation_harness.py new file mode 100644 index 000000000..08584dfb9 --- /dev/null +++ b/frontend/monitor/playwright/monitor_evaluation_harness.py @@ -0,0 +1,593 @@ +from __future__ import annotations + +import copy +import sys +from pathlib import Path +from typing import Any + +import uvicorn +from fastapi import FastAPI + +REPO_ROOT = Path(__file__).resolve().parents[3] +if str(REPO_ROOT) not in sys.path: + sys.path.insert(0, str(REPO_ROOT)) + +from backend.monitor.api.http import router as monitor_router # noqa: E402 +from backend.monitor.infrastructure.web import gateway as monitor_gateway # noqa: E402 + + +def _scenario( + scenario_id: str, + *, + name: str, + instance_id: str, + base_commit: str, + rank: int, +) -> dict[str, Any]: + return { + "scenario_id": scenario_id, + "name": name, + "category": "swe", + "sandbox": "local", + "message_count": 1, + "timeout_seconds": 120, + "benchmark": { + "family": "SWE-bench Verified", + "name": "SWE-bench/SWE-bench_Verified", + "split": "test", + "variant": "smoke", + "instance_id": instance_id, + "dataset_version": "91aa3ed51b709be6457e12d00300a6a596d4c6a3", + "source_uri": "https://huggingface.co/datasets/SWE-bench/SWE-bench_Verified", + }, + "workspace": { + "cwd": "/workspace/pytest", + "repo": "pytest-dev/pytest", + "base_commit": base_commit, + "env": {"PYTHONUNBUFFERED": "1"}, + "setup_commands": ["python -m pip install -e ."], + }, + "judge_type": "swe_verified_smoke", + "export_format": "predictions_jsonl", + "_rank": rank, + } + + +class HarnessState: + def __init__(self) -> None: + self._batch_counter = 1 + self._run_counter = 1 + self.scenarios = [ + _scenario( + "swe_verified_pytest_7521", + name="pytest capfd carriage return regression", + instance_id="pytest-dev__pytest-7521", + base_commit="41d211c24a6781843b174379d6d6538f5c17adb9", + rank=1, + ), + _scenario( + "swe_verified_pytest_7571", + name="pytest caplog level restore regression", + instance_id="pytest-dev__pytest-7571", + base_commit="422685d0bdc110547535036c1ff398b5e1c44145", + rank=2, + ), + ] + self.batches: dict[str, dict[str, Any]] = {} + self.runs: dict[str, dict[str, Any]] = {} + self.threads: dict[str, dict[str, Any]] = {} + self.run_order: list[str] = [] + self._seed_completed_batches() + + def _seed_completed_batches(self) -> None: + self._create_seed_batch("batch-baseline", pass_count=1) + self._create_seed_batch("batch-candidate", pass_count=2) + + def _create_seed_batch(self, batch_id: str, *, pass_count: int) -> None: + scenario_refs = [self._scenario_ref(item) for item in self.scenarios] + runs: list[dict[str, Any]] = [] + for index, scenario in enumerate(self.scenarios, start=1): + verdict = "passed" if index <= pass_count else "failed" + status = "completed" if verdict == "passed" else "failed" + run_id = f"{batch_id}-run-{index}" + thread_id = f"{batch_id}-thread-{index}" + batch_run_id = f"{batch_id}-batch-run-{index}" + run_detail = self._build_run_detail( + batch_id=batch_id, + batch_run_id=batch_run_id, + run_id=run_id, + thread_id=thread_id, + scenario=scenario, + verdict=verdict, + status=status, + ) + self.runs[run_id] = run_detail + self.threads[thread_id] = self._build_thread_detail(thread_id, run_id, scenario, verdict) + self.run_order.insert(0, run_id) + runs.append( + { + "batch_run_id": batch_run_id, + "scenario_id": scenario["scenario_id"], + "status": status, + "thread_id": thread_id, + "eval_run_id": run_id, + "started_at": "2026-04-21T10:00:00Z", + "finished_at": "2026-04-21T10:02:00Z", + "summary_json": self._batch_run_summary(scenario, verdict), + } + ) + + summary = self._batch_summary(runs) + self.batches[batch_id] = { + "batch": { + "batch_id": batch_id, + "kind": "benchmark_batch", + "status": "completed", + "submitted_by_user_id": "owner-1", + "agent_user_id": "agent-1", + "created_at": "2026-04-21T09:58:00Z", + "updated_at": "2026-04-21T10:02:00Z", + "config_json": { + "scenario_ids": [item["scenario_id"] for item in self.scenarios], + "sandbox": "local", + "max_concurrent": 2, + "scenario_refs": scenario_refs, + }, + "summary_json": summary, + }, + "runs": runs, + } + + def _scenario_ref(self, scenario: dict[str, Any]) -> dict[str, Any]: + return { + "scenario_id": scenario["scenario_id"], + "name": scenario["name"], + "category": scenario["category"], + "sandbox": scenario["sandbox"], + "benchmark": scenario["benchmark"], + "workspace": scenario["workspace"], + "judge_config": {"type": scenario["judge_type"], "config": {"profile_id": "swe_verified_smoke"}}, + "export": { + "format": scenario["export_format"], + "key": "predictions_path", + "config": {"profile": "swe_verified_smoke"}, + }, + } + + def _batch_run_summary(self, scenario: dict[str, Any], verdict: str) -> dict[str, Any]: + return { + "instance_id": scenario["benchmark"]["instance_id"], + "benchmark_family": scenario["benchmark"]["family"], + "benchmark_split": scenario["benchmark"]["split"], + "judge_type": scenario["judge_type"], + "judge_verdict": verdict, + "export_format": scenario["export_format"], + "export_key": "predictions_path", + "artifact_count": 3, + "total_tokens": 1200 if verdict == "passed" else 1500, + } + + def _batch_summary(self, runs: list[dict[str, Any]]) -> dict[str, Any]: + total_runs = len(runs) + completed_runs = sum(1 for row in runs if row["status"] == "completed") + failed_runs = sum(1 for row in runs if row["status"] == "failed") + passed_runs = sum(1 for row in runs if row.get("summary_json", {}).get("judge_verdict") == "passed") + failed_judges = sum(1 for row in runs if row.get("summary_json", {}).get("judge_verdict") == "failed") + total_tokens = sum(int(row.get("summary_json", {}).get("total_tokens") or 0) for row in runs) + artifacts = sum(int(row.get("summary_json", {}).get("artifact_count") or 0) for row in runs) + return { + "total_runs": total_runs, + "running_runs": 0, + "completed_runs": completed_runs, + "failed_runs": failed_runs, + "judge_passed_runs": passed_runs, + "judge_failed_runs": failed_judges, + "pass_rate": passed_runs / total_runs if total_runs else 0.0, + "avg_total_tokens": total_tokens / total_runs if total_runs else 0.0, + "artifact_count_total": artifacts, + "avg_scores": {"resolved": passed_runs / total_runs if total_runs else 0.0}, + "benchmark_families": ["SWE-bench Verified"], + "benchmark_splits": ["test"], + } + + def _build_run_detail( + self, + *, + batch_id: str, + batch_run_id: str, + run_id: str, + thread_id: str, + scenario: dict[str, Any], + verdict: str, + status: str, + ) -> dict[str, Any]: + benchmark = copy.deepcopy(scenario["benchmark"]) + judge_result = { + "judge_type": scenario["judge_type"], + "status": "completed", + "verdict": verdict, + "rationale": "Smoke harness verdict generated by the monitor Playwright fixture.", + "scores": {"resolved": 1.0 if verdict == "passed" else 0.0}, + "metadata": {"fixture": True}, + } + artifacts = [ + { + "name": "model_patch.diff", + "kind": "patch", + "mime_type": "text/x-diff", + "content": "diff --git a/testing/test_capture.py b/testing/test_capture.py\n+ assert out.endswith('\\r')\n", + "metadata": {"instance_id": benchmark["instance_id"]}, + }, + { + "name": "test_output.log", + "kind": "test_log", + "mime_type": "text/plain", + "content": "pytest testing/test_capture.py -q\n1 passed\n" + if verdict == "passed" + else "pytest testing/test_capture.py -q\n1 failed\n", + "metadata": {"verdict": verdict}, + }, + { + "name": "judge_result.json", + "kind": "judge_result", + "mime_type": "application/json", + "content": None, + "metadata": judge_result, + }, + ] + facts = [ + {"label": "Metric Tiers", "value": "2"}, + {"label": "Total tokens", "value": "1200" if verdict == "passed" else "1500"}, + {"label": "LLM calls", "value": "3"}, + {"label": "Tool calls", "value": "2"}, + {"label": "Judge verdict", "value": verdict}, + {"label": "Artifacts", "value": str(len(artifacts))}, + ] + return { + "run": { + "run_id": run_id, + "thread_id": thread_id, + "status": status, + "started_at": "2026-04-21T10:00:00Z", + "finished_at": "2026-04-21T10:02:00Z", + "user_message": f"Fix {benchmark['instance_id']}", + "final_response": "Prepared a patch, ran focused tests, and summarized the result.", + "artifact_count": len(artifacts), + "benchmark": benchmark, + "judge_result": judge_result, + }, + "facts": facts, + "batch_run": { + "batch_run_id": batch_run_id, + "batch_id": batch_id, + "scenario_id": scenario["scenario_id"], + }, + "limitations": [], + "judge_result": judge_result, + "artifacts": artifacts, + "benchmark": benchmark, + } + + def _build_thread_detail(self, thread_id: str, run_id: str, scenario: dict[str, Any], verdict: str) -> dict[str, Any]: + return { + "thread": {"thread_id": thread_id}, + "trajectory": { + "run_id": run_id, + "conversation": [ + {"role": "user", "content": f"Investigate {scenario['benchmark']['instance_id']}"}, + {"role": "assistant", "content": "Inspecting pytest capture behavior and preparing a minimal patch."}, + ], + "events": [ + { + "seq": 1, + "run_id": run_id, + "event_type": "assistant_text", + "actor": "assistant", + "summary": "Inspecting repository checkout", + "payload": {"content": "Opened testing/test_capture.py"}, + }, + { + "seq": 2, + "run_id": run_id, + "event_type": "tool_call", + "actor": "tool", + "summary": "exec_command", + "payload": {"cmd": "pytest testing/test_capture.py -q"}, + }, + { + "seq": 3, + "run_id": run_id, + "event_type": "tool_result", + "actor": "tool", + "summary": "pytest result", + "payload": {"content": "1 passed" if verdict == "passed" else "1 failed"}, + }, + ], + }, + } + + def _run_row(self, run_id: str) -> dict[str, Any]: + detail = self.runs[run_id] + run = detail["run"] + return { + "run_id": run["run_id"], + "thread_id": run["thread_id"], + "status": run["status"], + "started_at": run["started_at"], + "finished_at": run["finished_at"], + "user_message": run["user_message"], + "facts": detail["facts"], + } + + def workbench(self) -> dict[str, Any]: + run_rows = [self._run_row(run_id) for run_id in self.run_order[:6]] + return { + "headline": "Evaluation Workbench", + "summary": "Harness-backed monitor evaluation workbench.", + "overview": { + "total_runs": len(run_rows), + "running_runs": 0, + "completed_runs": sum(1 for row in run_rows if row["status"] == "completed"), + "failed_runs": sum(1 for row in run_rows if row["status"] == "failed"), + }, + "runs": run_rows, + "selected_run": run_rows[0] if run_rows else None, + "limitations": [], + } + + def list_batches(self, limit: int = 50) -> dict[str, Any]: + items = [copy.deepcopy(self.batches[batch_id]["batch"]) for batch_id in sorted(self.batches.keys())] + items.sort(key=lambda item: str(item.get("created_at") or ""), reverse=True) + return {"items": items[:limit], "count": min(len(items), limit)} + + def batch_detail(self, batch_id: str) -> dict[str, Any]: + if batch_id not in self.batches: + raise KeyError(f"Evaluation batch not found: {batch_id}") + payload = copy.deepcopy(self.batches[batch_id]) + payload["aggregate"] = copy.deepcopy(payload["batch"]["summary_json"]) + return payload + + def batch_aggregate(self, batch_id: str) -> dict[str, Any]: + detail = self.batch_detail(batch_id) + return { + "batch_id": batch_id, + "status": detail["batch"]["status"], + "summary": copy.deepcopy(detail["batch"]["summary_json"]), + } + + def run_detail(self, run_id: str) -> dict[str, Any]: + if run_id not in self.runs: + raise KeyError(f"Evaluation run not found: {run_id}") + return copy.deepcopy(self.runs[run_id]) + + def run_artifacts(self, run_id: str) -> dict[str, Any]: + detail = self.run_detail(run_id) + return { + "run_id": run_id, + "artifacts": detail["artifacts"], + "judge_result": detail["judge_result"], + "benchmark": detail["benchmark"], + } + + def thread_detail(self, thread_id: str) -> dict[str, Any]: + if thread_id not in self.threads: + raise KeyError(f"Thread not found: {thread_id}") + return copy.deepcopy(self.threads[thread_id]) + + def create_batch( + self, *, submitted_by_user_id: str, agent_user_id: str, scenario_ids: list[str], sandbox: str, max_concurrent: int + ) -> dict[str, Any]: + catalog = {item["scenario_id"]: item for item in self.scenarios} + scenario_refs = [] + for scenario_id in scenario_ids: + if scenario_id not in catalog: + raise KeyError(f"Evaluation scenarios not found: {scenario_id}") + scenario_refs.append(self._scenario_ref(catalog[scenario_id])) + batch_id = f"eval-batch-created-{self._batch_counter:04d}" + self._batch_counter += 1 + runs = [] + for index, scenario_id in enumerate(scenario_ids, start=1): + scenario = catalog[scenario_id] + runs.append( + { + "batch_run_id": f"{batch_id}-batch-run-{index}", + "scenario_id": scenario_id, + "status": "pending", + "thread_id": None, + "eval_run_id": None, + "started_at": None, + "finished_at": None, + "summary_json": { + "instance_id": scenario["benchmark"]["instance_id"], + "benchmark_family": scenario["benchmark"]["family"], + "benchmark_split": scenario["benchmark"]["split"], + "judge_type": scenario["judge_type"], + "export_format": scenario["export_format"], + "artifact_count": 0, + }, + } + ) + summary = { + "total_runs": len(runs), + "running_runs": 0, + "completed_runs": 0, + "failed_runs": 0, + "judge_passed_runs": 0, + "judge_failed_runs": 0, + "pass_rate": 0.0, + "avg_total_tokens": 0.0, + "artifact_count_total": 0, + "avg_scores": {}, + "benchmark_families": ["SWE-bench Verified"], + "benchmark_splits": ["test"], + } + self.batches[batch_id] = { + "batch": { + "batch_id": batch_id, + "kind": "benchmark_batch", + "status": "pending", + "submitted_by_user_id": submitted_by_user_id, + "agent_user_id": agent_user_id, + "created_at": "2026-04-21T11:00:00Z", + "updated_at": "2026-04-21T11:00:00Z", + "config_json": { + "scenario_ids": scenario_ids, + "sandbox": sandbox, + "max_concurrent": max_concurrent, + "scenario_refs": scenario_refs, + }, + "summary_json": summary, + }, + "runs": runs, + } + return {"batch": {"batch_id": batch_id}} + + def start_batch(self, *, batch_id: str) -> dict[str, Any]: + if batch_id not in self.batches: + raise KeyError(f"Evaluation batch not found: {batch_id}") + detail = self.batches[batch_id] + if detail["batch"]["status"] != "pending": + return {"accepted": False, "batch": copy.deepcopy(detail["batch"])} + + for index, run in enumerate(detail["runs"], start=1): + scenario = next(item for item in self.scenarios if item["scenario_id"] == run["scenario_id"]) + verdict = "passed" if index == 1 else "failed" + status = "completed" if verdict == "passed" else "failed" + run_id = f"created-run-{self._run_counter:04d}" + thread_id = f"created-thread-{self._run_counter:04d}" + self._run_counter += 1 + run_detail = self._build_run_detail( + batch_id=batch_id, + batch_run_id=run["batch_run_id"], + run_id=run_id, + thread_id=thread_id, + scenario=scenario, + verdict=verdict, + status=status, + ) + self.runs[run_id] = run_detail + self.threads[thread_id] = self._build_thread_detail(thread_id, run_id, scenario, verdict) + self.run_order.insert(0, run_id) + run.update( + { + "status": status, + "thread_id": thread_id, + "eval_run_id": run_id, + "started_at": "2026-04-21T11:01:00Z", + "finished_at": "2026-04-21T11:03:00Z", + "summary_json": self._batch_run_summary(scenario, verdict), + } + ) + + detail["batch"]["status"] = "completed" + detail["batch"]["updated_at"] = "2026-04-21T11:03:00Z" + detail["batch"]["summary_json"] = self._batch_summary(detail["runs"]) + return {"accepted": True, "batch": copy.deepcopy(detail["batch"])} + + def compare_batches(self, baseline_batch_id: str, candidate_batch_id: str) -> dict[str, Any]: + baseline = self.batch_detail(baseline_batch_id)["batch"]["summary_json"] + candidate = self.batch_detail(candidate_batch_id)["batch"]["summary_json"] + + def _metric(key: str) -> dict[str, float]: + baseline_value = float(baseline.get(key) or 0.0) + candidate_value = float(candidate.get(key) or 0.0) + return { + "baseline": baseline_value, + "candidate": candidate_value, + "delta": candidate_value - baseline_value, + } + + return { + "baseline_batch_id": baseline_batch_id, + "candidate_batch_id": candidate_batch_id, + "baseline": copy.deepcopy(baseline), + "candidate": copy.deepcopy(candidate), + "delta": { + "pass_rate": _metric("pass_rate"), + "judge_passed_runs": _metric("judge_passed_runs"), + "judge_failed_runs": _metric("judge_failed_runs"), + "avg_total_tokens": _metric("avg_total_tokens"), + "artifact_count_total": _metric("artifact_count_total"), + "avg_scores": { + "resolved": { + "baseline": float((baseline.get("avg_scores") or {}).get("resolved") or 0.0), + "candidate": float((candidate.get("avg_scores") or {}).get("resolved") or 0.0), + "delta": float((candidate.get("avg_scores") or {}).get("resolved") or 0.0) + - float((baseline.get("avg_scores") or {}).get("resolved") or 0.0), + } + }, + }, + } + + def export_batch(self, batch_id: str, export_format: str | None = None) -> dict[str, Any]: + detail = self.batch_detail(batch_id) + resolved_format = export_format or "predictions_jsonl" + run_records = [] + for run in detail["runs"]: + run_id = run.get("eval_run_id") + if not run_id: + continue + run_detail = self.run_detail(run_id) + run_records.append( + { + "scenario_id": run["scenario_id"], + "run_id": run_id, + "benchmark": run_detail["benchmark"], + "judge_result": run_detail["judge_result"], + "artifacts": run_detail["artifacts"], + "final_response": run_detail["run"]["final_response"], + } + ) + return { + "batch_id": batch_id, + "format": resolved_format, + "aggregate": detail["batch"]["summary_json"], + "run_records": run_records, + } + + +STATE = HarnessState() + + +def _patch_gateway() -> None: + async def _thread_detail(_app, thread_id: str) -> dict[str, Any]: + return STATE.thread_detail(thread_id) + + monitor_gateway.get_evaluation_workbench = STATE.workbench + monitor_gateway.get_evaluation_batches = lambda limit=50: STATE.list_batches(limit=limit) + monitor_gateway.get_evaluation_scenarios = lambda: {"items": copy.deepcopy(STATE.scenarios), "count": len(STATE.scenarios)} + monitor_gateway.get_evaluation_batch_detail = STATE.batch_detail + monitor_gateway.get_evaluation_batch_aggregate = STATE.batch_aggregate + monitor_gateway.get_evaluation_run_detail = STATE.run_detail + monitor_gateway.get_evaluation_run_artifacts = STATE.run_artifacts + monitor_gateway.compare_evaluation_batches = lambda *, baseline_batch_id, candidate_batch_id: STATE.compare_batches( + baseline_batch_id, candidate_batch_id + ) + monitor_gateway.export_evaluation_batch = lambda *, batch_id, export_format=None: STATE.export_batch(batch_id, export_format) + monitor_gateway.create_evaluation_batch = lambda *, submitted_by_user_id, agent_user_id, scenario_ids, sandbox, max_concurrent: ( + STATE.create_batch( + submitted_by_user_id=submitted_by_user_id, + agent_user_id=agent_user_id, + scenario_ids=scenario_ids, + sandbox=sandbox, + max_concurrent=max_concurrent, + ) + ) + monitor_gateway.start_evaluation_batch = lambda *, batch_id, base_url, token, schedule_task: STATE.start_batch(batch_id=batch_id) + monitor_gateway.get_thread_detail = _thread_detail + + +def create_app() -> FastAPI: + _patch_gateway() + app = FastAPI(title="Monitor Evaluation Playwright Harness") + app.include_router(monitor_router.router) + app.dependency_overrides[monitor_router.get_current_user_id] = lambda: "owner-1" + return app + + +app = create_app() + + +if __name__ == "__main__": + port = int(sys.argv[sys.argv.index("--port") + 1]) if "--port" in sys.argv else 8001 + uvicorn.run(app, host="127.0.0.1", port=port, log_level="warning") diff --git a/frontend/monitor/src/app/fetch.ts b/frontend/monitor/src/app/fetch.ts index 962f9a362..64957bdb9 100644 --- a/frontend/monitor/src/app/fetch.ts +++ b/frontend/monitor/src/app/fetch.ts @@ -60,7 +60,21 @@ export async function fetchAPI(path: string, init?: RequestInit): Promise return payload as T; } -export function useMonitorData(path: string) { +export function buildMonitorPath( + path: string, + params?: Record, +): string { + if (!params) return path; + const search = new URLSearchParams(); + for (const [key, value] of Object.entries(params)) { + if (value == null) continue; + search.set(key, String(value)); + } + const query = search.toString(); + return query ? `${path}?${query}` : path; +} + +export function useMonitorData(path: string | null) { const [data, setData] = React.useState(null); const [error, setError] = React.useState(null); @@ -69,6 +83,12 @@ export function useMonitorData(path: string) { setData(null); setError(null); + if (!path) { + return () => { + cancelled = true; + }; + } + fetchAPI(path) .then((result) => { if (!cancelled) setData(result); diff --git a/frontend/monitor/src/pages/EvaluationBatchDetailPage.tsx b/frontend/monitor/src/pages/EvaluationBatchDetailPage.tsx index f152310fa..493ac45d3 100644 --- a/frontend/monitor/src/pages/EvaluationBatchDetailPage.tsx +++ b/frontend/monitor/src/pages/EvaluationBatchDetailPage.tsx @@ -1,8 +1,15 @@ import React from "react"; import { Link, useParams } from "react-router-dom"; -import { postMonitorData, useMonitorData, type MonitorFetchError } from "../app/fetch"; +import { buildMonitorPath, fetchAPI, postMonitorData, useMonitorData, type MonitorFetchError } from "../app/fetch"; import ErrorState from "../components/ErrorState"; +import { + listBatchExportFormats, + resolveBatchExportFormat, + summarizeSelectedScenarioContracts, + type EvaluationBatchSummary, + type EvaluationScenarioRef, +} from "./evaluation-model"; type EvaluationBatchDetailPayload = { batch?: { @@ -13,13 +20,9 @@ type EvaluationBatchDetailPayload = { sandbox?: string | null; max_concurrent?: number | null; scenario_ids?: string[] | null; + scenario_refs?: EvaluationScenarioRef[] | null; } | null; - summary_json?: { - total_runs?: number | null; - running_runs?: number | null; - completed_runs?: number | null; - failed_runs?: number | null; - } | null; + summary_json?: EvaluationBatchSummary | null; } | null; runs?: Array<{ batch_run_id?: string | null; @@ -29,7 +32,19 @@ type EvaluationBatchDetailPayload = { eval_run_id?: string | null; started_at?: string | null; finished_at?: string | null; + summary_json?: { + instance_id?: string | null; + benchmark_family?: string | null; + benchmark_split?: string | null; + judge_type?: string | null; + judge_verdict?: string | null; + export_format?: string | null; + export_key?: string | null; + artifact_count?: number | null; + error?: string | null; + } | null; }> | null; + aggregate?: EvaluationBatchSummary | null; }; type EvaluationBatchStartPayload = { @@ -37,31 +52,78 @@ type EvaluationBatchStartPayload = { batch?: EvaluationBatchDetailPayload["batch"]; }; +type EvaluationBatchAggregatePayload = { + batch_id?: string | null; + status?: string | null; + summary?: EvaluationBatchSummary | null; +}; + +function formatTimestamp(value: string | null | undefined): string { + if (!value) return "-"; + const date = new Date(value); + if (Number.isNaN(date.getTime())) return value; + return date.toLocaleString(); +} + +function formatPercent(value: number | null | undefined): string { + return typeof value === "number" && Number.isFinite(value) ? `${(value * 100).toFixed(1)}%` : "-"; +} + +function downloadJson(filename: string, payload: unknown) { + const blob = new Blob([JSON.stringify(payload, null, 2)], { type: "application/json" }); + const url = window.URL.createObjectURL(blob); + const anchor = document.createElement("a"); + anchor.href = url; + anchor.download = filename; + document.body.appendChild(anchor); + anchor.click(); + anchor.remove(); + window.URL.revokeObjectURL(url); +} + export default function EvaluationBatchDetailPage() { const params = useParams<{ batchId: string }>(); const batchId = params.batchId ?? ""; const { data, error } = useMonitorData(`/evaluation/batches/${batchId}`); + const { data: aggregateData, error: aggregateError } = + useMonitorData(`/evaluation/batches/${batchId}/aggregate`); const [batchData, setBatchData] = React.useState(null); + const [aggregateSnapshot, setAggregateSnapshot] = React.useState(null); const [startMessage, setStartMessage] = React.useState(null); const [startError, setStartError] = React.useState(null); const [startPending, setStartPending] = React.useState(false); + const [exportFormat, setExportFormat] = React.useState("generic_json"); + const [exportPending, setExportPending] = React.useState(false); + const [exportError, setExportError] = React.useState(null); + const [exportMessage, setExportMessage] = React.useState(null); + const [exportPreview, setExportPreview] = React.useState | null>(null); React.useEffect(() => { - if (data) { - setBatchData(data); - setStartMessage(null); - setStartError(null); - setStartPending(false); - } + if (!data) return; + setBatchData(data); + setStartMessage(null); + setStartError(null); + setStartPending(false); + const scenarioRefs = data.batch?.config_json?.scenario_refs ?? []; + setExportFormat(resolveBatchExportFormat(scenarioRefs)); }, [data]); + React.useEffect(() => { + if (aggregateData) { + setAggregateSnapshot(aggregateData); + } + }, [aggregateData]); + if (error) return ; if (!batchData) return
Loading...
; const batch = batchData.batch ?? {}; const config = batch.config_json ?? {}; - const summary = batch.summary_json ?? {}; + const summary = aggregateSnapshot?.summary ?? batchData.aggregate ?? batch.summary_json ?? {}; const runs = batchData.runs ?? []; + const scenarioRefs = config.scenario_refs ?? []; + const contractSummary = summarizeSelectedScenarioContracts(scenarioRefs); + const exportFormats = listBatchExportFormats(scenarioRefs); const progressSummary = `${summary.completed_runs ?? 0} completed / ${summary.failed_runs ?? 0} failed / ${ summary.running_runs ?? 0 } running`; @@ -72,13 +134,18 @@ export default function EvaluationBatchDetailPage() { setStartError(null); try { const result = await postMonitorData(`/evaluation/batches/${batchId}/start`); - setBatchData((current) => ({ - ...(current ?? {}), + const [detail, aggregate] = await Promise.all([ + fetchAPI(`/evaluation/batches/${batchId}`), + fetchAPI(`/evaluation/batches/${batchId}/aggregate`), + ]); + setBatchData({ + ...detail, batch: { - ...(current?.batch ?? {}), + ...(detail.batch ?? {}), ...(result.batch ?? {}), }, - })); + }); + setAggregateSnapshot(aggregate); setStartMessage(result.accepted ? "Batch execution scheduled." : "Batch execution was not accepted."); } catch (err: unknown) { const fetchError = err as MonitorFetchError; @@ -88,10 +155,34 @@ export default function EvaluationBatchDetailPage() { } } + async function exportBatch() { + setExportPending(true); + setExportError(null); + setExportMessage(null); + try { + const result = await fetchAPI>( + buildMonitorPath(`/evaluation/batches/${batchId}/export`, { + format: exportFormat || undefined, + }), + ); + downloadJson(`${batchId}-${exportFormat || "generic_json"}.json`, result); + setExportPreview(result); + setExportMessage( + `Downloaded ${batchId}-${exportFormat || "generic_json"}.json with ${Object.keys(result).length} top-level keys.`, + ); + } catch (err: unknown) { + setExportError(err instanceof Error ? err.message : String(err)); + setExportPreview(null); + } finally { + setExportPending(false); + } + } + return (

{`Evaluation Batch ${batch.batch_id ?? batchId}`}

-

Scenario batch state, run linkage, and thread drilldown.

+

Scenario batch state, benchmark contract echo, aggregate summary, and export controls.

+

Batch State

@@ -125,6 +216,7 @@ export default function EvaluationBatchDetailPage() {
+ {batch.status === "pending" || startMessage || startError ? (

Execution

@@ -139,16 +231,178 @@ export default function EvaluationBatchDetailPage() { ) : null} {startMessage ?

{startMessage}

: null} - {startError ?

{startError}

: null} + {startError ?

{startError}

: null}
) : null} + +
+

Benchmark Contract

+
+
+ Families + {contractSummary.families.join(", ") || "-"} +
+
+ Instances + {contractSummary.instances.join(", ") || "-"} +
+
+ Judge profiles + {contractSummary.judgeTypes.join(", ") || "-"} +
+
+ Export profiles + {contractSummary.exportFormats.join(", ") || "-"} +
+
+ Repos + {contractSummary.repos.join(", ") || "-"} +
+
+ Base commits + {contractSummary.baseCommits.join(", ") || "-"} +
+
+ {contractSummary.missingBenchmarkMetadataCount > 0 ? ( +

+ {contractSummary.missingBenchmarkMetadataCount} scenario refs were created without benchmark metadata. This + is a real backend data gap, not a frontend omission. +

+ ) : null} + {scenarioRefs.length > 0 ? ( + + + + + + + + + + + + + + {scenarioRefs.map((scenario) => ( + + + + + + + + + + ))} + +
ScenarioFamilyInstanceJudgeExportRepoBase Commit
{scenario.scenario_id ?? scenario.name ?? "-"}{scenario.benchmark?.family ?? "-"}{scenario.benchmark?.instance_id ?? "-"}{scenario.judge_config?.type ?? "-"}{scenario.export?.format ?? "-"}{scenario.workspace?.repo ?? "-"}{scenario.workspace?.base_commit ?? "-"}
+ ) : ( +
+

No scenario refs were persisted for this batch.

+
+ )} +
+ +
+

Aggregate Summary

+ {aggregateError ?

{aggregateError.message}

: null} +
+
+

Pass Rate

+

{formatPercent(summary.pass_rate)}

+
+
+

Judge Passed

+

{summary.judge_passed_runs ?? 0}

+
+
+

Judge Failed

+

{summary.judge_failed_runs ?? 0}

+
+
+

Artifacts

+

{summary.artifact_count_total ?? 0}

+
+
+
+
+ Benchmark families + {summary.benchmark_families?.join(", ") || "-"} +
+
+ Benchmark splits + {summary.benchmark_splits?.join(", ") || "-"} +
+
+ Avg total tokens + {summary.avg_total_tokens ?? "-"} +
+
+ Avg scores + + {summary.avg_scores && Object.keys(summary.avg_scores).length > 0 + ? Object.entries(summary.avg_scores) + .map(([key, value]) => `${key}:${value}`) + .join(", ") + : "-"} + +
+
+
+ +
+

Export

+
+
+ +
+
+ {exportError ? ( + {exportError} + ) : ( + + Download hits `/api/monitor/evaluation/batches/{batchId}/export` and saves the live JSON payload locally. + + )} + +
+ {exportMessage ?

{exportMessage}

: null} + {exportPreview ? ( +
+ Last export preview +
{JSON.stringify(exportPreview, null, 2)}
+
+ ) : null} +
+
+

Batch Runs

+ + + + @@ -156,16 +410,23 @@ export default function EvaluationBatchDetailPage() { - {runs.map((run) => ( - - - - - - - - - ))} + {runs.map((run) => { + const runSummary = run.summary_json ?? {}; + return ( + + + + + + + + + + + + + ); + })}
ScenarioInstance StatusJudgeExportArtifacts Thread Eval Run Started
{run.scenario_id ?? "-"}{run.status ?? "-"}{run.thread_id || "-"}{run.eval_run_id ? {run.eval_run_id} : "-"}{run.started_at ?? "-"}{run.finished_at ?? "-"}
{run.scenario_id ?? "-"}{runSummary.instance_id ?? "-"}{run.status ?? "-"}{runSummary.judge_verdict ?? runSummary.error ?? "-"}{runSummary.export_format ?? "-"}{runSummary.artifact_count ?? "-"}{run.thread_id || "-"}{run.eval_run_id ? {run.eval_run_id} : "-"}{formatTimestamp(run.started_at)}{formatTimestamp(run.finished_at)}
diff --git a/frontend/monitor/src/pages/EvaluationPage.tsx b/frontend/monitor/src/pages/EvaluationPage.tsx index 2e0ad8094..2b2fd2dd6 100644 --- a/frontend/monitor/src/pages/EvaluationPage.tsx +++ b/frontend/monitor/src/pages/EvaluationPage.tsx @@ -2,7 +2,19 @@ import React from "react"; import { Link, useNavigate } from "react-router-dom"; import ErrorState from "../components/ErrorState"; -import { postMonitorData, useMonitorData } from "../app/fetch"; +import { buildMonitorPath, fetchAPI, postMonitorData, useMonitorData } from "../app/fetch"; +import { + buildCompareMetricRows, + buildLeaderboardRows, + buildScenarioFacetOptions, + filterScenariosByBenchmark, + listBatchExportFormats, + summarizeSelectedScenarioContracts, + type ComparePayload, + type EvaluationBatchListItem, + type EvaluationScenarioCatalogItem, + type EvaluationScenarioRef, +} from "./evaluation-model"; type EvaluationPayload = { headline?: string | null; @@ -34,37 +46,24 @@ type EvaluationPayload = { }; type EvaluationBatchIndexPayload = { - items?: Array<{ - batch_id?: string | null; - kind?: string | null; - status?: string | null; - submitted_by_user_id?: string | null; - agent_user_id?: string | null; - created_at?: string | null; - config_json?: { - sandbox?: string | null; - max_concurrent?: number | null; - scenario_ids?: string[] | null; - } | null; - summary_json?: { - total_runs?: number | null; - running_runs?: number | null; - completed_runs?: number | null; - failed_runs?: number | null; - } | null; - }> | null; + items?: Array< + EvaluationBatchListItem & { + kind?: string | null; + submitted_by_user_id?: string | null; + agent_user_id?: string | null; + config_json?: { + sandbox?: string | null; + max_concurrent?: number | null; + scenario_ids?: string[] | null; + scenario_refs?: EvaluationScenarioRef[] | null; + } | null; + } + > | null; count?: number | null; }; type EvaluationScenarioCatalogPayload = { - items?: Array<{ - scenario_id?: string | null; - name?: string | null; - category?: string | null; - sandbox?: string | null; - message_count?: number | null; - timeout_seconds?: number | null; - }> | null; + items?: EvaluationScenarioCatalogItem[] | null; count?: number | null; }; @@ -74,6 +73,28 @@ type EvaluationBatchCreatePayload = { } | null; }; +type EvaluationCompareResponse = ComparePayload & { + baseline_batch_id?: string | null; + candidate_batch_id?: string | null; + baseline?: Record | null; + candidate?: Record | null; +}; + +type BatchMetrics = { + totalRuns: number; + runningRuns: number; + completedRuns: number; + failedRuns: number; + finishedRuns: number; + progressPercent: number; + scenarioCount: number; + sandbox: string; + maxConcurrent: number; + passRate: number | null; + benchmarkFamilies: string[]; + exportFormats: string[]; +}; + const PAGE_SIZE = 8; const BATCH_STATUS_LABELS: Record = { @@ -105,6 +126,10 @@ function formatTimestamp(value: string | null | undefined): string { return date.toLocaleString(); } +function formatPercent(value: number | null | undefined): string { + return typeof value === "number" && Number.isFinite(value) ? `${(value * 100).toFixed(1)}%` : "-"; +} + function statusLabel(status: string | null | undefined): string { if (!status) return "未知"; return BATCH_STATUS_LABELS[status] ?? status; @@ -115,7 +140,7 @@ function statusTone(status: string | null | undefined): "pending" | "running" | return BATCH_STATUS_TONES[status] ?? "pending"; } -function summarizeBatch(batch: NonNullable[number]) { +function summarizeBatch(batch: NonNullable[number]): BatchMetrics { const summary = batch.summary_json ?? {}; const config = batch.config_json ?? {}; const totalRuns = asNumber(summary.total_runs); @@ -125,6 +150,8 @@ function summarizeBatch(batch: NonNullable const finishedRuns = completedRuns + failedRuns; const progressPercent = totalRuns > 0 ? Math.min(100, Math.round((finishedRuns / totalRuns) * 100)) : 0; const scenarioCount = Array.isArray(config.scenario_ids) ? config.scenario_ids.length : 0; + const contract = summarizeSelectedScenarioContracts(config.scenario_refs ?? []); + return { totalRuns, runningRuns, @@ -135,9 +162,20 @@ function summarizeBatch(batch: NonNullable scenarioCount, sandbox: config.sandbox ?? "-", maxConcurrent: asNumber(config.max_concurrent), + passRate: typeof summary.pass_rate === "number" ? summary.pass_rate : null, + benchmarkFamilies: + Array.isArray(summary.benchmark_families) && summary.benchmark_families.length > 0 + ? [...summary.benchmark_families] + : contract.families, + exportFormats: contract.exportFormats.length > 0 ? contract.exportFormats : listBatchExportFormats(config.scenario_refs ?? []), }; } +function formatCompareValue(key: string, value: number): string { + if (key === "pass_rate" || key.startsWith("avg_scores.")) return formatPercent(value); + return Number.isInteger(value) ? String(value) : value.toFixed(2); +} + export default function EvaluationPage() { const navigate = useNavigate(); const { data, error } = useMonitorData("/evaluation"); @@ -148,9 +186,18 @@ export default function EvaluationPage() { const [sandbox, setSandbox] = React.useState("local"); const [maxConcurrent, setMaxConcurrent] = React.useState(1); const [selectedScenarioIds, setSelectedScenarioIds] = React.useState([]); + const [selectedFamily, setSelectedFamily] = React.useState(""); + const [selectedInstanceId, setSelectedInstanceId] = React.useState(""); + const [selectedJudgeType, setSelectedJudgeType] = React.useState(""); + const [selectedExportFormat, setSelectedExportFormat] = React.useState(""); const [createError, setCreateError] = React.useState(null); const [createPending, setCreatePending] = React.useState(false); const [page, setPage] = React.useState(1); + const [baselineBatchId, setBaselineBatchId] = React.useState(""); + const [candidateBatchId, setCandidateBatchId] = React.useState(""); + const [comparePending, setComparePending] = React.useState(false); + const [compareError, setCompareError] = React.useState(null); + const [compareData, setCompareData] = React.useState(null); const overview = data?.overview ?? {}; const runs = data?.runs ?? []; @@ -159,12 +206,58 @@ export default function EvaluationPage() { const limitations = data?.limitations ?? []; const batches = batchesData?.items ?? []; const scenarios = scenariosData?.items ?? []; + const scenarioFacetOptions = buildScenarioFacetOptions(scenarios, selectedFamily); + const filteredScenarios = filterScenariosByBenchmark(scenarios, { + family: selectedFamily, + instanceId: selectedInstanceId, + judgeType: selectedJudgeType, + exportFormat: selectedExportFormat, + }); + const selectedScenarios = scenarios.filter((scenario) => selectedScenarioIds.includes(scenario.scenario_id ?? "")); + const selectedContracts = summarizeSelectedScenarioContracts(selectedScenarios); + const leaderboardRows = buildLeaderboardRows(batches); + const compareRows = buildCompareMetricRows(compareData); const totalPages = Math.max(1, Math.ceil(batches.length / PAGE_SIZE)); + React.useEffect(() => { setPage((current) => Math.min(current, totalPages)); }, [totalPages]); + React.useEffect(() => { + if (selectedFamily && !scenarioFacetOptions.families.includes(selectedFamily)) { + setSelectedFamily(""); + } + if (selectedInstanceId && !scenarioFacetOptions.instanceIds.includes(selectedInstanceId)) { + setSelectedInstanceId(""); + } + if (selectedJudgeType && !scenarioFacetOptions.judgeTypes.includes(selectedJudgeType)) { + setSelectedJudgeType(""); + } + if (selectedExportFormat && !scenarioFacetOptions.exportFormats.includes(selectedExportFormat)) { + setSelectedExportFormat(""); + } + }, [ + scenarioFacetOptions.exportFormats, + scenarioFacetOptions.families, + scenarioFacetOptions.instanceIds, + scenarioFacetOptions.judgeTypes, + selectedExportFormat, + selectedFamily, + selectedInstanceId, + selectedJudgeType, + ]); + + React.useEffect(() => { + if (baselineBatchId || candidateBatchId || batches.length < 2) return; + const rankedBatchIds = [...leaderboardRows, ...batches.map((batch) => ({ batchId: batch.batch_id ?? "" }))] + .map((row) => row.batchId) + .filter(Boolean); + if (rankedBatchIds.length < 2) return; + setBaselineBatchId(rankedBatchIds[0]); + setCandidateBatchId(rankedBatchIds.find((batchId) => batchId !== rankedBatchIds[0]) ?? ""); + }, [baselineBatchId, batches, candidateBatchId, leaderboardRows]); + const visibleBatches = React.useMemo(() => { const start = (page - 1) * PAGE_SIZE; return batches.slice(start, start + PAGE_SIZE); @@ -202,8 +295,40 @@ export default function EvaluationPage() { } } + async function loadComparison() { + if (!baselineBatchId || !candidateBatchId) { + setCompareError("Choose both baseline and candidate batches before running compare."); + setCompareData(null); + return; + } + if (baselineBatchId === candidateBatchId) { + setCompareError("Baseline and candidate must be different batches."); + setCompareData(null); + return; + } + + setComparePending(true); + setCompareError(null); + try { + const result = await fetchAPI( + buildMonitorPath("/evaluation/compare", { + baseline_batch_id: baselineBatchId, + candidate_batch_id: candidateBatchId, + }), + ); + setCompareData(result); + } catch (err: unknown) { + setCompareError(err instanceof Error ? err.message : String(err)); + setCompareData(null); + } finally { + setComparePending(false); + } + } + const batchRangeStart = batches.length === 0 ? 0 : (page - 1) * PAGE_SIZE + 1; const batchRangeEnd = Math.min(page * PAGE_SIZE, batches.length); + const benchmarkSurfaceAvailable = scenarioFacetOptions.benchmarkScenarioCount > 0; + const hasCompareRegression = compareRows.some((row) => row.regression); return (
@@ -249,7 +374,9 @@ export default function EvaluationPage() {

Builder

Create Batch

- Scenarios are the actual workloads a batch will run. They no longer sit in a standalone catalog section because they only matter while assembling a batch. + Benchmark controls read directly from the monitor scenario catalog. If the backend does not publish + benchmark metadata yet, the builder falls back to raw scenario selection and calls that gap out instead of + inventing values.

void createBatch(event)}> @@ -272,14 +399,103 @@ export default function EvaluationPage() { /> + +
+
+ Benchmark Contract + + {benchmarkSurfaceAvailable + ? `${scenarioFacetOptions.benchmarkScenarioCount} scenario contracts expose benchmark metadata` + : "Current backend catalog exposes no benchmark metadata"} + +
+
+ + + + +
+ {!benchmarkSurfaceAvailable ? ( +

+ Blocked by current backend data: `/api/monitor/evaluation/scenarios` returns no benchmark family, + instance, judge, or export profile metadata yet. +

+ ) : null} +
+
Scenarios - {selectedScenarioIds.length} selected + + {selectedScenarioIds.length} selected · {filteredScenarios.length}/{scenarios.length} visible +
+

+ Scenario selection is still the source of truth for batch creation. Benchmark fields above only filter + what the backend already publishes in the scenario catalog. +

- {scenarios.length > 0 ? ( - scenarios.map((scenario) => { + {filteredScenarios.length > 0 ? ( + filteredScenarios.map((scenario) => { const scenarioId = scenario.scenario_id ?? ""; const selected = selectedScenarioIds.includes(scenarioId); return ( @@ -292,18 +508,66 @@ export default function EvaluationPage() { > {scenarioId || scenario.name || "-"} - {scenario.category ?? "uncategorized"} · {scenario.message_count ?? 0} msg · {scenario.timeout_seconds ?? "-"}s + {scenario.benchmark?.family ?? scenario.category ?? "uncategorized"} ·{" "} + {scenario.benchmark?.instance_id ?? `${scenario.message_count ?? 0} msg`} · judge{" "} + {scenario.judge_type ?? "-"} · export {scenario.export_format ?? "-"} ); }) ) : ( -

No evaluation scenarios found.

+

No evaluation scenarios match the selected benchmark filters.

)}
+ +
+
+ Resolved Contract Preview + {selectedContracts.totalCount} scenario refs +
+
+
+ Families + {selectedContracts.families.join(", ") || "-"} +
+
+ Instances + {selectedContracts.instances.join(", ") || "-"} +
+
+ Judge profiles + {selectedContracts.judgeTypes.join(", ") || "-"} +
+
+ Export profiles + {selectedContracts.exportFormats.join(", ") || "-"} +
+
+ Repos + {selectedContracts.repos.join(", ") || "-"} +
+
+ Base commits + {selectedContracts.baseCommits.join(", ") || "-"} +
+
+ {selectedContracts.missingBenchmarkMetadataCount > 0 ? ( +

+ {selectedContracts.missingBenchmarkMetadataCount} selected scenario refs do not publish benchmark + metadata yet, so batch creation can only persist raw scenario ids for them. +

+ ) : null} +
+
- {createError ? {createError} : Pending batches start from the batch detail page.} + {createError ? ( + {createError} + ) : ( + + Pending batches start from the batch detail page. + + )}
- Scenarios - {metrics.scenarioCount} + Pass rate + {formatPercent(metrics.passRate)}
Sandbox - {metrics.sandbox} + + {metrics.sandbox} +
Concurrency @@ -397,12 +663,23 @@ export default function EvaluationPage() {
- Agent {batch.agent_user_id ?? "-"} - Submitted {batch.submitted_by_user_id ?? "-"} - Created {formatTimestamp(batch.created_at)} + + Agent {batch.agent_user_id ?? "-"} + + + Families {metrics.benchmarkFamilies.join(", ") || "-"} + + + Export {metrics.exportFormats.join(", ") || "-"} + + + Created {formatTimestamp(batch.created_at)} +
- {metrics.finishedRuns}/{metrics.totalRuns || 0} finished + + {metrics.finishedRuns}/{metrics.totalRuns || 0} finished + {batch.batch_id ? ( Open batch @@ -420,6 +697,118 @@ export default function EvaluationPage() { )} +
+

Leaderboard

+ {leaderboardRows.length > 0 ? ( + + + + + + + + + + + + + + {leaderboardRows.slice(0, 6).map((row) => ( + + + + + + + + + + ))} + +
BatchStatusPass RateJudge PassedTotal RunsFamiliesCreated
+ {row.batchId} + {row.status ?? "-"}{formatPercent(row.passRate)}{row.judgePassedRuns}{row.totalRuns}{row.families.join(", ") || "-"}{formatTimestamp(row.createdAt)}
+ ) : ( +
+

Leaderboard populates once batches have aggregate judge metrics.

+
+ )} +
+ +
+

Compare

+
+
+ + +
+
+ {compareError ? ( + {compareError} + ) : ( + + Compare hits `/api/monitor/evaluation/compare` directly and reports regressions when pass rate drops or + judge failures rise. + + )} + +
+ {compareData ? ( +
+
+ Comparison Result + {hasCompareRegression ? "Regression detected" : "No regression signal detected"} +
+ + + + + + + + + + + {compareRows.map((row) => ( + + + + + + + ))} + +
MetricBaselineCandidateDelta
{row.label}{formatCompareValue(row.key, row.baseline)}{formatCompareValue(row.key, row.candidate)} + {row.delta > 0 ? "+" : ""} + {formatCompareValue(row.key, row.delta)} +
+
+ ) : null} +
+
+

Recent Runs

diff --git a/frontend/monitor/src/pages/EvaluationRunDetailPage.tsx b/frontend/monitor/src/pages/EvaluationRunDetailPage.tsx index 68af18960..d70f1fef2 100644 --- a/frontend/monitor/src/pages/EvaluationRunDetailPage.tsx +++ b/frontend/monitor/src/pages/EvaluationRunDetailPage.tsx @@ -1,7 +1,33 @@ +import React from "react"; import { Link, useParams } from "react-router-dom"; import { useMonitorData } from "../app/fetch"; import ErrorState from "../components/ErrorState"; +import { + buildArtifactDownloadPayload, + summarizeTrajectory, + type EvaluationArtifactRecord, + type ThreadTrajectoryPayload, +} from "./evaluation-run-detail-model"; + +type BenchmarkInfo = { + family?: string | null; + name?: string | null; + split?: string | null; + variant?: string | null; + instance_id?: string | null; + dataset_version?: string | null; + source_uri?: string | null; +}; + +type JudgeResult = { + judge_type?: string | null; + status?: string | null; + verdict?: string | null; + rationale?: string | null; + scores?: Record | null; + metadata?: Record | null; +}; type EvaluationRunDetailPayload = { run?: { @@ -11,6 +37,10 @@ type EvaluationRunDetailPayload = { started_at?: string | null; finished_at?: string | null; user_message?: string | null; + final_response?: string | null; + artifact_count?: number | null; + benchmark?: BenchmarkInfo | null; + judge_result?: JudgeResult | null; } | null; facts?: Array<{ label?: string | null; value?: string | null }> | null; batch_run?: { @@ -19,12 +49,65 @@ type EvaluationRunDetailPayload = { scenario_id?: string | null; } | null; limitations?: string[] | null; + judge_result?: JudgeResult | null; + artifacts?: EvaluationArtifactRecord[] | null; + benchmark?: BenchmarkInfo | null; }; +type EvaluationRunArtifactsPayload = { + run_id?: string | null; + artifacts?: EvaluationArtifactRecord[] | null; + judge_result?: JudgeResult | null; + benchmark?: BenchmarkInfo | null; +}; + +type ThreadDetailPayload = { + trajectory?: ThreadTrajectoryPayload | null; +}; + +function formatTimestamp(value: string | null | undefined): string { + if (!value) return "-"; + const date = new Date(value); + if (Number.isNaN(date.getTime())) return value; + return date.toLocaleString(); +} + +function stringifyMessage(message: Record): string { + if (typeof message.content === "string") return message.content; + if (typeof message.text === "string") return message.text; + return JSON.stringify(message, null, 2); +} + +function describeMessageRole(message: Record): string { + if (typeof message.role === "string") return message.role; + if (typeof message.actor === "string") return message.actor; + if (typeof message.type === "string") return message.type; + return "message"; +} + +function downloadArtifact(runId: string, artifact: EvaluationArtifactRecord) { + const payload = buildArtifactDownloadPayload(runId, artifact); + const blob = new Blob([payload.text], { type: payload.mimeType }); + const url = window.URL.createObjectURL(blob); + const anchor = document.createElement("a"); + anchor.href = url; + anchor.download = payload.filename; + document.body.appendChild(anchor); + anchor.click(); + anchor.remove(); + window.URL.revokeObjectURL(url); +} + export default function EvaluationRunDetailPage() { const params = useParams<{ runId: string }>(); const runId = params.runId ?? ""; const { data, error } = useMonitorData(`/evaluation/runs/${runId}`); + const { data: artifactsData, error: artifactsError } = + useMonitorData(`/evaluation/runs/${runId}/artifacts`); + const threadId = data?.run?.thread_id ?? ""; + const { data: threadData, error: threadError } = useMonitorData( + threadId ? `/threads/${threadId}` : null, + ); if (error) return ; if (!data) return
Loading...
; @@ -33,11 +116,17 @@ export default function EvaluationRunDetailPage() { const facts = data.facts ?? []; const batchRun = data.batch_run ?? {}; const limitations = data.limitations ?? []; + const judgeResult = artifactsData?.judge_result ?? data.judge_result ?? run.judge_result ?? null; + const benchmark = artifactsData?.benchmark ?? data.benchmark ?? run.benchmark ?? null; + const artifacts = artifactsData?.artifacts ?? data.artifacts ?? []; + const trajectory = threadData?.trajectory ?? null; + const traceSummary = summarizeTrajectory(trajectory); return (

{`Evaluation Run ${run.run_id ?? runId}`}

-

Persisted evaluation run state and thread linkage.

+

Persisted evaluation run state, raw thread trace, and artifact viewer.

+

Run State

@@ -51,11 +140,11 @@ export default function EvaluationRunDetailPage() {
Started At - {run.started_at ?? "-"} + {formatTimestamp(run.started_at)}
Finished At - {run.finished_at ?? "-"} + {formatTimestamp(run.finished_at)}
User Message @@ -75,6 +164,62 @@ export default function EvaluationRunDetailPage() {
+ +
+

Benchmark & Judge

+
+
+ Family + {benchmark?.family ?? "-"} +
+
+ Name + {benchmark?.name ?? "-"} +
+
+ Split + {benchmark?.split ?? "-"} +
+
+ Instance + {benchmark?.instance_id ?? "-"} +
+
+ Judge Type + {judgeResult?.judge_type ?? "-"} +
+
+ Judge Status + {judgeResult?.status ?? "-"} +
+
+ Judge Verdict + {judgeResult?.verdict ?? "-"} +
+
+ Artifact Count + {artifacts.length} +
+
+ {judgeResult?.rationale ?

{judgeResult.rationale}

: null} + {judgeResult?.scores && Object.keys(judgeResult.scores).length > 0 ? ( +
+ {Object.entries(judgeResult.scores).map(([key, value]) => ( +
+ {key} + {value} +
+ ))} +
+ ) : null} + {run.final_response ? ( +
+ Final response +
{run.final_response}
+
+ ) : null} +
+

Run Facts

@@ -86,6 +231,99 @@ export default function EvaluationRunDetailPage() { ))}
+ +
+

Artifact Viewer

+ {artifactsError ?

{artifactsError.message}

: null} + {artifacts.length > 0 ? ( +
+ {artifacts.map((artifact, index) => ( +
+
+
+

{artifact.kind ?? "artifact"}

+

{artifact.name ?? artifact.path ?? "-"}

+
+ +
+
+
+ Mime + {artifact.mime_type ?? "-"} +
+
+ Path + {artifact.path ?? "-"} +
+
+
+ Metadata +
{JSON.stringify(artifact.metadata ?? {}, null, 2)}
+
+
+ Payload preview +
{artifact.content ?? JSON.stringify(artifact, null, 2)}
+
+
+ ))} +
+ ) : ( +
+

Artifacts API returned no artifacts for this run.

+
+ )} +
+ +
+

Raw Trace

+ {threadError ?

{threadError.message}

: null} + {!threadId ?

This evaluation run is not linked to a persisted thread yet.

: null} + {threadId && !traceSummary.hasTrace && !threadError ? ( +

Thread detail loaded, but it exposed no conversation or event trace.

+ ) : null} + {traceSummary.hasTrace ? ( +
+
+
+ Conversation + {traceSummary.messageCount} messages +
+ {trajectory?.conversation?.map((message, index) => ( +
+ + {index + 1}. {describeMessageRole(message)} + +
{stringifyMessage(message)}
+
+ ))} +
+
+
+ Events + {traceSummary.eventCount} events +
+ {trajectory?.events?.map((event, index) => ( +
+ + {event.seq ?? index + 1}. {event.actor ?? "-"} / {event.event_type ?? "-"} / {event.summary ?? "-"} + +
{JSON.stringify(event.payload ?? {}, null, 2)}
+
+ ))} +
+
+ ) : null} +
+ {limitations.length > 0 ? (

Notes

diff --git a/frontend/monitor/src/pages/evaluation-model.test.ts b/frontend/monitor/src/pages/evaluation-model.test.ts new file mode 100644 index 000000000..121c7b5f9 --- /dev/null +++ b/frontend/monitor/src/pages/evaluation-model.test.ts @@ -0,0 +1,148 @@ +import { describe, expect, it } from "vitest"; + +import { + buildCompareMetricRows, + buildLeaderboardRows, + buildScenarioFacetOptions, + filterScenariosByBenchmark, + resolveBatchExportFormat, + summarizeSelectedScenarioContracts, + type EvaluationBatchListItem, + type EvaluationScenarioCatalogItem, + type EvaluationScenarioRef, +} from "./evaluation-model"; + +const benchmarkScenarios: EvaluationScenarioCatalogItem[] = [ + { + scenario_id: "swe-1", + benchmark: { + family: "SWE-bench Verified", + instance_id: "pytest-dev__pytest-7521", + }, + judge_type: "swe_verified_smoke", + export_format: "predictions_jsonl", + workspace: { + repo: "pytest-dev/pytest", + base_commit: "abc123", + }, + }, + { + scenario_id: "swe-2", + benchmark: { + family: "SWE-bench Verified", + instance_id: "pytest-dev__pytest-7571", + }, + judge_type: "swe_verified_smoke", + export_format: "predictions_jsonl", + workspace: { + repo: "pytest-dev/pytest", + base_commit: "def456", + }, + }, + { + scenario_id: "terminal-1", + benchmark: { + family: "Terminal-Bench", + instance_id: "terminal-001", + }, + judge_type: "terminal_smoke", + export_format: "generic_json", + }, + { + scenario_id: "legacy-scenario", + judge_type: null, + export_format: null, + benchmark: null, + }, +]; + +describe("evaluation model helpers", () => { + it("builds benchmark facet options from the live scenario surface", () => { + expect(buildScenarioFacetOptions(benchmarkScenarios, "SWE-bench Verified")).toEqual({ + benchmarkScenarioCount: 3, + families: ["SWE-bench Verified", "Terminal-Bench"], + instanceIds: ["pytest-dev__pytest-7521", "pytest-dev__pytest-7571"], + judgeTypes: ["swe_verified_smoke"], + exportFormats: ["predictions_jsonl"], + }); + }); + + it("filters scenarios only by benchmark fields that are actually selected", () => { + const result = filterScenariosByBenchmark(benchmarkScenarios, { + family: "SWE-bench Verified", + instanceId: "", + judgeType: "swe_verified_smoke", + exportFormat: "predictions_jsonl", + }); + + expect(result.map((item) => item.scenario_id)).toEqual(["swe-1", "swe-2"]); + }); + + it("summarizes selected scenario contracts without hiding missing metadata", () => { + expect(summarizeSelectedScenarioContracts(benchmarkScenarios)).toEqual({ + totalCount: 4, + missingBenchmarkMetadataCount: 1, + families: ["SWE-bench Verified", "Terminal-Bench"], + instances: ["pytest-dev__pytest-7521", "pytest-dev__pytest-7571", "terminal-001"], + judgeTypes: ["swe_verified_smoke", "terminal_smoke"], + exportFormats: ["generic_json", "predictions_jsonl"], + repos: ["pytest-dev/pytest"], + baseCommits: ["abc123", "def456"], + }); + }); + + it("orders leaderboard rows by pass rate first and recency second", () => { + const rows = buildLeaderboardRows([ + { + batch_id: "older-high", + created_at: "2026-04-18T10:00:00Z", + summary_json: { pass_rate: 1, total_runs: 2, judge_passed_runs: 2 }, + }, + { + batch_id: "newer-low", + created_at: "2026-04-20T10:00:00Z", + summary_json: { pass_rate: 0.5, total_runs: 2, judge_passed_runs: 1 }, + }, + { + batch_id: "newer-high", + created_at: "2026-04-21T10:00:00Z", + summary_json: { pass_rate: 1, total_runs: 3, judge_passed_runs: 3 }, + }, + ] satisfies EvaluationBatchListItem[]); + + expect(rows.map((row) => row.batchId)).toEqual(["newer-high", "older-high", "newer-low"]); + }); + + it("resolves export format from scenario refs and falls back when missing", () => { + expect( + resolveBatchExportFormat([ + { export: { format: "predictions_jsonl" } }, + { export: { format: "generic_json" } }, + ] satisfies EvaluationScenarioRef[]), + ).toBe("generic_json"); + + expect(resolveBatchExportFormat([])).toBe("generic_json"); + }); + + it("marks comparison regressions when pass rate drops or judge failures rise", () => { + const rows = buildCompareMetricRows({ + delta: { + pass_rate: { baseline: 1, candidate: 0.5, delta: -0.5 }, + judge_passed_runs: { baseline: 3, candidate: 2, delta: -1 }, + judge_failed_runs: { baseline: 0, candidate: 1, delta: 1 }, + avg_total_tokens: { baseline: 100, candidate: 120, delta: 20 }, + artifact_count_total: { baseline: 4, candidate: 5, delta: 1 }, + avg_scores: { + resolved: { baseline: 1, candidate: 0, delta: -1 }, + }, + }, + }); + + expect(rows.filter((row) => row.regression).map((row) => row.key)).toEqual([ + "pass_rate", + "judge_passed_runs", + "judge_failed_runs", + "avg_scores.resolved", + ]); + }); +}); diff --git a/frontend/monitor/src/pages/evaluation-model.ts b/frontend/monitor/src/pages/evaluation-model.ts new file mode 100644 index 000000000..14cd4f768 --- /dev/null +++ b/frontend/monitor/src/pages/evaluation-model.ts @@ -0,0 +1,299 @@ +export type BenchmarkInfo = { + family?: string | null; + name?: string | null; + split?: string | null; + variant?: string | null; + instance_id?: string | null; + dataset_version?: string | null; + tags?: string[] | null; + source_uri?: string | null; +}; + +export type WorkspaceInfo = { + cwd?: string | null; + repo?: string | null; + base_commit?: string | null; + env?: Record | null; + setup_commands?: string[] | null; +}; + +export type JudgeConfigInfo = { + type?: string | null; + config?: Record | null; +}; + +export type ExportConfigInfo = { + format?: string | null; + key?: string | null; + config?: Record | null; +}; + +export type EvaluationScenarioCatalogItem = { + scenario_id?: string | null; + name?: string | null; + category?: string | null; + sandbox?: string | null; + message_count?: number | null; + timeout_seconds?: number | null; + benchmark?: BenchmarkInfo | null; + workspace?: WorkspaceInfo | null; + judge_type?: string | null; + export_format?: string | null; +}; + +export type EvaluationScenarioRef = { + scenario_id?: string | null; + name?: string | null; + category?: string | null; + sandbox?: string | null; + benchmark?: BenchmarkInfo | null; + workspace?: WorkspaceInfo | null; + judge_config?: JudgeConfigInfo | null; + export?: ExportConfigInfo | null; +}; + +export type EvaluationBatchSummary = { + total_runs?: number | null; + running_runs?: number | null; + completed_runs?: number | null; + failed_runs?: number | null; + judge_passed_runs?: number | null; + judge_failed_runs?: number | null; + pass_rate?: number | null; + avg_total_tokens?: number | null; + artifact_count_total?: number | null; + avg_scores?: Record | null; + benchmark_families?: string[] | null; + benchmark_splits?: string[] | null; +}; + +export type EvaluationBatchListItem = { + batch_id?: string | null; + status?: string | null; + created_at?: string | null; + summary_json?: EvaluationBatchSummary | null; +}; + +export type BenchmarkFilters = { + family: string; + instanceId: string; + judgeType: string; + exportFormat: string; +}; + +export type ScenarioFacetOptions = { + benchmarkScenarioCount: number; + families: string[]; + instanceIds: string[]; + judgeTypes: string[]; + exportFormats: string[]; +}; + +export type SelectedScenarioContractSummary = { + totalCount: number; + missingBenchmarkMetadataCount: number; + families: string[]; + instances: string[]; + judgeTypes: string[]; + exportFormats: string[]; + repos: string[]; + baseCommits: string[]; +}; + +export type LeaderboardRow = { + batchId: string; + createdAt: string | null; + status: string | null; + passRate: number | null; + judgePassedRuns: number; + totalRuns: number; + families: string[]; + splits: string[]; +}; + +export type CompareDeltaMetric = { + baseline?: number | null; + candidate?: number | null; + delta?: number | null; +}; + +export type ComparePayload = { + delta?: Record | null> | null; +}; + +export type CompareMetricRow = { + key: string; + label: string; + baseline: number; + candidate: number; + delta: number; + regression: boolean; +}; + +function trimText(value: string | null | undefined): string { + return typeof value === "string" ? value.trim() : ""; +} + +function uniqueSorted(values: Array): string[] { + return [...new Set(values.map(trimText).filter(Boolean))].sort((left, right) => left.localeCompare(right)); +} + +function isScenarioCatalogItem( + item: EvaluationScenarioCatalogItem | EvaluationScenarioRef, +): item is EvaluationScenarioCatalogItem { + return "judge_type" in item || "export_format" in item; +} + +function getJudgeType(item: EvaluationScenarioCatalogItem | EvaluationScenarioRef): string { + return isScenarioCatalogItem(item) ? trimText(item.judge_type) : trimText(item.judge_config?.type); +} + +function getExportFormat(item: EvaluationScenarioCatalogItem | EvaluationScenarioRef): string { + return isScenarioCatalogItem(item) ? trimText(item.export_format) : trimText(item.export?.format); +} + +export function hasScenarioBenchmarkSurface(item: EvaluationScenarioCatalogItem | EvaluationScenarioRef): boolean { + return Boolean( + trimText(item.benchmark?.family) || + trimText(item.benchmark?.instance_id) || + getJudgeType(item) || + getExportFormat(item), + ); +} + +export function buildScenarioFacetOptions( + scenarios: EvaluationScenarioCatalogItem[], + selectedFamily = "", +): ScenarioFacetOptions { + const benchmarkScenarioCount = scenarios.filter((scenario) => hasScenarioBenchmarkSurface(scenario)).length; + const family = trimText(selectedFamily); + const matchingFamilyScenarios = family + ? scenarios.filter((scenario) => trimText(scenario.benchmark?.family) === family) + : scenarios; + + return { + benchmarkScenarioCount, + families: uniqueSorted(scenarios.map((scenario) => scenario.benchmark?.family)), + instanceIds: uniqueSorted(matchingFamilyScenarios.map((scenario) => scenario.benchmark?.instance_id)), + judgeTypes: uniqueSorted(matchingFamilyScenarios.map((scenario) => scenario.judge_type)), + exportFormats: uniqueSorted(matchingFamilyScenarios.map((scenario) => scenario.export_format)), + }; +} + +export function filterScenariosByBenchmark( + scenarios: EvaluationScenarioCatalogItem[], + filters: BenchmarkFilters, +): EvaluationScenarioCatalogItem[] { + return scenarios.filter((scenario) => { + if (filters.family && trimText(scenario.benchmark?.family) !== trimText(filters.family)) return false; + if (filters.instanceId && trimText(scenario.benchmark?.instance_id) !== trimText(filters.instanceId)) return false; + if (filters.judgeType && trimText(scenario.judge_type) !== trimText(filters.judgeType)) return false; + if (filters.exportFormat && trimText(scenario.export_format) !== trimText(filters.exportFormat)) return false; + return true; + }); +} + +export function summarizeSelectedScenarioContracts( + scenarios: Array, +): SelectedScenarioContractSummary { + return { + totalCount: scenarios.length, + missingBenchmarkMetadataCount: scenarios.filter((scenario) => !hasScenarioBenchmarkSurface(scenario)).length, + families: uniqueSorted(scenarios.map((scenario) => scenario.benchmark?.family)), + instances: uniqueSorted(scenarios.map((scenario) => scenario.benchmark?.instance_id)), + judgeTypes: uniqueSorted(scenarios.map((scenario) => getJudgeType(scenario))), + exportFormats: uniqueSorted(scenarios.map((scenario) => getExportFormat(scenario))), + repos: uniqueSorted(scenarios.map((scenario) => scenario.workspace?.repo)), + baseCommits: uniqueSorted(scenarios.map((scenario) => scenario.workspace?.base_commit)), + }; +} + +function asNumber(value: number | null | undefined): number | null { + return typeof value === "number" && Number.isFinite(value) ? value : null; +} + +function compareTimestampDesc(left: string | null | undefined, right: string | null | undefined): number { + const leftTime = left ? new Date(left).getTime() : 0; + const rightTime = right ? new Date(right).getTime() : 0; + return rightTime - leftTime; +} + +export function buildLeaderboardRows(batches: EvaluationBatchListItem[]): LeaderboardRow[] { + return batches + .map((batch) => { + const summary = batch.summary_json ?? {}; + return { + batchId: trimText(batch.batch_id), + createdAt: batch.created_at ?? null, + status: batch.status ?? null, + passRate: asNumber(summary.pass_rate), + judgePassedRuns: typeof summary.judge_passed_runs === "number" ? summary.judge_passed_runs : 0, + totalRuns: typeof summary.total_runs === "number" ? summary.total_runs : 0, + families: uniqueSorted(summary.benchmark_families ?? []), + splits: uniqueSorted(summary.benchmark_splits ?? []), + }; + }) + .filter((row) => row.batchId) + .sort((left, right) => { + const passRateDelta = (right.passRate ?? -1) - (left.passRate ?? -1); + if (passRateDelta !== 0) return passRateDelta; + return compareTimestampDesc(left.createdAt, right.createdAt); + }); +} + +export function listBatchExportFormats(scenarioRefs: EvaluationScenarioRef[]): string[] { + return uniqueSorted(scenarioRefs.map((scenario) => scenario.export?.format)); +} + +export function resolveBatchExportFormat(scenarioRefs: EvaluationScenarioRef[], fallback = "generic_json"): string { + return listBatchExportFormats(scenarioRefs)[0] ?? fallback; +} + +const COMPARE_LABELS: Record = { + pass_rate: "Pass Rate", + judge_passed_runs: "Judge Passed Runs", + judge_failed_runs: "Judge Failed Runs", + avg_total_tokens: "Avg Total Tokens", + artifact_count_total: "Artifact Count", +}; + +function toCompareRow(key: string, metric: CompareDeltaMetric | null | undefined): CompareMetricRow { + const baseline = typeof metric?.baseline === "number" ? metric.baseline : 0; + const candidate = typeof metric?.candidate === "number" ? metric.candidate : 0; + const delta = typeof metric?.delta === "number" ? metric.delta : candidate - baseline; + const regression = + (key === "pass_rate" || key === "judge_passed_runs") && delta < 0 + ? true + : key === "judge_failed_runs" && delta > 0; + return { + key, + label: COMPARE_LABELS[key] ?? key, + baseline, + candidate, + delta, + regression, + }; +} + +export function buildCompareMetricRows(payload: ComparePayload | null | undefined): CompareMetricRow[] { + const delta = payload?.delta ?? {}; + const rows: CompareMetricRow[] = []; + + for (const key of ["pass_rate", "judge_passed_runs", "judge_failed_runs", "avg_total_tokens", "artifact_count_total"]) { + rows.push(toCompareRow(key, delta[key] as CompareDeltaMetric | null | undefined)); + } + + const scoreDelta = delta.avg_scores; + if (scoreDelta && typeof scoreDelta === "object" && !Array.isArray(scoreDelta)) { + const scoreMetrics = scoreDelta as Record; + for (const key of Object.keys(scoreMetrics).sort((left, right) => left.localeCompare(right))) { + const metric = scoreMetrics[key]; + const row = toCompareRow(`avg_scores.${key}`, metric); + row.label = `Score: ${key}`; + row.regression = row.delta < 0; + rows.push(row); + } + } + + return rows; +} diff --git a/frontend/monitor/src/pages/evaluation-run-detail-model.test.ts b/frontend/monitor/src/pages/evaluation-run-detail-model.test.ts new file mode 100644 index 000000000..5089948c0 --- /dev/null +++ b/frontend/monitor/src/pages/evaluation-run-detail-model.test.ts @@ -0,0 +1,46 @@ +import { describe, expect, it } from "vitest"; + +import { buildArtifactDownloadPayload, summarizeTrajectory } from "./evaluation-run-detail-model"; + +describe("evaluation run detail helpers", () => { + it("downloads artifact content as text when inline content exists", () => { + expect( + buildArtifactDownloadPayload("run-1", { + name: "patch.diff", + mime_type: "text/x-diff", + content: "diff --git a b", + }), + ).toEqual({ + filename: "run-1-patch.diff.txt", + mimeType: "text/x-diff", + text: "diff --git a b", + }); + }); + + it("falls back to artifact json when no inline content exists", () => { + const payload = buildArtifactDownloadPayload("run-1", { + name: "judge", + kind: "judge_result", + metadata: { verdict: "passed" }, + }); + + expect(payload.filename).toBe("run-1-judge.json"); + expect(payload.mimeType).toBe("application/json"); + expect(payload.text).toContain("\"judge_result\""); + expect(payload.text).toContain("\"passed\""); + }); + + it("summarizes trajectory availability without assuming either channel exists", () => { + expect(summarizeTrajectory({ conversation: [{ role: "user" }], events: [] })).toEqual({ + messageCount: 1, + eventCount: 0, + hasTrace: true, + }); + + expect(summarizeTrajectory({ conversation: [], events: [] })).toEqual({ + messageCount: 0, + eventCount: 0, + hasTrace: false, + }); + }); +}); diff --git a/frontend/monitor/src/pages/evaluation-run-detail-model.ts b/frontend/monitor/src/pages/evaluation-run-detail-model.ts new file mode 100644 index 000000000..a1a7dd75e --- /dev/null +++ b/frontend/monitor/src/pages/evaluation-run-detail-model.ts @@ -0,0 +1,65 @@ +export type EvaluationArtifactRecord = { + name?: string | null; + kind?: string | null; + content?: string | null; + path?: string | null; + mime_type?: string | null; + metadata?: Record | null; +}; + +export type ThreadTrajectoryPayload = { + run_id?: string | null; + conversation?: Array> | null; + events?: Array<{ + seq?: number | null; + run_id?: string | null; + event_type?: string | null; + actor?: string | null; + summary?: string | null; + payload?: Record | null; + }> | null; +}; + +export type DownloadArtifactPayload = { + filename: string; + mimeType: string; + text: string; +}; + +function sanitizeFilePart(value: string | null | undefined, fallback: string): string { + const normalized = typeof value === "string" ? value.trim() : ""; + if (!normalized) return fallback; + const safe = normalized.replace(/[^a-zA-Z0-9._-]+/g, "-").replace(/-+/g, "-"); + return safe.replace(/^-|-$/g, "") || fallback; +} + +export function buildArtifactDownloadPayload( + runId: string, + artifact: EvaluationArtifactRecord, +): DownloadArtifactPayload { + const baseName = sanitizeFilePart(artifact.name || artifact.path, "artifact"); + if (artifact.content) { + return { + filename: `${sanitizeFilePart(runId, "run")}-${baseName}.txt`, + mimeType: artifact.mime_type || "text/plain", + text: artifact.content, + }; + } + + return { + filename: `${sanitizeFilePart(runId, "run")}-${baseName}.json`, + mimeType: "application/json", + text: JSON.stringify(artifact, null, 2), + }; +} + +export function summarizeTrajectory(trajectory: ThreadTrajectoryPayload | null | undefined) { + return { + messageCount: Array.isArray(trajectory?.conversation) ? trajectory?.conversation.length : 0, + eventCount: Array.isArray(trajectory?.events) ? trajectory?.events.length : 0, + hasTrace: Boolean( + (Array.isArray(trajectory?.conversation) && trajectory.conversation.length > 0) || + (Array.isArray(trajectory?.events) && trajectory.events.length > 0), + ), + }; +} diff --git a/frontend/monitor/src/styles.css b/frontend/monitor/src/styles.css index 643f0f884..7a2b9628a 100644 --- a/frontend/monitor/src/styles.css +++ b/frontend/monitor/src/styles.css @@ -1870,7 +1870,8 @@ section li { text-transform: uppercase; } -.evaluation-create-form__field input { +.evaluation-create-form__field input, +.evaluation-create-form__field select { width: 100%; padding: 0.65rem 0.75rem; border: 1px solid rgba(81, 59, 29, 0.14); @@ -1879,6 +1880,59 @@ section li { color: #241d14; } +.evaluation-benchmark-filters, +.evaluation-contract-preview, +.evaluation-compare-panel, +.evaluation-compare-result { + display: grid; + gap: 0.75rem; + padding: 0.95rem 1rem; + border: 1px solid rgba(81, 59, 29, 0.12); + border-radius: 16px; + background: rgba(255, 252, 247, 0.78); +} + +.evaluation-benchmark-filters__header, +.evaluation-contract-preview__header, +.evaluation-compare-result__header { + display: flex; + justify-content: space-between; + gap: 0.75rem; + align-items: center; + color: #4a3a29; + font-size: 0.82rem; +} + +.evaluation-benchmark-filters__header span, +.evaluation-contract-preview__header span, +.evaluation-compare-result__header span { + color: #776a5b; + font-size: 0.76rem; +} + +.evaluation-contract-preview__grid { + display: grid; + grid-template-columns: repeat(auto-fit, minmax(180px, 1fr)); + gap: 0.75rem; +} + +.evaluation-contract-preview__grid div { + display: grid; + gap: 0.2rem; +} + +.evaluation-contract-preview__grid strong { + color: #4a3a29; + font-size: 0.76rem; + letter-spacing: 0.04em; + text-transform: uppercase; +} + +.evaluation-contract-preview__grid span { + color: #241d14; + font-size: 0.88rem; +} + .evaluation-scenario-picker { display: grid; gap: 0.55rem; @@ -1995,6 +2049,51 @@ section li { justify-content: flex-end; } +.evaluation-json-panel { + border: 1px solid rgba(81, 59, 29, 0.12); + border-radius: 14px; + background: rgba(255, 252, 247, 0.9); + overflow: hidden; +} + +.evaluation-json-panel summary { + cursor: pointer; + padding: 0.85rem 1rem; + color: #4a3a29; + font-weight: 700; +} + +.evaluation-json-panel pre { + margin: 0; + padding: 0 1rem 1rem; + overflow: auto; + font-family: "SF Mono", Monaco, monospace; + font-size: 0.78rem; + line-height: 1.5; + color: #2f261d; +} + +.evaluation-artifact-grid, +.evaluation-trace-grid { + display: grid; + grid-template-columns: repeat(auto-fit, minmax(280px, 1fr)); + gap: 1rem; +} + +.evaluation-artifact-card__header { + display: flex; + justify-content: space-between; + gap: 0.75rem; + align-items: flex-start; + margin-bottom: 0.85rem; +} + +.evaluation-artifact-card__title { + margin-top: 0.25rem; + color: #241d14; + font-size: 0.96rem; +} + @media (max-width: 840px) { .evaluation-pagination--inline { justify-content: flex-start; diff --git a/sandbox/interfaces/executor.py b/sandbox/interfaces/executor.py index 8a9d9c0ad..c7583d11b 100644 --- a/sandbox/interfaces/executor.py +++ b/sandbox/interfaces/executor.py @@ -48,6 +48,7 @@ class AsyncCommand: exit_code: int | None = None done: bool = False cancelled: bool = False + monitor_task: Any = None class BaseExecutor(ABC): diff --git a/sandbox/providers/local.py b/sandbox/providers/local.py index 0d5822937..b4faf70f8 100644 --- a/sandbox/providers/local.py +++ b/sandbox/providers/local.py @@ -349,7 +349,7 @@ def __init__( self, terminal, lease, - shell_command: tuple[str, ...] = ("/bin/bash",), + shell_command: tuple[str, ...] = ("/bin/bash", "--noprofile", "--norc"), ): super().__init__(terminal, lease) self.shell_command = shell_command diff --git a/sandbox/runtime.py b/sandbox/runtime.py index 6d8437b64..cb6d884e7 100644 --- a/sandbox/runtime.py +++ b/sandbox/runtime.py @@ -3,6 +3,7 @@ from __future__ import annotations import asyncio +import logging import os import platform import re @@ -27,6 +28,8 @@ from storage.providers.sqlite.kernel import SQLiteDBRole, connect_sqlite, resolve_role_db_path from storage.runtime import uses_supabase_runtime_defaults +logger = logging.getLogger(__name__) + if platform.system() == "Windows": pty = None select = None @@ -793,9 +796,13 @@ async def cancel_command(self, command_id: str) -> bool: stopped = await self._cancel_running_command() if stopped: try: - await asyncio.wait_for(asyncio.shield(task), timeout=1.0) + await asyncio.wait_for(asyncio.shield(task), timeout=5.0) except (TimeoutError, asyncio.CancelledError): - task.cancel() + logger.warning( + "Timed out waiting for cancelled command %s to drain after runtime stop", + command_id, + ) + return False else: task.cancel() cmd.done = True diff --git a/storage/providers/supabase/eval_repo.py b/storage/providers/supabase/eval_repo.py index d26ea1d8c..d71798d5c 100644 --- a/storage/providers/supabase/eval_repo.py +++ b/storage/providers/supabase/eval_repo.py @@ -176,7 +176,7 @@ def get_trajectory_json(self, run_id: str) -> str | None: def get_run(self, run_id: str) -> dict | None: query = q.limit( - self._t("eval_runs").select("id,thread_id,started_at,finished_at,status,user_message").eq("id", run_id), + self._t("eval_runs").select("id,thread_id,started_at,finished_at,status,user_message,final_response").eq("id", run_id), 1, _REPO, "get_run", @@ -192,6 +192,7 @@ def get_run(self, run_id: str) -> dict | None: "finished_at": row.get("finished_at"), "status": str(row.get("status") or ""), "user_message": str(row.get("user_message") or ""), + "final_response": str(row.get("final_response") or ""), } def list_runs(self, thread_id: str | None = None, limit: int = 50) -> list[dict]: diff --git a/tests/Integration/test_monitor_resources_route.py b/tests/Integration/test_monitor_resources_route.py index e4a62b7e3..9431fc797 100644 --- a/tests/Integration/test_monitor_resources_route.py +++ b/tests/Integration/test_monitor_resources_route.py @@ -8,7 +8,6 @@ from backend.monitor.infrastructure.web import gateway as monitor_gateway_impl from backend.web.core.dependencies import get_current_user_id from backend.web.routers import monitor_threads as monitor_threads_router -from backend.web.routers import resources def _app(*, include_product_resources: bool = False) -> FastAPI: @@ -16,6 +15,10 @@ def _app(*, include_product_resources: bool = False) -> FastAPI: app.include_router(global_router.router, prefix="/api/monitor") app.include_router(monitor_threads_router.router, prefix="/api/monitor") if include_product_resources: + try: + from backend.web.routers import resources + except ImportError as exc: # pragma: no cover - environment guard + pytest.skip(f"product resource routes unavailable in lightweight test env: {exc}") app.include_router(resources.router) app.dependency_overrides[get_current_user_id] = lambda: "owner-1" return app @@ -186,7 +189,19 @@ def test_global_monitor_router_accepts_evaluation_batch_create(monkeypatch): ("get", "/api/monitor/evaluation/batches", "get_monitor_evaluation_batches", {"items": [], "count": 0}), ("get", "/api/monitor/evaluation/scenarios", "get_monitor_evaluation_scenarios", {"items": [], "count": 0}), ("get", "/api/monitor/evaluation/batches/batch-1", "get_monitor_evaluation_batch_detail", {"batch": {"batch_id": "batch-1"}}), + ( + "get", + "/api/monitor/evaluation/batches/batch-1/aggregate", + "get_monitor_evaluation_batch_aggregate", + {"summary": {"pass_rate": 1.0}}, + ), ("get", "/api/monitor/evaluation/runs/run-1", "get_monitor_evaluation_run_detail", {"run": {"run_id": "run-1"}}), + ( + "get", + "/api/monitor/evaluation/runs/run-1/artifacts", + "get_monitor_evaluation_run_artifacts", + {"run_id": "run-1", "artifacts": []}, + ), ], ) def test_monitor_routes_delegate_to_service(monkeypatch, method, path, service_name, payload): @@ -210,7 +225,9 @@ def _sync(*args, **kwargs): "get_monitor_evaluation_batches": "get_evaluation_batches", "get_monitor_evaluation_scenarios": "get_evaluation_scenarios", "get_monitor_evaluation_batch_detail": "get_evaluation_batch_detail", + "get_monitor_evaluation_batch_aggregate": "get_evaluation_batch_aggregate", "get_monitor_evaluation_run_detail": "get_evaluation_run_detail", + "get_monitor_evaluation_run_artifacts": "get_evaluation_run_artifacts", }[service_name] monkeypatch.setattr(monitor_gateway_impl, gateway_name, _sync) @@ -260,7 +277,13 @@ async def _detail(thread_id, *, load_thread_base, trace_reader): ("/api/monitor/operations/missing", "get_monitor_operation_detail", "Operation not found: missing"), ("/api/monitor/runtimes/missing", "get_monitor_runtime_detail", "Runtime not found: missing"), ("/api/monitor/evaluation/batches/missing", "get_monitor_evaluation_batch_detail", "Evaluation batch not found: missing"), + ( + "/api/monitor/evaluation/batches/missing/aggregate", + "get_monitor_evaluation_batch_aggregate", + "Evaluation batch not found: missing", + ), ("/api/monitor/evaluation/runs/missing", "get_monitor_evaluation_run_detail", "Evaluation run not found: missing"), + ("/api/monitor/evaluation/runs/missing/artifacts", "get_monitor_evaluation_run_artifacts", "Evaluation run not found: missing"), ], ) def test_monitor_detail_routes_map_missing_rows_to_404(monkeypatch, path, service_name, message): @@ -273,7 +296,9 @@ def _raise(*_args, **_kwargs): "get_monitor_operation_detail": "get_operation_detail", "get_monitor_runtime_detail": "get_runtime_detail", "get_monitor_evaluation_batch_detail": "get_evaluation_batch_detail", + "get_monitor_evaluation_batch_aggregate": "get_evaluation_batch_aggregate", "get_monitor_evaluation_run_detail": "get_evaluation_run_detail", + "get_monitor_evaluation_run_artifacts": "get_evaluation_run_artifacts", }[service_name] monkeypatch.setattr(monitor_gateway_impl, gateway_name, _raise) @@ -332,6 +357,30 @@ def test_monitor_evaluation_batch_create_and_start_pass_request_context(monkeypa assert start_calls[0]["token"] == "token-1" +def test_monitor_evaluation_compare_and_export_routes_pass_query_context(monkeypatch): + compare_calls = [] + export_calls = [] + monkeypatch.setattr( + monitor_gateway_impl, + "compare_evaluation_batches", + lambda **kwargs: compare_calls.append(kwargs) or {"delta": {"pass_rate": {"delta": 1.0}}}, + ) + monkeypatch.setattr( + monitor_gateway_impl, + "export_evaluation_batch", + lambda **kwargs: export_calls.append(kwargs) or {"format": "predictions_json"}, + ) + + with TestClient(_app()) as client: + compare = client.get("/api/monitor/evaluation/compare?baseline_batch_id=batch-a&candidate_batch_id=batch-b") + export = client.get("/api/monitor/evaluation/batches/batch-b/export?format=predictions_json") + + assert compare.status_code == 200 + assert export.status_code == 200 + assert compare_calls == [{"baseline_batch_id": "batch-a", "candidate_batch_id": "batch-b"}] + assert export_calls == [{"batch_id": "batch-b", "export_format": "predictions_json"}] + + @pytest.mark.parametrize( ("verb", "path", "service_name"), [ diff --git a/tests/Unit/core/test_capability_async.py b/tests/Unit/core/test_capability_async.py index 39f97720b..23226281b 100644 --- a/tests/Unit/core/test_capability_async.py +++ b/tests/Unit/core/test_capability_async.py @@ -100,7 +100,7 @@ def test_local_sandbox_rebuilds_stale_closed_capability_before_execute_async(tmp async def run(): async_cmd = await sandbox.shell().execute_async("echo hi") - result = await sandbox.shell().wait_for(async_cmd.command_id, timeout=1.0) + result = await sandbox.shell().wait_for(async_cmd.command_id, timeout=2.0) return async_cmd, result async_cmd, result = asyncio.run(run()) diff --git a/tests/Unit/core/test_command_service.py b/tests/Unit/core/test_command_service.py index fee768cf0..285d2281e 100644 --- a/tests/Unit/core/test_command_service.py +++ b/tests/Unit/core/test_command_service.py @@ -1,6 +1,7 @@ """Tests for command executors, hooks, and CommandService.""" import asyncio +from types import SimpleNamespace from unittest.mock import MagicMock import pytest @@ -95,6 +96,46 @@ async def test_get_status(self): assert status is not None assert status.done + @pytest.mark.asyncio + async def test_get_status_syncs_completed_monitor_task(self): + executor = get_executor() + command_id = "cmd_status_sync" + async_cmd = AsyncCommand( + command_id=command_id, + command_line="echo done", + cwd="/tmp", + process=SimpleNamespace(returncode=0), + ) + + async def _finish() -> None: + await asyncio.sleep(0) + async_cmd.stdout_buffer.append("done\n") + async_cmd.exit_code = 0 + async_cmd.done = True + + async_cmd.monitor_task = asyncio.create_task(_finish()) + + if executor.shell_name == "powershell": + from core.tools.command.powershell import executor as powershell_executor + + powershell_executor._RUNNING_COMMANDS[command_id] = async_cmd + + def cleanup() -> None: + powershell_executor._RUNNING_COMMANDS.pop(command_id, None) + else: + type(executor)._running_commands[command_id] = async_cmd + + def cleanup() -> None: + type(executor)._running_commands.pop(command_id, None) + + try: + status = await executor.get_status(command_id) + assert status is async_cmd + assert status.done is True + assert status.stdout_buffer == ["done\n"] + finally: + cleanup() + class TestDangerousCommandsHook: def test_block_rm_rf(self): diff --git a/tests/Unit/eval/test_batch_executor.py b/tests/Unit/eval/test_batch_executor.py index bb8445954..d9451ff9d 100644 --- a/tests/Unit/eval/test_batch_executor.py +++ b/tests/Unit/eval/test_batch_executor.py @@ -1,3 +1,5 @@ +import asyncio + import pytest from eval.batch_executor import EvaluationBatchExecutor @@ -59,10 +61,10 @@ async def test_batch_executor_records_failed_scenario_before_reraising(): ) executor = EvaluationBatchExecutor(runner=_FailingRunner(), batch_service=batch_service) - with pytest.raises(RuntimeError, match="runner exploded"): - await executor.run_batch(batch["batch_id"], [EvalScenario(id="scenario-1", name="Scenario 1")]) + results = await executor.run_batch(batch["batch_id"], [EvalScenario(id="scenario-1", name="Scenario 1")]) batch_run = repo.list_batch_runs(batch["batch_id"])[0] + assert results == [] assert batch_run["status"] == "failed" assert batch_run["summary_json"] == {"error": "runner exploded"} assert repo.get_batch(batch["batch_id"])["status"] == "failed" @@ -101,6 +103,54 @@ async def run_scenario(self, scenario: EvalScenario) -> EvalResult: await executor.run_batch(batch["batch_id"], [EvalScenario(id="scenario-1", name="Scenario 1")]) +@pytest.mark.asyncio +async def test_batch_executor_respects_max_concurrent(): + active = 0 + max_seen = 0 + release = asyncio.Event() + + class _ConcurrencyRunner: + async def run_scenario(self, scenario: EvalScenario) -> EvalResult: + nonlocal active, max_seen + active += 1 + max_seen = max(max_seen, active) + if scenario.id == "scenario-1": + await asyncio.sleep(0) + assert active == 2 + release.set() + else: + await release.wait() + active -= 1 + return EvalResult( + scenario_id=scenario.id, + trajectory=RunTrajectory( + id=f"eval-run-{scenario.id}", + thread_id=f"thread-{scenario.id}", + user_message="", + status="completed", + ), + ) + + repo = _FakeBatchRepo() + batch_service = EvaluationBatchService(batch_repo=repo) + batch = batch_service.create_batch( + submitted_by_user_id="user-1", + agent_user_id="agent-1", + scenario_ids=["scenario-1", "scenario-2"], + sandbox="local", + max_concurrent=2, + ) + executor = EvaluationBatchExecutor(runner=_ConcurrencyRunner(), batch_service=batch_service) + + await executor.run_batch( + batch["batch_id"], + [EvalScenario(id="scenario-1", name="Scenario 1"), EvalScenario(id="scenario-2", name="Scenario 2")], + max_concurrent=2, + ) + + assert max_seen == 2 + + @pytest.mark.asyncio async def test_batch_executor_marks_current_batch_run_running_before_scenario(): class InspectingRunner: diff --git a/tests/Unit/eval/test_batch_service.py b/tests/Unit/eval/test_batch_service.py index e5b60bd27..f60f3da8d 100644 --- a/tests/Unit/eval/test_batch_service.py +++ b/tests/Unit/eval/test_batch_service.py @@ -1,5 +1,5 @@ from eval.batch_service import EvaluationBatchService -from eval.models import EvalResult, RunTrajectory, SystemMetrics +from eval.models import BenchmarkInfo, EvalResult, EvalScenario, RunTrajectory, SystemMetrics def _batch_row(*, batch_id: str = "batch-1", status: str = "pending") -> dict: @@ -126,6 +126,31 @@ def test_batch_service_creates_batch_and_runs(): assert len(repo.list_batch_runs(batch["batch_id"])) == 2 +def test_batch_service_persists_structured_scenario_refs(): + repo = _FakeBatchRepo() + service = EvaluationBatchService(batch_repo=repo) + + batch = service.create_batch( + submitted_by_user_id="user-1", + agent_user_id="agent-1", + scenario_ids=["s1"], + sandbox="local", + max_concurrent=1, + scenario_refs=[ + EvalScenario( + id="s1", + name="Scenario 1", + benchmark=BenchmarkInfo(family="swe-bench", instance_id="repo__1"), + workspace={"cwd": "/workspace/repo", "repo": "repo/name", "base_commit": "abc123"}, + ) + ], + ) + + assert batch["kind"] == "benchmark_batch" + assert batch["config_json"]["scenario_refs"][0]["benchmark"]["family"] == "swe-bench" + assert batch["config_json"]["scenario_refs"][0]["workspace"]["cwd"] == "/workspace/repo" + + def test_batch_service_recomputes_summary(): repo = _FakeBatchRepo() service = EvaluationBatchService(batch_repo=repo) @@ -203,6 +228,9 @@ def test_batch_service_records_eval_result_for_matching_scenario(): status="completed", ), system_metrics=SystemMetrics(total_tokens=42, tool_call_count=3), + benchmark=BenchmarkInfo(family="swe-bench", split="smoke", instance_id="repo__1"), + judge_result={"judge_type": "heuristic", "status": "completed", "verdict": "passed", "scores": {"resolved": 1.0}}, + artifacts=[{"name": "final-response", "kind": "submission"}], ) updated = service.record_eval_result(batch["batch_id"], result) @@ -210,7 +238,11 @@ def test_batch_service_records_eval_result_for_matching_scenario(): assert updated["eval_run_id"] == "eval-run-1" assert updated["thread_id"] == "thread-1" assert updated["status"] == "completed" - assert updated["summary_json"] == {"total_tokens": 42, "tool_call_count": 3} + assert updated["summary_json"]["total_tokens"] == 42 + assert updated["summary_json"]["tool_call_count"] == 3 + assert updated["summary_json"]["judge_verdict"] == "passed" + assert updated["summary_json"]["benchmark_family"] == "swe-bench" + assert updated["summary_json"]["artifact_count"] == 1 def test_batch_service_records_eval_error_for_matching_scenario(): @@ -285,3 +317,41 @@ def test_batch_service_lists_batch_runs_for_thread(): found = service.list_batch_runs_for_thread("thread-1") assert [row["eval_run_id"] for row in found] == ["eval-run-1"] + + +def test_batch_service_computes_benchmark_aggregate_and_compare(): + repo = _FakeBatchRepo() + service = EvaluationBatchService(batch_repo=repo) + baseline = service.create_batch( + submitted_by_user_id="user-1", + agent_user_id="agent-1", + scenario_ids=["scenario-1"], + sandbox="local", + max_concurrent=1, + ) + candidate = service.create_batch( + submitted_by_user_id="user-1", + agent_user_id="agent-1", + scenario_ids=["scenario-2"], + sandbox="local", + max_concurrent=1, + ) + baseline_run = repo.list_batch_runs(baseline["batch_id"])[0] + candidate_run = repo.list_batch_runs(candidate["batch_id"])[0] + repo.update_batch_run( + baseline_run["batch_run_id"], + status="completed", + summary_json={"judge_verdict": "failed", "scores": {"resolved": 0.0}, "artifact_count": 1}, + ) + repo.update_batch_run( + candidate_run["batch_run_id"], + status="completed", + summary_json={"judge_verdict": "passed", "scores": {"resolved": 1.0}, "artifact_count": 2}, + ) + + baseline_summary = service.get_batch_summary(baseline["batch_id"]) + comparison = service.compare_batches(baseline["batch_id"], candidate["batch_id"]) + + assert baseline_summary["summary"]["pass_rate"] == 0.0 + assert comparison["delta"]["pass_rate"]["delta"] == 1.0 + assert comparison["delta"]["artifact_count_total"]["candidate"] == 2.0 diff --git a/tests/Unit/eval/test_exporter.py b/tests/Unit/eval/test_exporter.py new file mode 100644 index 000000000..e35067e7f --- /dev/null +++ b/tests/Unit/eval/test_exporter.py @@ -0,0 +1,23 @@ +from eval.exporter import build_batch_export + + +def test_build_batch_export_emits_predictions_bundle(): + payload = build_batch_export( + batch={"batch_id": "batch-1", "config_json": {}, "kind": "benchmark_batch", "status": "completed"}, + aggregate={"pass_rate": 1.0}, + run_records=[ + { + "run_id": "run-1", + "scenario_id": "scenario-1", + "run": {"run_id": "run-1", "final_response": "patch-body"}, + "benchmark": {"instance_id": "repo__1"}, + "judge_result": {"verdict": "passed"}, + "artifacts": [{"name": "final-response"}], + } + ], + export_format="predictions_json", + ) + + assert payload["format"] == "predictions_json" + assert payload["predictions"][0]["instance_id"] == "repo__1" + assert payload["predictions"][0]["prediction"] == "patch-body" diff --git a/tests/Unit/eval/test_harness_runner.py b/tests/Unit/eval/test_harness_runner.py index 6bdbec44e..b5ba2d31e 100644 --- a/tests/Unit/eval/test_harness_runner.py +++ b/tests/Unit/eval/test_harness_runner.py @@ -5,7 +5,7 @@ class _RuntimeFailingClient: - async def create_thread(self, *, agent_user_id: str, sandbox: str) -> str: + async def create_thread(self, *, agent_user_id: str, sandbox: str, cwd: str | None = None) -> str: return "thread-1" async def run_message(self, _thread_id: str, _message: str, enable_trajectory: bool = True) -> TrajectoryCapture: @@ -19,7 +19,7 @@ async def delete_thread(self, _thread_id: str) -> None: class _DeleteFailingClient: - async def create_thread(self, *, agent_user_id: str, sandbox: str) -> str: + async def create_thread(self, *, agent_user_id: str, sandbox: str, cwd: str | None = None) -> str: return "thread-1" async def run_message(self, _thread_id: str, _message: str, enable_trajectory: bool = True) -> TrajectoryCapture: @@ -37,7 +37,7 @@ def __init__(self) -> None: self.messages: list[str] = [] self.deleted = False - async def create_thread(self, *, agent_user_id: str, sandbox: str) -> str: + async def create_thread(self, *, agent_user_id: str, sandbox: str, cwd: str | None = None) -> str: return "thread-1" async def run_message(self, _thread_id: str, message: str, enable_trajectory: bool = True) -> TrajectoryCapture: @@ -83,3 +83,27 @@ async def test_eval_runner_stops_on_terminal_error_before_next_message(): assert client.messages == ["first"] assert client.deleted is True + + +@pytest.mark.asyncio +async def test_eval_runner_passes_workspace_cwd_to_thread_creation(): + captured: dict[str, str | None] = {"cwd": None} + + class _InspectingClient(_DeleteFailingClient): + async def create_thread(self, *, agent_user_id: str, sandbox: str, cwd: str | None = None) -> str: + captured["cwd"] = cwd + return "thread-1" + + runner = EvalRunner(client=_InspectingClient(), agent_user_id="agent-1") + + with pytest.raises(RuntimeError, match="delete failed"): + await runner.run_scenario( + EvalScenario( + id="scenario-1", + name="Scenario 1", + workspace={"cwd": "/workspace/project"}, + messages=[ScenarioMessage(content="hello")], + ) + ) + + assert captured["cwd"] == "/workspace/project" diff --git a/tests/Unit/eval/test_judge.py b/tests/Unit/eval/test_judge.py new file mode 100644 index 000000000..13f74911a --- /dev/null +++ b/tests/Unit/eval/test_judge.py @@ -0,0 +1,67 @@ +import pytest + +from eval.judge import build_judge +from eval.models import EvalResult, EvalScenario, JudgeConfig, RunTrajectory + + +@pytest.mark.asyncio +async def test_heuristic_judge_scores_expected_behaviors(): + judge = build_judge(JudgeConfig(type="heuristic", config={})) + scenario = EvalScenario( + id="scenario-1", + name="Scenario 1", + expected_behaviors=["alpha", "beta"], + ) + result = EvalResult( + scenario_id="scenario-1", + trajectory=RunTrajectory(thread_id="thread-1", user_message="hello", final_response="alpha beta done"), + ) + + judged = await judge.evaluate(scenario, result) + + assert judged.verdict == "passed" + assert judged.scores["resolved"] == 1.0 + + +@pytest.mark.asyncio +async def test_command_judge_parses_json_stdout(tmp_path): + judge = build_judge( + JudgeConfig( + type="command", + config={ + "command": [ + "python", + "-c", + "import json,sys; payload=json.load(sys.stdin); print(json.dumps({'status':'completed','verdict':'passed','scores':{'resolved':1.0},'metadata':{'scenario_id':payload['scenario']['id']}}))", + ] + }, + ) + ) + scenario = EvalScenario(id="scenario-1", name="Scenario 1") + result = EvalResult( + scenario_id="scenario-1", + trajectory=RunTrajectory(thread_id="thread-1", user_message="hello", final_response="done"), + ) + + judged = await judge.evaluate(scenario, result) + + assert judged.verdict == "passed" + assert judged.metadata == {"scenario_id": "scenario-1"} + + +@pytest.mark.asyncio +async def test_command_judge_raises_on_non_zero_exit(): + judge = build_judge( + JudgeConfig( + type="command", + config={"command": ["python", "-c", "import sys; sys.stderr.write('nope'); sys.exit(2)"]}, + ) + ) + scenario = EvalScenario(id="scenario-1", name="Scenario 1") + result = EvalResult( + scenario_id="scenario-1", + trajectory=RunTrajectory(thread_id="thread-1", user_message="hello", final_response="done"), + ) + + with pytest.raises(RuntimeError, match="Judge command failed"): + await judge.evaluate(scenario, result) diff --git a/tests/Unit/eval/test_scenario_loader.py b/tests/Unit/eval/test_scenario_loader.py new file mode 100644 index 000000000..d05f83a0c --- /dev/null +++ b/tests/Unit/eval/test_scenario_loader.py @@ -0,0 +1,75 @@ +from pathlib import Path + +import pytest + +from eval.harness import scenario as scenario_loader + + +def test_load_scenario_parses_benchmark_contract(tmp_path: Path) -> None: + yaml_path = tmp_path / "scenario.yaml" + yaml_path.write_text( + """ +id: benchmark-scenario +name: Benchmark Scenario +category: swe +sandbox: local +messages: + - content: "hello" +benchmark: + family: swe-bench + name: SWE-bench Verified + split: smoke + instance_id: astropy__astropy-12907 +workspace: + cwd: /workspace/project + repo: astropy/astropy + base_commit: abc123 +judge_config: + type: command + config: + command: ["python", "judge.py"] +artifacts: + requested_artifacts: ["patch", "test_log"] +export: + format: swe-bench-predictions + key: smoke-export +""".strip() + ) + + loaded = scenario_loader.load_scenario(yaml_path) + + assert loaded.benchmark is not None + assert loaded.benchmark.family == "swe-bench" + assert loaded.benchmark.instance_id == "astropy__astropy-12907" + assert loaded.workspace is not None + assert loaded.workspace.cwd == "/workspace/project" + assert loaded.judge_config is not None + assert loaded.judge_config.type == "command" + assert loaded.artifact_policy is not None + assert loaded.artifact_policy.requested_artifacts == ["patch", "test_log"] + assert loaded.export is not None + assert loaded.export.format == "swe-bench-predictions" + + +def test_load_scenarios_from_dirs_rejects_duplicate_ids(tmp_path: Path) -> None: + first = tmp_path / "one" + second = tmp_path / "two" + first.mkdir() + second.mkdir() + (first / "a.yaml").write_text("id: duplicated\nname: One\nmessages: []\n") + (second / "b.yaml").write_text("id: duplicated\nname: Two\nmessages: []\n") + + with pytest.raises(ValueError, match="Duplicate evaluation scenario id"): + scenario_loader.load_scenarios_from_dirs([first, second]) + + +def test_parse_scenario_dirs_uses_env_style_separator(tmp_path: Path) -> None: + first = tmp_path / "a" + second = tmp_path / "b" + + parsed = scenario_loader.parse_scenario_dirs( + f"{first}{scenario_loader.os.pathsep}{second}", + default_dirs=[], + ) + + assert parsed == [first, second] diff --git a/tests/Unit/eval/test_storage.py b/tests/Unit/eval/test_storage.py index 598e58138..1d0c28aff 100644 --- a/tests/Unit/eval/test_storage.py +++ b/tests/Unit/eval/test_storage.py @@ -5,6 +5,8 @@ class _FakeEvalRepo: def __init__(self) -> None: self.header_calls: list[dict] = [] self.finalize_calls: list[dict] = [] + self.metrics_calls: list[dict] = [] + self.metrics_rows: list[dict] = [] def upsert_run_header(self, **payload): self.header_calls.append(payload) @@ -12,6 +14,23 @@ def upsert_run_header(self, **payload): def finalize_run(self, **payload): self.finalize_calls.append(payload) + def save_metrics(self, **payload): + self.metrics_calls.append(payload) + self.metrics_rows.append( + { + "id": payload["run_id"], + "tier": payload["tier"], + "timestamp": payload["timestamp"], + "metrics_json": payload["metrics_json"], + } + ) + + def get_metrics(self, run_id: str, tier: str | None = None): + rows = [row for row in self.metrics_rows if row["id"] == run_id] + if tier is not None: + rows = [row for row in rows if row["tier"] == tier] + return rows + def test_trajectory_store_exposes_upsert_run_header() -> None: repo = _FakeEvalRepo() @@ -59,3 +78,37 @@ def test_trajectory_store_exposes_finalize_run() -> None: "trajectory_json": '{"id":"run-1"}', } ] + + +def test_trajectory_store_round_trips_artifacts_and_judge_result() -> None: + repo = _FakeEvalRepo() + store = TrajectoryStore(eval_repo=repo) + + store.save_artifacts( + "run-1", + [ + { + "name": "final-response", + "kind": "submission", + "content": "patch text", + "metadata": {"captured": True}, + } + ], + ) + store.save_judge_result( + "run-1", + { + "judge_type": "heuristic", + "status": "completed", + "verdict": "passed", + "scores": {"resolved": 1.0}, + }, + ) + + artifacts = store.get_artifacts("run-1") + judge_result = store.get_judge_result("run-1") + + assert artifacts[0].name == "final-response" + assert artifacts[0].content == "patch text" + assert judge_result is not None + assert judge_result.verdict == "passed" diff --git a/tests/Unit/eval/test_swe_verified_acceptance.py b/tests/Unit/eval/test_swe_verified_acceptance.py new file mode 100644 index 000000000..c40afa350 --- /dev/null +++ b/tests/Unit/eval/test_swe_verified_acceptance.py @@ -0,0 +1,71 @@ +import json +from pathlib import Path + +from fastapi.testclient import TestClient + +from eval.benchmarks.swe_verified.acceptance import create_acceptance_app, evaluate_smoke_judge_payload, simulate_jsonrpc_request +from eval.harness.scenario import load_scenarios_from_dirs + + +def test_swe_verified_smoke_yaml_scenarios_load_with_benchmark_contracts(): + scenarios = load_scenarios_from_dirs([Path("eval/benchmarks")]) + scenario_ids = {scenario.id for scenario in scenarios} + + assert {"swe_verified_pytest_7521", "swe_verified_pytest_7571", "swe_verified_pytest_7490"} <= scenario_ids + benchmark_scenarios = {scenario.id: scenario for scenario in scenarios if scenario.id.startswith("swe_verified_pytest_")} + assert benchmark_scenarios["swe_verified_pytest_7521"].benchmark.family == "SWE-bench Verified" + assert benchmark_scenarios["swe_verified_pytest_7521"].judge_config.type == "command" + assert benchmark_scenarios["swe_verified_pytest_7521"].export.format == "predictions_json" + + +def test_swe_verified_acceptance_judge_fails_without_patch_marker(): + judged = evaluate_smoke_judge_payload( + instance_id="pytest-dev__pytest-7571", + payload={ + "result": { + "final_response": "Unable to confirm the requested fix.", + "artifacts": [ + {"name": "final-response"}, + {"name": "benchmark-instance"}, + {"name": "workspace"}, + ], + } + }, + ) + + assert judged["verdict"] == "failed" + assert judged["scores"]["resolved"] == 0.0 + + +def test_swe_verified_acceptance_jsonrpc_matches_fixtures(): + for request_name, response_name in ( + ("judge_request.json", "judge_response.json"), + ("export_request.json", "export_response.json"), + ): + request_path = Path("eval/benchmarks/swe_verified/smoke/rpc") / request_name + response_path = Path("eval/benchmarks/swe_verified/smoke/rpc") / response_name + request = json.loads(request_path.read_text(encoding="utf-8")) + expected = json.loads(response_path.read_text(encoding="utf-8")) + + assert simulate_jsonrpc_request(request) == expected + + +def test_swe_verified_acceptance_app_can_create_benchmark_batch(): + app = create_acceptance_app() + + with TestClient(app) as client: + response = client.post( + "/api/monitor/evaluation/batches", + headers={"Authorization": "Bearer token-1"}, + json={ + "agent_user_id": "agent-1", + "scenario_ids": ["swe_verified_pytest_7521", "swe_verified_pytest_7571"], + "sandbox": "local", + "max_concurrent": 2, + }, + ) + + assert response.status_code == 200 + payload = response.json()["batch"] + assert payload["kind"] == "benchmark_batch" + assert payload["config_json"]["scenario_ids"] == ["swe_verified_pytest_7521", "swe_verified_pytest_7571"] diff --git a/tests/Unit/eval/test_swe_verified_assets.py b/tests/Unit/eval/test_swe_verified_assets.py new file mode 100644 index 000000000..7f1bc1c32 --- /dev/null +++ b/tests/Unit/eval/test_swe_verified_assets.py @@ -0,0 +1,77 @@ +from __future__ import annotations + +import json + +from eval.benchmarks.swe_verified.assets import ( + clone_bundle, + load_smoke_asset_bundle, + resolve_repo_path, + validate_official_dataset_alignment, + validate_smoke_assets, +) + + +def _dataset_row(bundle, instance): + return { + "instance_id": instance.instance_id, + "repo": instance.repo, + "base_commit": instance.base_commit, + "environment_setup_commit": instance.environment_setup_commit, + "difficulty": instance.difficulty, + "problem_statement": instance.problem_statement, + "hints_text": instance.hints_text, + "FAIL_TO_PASS": json.dumps(instance.fail_to_pass), + "PASS_TO_PASS": json.dumps(instance.pass_to_pass), + "patch": resolve_repo_path(instance.official_patch_path).read_text(encoding="utf-8"), + "test_patch": resolve_repo_path(instance.official_test_patch_path).read_text(encoding="utf-8"), + } + + +def test_swe_verified_smoke_assets_validate_cleanly() -> None: + bundle = load_smoke_asset_bundle() + + assert validate_smoke_assets(bundle) == [] + + +def test_swe_verified_smoke_assets_detect_prediction_mismatch() -> None: + bundle = clone_bundle(load_smoke_asset_bundle()) + bundle.predictions[0]["instance_id"] = "wrong-instance" + + issues = validate_smoke_assets(bundle) + + assert any("sample predictions instance_ids do not match manifest order" in issue for issue in issues) + + +def test_swe_verified_smoke_assets_detect_missing_export_field() -> None: + bundle = clone_bundle(load_smoke_asset_bundle()) + bundle.export_golden["instances"][0].pop("judge_result") + + issues = validate_smoke_assets(bundle) + + assert any("judge_result" in issue for issue in issues) + + +def test_swe_verified_smoke_assets_detect_patch_hash_drift() -> None: + bundle = clone_bundle(load_smoke_asset_bundle()) + bundle.manifest.instances[0].official_patch_sha256 = "deadbeef" + + issues = validate_smoke_assets(bundle) + + assert any("patch fixture sha256 mismatch" in issue for issue in issues) + + +def test_swe_verified_smoke_assets_match_supplied_official_rows() -> None: + bundle = load_smoke_asset_bundle() + rows = [_dataset_row(bundle, instance) for instance in bundle.manifest.instances] + + assert validate_official_dataset_alignment(bundle, dataset_rows=rows) == [] + + +def test_swe_verified_smoke_assets_detect_official_dataset_drift() -> None: + bundle = load_smoke_asset_bundle() + rows = [_dataset_row(bundle, instance) for instance in bundle.manifest.instances] + rows[0]["base_commit"] = "0" * 40 + + issues = validate_official_dataset_alignment(bundle, dataset_rows=rows) + + assert any("base_commit does not match official dataset" in issue for issue in issues) diff --git a/tests/Unit/monitor/test_monitor_detail_contracts.py b/tests/Unit/monitor/test_monitor_detail_contracts.py index 217380827..a72e9fe9f 100644 --- a/tests/Unit/monitor/test_monitor_detail_contracts.py +++ b/tests/Unit/monitor/test_monitor_detail_contracts.py @@ -342,7 +342,7 @@ def test_monitor_evaluation_scenario_catalog_reads_yaml_scenarios(tmp_path, monk ] ) ) - monkeypatch.setattr(monitor_evaluation_execution_service, "EVAL_SCENARIO_DIR", scenario_dir) + monkeypatch.setenv("LEON_EVAL_SCENARIO_DIRS", str(scenario_dir)) payload = monitor_evaluation_service.get_monitor_evaluation_scenarios() @@ -355,6 +355,10 @@ def test_monitor_evaluation_scenario_catalog_reads_yaml_scenarios(tmp_path, monk "sandbox": "local", "message_count": 1, "timeout_seconds": 120, + "benchmark": None, + "workspace": None, + "judge_type": None, + "export_format": None, } ], "count": 1, @@ -363,6 +367,7 @@ def test_monitor_evaluation_scenario_catalog_reads_yaml_scenarios(tmp_path, monk def test_create_monitor_evaluation_batch_uses_batch_service(monkeypatch): calls = [] + scenario_refs = [{"scenario_id": "scenario-1", "sandbox": "local"}] class FakeBatchService: def create_batch(self, **kwargs): @@ -370,6 +375,11 @@ def create_batch(self, **kwargs): return {"batch_id": "batch-created", "status": "pending"} monkeypatch.setattr(monitor_evaluation_read_service, "make_eval_batch_service", lambda: FakeBatchService()) + monkeypatch.setattr( + monitor_evaluation_execution_service, + "select_monitor_eval_scenarios", + lambda scenario_ids, *, sandbox: scenario_refs, + ) payload = monitor_evaluation_service.create_monitor_evaluation_batch( submitted_by_user_id="owner-1", @@ -387,6 +397,7 @@ def create_batch(self, **kwargs): "scenario_ids": ["scenario-1"], "sandbox": "local", "max_concurrent": 1, + "scenario_refs": scenario_refs, } ] @@ -424,7 +435,7 @@ def get_batch_detail(self, batch_id): def update_batch_status(self, batch_id, status): return {"batch_id": batch_id, "status": status} - monkeypatch.setattr(monitor_evaluation_execution_service, "EVAL_SCENARIO_DIR", scenario_dir) + monkeypatch.setenv("LEON_EVAL_SCENARIO_DIRS", str(scenario_dir)) monkeypatch.setattr(monitor_evaluation_read_service, "make_eval_batch_service", lambda: FakeBatchService()) class _Scheduler: diff --git a/tests/Unit/monitor/test_monitor_evaluation_scheduler_boundary.py b/tests/Unit/monitor/test_monitor_evaluation_scheduler_boundary.py index 275bc35a4..dccf50cf7 100644 --- a/tests/Unit/monitor/test_monitor_evaluation_scheduler_boundary.py +++ b/tests/Unit/monitor/test_monitor_evaluation_scheduler_boundary.py @@ -18,6 +18,7 @@ def get_batch_detail(self, _batch_id): "config_json": { "scenario_ids": ["scenario-1"], "sandbox": "local", + "max_concurrent": 3, }, }, "runs": [], @@ -49,4 +50,5 @@ def test_start_batch_submits_typed_spec_to_scheduler(monkeypatch): assert spec.batch_id == "batch-1" assert spec.execution_base_url == "http://api" assert spec.agent_user_id == "agent-1" + assert spec.max_concurrent == 3 assert spec.scenarios == [("scenario-stub", "scenario-1", "local")]