OpenDCAI · shaluoyan523 · Apr 21, 2026 · Apr 21, 2026 · Apr 21, 2026 · Apr 21, 2026
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1,6 @@
+# Keep smoke patch fixtures byte-stable across Windows checkouts.
+eval/benchmarks/swe_verified/smoke/fixtures/official_patches/*.patch text eol=lf
+
+# Scenario YAML files contain UTF-8 content and should not be rewritten to CRLF.
+eval/scenarios/*.yaml text eol=lf
+eval/benchmarks/**/*.yaml text eol=lf
diff --git a/backend/monitor/api/http/global_router.py b/backend/monitor/api/http/global_router.py
@@ -161,11 +161,43 @@ def evaluation_batch_detail_snapshot(batch_id: str):
     return _or_404(monitor_gateway.get_evaluation_batch_detail, batch_id)
 
 
+@router.get("/evaluation/batches/{batch_id}/aggregate")
+def evaluation_batch_aggregate_snapshot(batch_id: str):
+    return _or_404(monitor_gateway.get_evaluation_batch_aggregate, batch_id)
+
+
 @router.get("/evaluation/runs/{run_id}")
 def evaluation_run_detail_snapshot(run_id: str):
     return _or_404(monitor_gateway.get_evaluation_run_detail, run_id)
 
 
+@router.get("/evaluation/runs/{run_id}/artifacts")
+def evaluation_run_artifacts_snapshot(run_id: str):
+    return _or_404(monitor_gateway.get_evaluation_run_artifacts, run_id)
+
+
+@router.get("/evaluation/compare")
+def evaluation_compare_snapshot(
+    baseline_batch_id: str = Query(..., min_length=1),
+    candidate_batch_id: str = Query(..., min_length=1),
+):
+    try:
+        return monitor_gateway.compare_evaluation_batches(
+            baseline_batch_id=baseline_batch_id,
+            candidate_batch_id=candidate_batch_id,
+        )
+    except KeyError as exc:
+        raise HTTPException(status_code=404, detail=str(exc)) from exc
+
+
+@router.get("/evaluation/batches/{batch_id}/export")
+def evaluation_batch_export_snapshot(batch_id: str, format: str | None = Query(default=None)):
+    try:
+        return monitor_gateway.export_evaluation_batch(batch_id=batch_id, export_format=format)
+    except KeyError as exc:
+        raise HTTPException(status_code=404, detail=str(exc)) from exc
+
+
 @router.get("/resources")
 def resources_overview():
     return monitor_gateway.get_resource_overview()

diff --git a/backend/monitor/application/use_cases/evaluation.py b/backend/monitor/application/use_cases/evaluation.py
@@ -2,13 +2,22 @@
 
 from __future__ import annotations
 
+import logging
 from typing import Any
 
 from backend.monitor.infrastructure.evaluation import evaluation_execution_service, evaluation_read_service
 from backend.monitor.infrastructure.evaluation.evaluation_scheduler import EvaluationJobScheduler, EvaluationJobSpec
+from eval.exporter import build_batch_export
 
+logger = logging.getLogger(__name__)
 
-def _build_monitor_evaluation_run_fact_rows(metrics_rows: list[dict[str, Any]]) -> list[dict[str, str]]:
+
+def _build_monitor_evaluation_run_fact_rows(
+    metrics_rows: list[dict[str, Any]],
+    *,
+    judge_result: dict[str, Any] | None = None,
+    artifacts: list[dict[str, Any]] | None = None,
+) -> list[dict[str, str]]:
     metrics_by_tier = {str(row.get("tier") or "").strip().lower(): row.get("metrics") or {} for row in metrics_rows}
     system_metrics = metrics_by_tier.get("system") or {}
     objective_metrics = metrics_by_tier.get("objective") or {}
@@ -25,18 +34,41 @@ def _build_monitor_evaluation_run_fact_rows(metrics_rows: list[dict[str, Any]])
     if total_duration_ms is not None:
         duration_value = int(total_duration_ms) if float(total_duration_ms).is_integer() else total_duration_ms
         facts.append({"label": "Duration (ms)", "value": str(duration_value)})
+    if judge_result:
+        facts.append({"label": "Judge verdict", "value": str(judge_result.get("verdict") or "-")})
+        if judge_result.get("scores"):
+            first_score_key = sorted(dict(judge_result["scores"]).keys())[0]
+            facts.append(
+                {
+                    "label": f"Judge {first_score_key}",
+                    "value": str(dict(judge_result["scores"]).get(first_score_key)),
+                }
+            )
+    if artifacts is not None:
+        facts.append({"label": "Artifacts", "value": str(len(artifacts))})
     return facts
 
 
-def _build_monitor_evaluation_run_row(run: dict[str, Any], metrics_rows: list[dict[str, Any]]) -> dict[str, Any]:
+def _build_monitor_evaluation_run_row(
+    run: dict[str, Any],
+    metrics_rows: list[dict[str, Any]],
+    *,
+    judge_result: dict[str, Any] | None = None,
+    artifacts: list[dict[str, Any]] | None = None,
+    benchmark: dict[str, Any] | None = None,
+) -> dict[str, Any]:
     return {
         "run_id": str(run.get("id") or "") or None,
         "thread_id": str(run.get("thread_id") or "") or None,
         "status": str(run.get("status") or "") or None,
         "started_at": str(run.get("started_at") or "") or None,
         "finished_at": str(run.get("finished_at") or "") or None,
         "user_message": str(run.get("user_message") or "") or None,
-        "facts": _build_monitor_evaluation_run_fact_rows(metrics_rows),
+        "final_response": str(run.get("final_response") or "") or None,
+        "facts": _build_monitor_evaluation_run_fact_rows(metrics_rows, judge_result=judge_result, artifacts=artifacts),
+        "judge_result": judge_result,
+        "artifact_count": len(artifacts or []),
+        "benchmark": benchmark,
     }
 
 
@@ -70,7 +102,16 @@ def get_monitor_evaluation_workbench() -> dict[str, Any]:
             completed_runs += 1
         elif status in {"error", "failed", "cancelled"}:
             failed_runs += 1
-        run_rows.append(_build_monitor_evaluation_run_row(run, store.get_metrics(str(run.get("id") or ""))))
+        run_id = str(run.get("id") or "")
+        run_rows.append(
+            _build_monitor_evaluation_run_row(
+                run,
+                store.get_metrics(run_id),
+                judge_result=_dump_model(store.get_judge_result(run_id)),
+                artifacts=[artifact.model_dump(mode="json") for artifact in store.get_artifacts(run_id)],
+                benchmark=_dump_model(store.get_benchmark_info(run_id)),
+            )
+        )
 
     return {
         "headline": "Evaluation Workbench",
@@ -92,9 +133,21 @@ def get_monitor_evaluation_run_detail(run_id: str) -> dict[str, Any]:
     run = store.get_run(run_id)
     if run is None:
         raise KeyError(f"Evaluation run not found: {run_id}")
-    run_row = _build_monitor_evaluation_run_row(run, store.get_metrics(run_id))
+    artifacts = [artifact.model_dump(mode="json") for artifact in store.get_artifacts(run_id)]
+    judge_result = _dump_model(store.get_judge_result(run_id))
+    benchmark = _dump_model(store.get_benchmark_info(run_id))
+    run_row = _build_monitor_evaluation_run_row(
+        run,
+        store.get_metrics(run_id),
+        judge_result=judge_result,
+        artifacts=artifacts,
+        benchmark=benchmark,
+    )
     detail = {"run": run_row, "facts": run_row["facts"], "limitations": []}
     detail["batch_run"] = evaluation_read_service.make_eval_batch_service().get_batch_run_for_eval_run(run_id)
+    detail["judge_result"] = judge_result
+    detail["artifacts"] = artifacts
+    detail["benchmark"] = benchmark
     return detail
 
 
@@ -115,6 +168,10 @@ def get_monitor_evaluation_scenarios() -> dict[str, Any]:
             "sandbox": scenario.sandbox,
             "message_count": len(scenario.messages),
             "timeout_seconds": scenario.timeout_seconds,
+            "benchmark": scenario.benchmark.model_dump(mode="json") if scenario.benchmark else None,
+            "workspace": scenario.workspace.model_dump(mode="json") if scenario.workspace else None,
+            "judge_type": scenario.judge_config.type if scenario.judge_config else None,
+            "export_format": scenario.export.format if scenario.export else None,
         }
         for scenario in evaluation_execution_service.load_monitor_eval_scenarios()
     ]
@@ -129,12 +186,14 @@ def create_monitor_evaluation_batch(
     sandbox: str,
     max_concurrent: int,
 ) -> dict[str, Any]:
+    scenarios = evaluation_execution_service.select_monitor_eval_scenarios(list(scenario_ids), sandbox=sandbox)
     batch = evaluation_read_service.make_eval_batch_service().create_batch(
         submitted_by_user_id=submitted_by_user_id,
         agent_user_id=agent_user_id,
         scenario_ids=scenario_ids,
         sandbox=sandbox,
         max_concurrent=max_concurrent,
+        scenario_refs=scenarios,
     )
     return {"batch": batch}
 
@@ -152,6 +211,7 @@ def start_monitor_evaluation_batch(
     config = batch.get("config_json") or {}
     scenario_ids = config.get("scenario_ids")
     sandbox = config.get("sandbox")
+    max_concurrent = int(config.get("max_concurrent") or 1)
     agent_user_id = batch.get("agent_user_id")
     if not scenario_ids:
         raise ValueError("Evaluation batch is missing scenario_ids")
@@ -171,10 +231,86 @@ def start_monitor_evaluation_batch(
             execution_base_url=execution_base_url.rstrip("/"),
             token=token,
             agent_user_id=str(agent_user_id),
+            max_concurrent=max_concurrent,
         )
     )
     return {"accepted": True, "batch": updated}
 
 
 def get_monitor_evaluation_batch_detail(batch_id: str) -> dict[str, Any]:
-    return evaluation_read_service.make_eval_batch_service().get_batch_detail(batch_id)
+    batch_service = evaluation_read_service.make_eval_batch_service()
+    detail = batch_service.get_batch_detail(batch_id)
+    detail["aggregate"] = batch_service.get_batch_summary(batch_id)["summary"]
+    return detail
+
+
+def get_monitor_evaluation_batch_aggregate(batch_id: str) -> dict[str, Any]:
+    return evaluation_read_service.make_eval_batch_service().get_batch_summary(batch_id)
+
+
+def compare_monitor_evaluation_batches(*, baseline_batch_id: str, candidate_batch_id: str) -> dict[str, Any]:
+    return evaluation_read_service.make_eval_batch_service().compare_batches(baseline_batch_id, candidate_batch_id)
+
+
+def get_monitor_evaluation_run_artifacts(run_id: str) -> dict[str, Any]:
+    store = evaluation_read_service.make_trajectory_store()
+    if store.get_run(run_id) is None:
+        raise KeyError(f"Evaluation run not found: {run_id}")
+    return {
+        "run_id": run_id,
+        "artifacts": [artifact.model_dump(mode="json") for artifact in store.get_artifacts(run_id)],
+        "judge_result": _dump_model(store.get_judge_result(run_id)),
+        "benchmark": _dump_model(store.get_benchmark_info(run_id)),
+    }
+
+
+def export_monitor_evaluation_batch(batch_id: str, *, export_format: str | None = None) -> dict[str, Any]:
+    batch_service = evaluation_read_service.make_eval_batch_service()
+    store = evaluation_read_service.make_trajectory_store()
+    detail = batch_service.get_batch_detail(batch_id)
+    batch = detail["batch"]
+    aggregate = batch_service.get_batch_summary(batch_id)["summary"]
+    resolved_format = export_format or _resolve_batch_export_format(batch)
+    run_records: list[dict[str, Any]] = []
+    for batch_run in detail["runs"]:
+        run_id = str(batch_run.get("eval_run_id") or "")
+        if not run_id:
+            continue
+        run = store.get_run(run_id)
+        if run is None:
+            logger.warning("Skipping export for missing evaluation run %s in batch %s", run_id, batch_id)
+            continue
+        run_records.append(
+            {
+                "run_id": run_id,
+                "scenario_id": batch_run.get("scenario_id"),
+                "batch_run": batch_run,
+                "run": {
+                    "run_id": run_id,
+                    "thread_id": run.get("thread_id"),
+                    "status": run.get("status"),
+                    "final_response": run.get("final_response"),
+                },
+                "judge_result": _dump_model(store.get_judge_result(run_id)),
+                "artifacts": [artifact.model_dump(mode="json") for artifact in store.get_artifacts(run_id)],
+                "benchmark": _dump_model(store.get_benchmark_info(run_id)),
+            }
+        )
+    return build_batch_export(batch=batch, aggregate=aggregate, run_records=run_records, export_format=resolved_format)
+
+
+def _resolve_batch_export_format(batch: dict[str, Any]) -> str:
+    config = batch.get("config_json") or {}
+    scenario_refs = list(config.get("scenario_refs") or [])
+    for scenario_ref in scenario_refs:
+        export_config = dict(scenario_ref.get("export") or {})
+        export_format = str(export_config.get("format") or "").strip()
+        if export_format:
+            return export_format
+    return "generic_json"
+
+
+def _dump_model(value: Any) -> dict[str, Any] | None:
+    if value is None:
+        return None
+    return value.model_dump(mode="json") if hasattr(value, "model_dump") else dict(value)
diff --git a/backend/monitor/infrastructure/evaluation/background_task_scheduler.py b/backend/monitor/infrastructure/evaluation/background_task_scheduler.py
@@ -19,5 +19,6 @@ def submit(self, spec: EvaluationJobSpec) -> None:
             execution_base_url=spec.execution_base_url,
             token=spec.token,
             agent_user_id=spec.agent_user_id,
+            max_concurrent=spec.max_concurrent,
             batch_service=make_eval_batch_service(),
         )
diff --git a/backend/monitor/infrastructure/evaluation/evaluation_execution_service.py b/backend/monitor/infrastructure/evaluation/evaluation_execution_service.py
@@ -2,21 +2,32 @@
 
 from __future__ import annotations
 
+import logging
+import os
 from pathlib import Path
 
 from backend.monitor.infrastructure.evaluation import evaluation_storage_service
 from eval.batch_executor import EvaluationBatchExecutor
 from eval.batch_service import EvaluationBatchService
 from eval.harness.client import EvalClient
 from eval.harness.runner import EvalRunner
-from eval.harness.scenario import load_scenarios_from_dir
+from eval.harness.scenario import load_scenarios_from_dirs, parse_scenario_dirs
 from eval.models import EvalScenario
 
-EVAL_SCENARIO_DIR = Path(__file__).resolve().parents[4] / "eval" / "scenarios"
+logger = logging.getLogger(__name__)
+
+_EVAL_ROOT = Path(__file__).resolve().parents[4] / "eval"
+EVAL_SCENARIO_DIRS = [_EVAL_ROOT / "scenarios", _EVAL_ROOT / "benchmarks"]
+
+
+def resolve_monitor_eval_scenario_dirs() -> list[Path]:
+    return parse_scenario_dirs(os.getenv("LEON_EVAL_SCENARIO_DIRS"), default_dirs=EVAL_SCENARIO_DIRS)
 
 
 def load_monitor_eval_scenarios() -> list[EvalScenario]:
-    return load_scenarios_from_dir(EVAL_SCENARIO_DIR)
+    scenario_dirs = resolve_monitor_eval_scenario_dirs()
+    logger.info("Loading monitor evaluation scenarios from %s", ", ".join(str(path) for path in scenario_dirs))
+    return load_scenarios_from_dirs(scenario_dirs)
 
 
 def select_monitor_eval_scenarios(scenario_ids: list[str], *, sandbox: str) -> list[EvalScenario]:
@@ -34,6 +45,7 @@ async def run_monitor_evaluation_batch(
     execution_base_url: str,
     token: str,
     agent_user_id: str,
+    max_concurrent: int,
     batch_service: EvaluationBatchService,
 ) -> None:
     client = EvalClient(base_url=execution_base_url, token=token)
@@ -44,6 +56,6 @@ async def run_monitor_evaluation_batch(
             store=evaluation_storage_service.make_trajectory_store(),
         )
         executor = EvaluationBatchExecutor(runner=runner, batch_service=batch_service)
-        await executor.run_batch(batch_id, scenarios)
+        await executor.run_batch(batch_id, scenarios, max_concurrent=max_concurrent)
     finally:
         await client.close()
diff --git a/backend/monitor/infrastructure/evaluation/evaluation_scheduler.py b/backend/monitor/infrastructure/evaluation/evaluation_scheduler.py
@@ -15,6 +15,7 @@ class EvaluationJobSpec:
     execution_base_url: str
     token: str
     agent_user_id: str
+    max_concurrent: int = 1
 
 
 class EvaluationJobScheduler(Protocol):

diff --git a/backend/monitor/infrastructure/web/gateway.py b/backend/monitor/infrastructure/web/gateway.py
@@ -138,6 +138,25 @@ def get_evaluation_run_detail(run_id: str) -> dict[str, Any]:
     return monitor_evaluation.get_monitor_evaluation_run_detail(run_id)
 
 
+def get_evaluation_batch_aggregate(batch_id: str) -> dict[str, Any]:
+    return monitor_evaluation.get_monitor_evaluation_batch_aggregate(batch_id)
+
+
+def compare_evaluation_batches(*, baseline_batch_id: str, candidate_batch_id: str) -> dict[str, Any]:
+    return monitor_evaluation.compare_monitor_evaluation_batches(
+        baseline_batch_id=baseline_batch_id,
+        candidate_batch_id=candidate_batch_id,
+    )
+
+
+def get_evaluation_run_artifacts(run_id: str) -> dict[str, Any]:
+    return monitor_evaluation.get_monitor_evaluation_run_artifacts(run_id)
+
+
+def export_evaluation_batch(*, batch_id: str, export_format: str | None = None) -> dict[str, Any]:
+    return monitor_evaluation.export_monitor_evaluation_batch(batch_id, export_format=export_format)
+
+
 def get_resource_overview() -> dict[str, Any]:
     return monitor_resources.get_monitor_resource_overview()