diff --git a/soar-eval/agents/bw-op-subgoal-base.soar b/soar-eval/agents/bw-op-subgoal-base.soar new file mode 100644 index 0000000000..d47234401b --- /dev/null +++ b/soar-eval/agents/bw-op-subgoal-base.soar @@ -0,0 +1,21 @@ +# BW-Op-Subgoal agent WITHOUT the hardcoded initialization. +# The eval harness sources a generated task initialization separately. +# +# Path is relative to the repo root. Run from the repo root or set +# the working directory before sourcing this file. + +pushd "UnitTests/SoarTestAgents/Chunking/tests/BW-Op-Subgoal/BW-Op-Subgoal" +source _firstload.soar +pushd all +source all_source.soar +popd +pushd elaborations +source elaborations_source.soar +popd +source _readme.soar +# NOTE: initialize-blocks-world.soar is NOT sourced — task generator provides it +source move-block.soar +pushd move-block +source move-block_source.soar +popd +popd diff --git a/soar-eval/module_eval.py b/soar-eval/module_eval.py new file mode 100644 index 0000000000..b7401f796e --- /dev/null +++ b/soar-eval/module_eval.py @@ -0,0 +1,467 @@ +#!/usr/bin/env python3 +"""Per-module eval harness for Soar. + +Wraps existing test agents, captures quantitative stats per test, +dumps to JSON, and diffs across builds. + +Usage: + python module_eval.py run --soar BUILD/SoarCLI/soar --suite ChunkingTests --out results.json + python module_eval.py compare --base upstream.json --candidate pr.json +""" + +import argparse +import json +import os +import re +import subprocess +import sys +from pathlib import Path + +SCRIPT_DIR = Path(__file__).resolve().parent +REPO_ROOT = SCRIPT_DIR.parent + +SOAR_CLI_DEFAULT = REPO_ROOT / "build" / "SoarCLI" / "soar" +TEST_AGENTS_DIR = REPO_ROOT / "UnitTests" / "SoarTestAgents" + +# Suite definitions: directory glob patterns for test agents +SUITES = { + "ChunkingTests": { + "glob": "Chunking/tests/ChunkingDemoTests_*.soar", + "description": "Chunking demo agents (BW, arithmetic, etc.)", + }, + "FunctionalTests": { + "glob": "FunctionalTests_*.soar", + "description": "Core functional test agents", + }, + "SMemFunctionalTests": { + "glob": "smem/SMemFunctionalTests_*.soar", + "description": "Semantic memory functional tests", + }, + "EpMemFunctionalTests": { + "glob": "epmem/EpMem*.soar", + "description": "Episodic memory functional tests", + }, + "PerformanceTests": { + "glob": "*.soar", + "root": REPO_ROOT / "PerformanceTests" / "TestAgents", + "description": "Performance benchmark agents", + }, +} + +# Metrics we always capture (deterministic, comparable across builds) +DETERMINISTIC_METRICS = [ + "decisions", "elaboration_cycles", "production_firings", + "wm_current", "wm_mean", "wm_max", + "productions_user", "productions_chunks", +] + +# Metrics that vary (need median of N runs) +TIMING_METRICS = ["kernel_cpu_sec", "total_cpu_sec"] + + +def parse_stats(output): + """Parse Soar stats output into a dict of metrics.""" + stats = {"status": "error", "halted": False} + + if "halted" in output.lower(): + stats["halted"] = True + stats["status"] = "success" + elif "Run stopped" not in output and "interrupt" not in output.lower(): + stats["status"] = "timeout" + + m = re.search(r"(\d+) decisions?", output) + if m: + stats["decisions"] = int(m.group(1)) + stats["status"] = "success" + + m = re.search(r"(\d+) elaboration cycles?", output) + if m: + stats["elaboration_cycles"] = int(m.group(1)) + + m = re.search(r"(\d+) production firings?", output) + if m: + stats["production_firings"] = int(m.group(1)) + + m = re.search(r"WM size:\s*(\d+) current,\s*([\d.]+) mean,\s*(\d+) maximum", output) + if m: + stats["wm_current"] = int(m.group(1)) + stats["wm_mean"] = float(m.group(2)) + stats["wm_max"] = int(m.group(3)) + + m = re.search(r"(\d+) productions? \((\d+) default, (\d+) user, (\d+) chunks?\)", output) + if m: + stats["productions_total"] = int(m.group(1)) + stats["productions_default"] = int(m.group(2)) + stats["productions_user"] = int(m.group(3)) + stats["productions_chunks"] = int(m.group(4)) + + m = re.search(r"Kernel CPU Time:\s*([\d.]+) sec", output) + if m: + stats["kernel_cpu_sec"] = float(m.group(1)) + + m = re.search(r"Total\s+CPU Time:\s*([\d.]+) sec", output) + if m: + stats["total_cpu_sec"] = float(m.group(1)) + + return stats + + +def run_test(soar_cli, agent_path, max_decisions=10000): + """Run a single test agent and return parsed stats.""" + try: + proc = subprocess.run( + [str(soar_cli), "-s", str(agent_path), f"run {max_decisions}; stats"], + capture_output=True, text=True, timeout=30, cwd=str(REPO_ROOT) + ) + output = proc.stdout + proc.stderr + stats = parse_stats(output) + stats["agent"] = agent_path.name + stats["exit_code"] = proc.returncode + return stats + except subprocess.TimeoutExpired: + return {"agent": agent_path.name, "status": "timeout", "exit_code": -1} + except Exception as e: + return {"agent": agent_path.name, "status": "error", "error": str(e)} + + +def discover_tests(suite_name): + """Find test agent files for a suite.""" + if suite_name not in SUITES: + print(f"Unknown suite: {suite_name}. Available: {list(SUITES.keys())}") + sys.exit(1) + + suite = SUITES[suite_name] + pattern = suite["glob"] + root = suite.get("root", TEST_AGENTS_DIR) + agents = sorted(root.glob(pattern)) + + if not agents: + agents = sorted(root.glob(f"**/{pattern}")) + + return agents + + +def run_suite(soar_cli, suite_name, max_decisions=10000): + """Run all tests in a suite, return list of stats dicts.""" + agents = discover_tests(suite_name) + print(f" Found {len(agents)} agents in {suite_name}", file=sys.stderr) + + results = [] + for agent in agents: + stats = run_test(soar_cli, agent, max_decisions) + results.append(stats) + status_char = "." if stats["status"] == "success" else "X" + print(status_char, end="", flush=True, file=sys.stderr) + print(file=sys.stderr) + + return results + + +## --------------------------------------------------------------------------- +## VISIBILITY: raw diff — what changed, by how much, no judgment +## --------------------------------------------------------------------------- + +def diff_results(base, candidate): + """Produce a raw diff of two result sets. Facts only, no judgment. + + Returns a list of per-agent diffs. Each metric reports base value, + candidate value, and delta. No classification, no direction, no + pass/fail. The consumer decides what matters. + """ + base_by_agent = {r["agent"]: r for r in base} + cand_by_agent = {r["agent"]: r for r in candidate} + all_agents = sorted(set(base_by_agent.keys()) | set(cand_by_agent.keys())) + + diffs = [] + for agent in all_agents: + b = base_by_agent.get(agent) + c = cand_by_agent.get(agent) + + entry = {"agent": agent} + + if not b: + entry["presence"] = "new_in_candidate" + if c: + entry["candidate"] = {m: c[m] for m in DETERMINISTIC_METRICS if m in c} + diffs.append(entry) + continue + if not c: + entry["presence"] = "missing_in_candidate" + diffs.append(entry) + continue + + entry["presence"] = "both" + entry["base_status"] = b.get("status") + entry["candidate_status"] = c.get("status") + entry["metrics"] = {} + + for metric in DETERMINISTIC_METRICS + TIMING_METRICS: + bv = b.get(metric) + cv = c.get(metric) + if bv is None and cv is None: + continue + m = {"base": bv, "candidate": cv} + if bv is not None and cv is not None: + m["delta"] = cv - bv + if bv != 0: + m["pct_change"] = round((cv - bv) / abs(bv) * 100, 1) + entry["metrics"][metric] = m + + diffs.append(entry) + + return {"agents": diffs} + + +def print_diff(diff): + """Print raw diff as a readable table. No judgment words.""" + changed_agents = [] + for entry in diff["agents"]: + if entry.get("presence") == "new_in_candidate": + changed_agents.append((entry["agent"], "NEW", {})) + continue + if entry.get("presence") == "missing_in_candidate": + changed_agents.append((entry["agent"], "MISSING", {})) + continue + + status_changed = entry["base_status"] != entry["candidate_status"] + metric_changes = {} + for m, info in entry.get("metrics", {}).items(): + if info.get("delta", 0) != 0: + metric_changes[m] = info + + if status_changed or metric_changes: + status = f"{entry['base_status']} -> {entry['candidate_status']}" if status_changed else "" + changed_agents.append((entry["agent"], status, metric_changes)) + + if not changed_agents: + print("\nNo differences detected.") + return + + print(f"\n{'Agent':<60} {'Metric':<25} {'Base':>10} {'Candidate':>10} {'Delta':>10} {'%':>8}") + print("-" * 125) + for agent, status, metrics in changed_agents: + if status == "NEW": + print(f"{agent:<60} {'(new agent)':<25}") + continue + if status == "MISSING": + print(f"{agent:<60} {'(missing)':<25}") + continue + first_line = True + if status: + print(f"{agent:<60} {'status':<25} {status}") + first_line = False + for m, info in sorted(metrics.items()): + bv = info.get("base", "—") + cv = info.get("candidate", "—") + delta = info.get("delta", "") + pct = info.get("pct_change", "") + delta_str = f"{delta:+}" if isinstance(delta, (int, float)) else "" + pct_str = f"{pct:+.1f}%" if isinstance(pct, (int, float)) else "" + label = agent if first_line else "" + print(f"{label:<60} {m:<25} {str(bv):>10} {str(cv):>10} {delta_str:>10} {pct_str:>8}") + first_line = False + + +## --------------------------------------------------------------------------- +## DECISION: policy layer — maintainer configures what counts as pass/fail +## --------------------------------------------------------------------------- + +# Default policy: lower is better for resource metrics, chunk count is neutral. +# Maintainers can override by providing a policy JSON file. +DEFAULT_POLICY = { + "lower_is_better": [ + "decisions", "elaboration_cycles", "production_firings", + "wm_max", "kernel_cpu_sec", "total_cpu_sec", + ], + "neutral": [ + "productions_chunks", "productions_user", + "wm_current", "wm_mean", + ], + # Metrics not listed are ignored for pass/fail. + # Status regressions (success -> timeout/error) always count. +} + + +def apply_policy(diff, policy=None): + """Apply a policy to a raw diff. Returns judgment summary. + + This is the ONLY place that says 'regression' or 'improvement'. + Separated from visibility so the maintainer controls the criteria. + """ + if policy is None: + policy = DEFAULT_POLICY + + lower_is_better = set(policy.get("lower_is_better", [])) + neutral = set(policy.get("neutral", [])) + timing_floor = policy.get("timing_noise_floor", 0) + timing_rel = policy.get("timing_relative_threshold", 0) + timing_metrics = {"kernel_cpu_sec", "total_cpu_sec"} + + regressions = [] + improvements = [] + neutral_changes = [] + + for entry in diff["agents"]: + agent = entry["agent"] + + if entry.get("presence") == "missing_in_candidate": + regressions.append({"agent": agent, "reason": "agent missing in candidate"}) + continue + + # Status regression + bs = entry.get("base_status") + cs = entry.get("candidate_status") + if bs == "success" and cs != "success": + regressions.append({"agent": agent, "reason": f"status {bs} -> {cs}"}) + + for metric, info in entry.get("metrics", {}).items(): + delta = info.get("delta", 0) + if delta == 0: + continue + + # Apply timing noise thresholds + if metric in timing_metrics: + abs_delta = abs(delta) + pct = abs(info.get("pct_change", 0)) + if abs_delta < timing_floor and pct < timing_rel: + continue # below both thresholds, treat as noise + + if metric in neutral: + neutral_changes.append({ + "agent": agent, "metric": metric, + "base": info["base"], "candidate": info["candidate"], + }) + elif metric in lower_is_better: + if delta > 0: + regressions.append({ + "agent": agent, "metric": metric, + "base": info["base"], "candidate": info["candidate"], + }) + else: + improvements.append({ + "agent": agent, "metric": metric, + "base": info["base"], "candidate": info["candidate"], + }) + + return { + "regressions": regressions, + "improvements": improvements, + "neutral_changes": neutral_changes, + "pareto_pass": len(regressions) == 0 and len(improvements) > 0, + } + + +def print_judgment(judgment): + """Print policy-based judgment. Clearly labeled as policy output.""" + print("\n" + "=" * 60) + print("POLICY JUDGMENT (maintainer-configured, not harness opinion)") + print("=" * 60) + + if judgment["regressions"]: + print(f"\nRegressions ({len(judgment['regressions'])}):") + for r in judgment["regressions"]: + if "metric" in r: + print(f" {r['agent']}: {r['metric']} {r['base']} -> {r['candidate']}") + else: + print(f" {r['agent']}: {r['reason']}") + + if judgment["improvements"]: + print(f"\nImprovements ({len(judgment['improvements'])}):") + for r in judgment["improvements"]: + print(f" {r['agent']}: {r['metric']} {r['base']} -> {r['candidate']}") + + if judgment["neutral_changes"]: + print(f"\nNeutral changes ({len(judgment['neutral_changes'])}):") + for r in judgment["neutral_changes"]: + print(f" {r['agent']}: {r['metric']} {r['base']} -> {r['candidate']}") + + print(f"\nPareto check: {'PASS' if judgment['pareto_pass'] else 'FAIL'}") + return 0 if not judgment["regressions"] else 1 + + +def cmd_run(args): + soar_cli = Path(args.soar).expanduser().resolve() + suites = args.suite if args.suite else list(SUITES.keys()) + + all_results = {} + for suite in suites: + print(f"Running {suite}...", file=sys.stderr) + results = run_suite(soar_cli, suite) + all_results[suite] = results + + output = { + "soar_binary": str(soar_cli), + "suites": all_results, + } + + out_path = Path(args.out) + out_path.parent.mkdir(parents=True, exist_ok=True) + with open(out_path, "w") as f: + json.dump(output, f, indent=2) + print(f"Results written to {out_path}", file=sys.stderr) + + +def cmd_compare(args): + with open(args.base) as f: + base = json.load(f) + with open(args.candidate) as f: + candidate = json.load(f) + + policy = DEFAULT_POLICY + if args.policy: + with open(args.policy) as f: + policy = json.load(f) + + exit_code = 0 + for suite in sorted(set(list(base["suites"].keys()) + list(candidate["suites"].keys()))): + b = base["suites"].get(suite, []) + c = candidate["suites"].get(suite, []) + if not b or not c: + print(f"\n{suite}: missing from one side, skipping") + continue + + print(f"\n{'='*60}") + print(f"Suite: {suite}") + print(f"{'='*60}") + + # Layer 1: VISIBILITY — raw facts + diff = diff_results(b, c) + print_diff(diff) + + # Layer 2: DECISION — policy judgment (optional) + if not args.facts_only: + judgment = apply_policy(diff, policy) + code = print_judgment(judgment) + if code != 0: + exit_code = 1 + + sys.exit(exit_code) + + +def main(): + parser = argparse.ArgumentParser(description="Soar per-module eval harness") + sub = parser.add_subparsers(dest="command") + + run_p = sub.add_parser("run", help="Run test suites and capture stats") + run_p.add_argument("--soar", default=str(SOAR_CLI_DEFAULT), help="Path to soar CLI") + run_p.add_argument("--suite", nargs="*", help="Suite(s) to run (default: all)") + run_p.add_argument("--out", default="results/module_eval.json", help="Output JSON") + + cmp_p = sub.add_parser("compare", help="Compare two result sets") + cmp_p.add_argument("--base", required=True, help="Baseline JSON") + cmp_p.add_argument("--candidate", required=True, help="Candidate JSON") + cmp_p.add_argument("--policy", default=None, help="Policy JSON (default: built-in)") + cmp_p.add_argument("--facts-only", action="store_true", + help="Show raw diff only, no policy judgment") + + args = parser.parse_args() + if args.command == "run": + cmd_run(args) + elif args.command == "compare": + cmd_compare(args) + else: + parser.print_help() + + +if __name__ == "__main__": + main() diff --git a/soar-eval/policy.json b/soar-eval/policy.json new file mode 100644 index 0000000000..43ebbde371 --- /dev/null +++ b/soar-eval/policy.json @@ -0,0 +1,18 @@ +{ + "lower_is_better": [ + "decisions", + "elaboration_cycles", + "production_firings", + "wm_max", + "kernel_cpu_sec", + "total_cpu_sec" + ], + "neutral": [ + "productions_chunks", + "productions_user", + "wm_current", + "wm_mean" + ], + "timing_noise_floor": 0.005, + "timing_relative_threshold": 1.0 +} diff --git a/soar-eval/policy_README.md b/soar-eval/policy_README.md new file mode 100644 index 0000000000..6a66ffc0a9 --- /dev/null +++ b/soar-eval/policy_README.md @@ -0,0 +1,39 @@ +# Soar Eval Policy + +`policy.json` controls Layer 2 of the eval harness: the judgment layer that decides whether raw metric differences count as improvements, regressions, or noise. + +Layer 1 reports facts only: baseline value, candidate value, delta, and percent change. This policy file defines how Layer 2 interprets those facts. + +## Fields + +### `lower_is_better` + +Metrics where a lower candidate value is an improvement and a higher value is a regression. + +- `decisions`, `elaboration_cycles`, `production_firings`, `wm_max` — resource-use metrics +- `kernel_cpu_sec`, `total_cpu_sec` — timing metrics (subject to noise thresholds below) + +### `neutral` + +Metrics reported but not judged directionally. A change is flagged as "changed" rather than "improved" or "regressed." + +- `productions_chunks`, `productions_user` — chunk count direction depends on context +- `wm_current`, `wm_mean` — informational + +### `timing_noise_floor` + +Absolute timing delta (seconds) below which changes are ignored as noise. Default: `0.005` (5ms). + +### `timing_relative_threshold` + +Relative timing delta (percent) below which changes are ignored as noise. Default: `1.0` (1%). + +A timing change is flagged only when it exceeds **both** thresholds. This biases toward review rather than automatic pass. + +## Customization + +Stricter (flag more): lower the thresholds. +Less sensitive (flag less): raise the thresholds. +Change metric direction: move names between `lower_is_better` and `neutral`. + +The maintainer owns this file. Contributors see the raw numbers (Layer 1). The maintainer decides what matters (Layer 2). diff --git a/soar-eval/tasks/__init__.py b/soar-eval/tasks/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/soar-eval/tasks/blocks_world.py b/soar-eval/tasks/blocks_world.py new file mode 100644 index 0000000000..6b9da65b20 --- /dev/null +++ b/soar-eval/tasks/blocks_world.py @@ -0,0 +1,185 @@ +"""Generate random 3-block Blocks World tasks as Soar production rules. + +A state is a set of ontop relations over {A, B, C, table}. +For 3 blocks there are 13 distinct legal states. +""" + +import random +from itertools import permutations + +BLOCKS = ["A", "B", "C"] +TABLE = "table" + + +def enumerate_states(): + """Return all legal 3-block states as frozensets of (top, bottom) pairs. + + Each block must be on exactly one thing. A block can be on the table + or on another block, but only if no other block is on that block + (i.e., the destination must be clear). + """ + states = [] + # Each block is on something: another block or table + # Constraint: no two blocks on the same block (but multiple on table is fine) + options = BLOCKS + [TABLE] + for a_on in options: + if a_on == "A": + continue + for b_on in options: + if b_on == "B": + continue + for c_on in options: + if c_on == "C": + continue + # Check: no two blocks on the same non-table thing + supports = [a_on, b_on, c_on] + block_supports = [s for s in supports if s != TABLE] + if len(block_supports) != len(set(block_supports)): + continue + # Check: no cycles (A on B on A) + placement = {"A": a_on, "B": b_on, "C": c_on} + if has_cycle(placement): + continue + state = frozenset([("A", a_on), ("B", b_on), ("C", c_on)]) + states.append(state) + return states + + +def has_cycle(placement): + """Check if a placement dict has a cycle among blocks.""" + for start in BLOCKS: + visited = set() + current = start + while current in placement and current != TABLE: + if current in visited: + return True + visited.add(current) + current = placement[current] + return False + + +ALL_STATES = enumerate_states() + + +def generate_task_pair(rng): + """Return (initial_state, goal_state) as two different states.""" + pair = rng.sample(ALL_STATES, 2) + return pair[0], pair[1] + + +def generate_tasks(n_train, n_transfer, seed): + """Generate training and transfer task pairs.""" + rng = random.Random(seed) + train = [generate_task_pair(rng) for _ in range(n_train)] + transfer = [generate_task_pair(rng) for _ in range(n_transfer)] + return train, transfer + + +def state_to_soar(state, var_prefix="s"): + """Convert a state (frozenset of (top, bottom)) to Soar WME creation actions.""" + lines = [] + ontop_vars = [] + block_vars = {} + + # Create block variables + for block in BLOCKS: + var = f"" + block_vars[block] = var + lines.append(f' ({var} ^name {block} ^type block)') + block_vars[TABLE] = "" + lines.append(f' (
^name table ^type table)') + + # Create ontop relations + for i, (top, bottom) in enumerate(sorted(state)): + var = f"" + ontop_vars.append(var) + lines.append(f' ({var} ^top-block {block_vars[top]} ^bottom-block {block_vars[bottom]})') + + return ontop_vars, block_vars, lines + + +def task_to_soar_file(initial, goal, task_name): + """Generate a Soar production that sets up a specific BW task. + + Returns the text of a .soar file with a single initialization production. + """ + block_vars = {b: f"" for b in BLOCKS} + block_vars[TABLE] = "
" + + # Block/table definitions (shared between initial and goal) + block_defs = [] + for b in BLOCKS: + block_defs.append(f' ({block_vars[b]} ^name {b} ^type block)') + block_defs.append(f' (
^name table ^type table)') + + # Initial ontop relations + init_ontop_vars = [] + init_ontop_lines = [] + for i, (top, bottom) in enumerate(sorted(initial)): + var = f"" + init_ontop_vars.append(var) + init_ontop_lines.append(f' ({var} ^top-block {block_vars[top]} ^bottom-block {block_vars[bottom]})') + + # Goal ontop relations + goal_ontop_vars = [] + goal_ontop_lines = [] + for i, (top, bottom) in enumerate(sorted(goal)): + var = f"" + goal_ontop_vars.append(var) + goal_ontop_lines.append(f' ({var} ^top-block {block_vars[top]} ^bottom-block {block_vars[bottom]})') + + ontop_refs = " ".join(init_ontop_vars) + object_refs = " ".join(block_vars[b] for b in BLOCKS) + "
" + dontop_refs = " ".join(goal_ontop_vars) + + # Use a fixed name so the harness can excise it between tasks + propose_name = "eval*propose*initialize-blocks-world" + apply_name = "eval*apply*initialize-blocks-world" + + prod = f"""sp {{{propose_name} + (state ^superstate nil + -^name) +--> + ( ^operator +) + ( ^name initialize-blocks-world) +}} + +sp {{{apply_name} + (state ^operator.name initialize-blocks-world) +--> + ( ^name blocks-world + ^ontop {ontop_refs} + ^object {object_refs} + ^desired ) +{chr(10).join(block_defs)} +{chr(10).join(init_ontop_lines)} + ( ^ontop {dontop_refs}) +{chr(10).join(goal_ontop_lines)} +}} +""" + return prod + + +def write_task_file(filepath, initial, goal, task_name): + """Write a single task's initialization production to a .soar file.""" + content = task_to_soar_file(initial, goal, task_name) + with open(filepath, "w") as f: + f.write(content) + return filepath + + +if __name__ == "__main__": + # Verify state enumeration + states = enumerate_states() + print(f"Found {len(states)} legal 3-block states") + for s in states: + print(f" {sorted(s)}") + + # Generate a sample task + rng = random.Random(42) + init, goal = generate_task_pair(rng) + print(f"\nSample task:") + print(f" Initial: {sorted(init)}") + print(f" Goal: {sorted(goal)}") + print(f"\nSoar production:") + print(task_to_soar_file(init, goal, "eval*task*seed42*train01*initialize")) diff --git a/soar-eval/transfer_eval.py b/soar-eval/transfer_eval.py new file mode 100644 index 0000000000..37ff518ca3 --- /dev/null +++ b/soar-eval/transfer_eval.py @@ -0,0 +1,309 @@ +#!/usr/bin/env python3 +"""Soar transfer eval harness. + +Runs a chunking agent on generated Blocks World tasks and measures +whether learning on training tasks speeds up transfer tasks. + +Usage: + python eval.py --seeds 5 --train 10 --transfer 10 +""" + +import argparse +import json +import os +import re +import subprocess +import sys +import tempfile +from pathlib import Path + +from tasks.blocks_world import generate_tasks, write_task_file + +SCRIPT_DIR = Path(__file__).resolve().parent +REPO_ROOT = SCRIPT_DIR.parent + +SOAR_CLI = REPO_ROOT / "build" / "SoarCLI" / "soar" +BASE_AGENT = SCRIPT_DIR / "agents" / "bw-op-subgoal-base.soar" +RESULTS_DIR = SCRIPT_DIR / "results" +MAX_DECISIONS = 500 + + +def parse_run_line(output): + """Extract per-run metrics from '--> N decision cycles' line.""" + decisions = 0 + chunks_learned = 0 + + m = re.search(r"--> (\d+) decision cycles? executed", output) + if m: + decisions = int(m.group(1)) + + m = re.search(r"(\d+) new rules? learned", output) + if m: + chunks_learned = int(m.group(1)) + + return {"decisions": decisions, "chunks_learned": chunks_learned} + + +def parse_stats(output): + """Extract cumulative metrics from Soar stats output.""" + chunks = 0 + m = re.search(r"(\d+) chunks?\)", output) + if m: + chunks = int(m.group(1)) + return {"chunks_total": chunks} + + +def run_single_task(soar_cli, base_agent, task_file, chunking=True, + extra_sources=None, max_decisions=MAX_DECISIONS): + """Run agent on a single task in a fresh Soar process. + + extra_sources: list of .soar files to source before the task (e.g., saved chunks) + Returns stats dict. + """ + cmd_parts = [] + if not chunking: + cmd_parts.append("chunk never") + if extra_sources: + for s in extra_sources: + cmd_parts.append(f'source "{s}"') + cmd_parts.append(f'source "{task_file}"') + cmd_parts.append(f"run {max_decisions}") + cmd_parts.append("stats") + + cmd_str = "; ".join(cmd_parts) + + proc = subprocess.run( + [str(soar_cli), "-s", str(base_agent), cmd_str], + capture_output=True, text=True, timeout=30, cwd=str(REPO_ROOT) + ) + + output = proc.stdout + proc.stderr + run_metrics = parse_run_line(output) + cumulative = parse_stats(output) + halted = "halted" in output.lower() + + return { + "decisions": run_metrics["decisions"], + "chunks_learned": run_metrics["chunks_learned"], + "chunks_total": cumulative["chunks_total"], + "halted": halted, + "status": "success" if halted else "timeout", + } + + +def save_chunks(soar_cli, base_agent, task_files, chunking=True, + max_decisions=MAX_DECISIONS): + """Run tasks sequentially in one process, save learned chunks to a file. + + Uses the CLI's semicolon-joined command approach which works for + source+run+save sequences. + Returns (list of per-task stats, path to saved chunks file). + """ + chunks_file = tempfile.mktemp(suffix=".soar") + + # Build command: source each task, run, excise, source next, init-soar, run... + # Then save chunks at the end. + cmd_parts = [] + if not chunking: + cmd_parts.append("chunk never") + + for i, task_file in enumerate(task_files): + cmd_parts.append(f'source "{task_file}"') + if i > 0: + cmd_parts.append("init-soar") + cmd_parts.append(f"run {max_decisions}") + cmd_parts.append("excise eval*propose*initialize-blocks-world") + cmd_parts.append("excise eval*apply*initialize-blocks-world") + + # Save all chunks to file + cmd_parts.append(f'command-to-file "{chunks_file}" print --chunks --full') + cmd_parts.append("stats") + + cmd_str = "; ".join(cmd_parts) + + proc = subprocess.run( + [str(soar_cli), "-s", str(base_agent), cmd_str], + capture_output=True, text=True, timeout=120, cwd=str(REPO_ROOT) + ) + + output = proc.stdout + proc.stderr + cumulative = parse_stats(output) + + # Check if chunks file was created and has content + if not os.path.exists(chunks_file) or os.path.getsize(chunks_file) == 0: + chunks_file = None + + return cumulative, chunks_file + + +def run_condition(soar_cli, base_agent, task_files, phase_labels, + chunking=True, max_decisions=MAX_DECISIONS, + chunks_file=None): + """Run each task as a separate subprocess for reliable stats. + + If chunks_file is provided, it's sourced before each task to simulate + having learned from prior training. + """ + extra = [chunks_file] if chunks_file else None + results = [] + for task_file, label in zip(task_files, phase_labels): + stats = run_single_task( + soar_cli, base_agent, task_file, + chunking=chunking, extra_sources=extra, + max_decisions=max_decisions + ) + stats["id"] = label + results.append(stats) + return results + + +def run_eval(seed, n_train, n_transfer, soar_cli=SOAR_CLI, + base_agent=BASE_AGENT): + """Run the full three-condition eval for one seed.""" + train_tasks, transfer_tasks = generate_tasks(n_train, n_transfer, seed) + + # Write task files + tmpdir = tempfile.mkdtemp(prefix="soar_eval_") + train_files = [] + transfer_files = [] + train_labels = [] + transfer_labels = [] + + for i, (init, goal) in enumerate(train_tasks): + name = f"eval*task*seed{seed}*train{i:02d}" + path = os.path.join(tmpdir, f"train_{i:02d}.soar") + write_task_file(path, init, goal, name) + train_files.append(path) + train_labels.append(f"train_{i:02d}") + + for i, (init, goal) in enumerate(transfer_tasks): + name = f"eval*task*seed{seed}*transfer{i:02d}" + path = os.path.join(tmpdir, f"transfer_{i:02d}.soar") + write_task_file(path, init, goal, name) + transfer_files.append(path) + transfer_labels.append(f"transfer_{i:02d}") + + # Phase 1: Train — run all training tasks in one process, save chunks + train_cumulative, chunks_file = save_chunks( + soar_cli, base_agent, train_files, chunking=True + ) + + # Condition 1: trained-transfer (transfer tasks WITH learned chunks) + trained_transfer = run_condition( + soar_cli, base_agent, transfer_files, transfer_labels, + chunking=True, chunks_file=chunks_file + ) + + # Condition 2: fresh baseline (transfer tasks WITHOUT learned chunks) + baseline_results = run_condition( + soar_cli, base_agent, transfer_files, transfer_labels, chunking=True + ) + + # Condition 3: no-learning control (no chunking at all) + nolearn_transfer = run_condition( + soar_cli, base_agent, transfer_files, transfer_labels, chunking=False + ) + + # Clean up chunks file + if chunks_file and os.path.exists(chunks_file): + os.unlink(chunks_file) + + # Compute transfer ratio + trained_transfer_dcs = sum(r["decisions"] for r in trained_transfer) + baseline_dcs = sum(r["decisions"] for r in baseline_results) + trained_train_dcs = 0 # not tracked per-task in this approach + trained_chunks = train_cumulative.get("chunks_total", 0) + + if baseline_dcs > 0: + transfer_ratio = (baseline_dcs - trained_transfer_dcs) / baseline_dcs + else: + transfer_ratio = 0.0 + + trained_success = sum(1 for r in trained_transfer if r["status"] == "success") + baseline_success = sum(1 for r in baseline_results if r["status"] == "success") + nolearn_success = sum(1 for r in nolearn_transfer if r["status"] == "success") + nolearn_dcs = sum(r["decisions"] for r in nolearn_transfer) + + return { + "seed": seed, + "n_train": n_train, + "n_transfer": n_transfer, + "trained_transfer": { + "success": f"{trained_success}/{n_transfer}", + "transfer_decisions": trained_transfer_dcs, + "chunks": trained_chunks, + "transfer_ratio": round(transfer_ratio, 3), + }, + "fresh_baseline": { + "success": f"{baseline_success}/{n_transfer}", + "transfer_decisions": baseline_dcs, + }, + "no_learning": { + "success": f"{nolearn_success}/{n_transfer}", + "transfer_decisions": nolearn_dcs, + }, + "per_task": { + "trained_transfer": trained_transfer, + "baseline": baseline_results, + "no_learning": nolearn_transfer, + } + } + + +def print_summary(results): + """Print summary table.""" + header = f"{'Seed':>6} {'Condition':<20} {'Success':<10} {'Transfer DCs':>13} {'Chunks':>7} {'Transfer Ratio':>15}" + print(header) + print("-" * len(header)) + for r in results: + s = r["seed"] + t = r["trained_transfer"] + b = r["fresh_baseline"] + n = r["no_learning"] + print(f"{s:>6} {'trained-transfer':<20} {t['success']:<10} {t['transfer_decisions']:>13} {t['chunks']:>7} {t['transfer_ratio']:>+15.3f}") + print(f"{s:>6} {'fresh-baseline':<20} {b['success']:<10} {b['transfer_decisions']:>13} {'-':>7} {'-':>15}") + print(f"{s:>6} {'no-learning':<20} {n['success']:<10} {n['transfer_decisions']:>13} {'-':>7} {'-':>15}") + print() + + +def main(): + parser = argparse.ArgumentParser(description="Soar transfer eval harness") + parser.add_argument("--seeds", type=int, default=5, help="Number of seeds") + parser.add_argument("--train", type=int, default=6, help="Training tasks per seed") + parser.add_argument("--transfer", type=int, default=6, help="Transfer tasks per seed") + parser.add_argument("--soar", type=str, default=str(SOAR_CLI), help="Path to soar CLI") + parser.add_argument("--agent", type=str, default=str(BASE_AGENT), help="Path to base agent") + parser.add_argument("--output", type=str, default=None, help="JSON output file") + args = parser.parse_args() + + soar_cli = Path(args.soar) + base_agent = Path(args.agent) + + if not soar_cli.exists(): + print(f"Error: Soar CLI not found at {soar_cli}", file=sys.stderr) + sys.exit(1) + if not base_agent.exists(): + print(f"Error: Base agent not found at {base_agent}", file=sys.stderr) + sys.exit(1) + + all_results = [] + for seed in range(args.seeds): + print(f"Running seed {seed}...", file=sys.stderr) + result = run_eval(seed, args.train, args.transfer, soar_cli, base_agent) + all_results.append(result) + + print_summary(all_results) + + if args.output: + output_path = Path(args.output) + else: + RESULTS_DIR.mkdir(parents=True, exist_ok=True) + output_path = RESULTS_DIR / "latest.json" + + with open(output_path, "w") as f: + json.dump(all_results, f, indent=2) + print(f"\nResults written to {output_path}", file=sys.stderr) + + +if __name__ == "__main__": + main()