diff --git a/soar-eval/agents/bw-op-subgoal-base.soar b/soar-eval/agents/bw-op-subgoal-base.soar
new file mode 100644
index 0000000000..d47234401b
--- /dev/null
+++ b/soar-eval/agents/bw-op-subgoal-base.soar
@@ -0,0 +1,21 @@
+# BW-Op-Subgoal agent WITHOUT the hardcoded initialization.
+# The eval harness sources a generated task initialization separately.
+#
+# Path is relative to the repo root. Run from the repo root or set
+# the working directory before sourcing this file.
+
+pushd "UnitTests/SoarTestAgents/Chunking/tests/BW-Op-Subgoal/BW-Op-Subgoal"
+source _firstload.soar
+pushd all
+source all_source.soar
+popd
+pushd elaborations
+source elaborations_source.soar
+popd
+source _readme.soar
+# NOTE: initialize-blocks-world.soar is NOT sourced — task generator provides it
+source move-block.soar
+pushd move-block
+source move-block_source.soar
+popd
+popd
diff --git a/soar-eval/module_eval.py b/soar-eval/module_eval.py
new file mode 100644
index 0000000000..b7401f796e
--- /dev/null
+++ b/soar-eval/module_eval.py
@@ -0,0 +1,467 @@
+#!/usr/bin/env python3
+"""Per-module eval harness for Soar.
+
+Wraps existing test agents, captures quantitative stats per test,
+dumps to JSON, and diffs across builds.
+
+Usage:
+    python module_eval.py run --soar BUILD/SoarCLI/soar --suite ChunkingTests --out results.json
+    python module_eval.py compare --base upstream.json --candidate pr.json
+"""
+
+import argparse
+import json
+import os
+import re
+import subprocess
+import sys
+from pathlib import Path
+
+SCRIPT_DIR = Path(__file__).resolve().parent
+REPO_ROOT = SCRIPT_DIR.parent
+
+SOAR_CLI_DEFAULT = REPO_ROOT / "build" / "SoarCLI" / "soar"
+TEST_AGENTS_DIR = REPO_ROOT / "UnitTests" / "SoarTestAgents"
+
+# Suite definitions: directory glob patterns for test agents
+SUITES = {
+    "ChunkingTests": {
+        "glob": "Chunking/tests/ChunkingDemoTests_*.soar",
+        "description": "Chunking demo agents (BW, arithmetic, etc.)",
+    },
+    "FunctionalTests": {
+        "glob": "FunctionalTests_*.soar",
+        "description": "Core functional test agents",
+    },
+    "SMemFunctionalTests": {
+        "glob": "smem/SMemFunctionalTests_*.soar",
+        "description": "Semantic memory functional tests",
+    },
+    "EpMemFunctionalTests": {
+        "glob": "epmem/EpMem*.soar",
+        "description": "Episodic memory functional tests",
+    },
+    "PerformanceTests": {
+        "glob": "*.soar",
+        "root": REPO_ROOT / "PerformanceTests" / "TestAgents",
+        "description": "Performance benchmark agents",
+    },
+}
+
+# Metrics we always capture (deterministic, comparable across builds)
+DETERMINISTIC_METRICS = [
+    "decisions", "elaboration_cycles", "production_firings",
+    "wm_current", "wm_mean", "wm_max",
+    "productions_user", "productions_chunks",
+]
+
+# Metrics that vary (need median of N runs)
+TIMING_METRICS = ["kernel_cpu_sec", "total_cpu_sec"]
+
+
+def parse_stats(output):
+    """Parse Soar stats output into a dict of metrics."""
+    stats = {"status": "error", "halted": False}
+
+    if "halted" in output.lower():
+        stats["halted"] = True
+        stats["status"] = "success"
+    elif "Run stopped" not in output and "interrupt" not in output.lower():
+        stats["status"] = "timeout"
+
+    m = re.search(r"(\d+) decisions?", output)
+    if m:
+        stats["decisions"] = int(m.group(1))
+        stats["status"] = "success"
+
+    m = re.search(r"(\d+) elaboration cycles?", output)
+    if m:
+        stats["elaboration_cycles"] = int(m.group(1))
+
+    m = re.search(r"(\d+) production firings?", output)
+    if m:
+        stats["production_firings"] = int(m.group(1))
+
+    m = re.search(r"WM size:\s*(\d+) current,\s*([\d.]+) mean,\s*(\d+) maximum", output)
+    if m:
+        stats["wm_current"] = int(m.group(1))
+        stats["wm_mean"] = float(m.group(2))
+        stats["wm_max"] = int(m.group(3))
+
+    m = re.search(r"(\d+) productions? \((\d+) default, (\d+) user, (\d+) chunks?\)", output)
+    if m:
+        stats["productions_total"] = int(m.group(1))
+        stats["productions_default"] = int(m.group(2))
+        stats["productions_user"] = int(m.group(3))
+        stats["productions_chunks"] = int(m.group(4))
+
+    m = re.search(r"Kernel CPU Time:\s*([\d.]+) sec", output)
+    if m:
+        stats["kernel_cpu_sec"] = float(m.group(1))
+
+    m = re.search(r"Total\s+CPU Time:\s*([\d.]+) sec", output)
+    if m:
+        stats["total_cpu_sec"] = float(m.group(1))
+
+    return stats
+
+
+def run_test(soar_cli, agent_path, max_decisions=10000):
+    """Run a single test agent and return parsed stats."""
+    try:
+        proc = subprocess.run(
+            [str(soar_cli), "-s", str(agent_path), f"run {max_decisions}; stats"],
+            capture_output=True, text=True, timeout=30, cwd=str(REPO_ROOT)
+        )
+        output = proc.stdout + proc.stderr
+        stats = parse_stats(output)
+        stats["agent"] = agent_path.name
+        stats["exit_code"] = proc.returncode
+        return stats
+    except subprocess.TimeoutExpired:
+        return {"agent": agent_path.name, "status": "timeout", "exit_code": -1}
+    except Exception as e:
+        return {"agent": agent_path.name, "status": "error", "error": str(e)}
+
+
+def discover_tests(suite_name):
+    """Find test agent files for a suite."""
+    if suite_name not in SUITES:
+        print(f"Unknown suite: {suite_name}. Available: {list(SUITES.keys())}")
+        sys.exit(1)
+
+    suite = SUITES[suite_name]
+    pattern = suite["glob"]
+    root = suite.get("root", TEST_AGENTS_DIR)
+    agents = sorted(root.glob(pattern))
+
+    if not agents:
+        agents = sorted(root.glob(f"**/{pattern}"))
+
+    return agents
+
+
+def run_suite(soar_cli, suite_name, max_decisions=10000):
+    """Run all tests in a suite, return list of stats dicts."""
+    agents = discover_tests(suite_name)
+    print(f"  Found {len(agents)} agents in {suite_name}", file=sys.stderr)
+
+    results = []
+    for agent in agents:
+        stats = run_test(soar_cli, agent, max_decisions)
+        results.append(stats)
+        status_char = "." if stats["status"] == "success" else "X"
+        print(status_char, end="", flush=True, file=sys.stderr)
+    print(file=sys.stderr)
+
+    return results
+
+
+## ---------------------------------------------------------------------------
+## VISIBILITY: raw diff — what changed, by how much, no judgment
+## ---------------------------------------------------------------------------
+
+def diff_results(base, candidate):
+    """Produce a raw diff of two result sets. Facts only, no judgment.
+
+    Returns a list of per-agent diffs. Each metric reports base value,
+    candidate value, and delta. No classification, no direction, no
+    pass/fail. The consumer decides what matters.
+    """
+    base_by_agent = {r["agent"]: r for r in base}
+    cand_by_agent = {r["agent"]: r for r in candidate}
+    all_agents = sorted(set(base_by_agent.keys()) | set(cand_by_agent.keys()))
+
+    diffs = []
+    for agent in all_agents:
+        b = base_by_agent.get(agent)
+        c = cand_by_agent.get(agent)
+
+        entry = {"agent": agent}
+
+        if not b:
+            entry["presence"] = "new_in_candidate"
+            if c:
+                entry["candidate"] = {m: c[m] for m in DETERMINISTIC_METRICS if m in c}
+            diffs.append(entry)
+            continue
+        if not c:
+            entry["presence"] = "missing_in_candidate"
+            diffs.append(entry)
+            continue
+
+        entry["presence"] = "both"
+        entry["base_status"] = b.get("status")
+        entry["candidate_status"] = c.get("status")
+        entry["metrics"] = {}
+
+        for metric in DETERMINISTIC_METRICS + TIMING_METRICS:
+            bv = b.get(metric)
+            cv = c.get(metric)
+            if bv is None and cv is None:
+                continue
+            m = {"base": bv, "candidate": cv}
+            if bv is not None and cv is not None:
+                m["delta"] = cv - bv
+                if bv != 0:
+                    m["pct_change"] = round((cv - bv) / abs(bv) * 100, 1)
+            entry["metrics"][metric] = m
+
+        diffs.append(entry)
+
+    return {"agents": diffs}
+
+
+def print_diff(diff):
+    """Print raw diff as a readable table. No judgment words."""
+    changed_agents = []
+    for entry in diff["agents"]:
+        if entry.get("presence") == "new_in_candidate":
+            changed_agents.append((entry["agent"], "NEW", {}))
+            continue
+        if entry.get("presence") == "missing_in_candidate":
+            changed_agents.append((entry["agent"], "MISSING", {}))
+            continue
+
+        status_changed = entry["base_status"] != entry["candidate_status"]
+        metric_changes = {}
+        for m, info in entry.get("metrics", {}).items():
+            if info.get("delta", 0) != 0:
+                metric_changes[m] = info
+
+        if status_changed or metric_changes:
+            status = f"{entry['base_status']} -> {entry['candidate_status']}" if status_changed else ""
+            changed_agents.append((entry["agent"], status, metric_changes))
+
+    if not changed_agents:
+        print("\nNo differences detected.")
+        return
+
+    print(f"\n{'Agent':<60} {'Metric':<25} {'Base':>10} {'Candidate':>10} {'Delta':>10} {'%':>8}")
+    print("-" * 125)
+    for agent, status, metrics in changed_agents:
+        if status == "NEW":
+            print(f"{agent:<60} {'(new agent)':<25}")
+            continue
+        if status == "MISSING":
+            print(f"{agent:<60} {'(missing)':<25}")
+            continue
+        first_line = True
+        if status:
+            print(f"{agent:<60} {'status':<25} {status}")
+            first_line = False
+        for m, info in sorted(metrics.items()):
+            bv = info.get("base", "—")
+            cv = info.get("candidate", "—")
+            delta = info.get("delta", "")
+            pct = info.get("pct_change", "")
+            delta_str = f"{delta:+}" if isinstance(delta, (int, float)) else ""
+            pct_str = f"{pct:+.1f}%" if isinstance(pct, (int, float)) else ""
+            label = agent if first_line else ""
+            print(f"{label:<60} {m:<25} {str(bv):>10} {str(cv):>10} {delta_str:>10} {pct_str:>8}")
+            first_line = False
+
+
+## ---------------------------------------------------------------------------
+## DECISION: policy layer — maintainer configures what counts as pass/fail
+## ---------------------------------------------------------------------------
+
+# Default policy: lower is better for resource metrics, chunk count is neutral.
+# Maintainers can override by providing a policy JSON file.
+DEFAULT_POLICY = {
+    "lower_is_better": [
+        "decisions", "elaboration_cycles", "production_firings",
+        "wm_max", "kernel_cpu_sec", "total_cpu_sec",
+    ],
+    "neutral": [
+        "productions_chunks", "productions_user",
+        "wm_current", "wm_mean",
+    ],
+    # Metrics not listed are ignored for pass/fail.
+    # Status regressions (success -> timeout/error) always count.
+}
+
+
+def apply_policy(diff, policy=None):
+    """Apply a policy to a raw diff. Returns judgment summary.
+
+    This is the ONLY place that says 'regression' or 'improvement'.
+    Separated from visibility so the maintainer controls the criteria.
+    """
+    if policy is None:
+        policy = DEFAULT_POLICY
+
+    lower_is_better = set(policy.get("lower_is_better", []))
+    neutral = set(policy.get("neutral", []))
+    timing_floor = policy.get("timing_noise_floor", 0)
+    timing_rel = policy.get("timing_relative_threshold", 0)
+    timing_metrics = {"kernel_cpu_sec", "total_cpu_sec"}
+
+    regressions = []
+    improvements = []
+    neutral_changes = []
+
+    for entry in diff["agents"]:
+        agent = entry["agent"]
+
+        if entry.get("presence") == "missing_in_candidate":
+            regressions.append({"agent": agent, "reason": "agent missing in candidate"})
+            continue
+
+        # Status regression
+        bs = entry.get("base_status")
+        cs = entry.get("candidate_status")
+        if bs == "success" and cs != "success":
+            regressions.append({"agent": agent, "reason": f"status {bs} -> {cs}"})
+
+        for metric, info in entry.get("metrics", {}).items():
+            delta = info.get("delta", 0)
+            if delta == 0:
+                continue
+
+            # Apply timing noise thresholds
+            if metric in timing_metrics:
+                abs_delta = abs(delta)
+                pct = abs(info.get("pct_change", 0))
+                if abs_delta < timing_floor and pct < timing_rel:
+                    continue  # below both thresholds, treat as noise
+
+            if metric in neutral:
+                neutral_changes.append({
+                    "agent": agent, "metric": metric,
+                    "base": info["base"], "candidate": info["candidate"],
+                })
+            elif metric in lower_is_better:
+                if delta > 0:
+                    regressions.append({
+                        "agent": agent, "metric": metric,
+                        "base": info["base"], "candidate": info["candidate"],
+                    })
+                else:
+                    improvements.append({
+                        "agent": agent, "metric": metric,
+                        "base": info["base"], "candidate": info["candidate"],
+                    })
+
+    return {
+        "regressions": regressions,
+        "improvements": improvements,
+        "neutral_changes": neutral_changes,
+        "pareto_pass": len(regressions) == 0 and len(improvements) > 0,
+    }
+
+
+def print_judgment(judgment):
+    """Print policy-based judgment. Clearly labeled as policy output."""
+    print("\n" + "=" * 60)
+    print("POLICY JUDGMENT (maintainer-configured, not harness opinion)")
+    print("=" * 60)
+
+    if judgment["regressions"]:
+        print(f"\nRegressions ({len(judgment['regressions'])}):")
+        for r in judgment["regressions"]:
+            if "metric" in r:
+                print(f"  {r['agent']}: {r['metric']} {r['base']} -> {r['candidate']}")
+            else:
+                print(f"  {r['agent']}: {r['reason']}")
+
+    if judgment["improvements"]:
+        print(f"\nImprovements ({len(judgment['improvements'])}):")
+        for r in judgment["improvements"]:
+            print(f"  {r['agent']}: {r['metric']} {r['base']} -> {r['candidate']}")
+
+    if judgment["neutral_changes"]:
+        print(f"\nNeutral changes ({len(judgment['neutral_changes'])}):")
+        for r in judgment["neutral_changes"]:
+            print(f"  {r['agent']}: {r['metric']} {r['base']} -> {r['candidate']}")
+
+    print(f"\nPareto check: {'PASS' if judgment['pareto_pass'] else 'FAIL'}")
+    return 0 if not judgment["regressions"] else 1
+
+
+def cmd_run(args):
+    soar_cli = Path(args.soar).expanduser().resolve()
+    suites = args.suite if args.suite else list(SUITES.keys())
+
+    all_results = {}
+    for suite in suites:
+        print(f"Running {suite}...", file=sys.stderr)
+        results = run_suite(soar_cli, suite)
+        all_results[suite] = results
+
+    output = {
+        "soar_binary": str(soar_cli),
+        "suites": all_results,
+    }
+
+    out_path = Path(args.out)
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    with open(out_path, "w") as f:
+        json.dump(output, f, indent=2)
+    print(f"Results written to {out_path}", file=sys.stderr)
+
+
+def cmd_compare(args):
+    with open(args.base) as f:
+        base = json.load(f)
+    with open(args.candidate) as f:
+        candidate = json.load(f)
+
+    policy = DEFAULT_POLICY
+    if args.policy:
+        with open(args.policy) as f:
+            policy = json.load(f)
+
+    exit_code = 0
+    for suite in sorted(set(list(base["suites"].keys()) + list(candidate["suites"].keys()))):
+        b = base["suites"].get(suite, [])
+        c = candidate["suites"].get(suite, [])
+        if not b or not c:
+            print(f"\n{suite}: missing from one side, skipping")
+            continue
+
+        print(f"\n{'='*60}")
+        print(f"Suite: {suite}")
+        print(f"{'='*60}")
+
+        # Layer 1: VISIBILITY — raw facts
+        diff = diff_results(b, c)
+        print_diff(diff)
+
+        # Layer 2: DECISION — policy judgment (optional)
+        if not args.facts_only:
+            judgment = apply_policy(diff, policy)
+            code = print_judgment(judgment)
+            if code != 0:
+                exit_code = 1
+
+    sys.exit(exit_code)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Soar per-module eval harness")
+    sub = parser.add_subparsers(dest="command")
+
+    run_p = sub.add_parser("run", help="Run test suites and capture stats")
+    run_p.add_argument("--soar", default=str(SOAR_CLI_DEFAULT), help="Path to soar CLI")
+    run_p.add_argument("--suite", nargs="*", help="Suite(s) to run (default: all)")
+    run_p.add_argument("--out", default="results/module_eval.json", help="Output JSON")
+
+    cmp_p = sub.add_parser("compare", help="Compare two result sets")
+    cmp_p.add_argument("--base", required=True, help="Baseline JSON")
+    cmp_p.add_argument("--candidate", required=True, help="Candidate JSON")
+    cmp_p.add_argument("--policy", default=None, help="Policy JSON (default: built-in)")
+    cmp_p.add_argument("--facts-only", action="store_true",
+                       help="Show raw diff only, no policy judgment")
+
+    args = parser.parse_args()
+    if args.command == "run":
+        cmd_run(args)
+    elif args.command == "compare":
+        cmd_compare(args)
+    else:
+        parser.print_help()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/soar-eval/policy.json b/soar-eval/policy.json
new file mode 100644
index 0000000000..43ebbde371
--- /dev/null
+++ b/soar-eval/policy.json
@@ -0,0 +1,18 @@
+{
+  "lower_is_better": [
+    "decisions",
+    "elaboration_cycles",
+    "production_firings",
+    "wm_max",
+    "kernel_cpu_sec",
+    "total_cpu_sec"
+  ],
+  "neutral": [
+    "productions_chunks",
+    "productions_user",
+    "wm_current",
+    "wm_mean"
+  ],
+  "timing_noise_floor": 0.005,
+  "timing_relative_threshold": 1.0
+}
diff --git a/soar-eval/policy_README.md b/soar-eval/policy_README.md
new file mode 100644
index 0000000000..6a66ffc0a9
--- /dev/null
+++ b/soar-eval/policy_README.md
@@ -0,0 +1,39 @@
+# Soar Eval Policy
+
+`policy.json` controls Layer 2 of the eval harness: the judgment layer that decides whether raw metric differences count as improvements, regressions, or noise.
+
+Layer 1 reports facts only: baseline value, candidate value, delta, and percent change. This policy file defines how Layer 2 interprets those facts.
+
+## Fields
+
+### `lower_is_better`
+
+Metrics where a lower candidate value is an improvement and a higher value is a regression.
+
+- `decisions`, `elaboration_cycles`, `production_firings`, `wm_max` — resource-use metrics
+- `kernel_cpu_sec`, `total_cpu_sec` — timing metrics (subject to noise thresholds below)
+
+### `neutral`
+
+Metrics reported but not judged directionally. A change is flagged as "changed" rather than "improved" or "regressed."
+
+- `productions_chunks`, `productions_user` — chunk count direction depends on context
+- `wm_current`, `wm_mean` — informational
+
+### `timing_noise_floor`
+
+Absolute timing delta (seconds) below which changes are ignored as noise. Default: `0.005` (5ms).
+
+### `timing_relative_threshold`
+
+Relative timing delta (percent) below which changes are ignored as noise. Default: `1.0` (1%).
+
+A timing change is flagged only when it exceeds **both** thresholds. This biases toward review rather than automatic pass.
+
+## Customization
+
+Stricter (flag more): lower the thresholds.
+Less sensitive (flag less): raise the thresholds.
+Change metric direction: move names between `lower_is_better` and `neutral`.
+
+The maintainer owns this file. Contributors see the raw numbers (Layer 1). The maintainer decides what matters (Layer 2).
diff --git a/soar-eval/tasks/__init__.py b/soar-eval/tasks/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/soar-eval/tasks/blocks_world.py b/soar-eval/tasks/blocks_world.py
new file mode 100644
index 0000000000..6b9da65b20
--- /dev/null
+++ b/soar-eval/tasks/blocks_world.py
@@ -0,0 +1,185 @@
+"""Generate random 3-block Blocks World tasks as Soar production rules.
+
+A state is a set of ontop relations over {A, B, C, table}.
+For 3 blocks there are 13 distinct legal states.
+"""
+
+import random
+from itertools import permutations
+
+BLOCKS = ["A", "B", "C"]
+TABLE = "table"
+
+
+def enumerate_states():
+    """Return all legal 3-block states as frozensets of (top, bottom) pairs.
+
+    Each block must be on exactly one thing. A block can be on the table
+    or on another block, but only if no other block is on that block
+    (i.e., the destination must be clear).
+    """
+    states = []
+    # Each block is on something: another block or table
+    # Constraint: no two blocks on the same block (but multiple on table is fine)
+    options = BLOCKS + [TABLE]
+    for a_on in options:
+        if a_on == "A":
+            continue
+        for b_on in options:
+            if b_on == "B":
+                continue
+            for c_on in options:
+                if c_on == "C":
+                    continue
+                # Check: no two blocks on the same non-table thing
+                supports = [a_on, b_on, c_on]
+                block_supports = [s for s in supports if s != TABLE]
+                if len(block_supports) != len(set(block_supports)):
+                    continue
+                # Check: no cycles (A on B on A)
+                placement = {"A": a_on, "B": b_on, "C": c_on}
+                if has_cycle(placement):
+                    continue
+                state = frozenset([("A", a_on), ("B", b_on), ("C", c_on)])
+                states.append(state)
+    return states
+
+
+def has_cycle(placement):
+    """Check if a placement dict has a cycle among blocks."""
+    for start in BLOCKS:
+        visited = set()
+        current = start
+        while current in placement and current != TABLE:
+            if current in visited:
+                return True
+            visited.add(current)
+            current = placement[current]
+    return False
+
+
+ALL_STATES = enumerate_states()
+
+
+def generate_task_pair(rng):
+    """Return (initial_state, goal_state) as two different states."""
+    pair = rng.sample(ALL_STATES, 2)
+    return pair[0], pair[1]
+
+
+def generate_tasks(n_train, n_transfer, seed):
+    """Generate training and transfer task pairs."""
+    rng = random.Random(seed)
+    train = [generate_task_pair(rng) for _ in range(n_train)]
+    transfer = [generate_task_pair(rng) for _ in range(n_transfer)]
+    return train, transfer
+
+
+def state_to_soar(state, var_prefix="s"):
+    """Convert a state (frozenset of (top, bottom)) to Soar WME creation actions."""
+    lines = []
+    ontop_vars = []
+    block_vars = {}
+
+    # Create block variables
+    for block in BLOCKS:
+        var = f"<block{block}>"
+        block_vars[block] = var
+        lines.append(f'   ({var} ^name {block} ^type block)')
+    block_vars[TABLE] = "<table>"
+    lines.append(f'   (<table> ^name table ^type table)')
+
+    # Create ontop relations
+    for i, (top, bottom) in enumerate(sorted(state)):
+        var = f"<ontop{i+1}>"
+        ontop_vars.append(var)
+        lines.append(f'   ({var} ^top-block {block_vars[top]} ^bottom-block {block_vars[bottom]})')
+
+    return ontop_vars, block_vars, lines
+
+
+def task_to_soar_file(initial, goal, task_name):
+    """Generate a Soar production that sets up a specific BW task.
+
+    Returns the text of a .soar file with a single initialization production.
+    """
+    block_vars = {b: f"<block{b}>" for b in BLOCKS}
+    block_vars[TABLE] = "<table>"
+
+    # Block/table definitions (shared between initial and goal)
+    block_defs = []
+    for b in BLOCKS:
+        block_defs.append(f'   ({block_vars[b]} ^name {b} ^type block)')
+    block_defs.append(f'   (<table> ^name table ^type table)')
+
+    # Initial ontop relations
+    init_ontop_vars = []
+    init_ontop_lines = []
+    for i, (top, bottom) in enumerate(sorted(initial)):
+        var = f"<ontop{i+1}>"
+        init_ontop_vars.append(var)
+        init_ontop_lines.append(f'   ({var} ^top-block {block_vars[top]} ^bottom-block {block_vars[bottom]})')
+
+    # Goal ontop relations
+    goal_ontop_vars = []
+    goal_ontop_lines = []
+    for i, (top, bottom) in enumerate(sorted(goal)):
+        var = f"<dontop{i+1}>"
+        goal_ontop_vars.append(var)
+        goal_ontop_lines.append(f'   ({var} ^top-block {block_vars[top]} ^bottom-block {block_vars[bottom]})')
+
+    ontop_refs = " ".join(init_ontop_vars)
+    object_refs = " ".join(block_vars[b] for b in BLOCKS) + " <table>"
+    dontop_refs = " ".join(goal_ontop_vars)
+
+    # Use a fixed name so the harness can excise it between tasks
+    propose_name = "eval*propose*initialize-blocks-world"
+    apply_name = "eval*apply*initialize-blocks-world"
+
+    prod = f"""sp {{{propose_name}
+   (state <s> ^superstate nil
+             -^name)
+-->
+   (<s> ^operator <o> +)
+   (<o> ^name initialize-blocks-world)
+}}
+
+sp {{{apply_name}
+   (state <s> ^operator.name initialize-blocks-world)
+-->
+   (<s> ^name blocks-world
+        ^ontop {ontop_refs}
+        ^object {object_refs}
+        ^desired <ds>)
+{chr(10).join(block_defs)}
+{chr(10).join(init_ontop_lines)}
+   (<ds> ^ontop {dontop_refs})
+{chr(10).join(goal_ontop_lines)}
+}}
+"""
+    return prod
+
+
+def write_task_file(filepath, initial, goal, task_name):
+    """Write a single task's initialization production to a .soar file."""
+    content = task_to_soar_file(initial, goal, task_name)
+    with open(filepath, "w") as f:
+        f.write(content)
+    return filepath
+
+
+if __name__ == "__main__":
+    # Verify state enumeration
+    states = enumerate_states()
+    print(f"Found {len(states)} legal 3-block states")
+    for s in states:
+        print(f"  {sorted(s)}")
+
+    # Generate a sample task
+    rng = random.Random(42)
+    init, goal = generate_task_pair(rng)
+    print(f"\nSample task:")
+    print(f"  Initial: {sorted(init)}")
+    print(f"  Goal:    {sorted(goal)}")
+    print(f"\nSoar production:")
+    print(task_to_soar_file(init, goal, "eval*task*seed42*train01*initialize"))
diff --git a/soar-eval/transfer_eval.py b/soar-eval/transfer_eval.py
new file mode 100644
index 0000000000..37ff518ca3
--- /dev/null
+++ b/soar-eval/transfer_eval.py
@@ -0,0 +1,309 @@
+#!/usr/bin/env python3
+"""Soar transfer eval harness.
+
+Runs a chunking agent on generated Blocks World tasks and measures
+whether learning on training tasks speeds up transfer tasks.
+
+Usage:
+    python eval.py --seeds 5 --train 10 --transfer 10
+"""
+
+import argparse
+import json
+import os
+import re
+import subprocess
+import sys
+import tempfile
+from pathlib import Path
+
+from tasks.blocks_world import generate_tasks, write_task_file
+
+SCRIPT_DIR = Path(__file__).resolve().parent
+REPO_ROOT = SCRIPT_DIR.parent
+
+SOAR_CLI = REPO_ROOT / "build" / "SoarCLI" / "soar"
+BASE_AGENT = SCRIPT_DIR / "agents" / "bw-op-subgoal-base.soar"
+RESULTS_DIR = SCRIPT_DIR / "results"
+MAX_DECISIONS = 500
+
+
+def parse_run_line(output):
+    """Extract per-run metrics from '--> N decision cycles' line."""
+    decisions = 0
+    chunks_learned = 0
+
+    m = re.search(r"--> (\d+) decision cycles? executed", output)
+    if m:
+        decisions = int(m.group(1))
+
+    m = re.search(r"(\d+) new rules? learned", output)
+    if m:
+        chunks_learned = int(m.group(1))
+
+    return {"decisions": decisions, "chunks_learned": chunks_learned}
+
+
+def parse_stats(output):
+    """Extract cumulative metrics from Soar stats output."""
+    chunks = 0
+    m = re.search(r"(\d+) chunks?\)", output)
+    if m:
+        chunks = int(m.group(1))
+    return {"chunks_total": chunks}
+
+
+def run_single_task(soar_cli, base_agent, task_file, chunking=True,
+                    extra_sources=None, max_decisions=MAX_DECISIONS):
+    """Run agent on a single task in a fresh Soar process.
+
+    extra_sources: list of .soar files to source before the task (e.g., saved chunks)
+    Returns stats dict.
+    """
+    cmd_parts = []
+    if not chunking:
+        cmd_parts.append("chunk never")
+    if extra_sources:
+        for s in extra_sources:
+            cmd_parts.append(f'source "{s}"')
+    cmd_parts.append(f'source "{task_file}"')
+    cmd_parts.append(f"run {max_decisions}")
+    cmd_parts.append("stats")
+
+    cmd_str = "; ".join(cmd_parts)
+
+    proc = subprocess.run(
+        [str(soar_cli), "-s", str(base_agent), cmd_str],
+        capture_output=True, text=True, timeout=30, cwd=str(REPO_ROOT)
+    )
+
+    output = proc.stdout + proc.stderr
+    run_metrics = parse_run_line(output)
+    cumulative = parse_stats(output)
+    halted = "halted" in output.lower()
+
+    return {
+        "decisions": run_metrics["decisions"],
+        "chunks_learned": run_metrics["chunks_learned"],
+        "chunks_total": cumulative["chunks_total"],
+        "halted": halted,
+        "status": "success" if halted else "timeout",
+    }
+
+
+def save_chunks(soar_cli, base_agent, task_files, chunking=True,
+                max_decisions=MAX_DECISIONS):
+    """Run tasks sequentially in one process, save learned chunks to a file.
+
+    Uses the CLI's semicolon-joined command approach which works for
+    source+run+save sequences.
+    Returns (list of per-task stats, path to saved chunks file).
+    """
+    chunks_file = tempfile.mktemp(suffix=".soar")
+
+    # Build command: source each task, run, excise, source next, init-soar, run...
+    # Then save chunks at the end.
+    cmd_parts = []
+    if not chunking:
+        cmd_parts.append("chunk never")
+
+    for i, task_file in enumerate(task_files):
+        cmd_parts.append(f'source "{task_file}"')
+        if i > 0:
+            cmd_parts.append("init-soar")
+        cmd_parts.append(f"run {max_decisions}")
+        cmd_parts.append("excise eval*propose*initialize-blocks-world")
+        cmd_parts.append("excise eval*apply*initialize-blocks-world")
+
+    # Save all chunks to file
+    cmd_parts.append(f'command-to-file "{chunks_file}" print --chunks --full')
+    cmd_parts.append("stats")
+
+    cmd_str = "; ".join(cmd_parts)
+
+    proc = subprocess.run(
+        [str(soar_cli), "-s", str(base_agent), cmd_str],
+        capture_output=True, text=True, timeout=120, cwd=str(REPO_ROOT)
+    )
+
+    output = proc.stdout + proc.stderr
+    cumulative = parse_stats(output)
+
+    # Check if chunks file was created and has content
+    if not os.path.exists(chunks_file) or os.path.getsize(chunks_file) == 0:
+        chunks_file = None
+
+    return cumulative, chunks_file
+
+
+def run_condition(soar_cli, base_agent, task_files, phase_labels,
+                  chunking=True, max_decisions=MAX_DECISIONS,
+                  chunks_file=None):
+    """Run each task as a separate subprocess for reliable stats.
+
+    If chunks_file is provided, it's sourced before each task to simulate
+    having learned from prior training.
+    """
+    extra = [chunks_file] if chunks_file else None
+    results = []
+    for task_file, label in zip(task_files, phase_labels):
+        stats = run_single_task(
+            soar_cli, base_agent, task_file,
+            chunking=chunking, extra_sources=extra,
+            max_decisions=max_decisions
+        )
+        stats["id"] = label
+        results.append(stats)
+    return results
+
+
+def run_eval(seed, n_train, n_transfer, soar_cli=SOAR_CLI,
+             base_agent=BASE_AGENT):
+    """Run the full three-condition eval for one seed."""
+    train_tasks, transfer_tasks = generate_tasks(n_train, n_transfer, seed)
+
+    # Write task files
+    tmpdir = tempfile.mkdtemp(prefix="soar_eval_")
+    train_files = []
+    transfer_files = []
+    train_labels = []
+    transfer_labels = []
+
+    for i, (init, goal) in enumerate(train_tasks):
+        name = f"eval*task*seed{seed}*train{i:02d}"
+        path = os.path.join(tmpdir, f"train_{i:02d}.soar")
+        write_task_file(path, init, goal, name)
+        train_files.append(path)
+        train_labels.append(f"train_{i:02d}")
+
+    for i, (init, goal) in enumerate(transfer_tasks):
+        name = f"eval*task*seed{seed}*transfer{i:02d}"
+        path = os.path.join(tmpdir, f"transfer_{i:02d}.soar")
+        write_task_file(path, init, goal, name)
+        transfer_files.append(path)
+        transfer_labels.append(f"transfer_{i:02d}")
+
+    # Phase 1: Train — run all training tasks in one process, save chunks
+    train_cumulative, chunks_file = save_chunks(
+        soar_cli, base_agent, train_files, chunking=True
+    )
+
+    # Condition 1: trained-transfer (transfer tasks WITH learned chunks)
+    trained_transfer = run_condition(
+        soar_cli, base_agent, transfer_files, transfer_labels,
+        chunking=True, chunks_file=chunks_file
+    )
+
+    # Condition 2: fresh baseline (transfer tasks WITHOUT learned chunks)
+    baseline_results = run_condition(
+        soar_cli, base_agent, transfer_files, transfer_labels, chunking=True
+    )
+
+    # Condition 3: no-learning control (no chunking at all)
+    nolearn_transfer = run_condition(
+        soar_cli, base_agent, transfer_files, transfer_labels, chunking=False
+    )
+
+    # Clean up chunks file
+    if chunks_file and os.path.exists(chunks_file):
+        os.unlink(chunks_file)
+
+    # Compute transfer ratio
+    trained_transfer_dcs = sum(r["decisions"] for r in trained_transfer)
+    baseline_dcs = sum(r["decisions"] for r in baseline_results)
+    trained_train_dcs = 0  # not tracked per-task in this approach
+    trained_chunks = train_cumulative.get("chunks_total", 0)
+
+    if baseline_dcs > 0:
+        transfer_ratio = (baseline_dcs - trained_transfer_dcs) / baseline_dcs
+    else:
+        transfer_ratio = 0.0
+
+    trained_success = sum(1 for r in trained_transfer if r["status"] == "success")
+    baseline_success = sum(1 for r in baseline_results if r["status"] == "success")
+    nolearn_success = sum(1 for r in nolearn_transfer if r["status"] == "success")
+    nolearn_dcs = sum(r["decisions"] for r in nolearn_transfer)
+
+    return {
+        "seed": seed,
+        "n_train": n_train,
+        "n_transfer": n_transfer,
+        "trained_transfer": {
+            "success": f"{trained_success}/{n_transfer}",
+            "transfer_decisions": trained_transfer_dcs,
+            "chunks": trained_chunks,
+            "transfer_ratio": round(transfer_ratio, 3),
+        },
+        "fresh_baseline": {
+            "success": f"{baseline_success}/{n_transfer}",
+            "transfer_decisions": baseline_dcs,
+        },
+        "no_learning": {
+            "success": f"{nolearn_success}/{n_transfer}",
+            "transfer_decisions": nolearn_dcs,
+        },
+        "per_task": {
+            "trained_transfer": trained_transfer,
+            "baseline": baseline_results,
+            "no_learning": nolearn_transfer,
+        }
+    }
+
+
+def print_summary(results):
+    """Print summary table."""
+    header = f"{'Seed':>6}  {'Condition':<20}  {'Success':<10}  {'Transfer DCs':>13}  {'Chunks':>7}  {'Transfer Ratio':>15}"
+    print(header)
+    print("-" * len(header))
+    for r in results:
+        s = r["seed"]
+        t = r["trained_transfer"]
+        b = r["fresh_baseline"]
+        n = r["no_learning"]
+        print(f"{s:>6}  {'trained-transfer':<20}  {t['success']:<10}  {t['transfer_decisions']:>13}  {t['chunks']:>7}  {t['transfer_ratio']:>+15.3f}")
+        print(f"{s:>6}  {'fresh-baseline':<20}  {b['success']:<10}  {b['transfer_decisions']:>13}  {'-':>7}  {'-':>15}")
+        print(f"{s:>6}  {'no-learning':<20}  {n['success']:<10}  {n['transfer_decisions']:>13}  {'-':>7}  {'-':>15}")
+        print()
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Soar transfer eval harness")
+    parser.add_argument("--seeds", type=int, default=5, help="Number of seeds")
+    parser.add_argument("--train", type=int, default=6, help="Training tasks per seed")
+    parser.add_argument("--transfer", type=int, default=6, help="Transfer tasks per seed")
+    parser.add_argument("--soar", type=str, default=str(SOAR_CLI), help="Path to soar CLI")
+    parser.add_argument("--agent", type=str, default=str(BASE_AGENT), help="Path to base agent")
+    parser.add_argument("--output", type=str, default=None, help="JSON output file")
+    args = parser.parse_args()
+
+    soar_cli = Path(args.soar)
+    base_agent = Path(args.agent)
+
+    if not soar_cli.exists():
+        print(f"Error: Soar CLI not found at {soar_cli}", file=sys.stderr)
+        sys.exit(1)
+    if not base_agent.exists():
+        print(f"Error: Base agent not found at {base_agent}", file=sys.stderr)
+        sys.exit(1)
+
+    all_results = []
+    for seed in range(args.seeds):
+        print(f"Running seed {seed}...", file=sys.stderr)
+        result = run_eval(seed, args.train, args.transfer, soar_cli, base_agent)
+        all_results.append(result)
+
+    print_summary(all_results)
+
+    if args.output:
+        output_path = Path(args.output)
+    else:
+        RESULTS_DIR.mkdir(parents=True, exist_ok=True)
+        output_path = RESULTS_DIR / "latest.json"
+
+    with open(output_path, "w") as f:
+        json.dump(all_results, f, indent=2)
+    print(f"\nResults written to {output_path}", file=sys.stderr)
+
+
+if __name__ == "__main__":
+    main()