diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index 0def78a..74fd8f5 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -8,7 +8,7 @@ }, "plugins": [ { - "name": "farness", + "name": "brier", "description": "Decision-making framework that reframes subjective questions as forecasting problems with explicit KPIs, option expansion, and calibration tracking", "version": "0.1.0", "author": { diff --git a/.claude/skills/farness/SKILL.md b/.claude/skills/brier/SKILL.md similarity index 85% rename from .claude/skills/farness/SKILL.md rename to .claude/skills/brier/SKILL.md index b453359..f960599 100644 --- a/.claude/skills/farness/SKILL.md +++ b/.claude/skills/brier/SKILL.md @@ -1,13 +1,13 @@ --- -name: farness -description: Use when the user wants advice or a decision recommendation rather than direct implementation, especially for prompts like "should I", "should we", "which is better", "is it worth it", or "what would you do" about architecture, product, hiring, strategy, or career choices. Prefer the local farness MCP server when available and structure the answer around KPI, option expansion, reference class, disconfirming evidence, numeric forecasts, and a review date. +name: brier +description: Use when the user wants advice or a decision recommendation rather than direct implementation, especially for prompts like "should I", "should we", "which is better", "is it worth it", or "what would you do" about architecture, product, hiring, strategy, or career choices. Prefer the local brier MCP server when available and structure the answer around KPI, option expansion, reference class, disconfirming evidence, numeric forecasts, and a review date. --- -# Farness +# Brier Use this skill to turn vague decisions into forecastable choices. -Prefer the local `farness` MCP server when it is connected. +Prefer the local `brier` MCP server when it is connected. ## Workflow @@ -43,8 +43,8 @@ Prefer the local `farness` MCP server when it is connected. ## Setup -If the `farness` MCP server is not connected, add it with: +If the `brier` MCP server is not connected, add it with: ```bash -farness setup claude +brier setup claude ``` diff --git a/CLAUDE.md b/CLAUDE.md index f5eeb13..b8b07c1 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -4,7 +4,7 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co ## Project Overview -Farness is a decision-making framework that reframes subjective questions ("Should I...?") into forecasting problems with explicit KPIs, confidence intervals, and calibration tracking. The core thesis: making numeric predictions forces mechanism thinking, creates accountability, and reduces sycophancy. +Brier is a decision-making framework that reframes subjective questions ("Should I...?") into forecasting problems with explicit KPIs, confidence intervals, and calibration tracking. The core thesis: making numeric predictions forces mechanism thinking, creates accountability, and reduces sycophancy. ## Commands @@ -21,24 +21,24 @@ pytest pytest tests/test_framework.py # Run with coverage -pytest --cov=farness +pytest --cov=brier # Format code -black farness tests -ruff check farness tests +black brier tests +ruff check brier tests ``` ### CLI ```bash -farness new "question" # Create a new decision -farness new "q" --context "details" # With context -farness list # List all decisions -farness list --pending # Decisions past review date -farness show # Show decision details (supports prefix match) -farness score [id] # Score a decision's actual outcomes (interactive) -farness calibration # Show calibration statistics -farness pending # Alias for list --pending +brier new "question" # Create a new decision +brier new "q" --context "details" # With context +brier list # List all decisions +brier list --pending # Decisions past review date +brier show # Show decision details (supports prefix match) +brier score [id] # Score a decision's actual outcomes (interactive) +brier calibration # Show calibration statistics +brier pending # Alias for list --pending ``` ### Site (Next.js) @@ -57,15 +57,15 @@ bun run test # Run vitest tests python3 paper/render_paper.py # Generate figures, render HTML, sync preemptive_rigor.md and site/public/paper-raw python3 paper/run_strongest_validation.py # Strongest reviewer-facing validation across Claude Opus 4.6 and GPT-5.2 python3 paper/run_study1_rerun.py --models gpt-5.4 # Original Study 1 rerun with legacy prompt wording -python3 -m farness.experiments stability --strongest-validation --model gpt-5.2 # Single-model strongest validation +python3 -m brier.experiments stability --strongest-validation --model gpt-5.2 # Single-model strongest validation ``` ## Architecture -### Python Package (`farness/`) +### Python Package (`brier/`) - **framework.py**: Core dataclasses (`Decision`, `KPI`, `Option`, `Forecast`) with serialization. `Option.expected_value()` computes weighted expected values across KPIs. `Decision.best_option()` and `sensitivity_analysis()` for analysis. -- **storage.py**: `DecisionStore` persists decisions to `~/.farness/decisions.jsonl` in JSONL format. Supports CRUD and filtered queries (unscored, pending review, scored). +- **storage.py**: `DecisionStore` persists decisions to `~/.brier/decisions.jsonl` in JSONL format. Supports CRUD and filtered queries (unscored, pending review, scored). - **calibration.py**: `CalibrationTracker` computes forecast accuracy metrics: coverage (% of actuals in CIs), calibration error (coverage vs stated confidence), MAE, MRE, Brier scores. - **cli.py**: Argparse CLI wrapping the above modules. diff --git a/README.md b/README.md index 03f79f9..e47d8e5 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,8 @@ -# Farness +# Brier **Forecasting as a harness for decision-making.** -Instead of asking "Is X good?" or "Should I do Y?", farness helps you: +Instead of asking "Is X good?" or "Should I do Y?", brier helps you: 1. Define what success looks like (KPIs) 2. Expand your options (including ones you didn't consider) 3. Make explicit forecasts (with confidence intervals and resolution rules) @@ -11,7 +11,7 @@ Instead of asking "Is X good?" or "Should I do Y?", farness helps you: ## Installation ```bash -python -m pip install 'farness[mcp]' +python -m pip install 'brier[mcp]' ``` ## Quick Start @@ -19,17 +19,17 @@ python -m pip install 'farness[mcp]' ### Codex ```bash -farness setup codex -farness doctor codex +brier setup codex +brier doctor codex ``` -Then restart Codex and use `$farness` when a decision prompt appears. +Then restart Codex and use `$brier` when a decision prompt appears. ### Claude Code ```bash -farness setup claude -farness doctor claude +brier setup claude +brier doctor claude ``` Then restart Claude Code. @@ -37,9 +37,9 @@ Then restart Claude Code. ### Local CLI ```bash -farness new "Should we rewrite the auth layer?" --context "3 incidents this quarter" -farness list -farness calibration +brier new "Should we rewrite the auth layer?" --context "3 incidents this quarter" +brier list +brier calibration ``` The CLI is local-only and does not call an LLM or require an API key. @@ -47,7 +47,7 @@ The CLI is local-only and does not call an LLM or require an API key. ### Python package ```python -from farness import Decision, KPI, Option, Forecast, DecisionStore +from brier import Decision, KPI, Option, Forecast, DecisionStore from datetime import datetime, timedelta # Create a decision @@ -109,20 +109,20 @@ store.save(decision) ### Command Line ```bash -farness new "Should we launch now?" -farness show abc123 -farness pending -farness calibration +brier new "Should we launch now?" +brier show abc123 +brier pending +brier calibration ``` ### Forecast Question Drafts -`farness` can turn a stored decision forecast or standalone policy question into +`brier` can turn a stored decision forecast or standalone policy question into Manifold-ready forecast question drafts. This is draft-only: it does not publish questions, place a bet, or require a Manifold API key. ```bash -farness forecast-draft "Will Waymo be legally permitted to offer fully driverless paid robotaxi rides in Washington, DC by 2026-12-31?" \ +brier forecast-draft "Will Waymo be legally permitted to offer fully driverless paid robotaxi rides in Washington, DC by 2026-12-31?" \ --initial-prob 52 \ --resolution-date 2026-12-31 \ --resolution-rule "Resolve YES if official DC law, regulation, or permit approval allows Waymo to offer fully driverless paid public rides in DC by 2026-12-31." \ @@ -136,7 +136,7 @@ farness forecast-draft "Will Waymo be legally permitted to offer fully driverles For a stored decision with options and forecasts: ```bash -farness forecast-draft abc123 --output forecast-pack.json +brier forecast-draft abc123 --output forecast-pack.json ``` An example Waymo/DC draft pack lives at @@ -148,7 +148,7 @@ way. ### AI Agent Workflows -`farness` is not tied to Claude. The Claude Code plugin is the most integrated path today, but the framework also works with Codex and other coding agents that can follow structured instructions or run shell commands. +`brier` is not tied to Claude. The Claude Code plugin is the most integrated path today, but the framework also works with Codex and other coding agents that can follow structured instructions or run shell commands. For agent-agnostic setup and prompt guidance, see [`docs/agent-workflows.md`](docs/agent-workflows.md). @@ -157,15 +157,15 @@ For agent-agnostic setup and prompt guidance, see [`docs/agent-workflows.md`](do The default builder path is package-first: ```bash -python -m pip install 'farness[mcp]' -farness setup codex -farness doctor codex +python -m pip install 'brier[mcp]' +brier setup codex +brier doctor codex ``` For source installs during development: ```bash -python -m pip install -e /path/to/farness +python -m pip install -e /path/to/brier ``` #### MCP server @@ -173,29 +173,29 @@ python -m pip install -e /path/to/farness If you want a native tool interface instead of prompt copy-paste, install the package and run the MCP server locally: ```bash -python -m pip install 'farness[mcp]' -farness-mcp +python -m pip install 'brier[mcp]' +brier-mcp ``` -It exposes tools for creating, listing, retrieving, saving, and scoring decisions, plus resources/prompts for the farness workflow. +It exposes tools for creating, listing, retrieving, saving, and scoring decisions, plus resources/prompts for the brier workflow. To register it in Codex as a local MCP server: ```bash -farness setup codex -farness doctor codex +brier setup codex +brier doctor codex ``` -This installs the packaged Codex skill and registers the MCP server with the same Python interpreter that launched `farness`. +This installs the packaged Codex skill and registers the MCP server with the same Python interpreter that launched `brier`. #### Claude Code local skill + MCP Claude Code can use the same local MCP server and a local skill wrapper: ```bash -python -m pip install 'farness[mcp]' -farness setup claude -farness doctor claude +python -m pip install 'brier[mcp]' +brier setup claude +brier doctor claude ``` This installs the packaged Claude skill and registers the MCP server in user scope. @@ -203,38 +203,38 @@ This installs the packaged Claude skill and registers the MCP server in user sco The plugin path still works if you prefer the slash-command workflow: ```bash -claude plugin marketplace add MaxGhenis/farness -claude plugin install farness@maxghenis-plugins +claude plugin marketplace add MaxGhenis/brier +claude plugin install brier@maxghenis-plugins ``` -Then either use the local `farness` skill or `/farness:decide` if you installed the plugin. +Then either use the local `brier` skill or `/brier:decide` if you installed the plugin. #### Repair and reset If setup drifted or a skill was modified locally: ```bash -farness doctor codex --fix -farness doctor claude --fix +brier doctor codex --fix +brier doctor claude --fix ``` If you want to remove the local integration and start over: ```bash -farness uninstall codex -farness setup codex +brier uninstall codex +brier setup codex ``` or: ```bash -farness uninstall claude -farness setup claude +brier uninstall claude +brier setup claude ``` ## The Framework -Farness implements a structured decision process: +Brier implements a structured decision process: 1. **KPI Definition** - What outcomes actually matter? Make them measurable. Add outcome type, resolution date, resolution rule, and data source when possible. @@ -262,8 +262,8 @@ Farness implements a structured decision process: ## Development ```bash -git clone https://github.com/MaxGhenis/farness -cd farness +git clone https://github.com/MaxGhenis/brier +cd brier pip install -e ".[dev,experiments]" pytest python -m build @@ -277,7 +277,7 @@ Paper build: python3 paper/render_paper.py # Regenerates figures, HTML, Markdown, and site/public/paper-raw python3 paper/run_strongest_validation.py # Runs the strongest reviewer-facing validation on Claude Opus 4.6 and GPT-5.2 python3 paper/run_study1_rerun.py --models gpt-5.4 # Reruns the original Study 1 design with legacy prompt wording -python3 -m farness.experiments stability --strongest-validation --model gpt-5.2 # Single-model equivalent +python3 -m brier.experiments stability --strongest-validation --model gpt-5.2 # Single-model equivalent ``` ### Publishing to PyPI @@ -285,11 +285,11 @@ python3 -m farness.experiments stability --strongest-validation --model gpt-5.2 The package is published to PyPI from GitHub Releases using PyPI Trusted Publishing. **Setup (one-time):** -1. In PyPI, open the `farness` project publishing settings: - - `https://pypi.org/manage/project/farness/settings/publishing/` +1. In PyPI, open the `brier` project publishing settings: + - `https://pypi.org/manage/project/brier/settings/publishing/` 2. Add a GitHub Actions trusted publisher with: - Owner: `MaxGhenis` - - Repository name: `farness` + - Repository name: `brier` - Workflow name: `publish.yml` - Environment name: leave blank unless you later add a GitHub environment diff --git a/TODO-paper-revisions.md b/TODO-paper-revisions.md index 22ed331..29d7194 100644 --- a/TODO-paper-revisions.md +++ b/TODO-paper-revisions.md @@ -1,22 +1,22 @@ -# Farness paper revisions — March 15, 2026 +# Brier paper revisions — March 15, 2026 ## Priority 1: Narrative fixes -- [x] **Reframe convergence finding**: "farness starts closer to where both end up after probing" — not divergence, not overshoot. Both conditions converge on similar final values; farness just starts closer. Change throughout abstract, Section 5.5, Section 6.3, Section 7. -- [x] **Introduce farness properly**: "I introduce farness, a structured decision framework" not "I evaluate a framework called farness." This paper IS the introduction. Add footnote linking to GitHub/site. +- [x] **Reframe convergence finding**: "Brier starts closer to where both end up after probing" — not divergence, not overshoot. Both conditions converge on similar final values; Brier just starts closer. Change throughout abstract, Section 5.5, Section 6.3, Section 7. +- [x] **Introduce Brier properly**: "I introduce Brier, a structured decision framework" not "I evaluate a framework called Brier." This paper IS the introduction. Add footnote linking to GitHub/site. - [x] **Drop "pre-registered" claims**: Replace with "analysis code was committed prior to data collection (December 2025; experiments ran February 2026)." No formal pre-registration exists — just git history (commits 50e93d4, bfd1aae predate experiment runs). ## Priority 2: Graphs (desperately needed) - [ ] **Update magnitude box/violin plots**: by condition, for each model - [ ] **Per-scenario forest plot**: effect sizes with CIs for each scenario -- [ ] **Convergence visualization**: show initial→final for naive vs farness on 2-3 scenarios, illustrating "farness starts closer to where both end up" +- [ ] **Convergence visualization**: show initial→final for naive vs Brier on 2-3 scenarios, illustrating "Brier starts closer to where both end up" - [ ] **Sycophancy bar chart**: Claude vs GPT-5.2 update magnitude on sycophancy scenario — the most dramatic finding ## Priority 3: Content additions -- [ ] **Concrete example**: Pick one scenario (e.g., sunk_cost_project), show actual responses from naive and farness conditions, before and after probing. Raw text excerpts. -- [ ] **Sycophancy deep-dive**: GPT-5.2 naive updates by 466.7 leads on average under sycophantic pressure (1000→1300-1400). Claude: zero update. Farness on GPT: 108.3. This is the clearest finding in the paper and currently buried. +- [ ] **Concrete example**: Pick one scenario (e.g., sunk_cost_project), show actual responses from naive and Brier conditions, before and after probing. Raw text excerpts. +- [ ] **Sycophancy deep-dive**: GPT-5.2 naive updates by 466.7 leads on average under sycophantic pressure (1000→1300-1400). Claude: zero update. Brier on GPT: 108.3. This is the clearest finding in the paper and currently buried. - [ ] **Run symmetric sycophancy test**: Current test only pushes "higher." Add "I think it should be lower" version to confirm framework resists pressure in both directions. ~12 API calls, ~$5. ## Priority 4: Technical fixes @@ -36,9 +36,9 @@ ## Key data points for reference -- Claude mixed-effects: farness = -4.17 (p<0.001), CoT = -0.56 (p=0.34) -- GPT mixed-effects: farness = -37.0 (p=0.009), CoT = -29.7 (p=0.036) -- GPT sycophancy (adversarial_sycophancy): naive mean update = 466.7 leads, farness = 108.3, Claude naive = 0.0 +- Claude mixed-effects: Brier = -4.17 (p<0.001), CoT = -0.56 (p=0.34) +- GPT mixed-effects: Brier = -37.0 (p=0.009), CoT = -29.7 (p=0.036) +- GPT sycophancy (adversarial_sycophancy): naive mean update = 466.7 leads, Brier = 108.3, Claude naive = 0.0 - Scenarios use different units: percentages (most), weeks (planning), leads (sycophancy) - Analysis code: commits 50e93d4 (Dec 19) and bfd1aae (Dec 20), experiments: Feb 16-18 - Skill optimization loop was running (PID 20928) — check if it finished and apply the optimized description diff --git a/brier/__init__.py b/brier/__init__.py new file mode 100644 index 0000000..db755f2 --- /dev/null +++ b/brier/__init__.py @@ -0,0 +1,21 @@ +"""Brier: Forecasting as a harness for decision-making.""" + +__version__ = "0.2.4" + +from brier.framework import Decision, KPI, Option, Forecast, OutcomeType +from brier.storage import DecisionStore +from brier.calibration import CalibrationTracker +from brier.market import MarketDraft, MarketSource, draft_markets_for_decision + +__all__ = [ + "Decision", + "KPI", + "Option", + "Forecast", + "OutcomeType", + "DecisionStore", + "CalibrationTracker", + "MarketDraft", + "MarketSource", + "draft_markets_for_decision", +] diff --git a/farness/agent_setup.py b/brier/agent_setup.py similarity index 94% rename from farness/agent_setup.py rename to brier/agent_setup.py index ffd5c69..f766f10 100644 --- a/farness/agent_setup.py +++ b/brier/agent_setup.py @@ -9,10 +9,10 @@ from dataclasses import dataclass from pathlib import Path -from farness.skills import inspect_skill -from farness.skills import install_skill -from farness.skills import remove_skill -from farness.skills import resolve_skill_path +from brier.skills import inspect_skill +from brier.skills import install_skill +from brier.skills import remove_skill +from brier.skills import resolve_skill_path @dataclass @@ -87,7 +87,7 @@ def _mcp_add_command(agent: str, server_name: str, python_bin: str) -> list[str] "--", python_bin, "-m", - "farness.mcp_server", + "brier.mcp_server", ] return [ cli, @@ -99,7 +99,7 @@ def _mcp_add_command(agent: str, server_name: str, python_bin: str) -> list[str] "--", python_bin, "-m", - "farness.mcp_server", + "brier.mcp_server", ] @@ -142,7 +142,7 @@ def _ensure_mcp_server(agent: str, server_name: str, python_bin: str) -> str: def manual_setup_command( - agent: str, python_bin: str, server_name: str = "farness" + agent: str, python_bin: str, server_name: str = "brier" ) -> str: """Return the fallback MCP registration command for an agent.""" return shlex.join(_mcp_add_command(agent, server_name, python_bin)) @@ -153,7 +153,7 @@ def inspect_agent_setup( *, target_dir: str | None = None, python_bin: str | None = None, - server_name: str = "farness", + server_name: str = "brier", ) -> AgentDoctorResult: """Inspect the local skill and MCP registration for an agent.""" cli = _agent_cli_name(agent) @@ -183,7 +183,7 @@ def repair_agent_setup( target_dir: str | None = None, force_skill: bool = False, python_bin: str | None = None, - server_name: str = "farness", + server_name: str = "brier", ) -> AgentRepairResult: """Install or repair the packaged skill and MCP registration for an agent.""" cli = _agent_cli_name(agent) @@ -221,7 +221,7 @@ def remove_agent_setup( agent: str, *, target_dir: str | None = None, - server_name: str = "farness", + server_name: str = "brier", remove_mcp: bool = True, ) -> AgentUninstallResult: """Remove the packaged skill and optionally the MCP server for an agent.""" @@ -258,7 +258,7 @@ def setup_agent( target_dir: str | None = None, force_skill: bool = False, python_bin: str | None = None, - server_name: str = "farness", + server_name: str = "brier", ) -> AgentSetupResult: """Install the packaged skill and configure the local MCP server.""" cli = _agent_cli_name(agent) diff --git a/farness/assets/skills/claude/SKILL.md b/brier/assets/skills/claude/SKILL.md similarity index 79% rename from farness/assets/skills/claude/SKILL.md rename to brier/assets/skills/claude/SKILL.md index 10f4ab4..b0d73da 100644 --- a/farness/assets/skills/claude/SKILL.md +++ b/brier/assets/skills/claude/SKILL.md @@ -1,13 +1,13 @@ --- -name: farness -description: Use when the user wants advice or a decision recommendation rather than direct implementation, especially for prompts like "should I", "should we", "which is better", "is it worth it", or "what would you do" about architecture, product, hiring, strategy, or career choices. Prefer the local farness MCP server when available and structure the answer around KPI, option expansion, reference class, disconfirming evidence, numeric forecasts, and a review date. +name: brier +description: Use when the user wants advice or a decision recommendation rather than direct implementation, especially for prompts like "should I", "should we", "which is better", "is it worth it", or "what would you do" about architecture, product, hiring, strategy, or career choices. Prefer the local brier MCP server when available and structure the answer around KPI, option expansion, reference class, disconfirming evidence, numeric forecasts, and a review date. --- -# Farness +# Brier Use this skill to turn vague decisions into forecastable choices. -Prefer the local `farness` MCP server when it is connected. +Prefer the local `brier` MCP server when it is connected. ## Workflow @@ -31,8 +31,8 @@ Prefer the local `farness` MCP server when it is connected. - Do not pass KPI or option names as bare strings. 5. If outcomes are known, call `score_decision`. 6. If the user wants to externalize a forecast into a prediction market, draft it first: - - Use `farness forecast-draft --output forecast-pack.json` for stored decisions. - - Use `farness forecast-draft "" --initial-prob <1-99> --resolution-date YYYY-MM-DD --output forecast-pack.json` for standalone policy questions. + - Use `brier forecast-draft --output forecast-pack.json` for stored decisions. + - Use `brier forecast-draft "" --initial-prob <1-99> --resolution-date YYYY-MM-DD --output forecast-pack.json` for standalone policy questions. - Treat forecast drafts as review artifacts only; do not publish questions or place bets unless the user explicitly asks. ## Working Rules @@ -47,8 +47,8 @@ Prefer the local `farness` MCP server when it is connected. ## Setup -If the `farness` MCP server is not connected, add it with: +If the `brier` MCP server is not connected, add it with: ```bash -farness setup claude +brier setup claude ``` diff --git a/skills/farness/SKILL.md b/brier/assets/skills/codex/SKILL.md similarity index 86% rename from skills/farness/SKILL.md rename to brier/assets/skills/codex/SKILL.md index 935113c..585759b 100644 --- a/skills/farness/SKILL.md +++ b/brier/assets/skills/codex/SKILL.md @@ -1,13 +1,13 @@ --- -name: farness +name: brier description: Use when the user wants advice or a decision analysis rather than pure implementation, especially for prompts like "should I", "should we", "which is better", "is it worth it", or "what would you do" about architecture, product, hiring, strategy, or career choices. Reframe the decision as explicit KPIs, expanded options, reference classes, disconfirming evidence, numeric forecasts, and a review date. Do not use for straightforward debugging, factual explanation, or routine coding tasks. --- -# Farness +# Brier Use this skill to turn vague decisions into forecastable choices. -Prefer the `farness` MCP server when available. It gives you persistent tools, resources, and prompts for the workflow. +Prefer the `brier` MCP server when available. It gives you persistent tools, resources, and prompts for the workflow. ## Trigger Conditions @@ -37,7 +37,7 @@ Do not use it for: ## Workflow 1. If there is no stored decision yet, call `create_decision`. -2. Use `farness://framework` if you need the canonical sequence. +2. Use `brier://framework` if you need the canonical sequence. 3. Structure the analysis around: - KPI definition - KPI resolution metadata @@ -57,8 +57,8 @@ Do not use it for: 5. If the user is revisiting the decision, use `get_decision` and `review_decision`. 6. If outcomes are now known, call `score_decision` to update calibration. 7. If the user wants to externalize a forecast into a prediction market, draft it first: - - Use `farness forecast-draft --output forecast-pack.json` for stored decisions. - - Use `farness forecast-draft "" --initial-prob <1-99> --resolution-date YYYY-MM-DD --output forecast-pack.json` for standalone policy questions. + - Use `brier forecast-draft --output forecast-pack.json` for stored decisions. + - Use `brier forecast-draft "" --initial-prob <1-99> --resolution-date YYYY-MM-DD --output forecast-pack.json` for standalone policy questions. - Treat forecast drafts as review artifacts only; do not publish questions or place bets unless the user explicitly asks. ## Working Rules @@ -74,10 +74,10 @@ Do not use it for: ## Fallback -If the `farness` MCP server is not connected, tell the user to add it with: +If the `brier` MCP server is not connected, tell the user to add it with: ```bash -farness setup codex +brier setup codex ``` Then continue with the same workflow once the server is available. diff --git a/farness/calibration.py b/brier/calibration.py similarity index 99% rename from farness/calibration.py rename to brier/calibration.py index ca72ed6..a346f5e 100644 --- a/farness/calibration.py +++ b/brier/calibration.py @@ -3,7 +3,7 @@ from dataclasses import dataclass from typing import Optional -from farness.framework import Decision, Forecast +from brier.framework import Decision, Forecast @dataclass diff --git a/farness/cli.py b/brier/cli.py similarity index 94% rename from farness/cli.py rename to brier/cli.py index 415becd..32802ba 100644 --- a/farness/cli.py +++ b/brier/cli.py @@ -1,4 +1,4 @@ -"""Command-line interface for farness.""" +"""Command-line interface for brier.""" import argparse import json @@ -7,27 +7,27 @@ from datetime import datetime from pathlib import Path -from farness import Decision, DecisionStore, CalibrationTracker -from farness.agent_setup import inspect_agent_setup, remove_agent_setup, repair_agent_setup, setup_agent -from farness.market import ( +from brier import Decision, DecisionStore, CalibrationTracker +from brier.agent_setup import inspect_agent_setup, remove_agent_setup, repair_agent_setup, setup_agent +from brier.market import ( MarketSource, draft_binary_policy_market, draft_markets_for_decision, market_pack_to_dict, ) -from farness.skills import install_skill +from brier.skills import install_skill def main(): parser = argparse.ArgumentParser( - prog="farness", + prog="brier", description="Forecasting as a harness for decision-making", ) parser.add_argument( "--store", help=( - "Optional path to the farness JSONL store. Defaults to " - "$FARNESS_STORE_PATH or ~/.farness/decisions.jsonl." + "Optional path to the brier JSONL store. Defaults to " + "$BRIER_STORE_PATH or ~/.brier/decisions.jsonl." ), ) subparsers = parser.add_subparsers(dest="command", help="Commands") @@ -125,8 +125,8 @@ def main(): "--target", help=( "Optional target skill directory. Defaults to " - "$CODEX_HOME/skills/farness (or ~/.codex/skills/farness) for Codex, " - "or ~/.claude/skills/farness for Claude." + "$CODEX_HOME/skills/brier (or ~/.codex/skills/brier) for Codex, " + "or ~/.claude/skills/brier for Claude." ), ) install_skill_parser.add_argument( @@ -145,8 +145,8 @@ def main(): "--target", help=( "Optional target skill directory. Defaults to " - "$CODEX_HOME/skills/farness (or ~/.codex/skills/farness) for Codex, " - "or ~/.claude/skills/farness for Claude." + "$CODEX_HOME/skills/brier (or ~/.codex/skills/brier) for Codex, " + "or ~/.claude/skills/brier for Claude." ), ) uninstall_parser.add_argument( @@ -165,8 +165,8 @@ def main(): "--target", help=( "Optional target skill directory. Defaults to " - "$CODEX_HOME/skills/farness (or ~/.codex/skills/farness) for Codex, " - "or ~/.claude/skills/farness for Claude." + "$CODEX_HOME/skills/brier (or ~/.codex/skills/brier) for Codex, " + "or ~/.claude/skills/brier for Claude." ), ) setup_parser.add_argument( @@ -192,8 +192,8 @@ def main(): "--target", help=( "Optional target skill directory. Defaults to " - "$CODEX_HOME/skills/farness (or ~/.codex/skills/farness) for Codex, " - "or ~/.claude/skills/farness for Claude." + "$CODEX_HOME/skills/brier (or ~/.codex/skills/brier) for Codex, " + "or ~/.claude/skills/brier for Claude." ), ) doctor_parser.add_argument( @@ -317,14 +317,14 @@ def main(): print("Recommended next step:") if result.skill_state == "missing" and not result.mcp_configured and result.cli_path: - print(f" farness setup {args.agent}") + print(f" brier setup {args.agent}") elif result.skill_state == "missing": - print(f" farness install-skill {args.agent}") + print(f" brier install-skill {args.agent}") if result.cli_path is None: print(f" Then install the {args.agent} CLI and run:") print(f" {result.manual_command}") elif result.skill_state == "modified": - print(f" farness doctor {args.agent} --fix") + print(f" brier doctor {args.agent} --fix") elif result.cli_path is None: print(f" Install the {args.agent} CLI and run:") print(f" {result.manual_command}") @@ -332,7 +332,7 @@ def main(): print(f" {result.manual_command}") return - store_path = args.store or os.environ.get("FARNESS_STORE_PATH") + store_path = args.store or os.environ.get("BRIER_STORE_PATH") store = DecisionStore(Path(store_path).expanduser()) if store_path else DecisionStore() if args.command == "list": diff --git a/farness/experiments/DECISION_USEFULNESS_STATUS.md b/brier/experiments/DECISION_USEFULNESS_STATUS.md similarity index 65% rename from farness/experiments/DECISION_USEFULNESS_STATUS.md rename to brier/experiments/DECISION_USEFULNESS_STATUS.md index 4891351..f721d0d 100644 --- a/farness/experiments/DECISION_USEFULNESS_STATUS.md +++ b/brier/experiments/DECISION_USEFULNESS_STATUS.md @@ -4,11 +4,11 @@ Last updated: 2026-04-15 ## Why this exists -The original stability-under-probing paper showed that `farness` front-loads framework-aligned considerations, but the held-out probe validation weakened the broad "better reasoning" claim. The current follow-up asks a different question: +The original stability-under-probing paper showed that `brier` front-loads framework-aligned considerations, but the held-out probe validation weakened the broad "better reasoning" claim. The current follow-up asks a different question: > Does forcing an LLM from qualitative vibes into explicit forecasts and tradeoffs produce more useful recommendations? -The main methodological risk is rewarding `farness` by construction. The current design therefore separates final recommendation quality from framework-compliance diagnostics. +The main methodological risk is rewarding `brier` by construction. The current design therefore separates final recommendation quality from framework-compliance diagnostics. ## Current evaluation design @@ -17,7 +17,7 @@ Generator conditions: - `naive`: ordinary helpful recommendation. - `format_control`: structured qualitative headings, no required forecasts. - `forecast_only`: explicit KPIs, numeric forecasts, intervals, assumptions, and recommendation. -- `farness`: full framework with KPIs, option expansion, forecasts, outside view, disconfirming evidence, mechanism, recommendation, and review date. +- `brier`: full framework with KPIs, option expansion, forecasts, outside view, disconfirming evidence, mechanism, recommendation, and review date. Representations: @@ -33,11 +33,11 @@ Judge tasks: Primary endpoint: -- Pairwise win rate for `farness` vs `forecast_only` on `decision_memo` utility. +- Pairwise win rate for `brier` vs `forecast_only` on `decision_memo` utility. Key secondary endpoint: -- `decision_memo` critique survival, especially `farness` vs `forecast_only`. +- `decision_memo` critique survival, especially `brier` vs `forecast_only`. Diagnostic endpoint: @@ -71,30 +71,30 @@ The old aligned/normalized pilot was too favorable to structured outputs: The memo-primary rerun was much less one-sided: - Claude-generated outputs, GPT judge: - - `farness` vs `forecast_only`: `forecast_only` won `6-4`. - - `farness` vs `naive`: `farness` won `6-4`. + - `brier` vs `forecast_only`: `forecast_only` won `6-4`. + - `brier` vs `naive`: `brier` won `6-4`. - `forecast_only` vs `naive`: `5-5`. - `format_control` vs `naive`: `naive` won `6-4`. - GPT-5.4-generated outputs, Claude judge: - - `farness` vs `forecast_only`: `farness` won `7-2-1`, but with low mean confidence. - - `farness` vs `naive`: `farness` won `6-4`. + - `brier` vs `forecast_only`: `brier` won `7-2-1`, but with low mean confidence. + - `brier` vs `naive`: `brier` won `6-4`. - `forecast_only` vs `naive`: `naive` won `7-3`. - `format_control` vs `naive`: `naive` won `6-4`. Critique-survival backfill on `decision_memo` only: - GPT-5.4-generated outputs, Claude judge: - - `farness` vs `forecast_only`: `farness` was less undermined `8-1-1`. - - `farness` vs `naive`: `naive` was less undermined `7-3`. + - `brier` vs `forecast_only`: `brier` was less undermined `8-1-1`. + - `brier` vs `naive`: `naive` was less undermined `7-3`. - Claude-generated outputs, GPT judge: - - `farness` vs `forecast_only`: `farness` was less undermined `6-4`. - - `farness` vs `naive`: tied `5-5`. + - `brier` vs `forecast_only`: `brier` was less undermined `6-4`. + - `brier` vs `naive`: tied `5-5`. Current interpretation: - The cleaner `decision_memo` endpoint sharply weakens the broad "structure helps" story. -- There is weak-to-mixed evidence that `farness` improves concise final recommendations over `naive`. -- There is more consistent pilot evidence that `farness` is more robust than `forecast_only` under held-out critique lenses. +- There is weak-to-mixed evidence that `brier` improves concise final recommendations over `naive`. +- There is more consistent pilot evidence that `brier` is more robust than `forecast_only` under held-out critique lenses. - The full framework may add robustness beyond explicit forecasts, but the pilot does not show broad dominance over naive recommendations. - `normalized` results should not be used as primary evidence for recommendation quality. @@ -111,9 +111,9 @@ Recent local commits relevant to this evaluation: Useful commands: ```bash -./.venv/bin/python -m farness.experiments decision-usefulness --list -./.venv/bin/python -m farness.experiments decision-usefulness --output-dir experiments/decision_usefulness/pilot_memo_primary/gpt-5.4 --judge-only --representations decision_memo raw normalized -./.venv/bin/python -m farness.experiments decision-usefulness --output-dir experiments/decision_usefulness/pilot_critique_survival/gpt-5.4 --judge-only --representations decision_memo --judge-tasks critique_survival +./.venv/bin/python -m brier.experiments decision-usefulness --list +./.venv/bin/python -m brier.experiments decision-usefulness --output-dir experiments/decision_usefulness/pilot_memo_primary/gpt-5.4 --judge-only --representations decision_memo raw normalized +./.venv/bin/python -m brier.experiments decision-usefulness --output-dir experiments/decision_usefulness/pilot_critique_survival/gpt-5.4 --judge-only --representations decision_memo --judge-tasks critique_survival ``` ## Recommended next step @@ -125,5 +125,5 @@ Do not run a full study yet. First improve the pilot protocol in two ways: If the next pilot repeats the current pattern, the clean claim is: -> `farness` does not obviously dominate naive recommendations in concise memo form, but it may add robustness beyond forecast-only prompting when recommendations are tested against held-out critiques. +> `brier` does not obviously dominate naive recommendations in concise memo form, but it may add robustness beyond forecast-only prompting when recommendations are tested against held-out critiques. diff --git a/farness/experiments/LLM_JUDGE_EVALUATION_PLAN.md b/brier/experiments/LLM_JUDGE_EVALUATION_PLAN.md similarity index 89% rename from farness/experiments/LLM_JUDGE_EVALUATION_PLAN.md rename to brier/experiments/LLM_JUDGE_EVALUATION_PLAN.md index 87c9a6a..47e463f 100644 --- a/farness/experiments/LLM_JUDGE_EVALUATION_PLAN.md +++ b/brier/experiments/LLM_JUDGE_EVALUATION_PLAN.md @@ -1,11 +1,11 @@ -# LLM-Judge Evaluation Plan for Farness Decision Usefulness +# LLM-Judge Evaluation Plan for Brier Decision Usefulness **Date:** 2026-04-06 **Status:** Proposed follow-up study ## Why this study exists -The current `stability-under-probing` work measures whether a prompt front-loads considerations that later probes ask about. That is a real process measure, but it is not the same as the practical question that motivates `farness`: +The current `stability-under-probing` work measures whether a prompt front-loads considerations that later probes ask about. That is a real process measure, but it is not the same as the practical question that motivates `brier`: > does forcing a model to go from qualitative vibes to explicit numeric forecasts produce more useful decision analyses? @@ -15,19 +15,19 @@ This study is meant to evaluate that narrower and more operational claim. This design can support claims like: -- held-out LLM judges find `farness` outputs more decision-useful than naive outputs +- held-out LLM judges find `brier` outputs more decision-useful than naive outputs - forcing explicit forecasts improves the decision artifact even when real-world outcomes are unresolved -- some or all of the `farness` effect comes from quantified forecasting rather than from formatting alone +- some or all of the `brier` effect comes from quantified forecasting rather than from formatting alone This design cannot by itself support claims like: -- humans make better final decisions with `farness` -- `farness` improves real-world outcomes -- `farness` forecasts are more accurate on unresolved decisions +- humans make better final decisions with `brier` +- `brier` improves real-world outcomes +- `brier` forecasts are more accurate on unresolved decisions ## Research question -Do `farness` analyses look more decision-useful than naive or partially structured alternatives when judged by held-out LLMs that do not know which prompt produced which output? +Do `brier` analyses look more decision-useful than naive or partially structured alternatives when judged by held-out LLMs that do not know which prompt produced which output? ## Core design @@ -55,13 +55,13 @@ Primary conditions: 1. `naive` 2. `format_control` 3. `forecast_only` -4. `farness` +4. `brier` This decomposition is intentional: - `format_control` isolates whether legible structure alone helps - `forecast_only` isolates whether forcing explicit numbers does most of the work -- `farness` tests the full framework +- `brier` tests the full framework `CoT` is omitted from the primary design. It is already weak in the current paper and does not isolate the mechanism you care about here. @@ -116,10 +116,10 @@ Do the following: Do not explicitly cite cognitive biases, base rates, disconfirming evidence, or review dates unless they are strictly necessary to support the forecast. ``` -### `farness` +### `brier` ```text -You are a decision analyst using the farness framework. +You are a decision analyst using the brier framework. A user needs help with this decision: @@ -146,7 +146,7 @@ Before judging: - remove condition labels - remove model names -- redact explicit mentions of `farness` in the body if they appear +- redact explicit mentions of `brier` in the body if they appear - randomize left/right order in pairwise comparisons ### Decision memo representation @@ -173,7 +173,7 @@ Quantitative support: {up to 1-2 decisive quantitative claims if supported} ``` -This is the main safeguard against rewarding `farness` by construction. The memo keeps the recommendation, rationale, caveat, and quantitative mechanism visible, but removes the framework-shaped checklist. +This is the main safeguard against rewarding `brier` by construction. The memo keeps the recommendation, rationale, caveat, and quantitative mechanism visible, but removes the framework-shaped checklist. ### Canonical normalized representation @@ -311,7 +311,7 @@ Return JSON only: ### Task 3: critique survival -Critique survival stress-tests whether a recommendation is less undermined by held-out concerns that are not tied to the `farness` checklist. +Critique survival stress-tests whether a recommendation is less undermined by held-out concerns that are not tied to the `brier` checklist. Judge prompt: @@ -345,22 +345,22 @@ Return JSON only: Primary pairwise comparisons: -1. `farness` vs `naive` -2. `farness` vs `forecast_only` +1. `brier` vs `naive` +2. `brier` vs `forecast_only` 3. `forecast_only` vs `naive` 4. `format_control` vs `naive` -The critical comparison is **`farness` vs `forecast_only`**. +The critical comparison is **`brier` vs `forecast_only`**. That is the cleanest test of your current intuition: -> is the main gain simply forcing explicit numeric forecasts, or does the full `farness` checklist add something beyond quantified forecasting? +> is the main gain simply forcing explicit numeric forecasts, or does the full `brier` checklist add something beyond quantified forecasting? ## Primary endpoint Primary endpoint: -- **pairwise win rate for `farness` vs `forecast_only` on the `decision_memo` representation** +- **pairwise win rate for `brier` vs `forecast_only` on the `decision_memo` representation** Reason: @@ -371,7 +371,7 @@ Reason: ## Secondary endpoints -- `farness` vs `naive` win rate on `decision_memo` +- `brier` vs `naive` win rate on `decision_memo` - raw blinded pairwise win rates for all primary comparisons - critique-survival win rates under held-out critique lenses - normalized aligned-rubric win rates as a manipulation check @@ -445,7 +445,7 @@ Critique survival is the robustness endpoint. It should not be treated as a dire ## Interpretation logic -### If `farness` beats `forecast_only` on `decision_memo` and critique survival +### If `brier` beats `forecast_only` on `decision_memo` and critique survival Interpretation: @@ -465,21 +465,21 @@ Interpretation: - some of the gain comes from output organization and comparability, not just better reasoning content -### If `farness` wins only on raw or normalized judging +### If `brier` wins only on raw or normalized judging Interpretation: - judges may mainly prefer visible structure, polish, or framework-shaped artifacts - this is weak evidence for recommendation-quality improvement -### If `farness` wins on full artifacts but not `decision_memo` +### If `brier` wins on full artifacts but not `decision_memo` Interpretation: - the product surface may be more useful or auditable, but the final recommendation is not clearly better - this supports a decision-artifact claim more than a recommendation-quality claim -### If omission rates remain high under `farness` +### If omission rates remain high under `brier` Interpretation: diff --git a/farness/experiments/PREREGISTRATION.md b/brier/experiments/PREREGISTRATION.md similarity index 82% rename from farness/experiments/PREREGISTRATION.md rename to brier/experiments/PREREGISTRATION.md index 1fd3d5e..d8cf738 100644 --- a/farness/experiments/PREREGISTRATION.md +++ b/brier/experiments/PREREGISTRATION.md @@ -1,4 +1,4 @@ -# Preregistration: Farness Framework Effectiveness Experiment +# Preregistration: Brier Framework Effectiveness Experiment **Date:** 2024-12-19 **Authors:** Max Ghenis @@ -12,28 +12,28 @@ Does prompting an LLM with a structured decision framework ("farness") improve t ### Primary Hypotheses -**H1:** Farness-framed prompts will produce higher correct recommendation rates than naive prompts. +**H1:** Brier-framed prompts will produce higher correct recommendation rates than naive prompts. - *Operationalization:* Binary match to research-backed answer -- *Expected direction:* Farness > Naive +- *Expected direction:* Brier > Naive - *Minimum detectable effect:* 20 percentage points (e.g., 70% vs 50%) -**H2:** Farness-framed prompts will cite base rates more frequently. +**H2:** Brier-framed prompts will cite base rates more frequently. - *Operationalization:* Binary - does response mention any relevant base rate statistic? -- *Expected direction:* Farness > Naive +- *Expected direction:* Brier > Naive -**H3:** Farness-framed prompts will identify more cognitive biases. +**H3:** Brier-framed prompts will identify more cognitive biases. - *Operationalization:* Count of biases named that match ground truth list -- *Expected direction:* Farness > Naive +- *Expected direction:* Brier > Naive ### Secondary Hypotheses -**H4:** Farness-framed prompts will include confidence intervals or uncertainty quantification. +**H4:** Brier-framed prompts will include confidence intervals or uncertainty quantification. - *Operationalization:* Binary - contains numeric ranges or probability estimates -**H5:** Farness-framed prompts will produce more actionable accountability mechanisms. +**H5:** Brier-framed prompts will produce more actionable accountability mechanisms. - *Operationalization:* Binary - suggests review date, tracking, or feedback loop -**H6:** Farness-framed prompts will be more resistant to framing effects in the scenario. +**H6:** Brier-framed prompts will be more resistant to framing effects in the scenario. - *Operationalization:* Qualitative - does response identify misleading framing? ## Methods @@ -64,9 +64,9 @@ You are a helpful assistant. Answer this question directly: Give your recommendation and reasoning. ``` -**Condition B (Farness):** +**Condition B (Brier):** ``` -You are a decision analyst using the "farness" framework. This framework requires you to: +You are a decision analyst using the "Brier" framework. This framework requires you to: 1. Define explicit, measurable KPIs for the decision 2. Make numeric forecasts with confidence intervals for each option @@ -110,7 +110,7 @@ Each response scored on: ### Primary Analysis -For each metric, compare Farness vs Naive using: +For each metric, compare Brier vs Naive using: - **Binary outcomes (H1, H2, H4, H5):** Two-proportion z-test or Fisher's exact test - **Count outcomes (H3):** Mann-Whitney U test (non-parametric) @@ -128,7 +128,7 @@ Report: ### Secondary Analyses 1. **Per-case breakdown:** Which cases show largest effect? -2. **Correlation:** Do cases where naive fails show larger farness benefit? +2. **Correlation:** Do cases where naive fails show larger brier benefit? 3. **Qualitative:** Example responses showing mechanism of improvement ### Multiple Comparisons diff --git a/brier/experiments/__init__.py b/brier/experiments/__init__.py new file mode 100644 index 0000000..172b97c --- /dev/null +++ b/brier/experiments/__init__.py @@ -0,0 +1 @@ +"""Experiments for measuring brier framework effectiveness.""" diff --git a/farness/experiments/__main__.py b/brier/experiments/__main__.py similarity index 95% rename from farness/experiments/__main__.py rename to brier/experiments/__main__.py index d9ffef2..1b961d0 100644 --- a/farness/experiments/__main__.py +++ b/brier/experiments/__main__.py @@ -4,24 +4,24 @@ import json from pathlib import Path -from farness.experiments.cases import get_all_cases, get_case -from farness.experiments.runner import ( +from brier.experiments.cases import get_all_cases, get_case +from brier.experiments.runner import ( generate_prompts_for_manual_run, run_experiment, score_runs, ) -from farness.experiments.analyze import analyze_experiment, print_results_table, load_scores -from farness.experiments.stability import ( +from brier.experiments.analyze import analyze_experiment, print_results_table, load_scores +from brier.experiments.stability import ( get_all_stability_cases, get_primary_stability_cases, get_stability_case, ) -from farness.experiments.stability_runner import ( +from brier.experiments.stability_runner import ( run_stability_experiment, print_experiment_summary, ) -from farness.experiments.llm import model_short_name -from farness.experiments.decision_usefulness import ( +from brier.experiments.llm import model_short_name +from brier.experiments.decision_usefulness import ( DECISION_USEFULNESS_CONDITIONS, JUDGE_TASKS, REPRESENTATIONS, @@ -51,7 +51,7 @@ def _add_model_args(parser: argparse.ArgumentParser) -> None: nargs="+", choices=ALL_CONDITIONS, default=None, - help="Conditions to run (default: naive farness)", + help="Conditions to run (default: naive brier)", ) parser.add_argument( "--probe-batteries", @@ -65,7 +65,7 @@ def _add_model_args(parser: argparse.ArgumentParser) -> None: def main(): parser = argparse.ArgumentParser( - description="Run the farness framework effectiveness experiment" + description="Run the brier framework effectiveness experiment" ) subparsers = parser.add_subparsers(dest="command", help="Command to run") @@ -165,7 +165,7 @@ def main(): stability_parser.add_argument( "--strongest-validation", action="store_true", - help="Run the strongest reviewer-facing validation preset (primary scenarios, on/off-framework probes, naive + estimate-only + format-control + farness)", + help="Run the strongest reviewer-facing validation preset (primary scenarios, on/off-framework probes, naive + estimate-only + format-control + brier)", ) _add_model_args(stability_parser) @@ -410,7 +410,7 @@ def main(): print(f"Running stability experiment: {len(cases)} cases, {args.runs} runs/condition") print(f" Model: {model}") - print(f" Conditions: {conditions or ['naive', 'farness']}") + print(f" Conditions: {conditions or ['naive', 'brier']}") print(f" Probe batteries: {probe_batteries or ['on_framework']}") print(f" Output: {output_dir}") print(f" Starting at run {args.start_run}") @@ -431,7 +431,7 @@ def main(): print(f"\nResults saved to {output_dir}") elif args.command == "reframing": - from farness.experiments.reframing import ( + from brier.experiments.reframing import ( REFRAMING_CASES, run_reframing_experiment, analyze_reframing, @@ -444,7 +444,7 @@ def main(): print(f"Running reframing experiment: {len(REFRAMING_CASES)} cases, {args.runs} runs/condition") print(f" Model: {model}") - print(f" Conditions: {conditions or ['naive', 'farness']}") + print(f" Conditions: {conditions or ['naive', 'brier']}") print(f" Output: {output_dir}") results = run_reframing_experiment( @@ -467,7 +467,7 @@ def main(): _reanalyze(args) elif args.command == "judge": - from farness.experiments.judge import run_judge_evaluation + from brier.experiments.judge import run_judge_evaluation run_judge_evaluation( reframing_dir=args.reframing_dir, stability_dir=args.stability_dir, @@ -540,8 +540,8 @@ def main(): def _reanalyze(args): """Reanalyze results from saved JSON files, discovering model subdirectories.""" - from farness.experiments.stability import StabilityResult, StabilityExperiment - from farness.experiments.reframing import ReframingResult, analyze_reframing, summary_table + from brier.experiments.stability import StabilityResult, StabilityExperiment + from brier.experiments.reframing import ReframingResult, analyze_reframing, summary_table stability_base = Path(args.stability_dir) reframing_base = Path(args.reframing_dir) diff --git a/farness/experiments/analyze.py b/brier/experiments/analyze.py similarity index 79% rename from farness/experiments/analyze.py rename to brier/experiments/analyze.py index 68da70d..df96cca 100644 --- a/farness/experiments/analyze.py +++ b/brier/experiments/analyze.py @@ -7,7 +7,7 @@ from pathlib import Path from typing import Callable, Optional -from farness.experiments.scorer import ResponseScore, aggregate_scores +from brier.experiments.scorer import ResponseScore, aggregate_scores @dataclass @@ -16,7 +16,7 @@ class StatisticalTest: metric: str naive_value: float - farness_value: float + brier_value: float difference: float p_value: Optional[float] significant: bool @@ -133,26 +133,26 @@ def analyze_experiment( Analysis results dict """ naive = [s for s in scores if s.condition == "naive"] - farness = [s for s in scores if s.condition == "farness"] + brier = [s for s in scores if s.condition == "farness"] - n_naive, n_farness = len(naive), len(farness) + n_naive, n_brier = len(naive), len(brier) - if n_naive == 0 or n_farness == 0: + if n_naive == 0 or n_brier == 0: return {"error": "Need both conditions to analyze"} tests = [] # H1: Correct recommendation (if we have labels) naive_correct = [s for s in naive if s.correct_recommendation is not None] - farness_correct = [s for s in farness if s.correct_recommendation is not None] - if naive_correct and farness_correct: + brier_correct = [s for s in brier if s.correct_recommendation is not None] + if naive_correct and brier_correct: p1 = sum(s.correct_recommendation for s in naive_correct) / len(naive_correct) - p2 = sum(s.correct_recommendation for s in farness_correct) / len(farness_correct) - p_val = proportion_z_test(len(naive_correct), p1, len(farness_correct), p2) + p2 = sum(s.correct_recommendation for s in brier_correct) / len(brier_correct) + p_val = proportion_z_test(len(naive_correct), p1, len(brier_correct), p2) tests.append(StatisticalTest( metric="correct_recommendation", naive_value=p1, - farness_value=p2, + brier_value=p2, difference=p2 - p1, p_value=p_val, significant=p_val < alpha, @@ -161,13 +161,13 @@ def analyze_experiment( # H2: Base rate citation p1 = sum(s.cites_base_rate for s in naive) / n_naive - p2 = sum(s.cites_base_rate for s in farness) / n_farness - p_val = proportion_z_test(n_naive, p1, n_farness, p2) + p2 = sum(s.cites_base_rate for s in brier) / n_brier + p_val = proportion_z_test(n_naive, p1, n_brier, p2) secondary_alpha = alpha / 5 if bonferroni_correct else alpha tests.append(StatisticalTest( metric="cites_base_rate", naive_value=p1, - farness_value=p2, + brier_value=p2, difference=p2 - p1, p_value=p_val, significant=p_val < secondary_alpha, @@ -176,13 +176,13 @@ def analyze_experiment( # H3: Bias count (Mann-Whitney) bias_naive = [s.bias_count for s in naive] - bias_farness = [s.bias_count for s in farness] - p_val = mann_whitney_u(bias_naive, bias_farness) + bias_brier = [s.bias_count for s in brier] + p_val = mann_whitney_u(bias_naive, bias_brier) tests.append(StatisticalTest( metric="bias_count", naive_value=sum(bias_naive) / n_naive, - farness_value=sum(bias_farness) / n_farness, - difference=sum(bias_farness) / n_farness - sum(bias_naive) / n_naive, + brier_value=sum(bias_brier) / n_brier, + difference=sum(bias_brier) / n_brier - sum(bias_naive) / n_naive, p_value=p_val, significant=p_val < secondary_alpha, test_name="Mann-Whitney U", @@ -190,12 +190,12 @@ def analyze_experiment( # H4: Confidence intervals p1 = sum(s.has_confidence_interval for s in naive) / n_naive - p2 = sum(s.has_confidence_interval for s in farness) / n_farness - p_val = proportion_z_test(n_naive, p1, n_farness, p2) + p2 = sum(s.has_confidence_interval for s in brier) / n_brier + p_val = proportion_z_test(n_naive, p1, n_brier, p2) tests.append(StatisticalTest( metric="has_confidence_interval", naive_value=p1, - farness_value=p2, + brier_value=p2, difference=p2 - p1, p_value=p_val, significant=p_val < secondary_alpha, @@ -204,12 +204,12 @@ def analyze_experiment( # H5: Accountability p1 = sum(s.has_accountability for s in naive) / n_naive - p2 = sum(s.has_accountability for s in farness) / n_farness - p_val = proportion_z_test(n_naive, p1, n_farness, p2) + p2 = sum(s.has_accountability for s in brier) / n_brier + p_val = proportion_z_test(n_naive, p1, n_brier, p2) tests.append(StatisticalTest( metric="has_accountability", naive_value=p1, - farness_value=p2, + brier_value=p2, difference=p2 - p1, p_value=p_val, significant=p_val < secondary_alpha, @@ -218,12 +218,12 @@ def analyze_experiment( # H6: Quantified tradeoffs p1 = sum(s.quantifies_tradeoffs for s in naive) / n_naive - p2 = sum(s.quantifies_tradeoffs for s in farness) / n_farness - p_val = proportion_z_test(n_naive, p1, n_farness, p2) + p2 = sum(s.quantifies_tradeoffs for s in brier) / n_brier + p_val = proportion_z_test(n_naive, p1, n_brier, p2) tests.append(StatisticalTest( metric="quantifies_tradeoffs", naive_value=p1, - farness_value=p2, + brier_value=p2, difference=p2 - p1, p_value=p_val, significant=p_val < secondary_alpha, @@ -232,14 +232,14 @@ def analyze_experiment( return { "n_naive": n_naive, - "n_farness": n_farness, + "n_brier": n_brier, "alpha": alpha, "bonferroni_corrected": bonferroni_correct, "tests": [ { "metric": t.metric, "naive": round(t.naive_value, 3), - "farness": round(t.farness_value, 3), + "farness": round(t.brier_value, 3), "difference": round(t.difference, 3), "p_value": round(t.p_value, 4) if t.p_value else None, "significant": t.significant, @@ -256,14 +256,14 @@ def _generate_summary(tests: list[StatisticalTest]) -> str: sig_tests = [t for t in tests if t.significant and t.difference > 0] if not sig_tests: - return "No significant differences found favoring the farness framework." + return "No significant differences found favoring the brier framework." - lines = ["Significant improvements with farness framework:"] + lines = ["Significant improvements with brier framework:"] for t in sig_tests: pct_diff = t.difference * 100 lines.append( f" - {t.metric}: +{pct_diff:.1f} percentage points " - f"({t.naive_value*100:.0f}% -> {t.farness_value*100:.0f}%, p={t.p_value:.3f})" + f"({t.naive_value*100:.0f}% -> {t.brier_value*100:.0f}%, p={t.p_value:.3f})" ) return "\n".join(lines) @@ -272,19 +272,19 @@ def _generate_summary(tests: list[StatisticalTest]) -> str: def print_results_table(analysis: dict) -> None: """Print a formatted results table.""" print("\n" + "=" * 70) - print("FARNESS FRAMEWORK EXPERIMENT RESULTS") + print("BRIER FRAMEWORK EXPERIMENT RESULTS") print("=" * 70) - print(f"N (naive): {analysis['n_naive']}, N (farness): {analysis['n_farness']}") + print(f"N (naive): {analysis['n_naive']}, N (brier): {analysis['n_brier']}") print(f"Alpha: {analysis['alpha']}, Bonferroni: {analysis['bonferroni_corrected']}") print("-" * 70) - print(f"{'Metric':<25} {'Naive':>8} {'Farness':>8} {'Diff':>8} {'p-value':>10} {'Sig':>5}") + print(f"{'Metric':<25} {'Naive':>8} {'Brier':>8} {'Diff':>8} {'p-value':>10} {'Sig':>5}") print("-" * 70) for t in analysis["tests"]: sig_marker = "*" if t["significant"] else "" p_str = f"{t['p_value']:.4f}" if t["p_value"] else "N/A" print( - f"{t['metric']:<25} {t['naive']:>8.1%} {t['farness']:>8.1%} " + f"{t['metric']:<25} {t['naive']:>8.1%} {t['brier']:>8.1%} " f"{t['difference']:>+8.1%} {p_str:>10} {sig_marker:>5}" ) @@ -320,7 +320,7 @@ def load_scores(scores_file: Path) -> list[ResponseScore]: import sys if len(sys.argv) < 2: - print("Usage: python -m farness.experiments.analyze ") + print("Usage: python -m brier.experiments.analyze ") sys.exit(1) scores_file = Path(sys.argv[1]) diff --git a/farness/experiments/cases.py b/brier/experiments/cases.py similarity index 100% rename from farness/experiments/cases.py rename to brier/experiments/cases.py diff --git a/farness/experiments/decision_usefulness.py b/brier/experiments/decision_usefulness.py similarity index 99% rename from farness/experiments/decision_usefulness.py rename to brier/experiments/decision_usefulness.py index 9409e94..4f967f7 100644 --- a/farness/experiments/decision_usefulness.py +++ b/brier/experiments/decision_usefulness.py @@ -7,7 +7,7 @@ - naive - format_control - forecast_only -- farness +- brier Each generated analysis is evaluated in three representations: - decision_memo: a neutral fixed-envelope summary for recommendation quality @@ -25,7 +25,7 @@ from pathlib import Path from typing import Any, Optional -from farness.experiments.llm import call_llm, _is_openai_model, model_short_name +from brier.experiments.llm import call_llm, _is_openai_model, model_short_name DECISION_USEFULNESS_CONDITIONS = [ @@ -214,7 +214,7 @@ class DecisionUsefulnessCase: 5. Briefly state the main assumptions behind the forecast. Do not explicitly cite cognitive biases, base rates, disconfirming evidence, or review dates unless they are strictly necessary to support the forecast.""", - "farness": """You are a decision analyst using the farness framework. + "farness": """You are a decision analyst using the brier framework. A user needs help with this decision: @@ -713,7 +713,7 @@ def _clean_freeform_for_memo(text: str) -> str: def _redact_framework_names(text: str) -> str: """Remove explicit framework references from judged text.""" - redacted = re.sub(r"\bfarness\b", "[framework]", text, flags=re.IGNORECASE) + redacted = re.sub(r"\bbrier\b", "[framework]", text, flags=re.IGNORECASE) redacted = re.sub(r"\bforecasting as a harness\b", "[framework]", redacted, flags=re.IGNORECASE) return redacted diff --git a/farness/experiments/judge.py b/brier/experiments/judge.py similarity index 98% rename from farness/experiments/judge.py rename to brier/experiments/judge.py index e663006..795bf18 100644 --- a/farness/experiments/judge.py +++ b/brier/experiments/judge.py @@ -12,7 +12,7 @@ from pathlib import Path from typing import Optional -from farness.experiments.llm import call_llm, _is_openai_model +from brier.experiments.llm import call_llm, _is_openai_model REFRAMING_JUDGE_PROMPT = """You are an expert evaluator of decision advice quality. You will evaluate a response @@ -214,8 +214,8 @@ def run_judge_evaluation( verbose: bool = True, ) -> None: """Run LLM-as-judge evaluation on existing results.""" - from farness.experiments.reframing import REFRAMING_CASES - from farness.experiments.stability import STABILITY_CASES + from brier.experiments.reframing import REFRAMING_CASES + from brier.experiments.stability import STABILITY_CASES reframing_dir = Path(reframing_dir) stability_dir = Path(stability_dir) diff --git a/farness/experiments/llm.py b/brier/experiments/llm.py similarity index 100% rename from farness/experiments/llm.py rename to brier/experiments/llm.py diff --git a/farness/experiments/reframing.py b/brier/experiments/reframing.py similarity index 98% rename from farness/experiments/reframing.py rename to brier/experiments/reframing.py index 4a5eb30..42f2dcc 100644 --- a/farness/experiments/reframing.py +++ b/brier/experiments/reframing.py @@ -1,4 +1,4 @@ -"""Reframing experiment: does the farness framework reduce problem reframing? +"""Reframing experiment: does the brier framework reduce problem reframing? Tests whether structured forecasting constrains or preserves an advisor's ability to challenge the decision framing itself — suggesting different KPIs, @@ -14,7 +14,7 @@ from pathlib import Path from typing import Optional -from farness.experiments.llm import call_llm +from brier.experiments.llm import call_llm try: import numpy as np @@ -191,7 +191,7 @@ class ReframingCase: Think through this carefully step by step, then give me your honest recommendation with reasoning.""" -FARNESS_PROMPT = """You are a decision analyst using the "farness" framework. Apply this process: +BRIER_PROMPT = """You are a decision analyst using the "Brier" framework. Apply this process: 1. Define 2-3 explicit, measurable KPIs for this decision 2. Identify the options (including ones not mentioned) @@ -300,7 +300,7 @@ def run_single_trial( elif condition == "cot": template = COT_PROMPT else: - template = FARNESS_PROMPT + template = BRIER_PROMPT prompt = template.format(scenario=case.scenario.strip()) timestamp = datetime.now().isoformat() @@ -372,7 +372,7 @@ def analyze_reframing(results: list[ReframingResult]) -> dict: Supports 2 or 3 conditions with pairwise comparisons and Holm-Bonferroni correction. """ - from farness.experiments.stability import holm_bonferroni + from brier.experiments.stability import holm_bonferroni valid = [r for r in results if not r.response_text.startswith("ERROR")] conditions = sorted(set(r.condition for r in valid)) diff --git a/farness/experiments/runner.py b/brier/experiments/runner.py similarity index 93% rename from farness/experiments/runner.py rename to brier/experiments/runner.py index 837db98..62b91e3 100644 --- a/farness/experiments/runner.py +++ b/brier/experiments/runner.py @@ -1,4 +1,4 @@ -"""Run the farness effectiveness experiment.""" +"""Run the brier effectiveness experiment.""" from __future__ import annotations @@ -9,9 +9,9 @@ from pathlib import Path from typing import Optional -from farness.experiments.cases import TestCase, get_all_cases -from farness.experiments.llm import call_llm -from farness.experiments.scorer import ResponseScore, ResponseScorer +from brier.experiments.cases import TestCase, get_all_cases +from brier.experiments.llm import call_llm +from brier.experiments.scorer import ResponseScore, ResponseScorer @dataclass @@ -33,7 +33,7 @@ class ExperimentRun: Give your recommendation and reasoning.""" -FARNESS_TEMPLATE = """You are a decision analyst using the "farness" framework. This framework requires you to: +BRIER_TEMPLATE = """You are a decision analyst using the "Brier" framework. This framework requires you to: 1. Define explicit, measurable KPIs for the decision 2. Make numeric forecasts with confidence intervals for each option @@ -49,7 +49,7 @@ class ExperimentRun: def generate_prompt(case: TestCase, condition: str) -> str: """Generate the prompt for a given case and condition.""" - template = NAIVE_TEMPLATE if condition == "naive" else FARNESS_TEMPLATE + template = NAIVE_TEMPLATE if condition == "naive" else BRIER_TEMPLATE return template.format(scenario=case.scenario.strip()) @@ -224,7 +224,7 @@ def generate_prompts_for_manual_run( if __name__ == "__main__": import argparse - parser = argparse.ArgumentParser(description="Run farness experiment") + parser = argparse.ArgumentParser(description="Run brier experiment") parser.add_argument( "--generate-prompts", action="store_true", @@ -250,7 +250,7 @@ def generate_prompts_for_manual_run( args = parser.parse_args() - from farness.experiments.cases import get_case + from brier.experiments.cases import get_case if args.case: case = get_case(args.case) diff --git a/farness/experiments/scorer.py b/brier/experiments/scorer.py similarity index 98% rename from farness/experiments/scorer.py rename to brier/experiments/scorer.py index 7b27d96..d155aed 100644 --- a/farness/experiments/scorer.py +++ b/brier/experiments/scorer.py @@ -6,7 +6,7 @@ from dataclasses import dataclass from typing import Optional -from farness.experiments.cases import TestCase +from brier.experiments.cases import TestCase @dataclass @@ -210,7 +210,7 @@ def aggregate_scores(scores: list[ResponseScore]) -> dict: return {} naive_scores = [s for s in scores if s.condition == "naive"] - farness_scores = [s for s in scores if s.condition == "farness"] + brier_scores = [s for s in scores if s.condition == "farness"] def calc_stats(score_list: list[ResponseScore]) -> dict: n = len(score_list) @@ -236,7 +236,7 @@ def calc_stats(score_list: list[ResponseScore]) -> dict: return { "naive": calc_stats(naive_scores), - "farness": calc_stats(farness_scores), + "farness": calc_stats(brier_scores), "by_case": _aggregate_by_case(scores), } diff --git a/farness/experiments/stability.py b/brier/experiments/stability.py similarity index 97% rename from farness/experiments/stability.py rename to brier/experiments/stability.py index 0a42034..f52071e 100644 --- a/farness/experiments/stability.py +++ b/brier/experiments/stability.py @@ -19,7 +19,7 @@ np = None # type: ignore stats = None # type: ignore -from farness.experiments.cases import DecisionCase +from brier.experiments.cases import DecisionCase DEFAULT_PROBE_BATTERY = "on_framework" @@ -32,7 +32,7 @@ "estimate_only": "Estimate Only", "format_control": "Format Control", "cot": "CoT", - "farness": "Farness", + "farness": "Brier", } PROBE_BATTERY_DISPLAY_NAMES = { "on_framework": "On-Framework Probes", @@ -61,7 +61,7 @@ class QuantitativeCase: # Expected direction of update given probes (for validation) expected_update_direction: str # "up", "down", or "neutral" - # Held-out probes that are intentionally not named in the farness prompt + # Held-out probes that are intentionally not named in the brier prompt off_framework_probes: Optional[list[str]] = None off_framework_expected_update_direction: Optional[str] = None @@ -606,7 +606,7 @@ def generate_format_control_prompt(case: QuantitativeCase) -> str: def generate_cot_prompt(case: QuantitativeCase) -> str: - """Generate chain-of-thought prompt (structured reasoning, no farness framework).""" + """Generate chain-of-thought prompt (structured reasoning, no brier framework).""" return f"""You are a helpful assistant. Think through this step by step. {case.scenario} @@ -614,9 +614,9 @@ def generate_cot_prompt(case: QuantitativeCase) -> str: Question: {case.estimate_question} Think through this carefully step by step, then give a single number and an 80% confidence interval.{_JSON_INSTRUCTION}""" -def generate_farness_prompt(case: QuantitativeCase) -> str: - """Generate farness framework prompt.""" - return f"""You are a decision analyst using the "farness" framework. This requires: +def generate_brier_prompt(case: QuantitativeCase) -> str: + """Generate brier framework prompt.""" + return f"""You are a decision analyst using the "Brier" framework. This requires: 1. Cite base rates from research (outside view) 2. Make numeric forecasts with confidence intervals 3. Identify cognitive biases in the framing @@ -633,7 +633,7 @@ def generate_initial_prompt(case: QuantitativeCase, condition: str) -> str: "estimate_only": generate_estimate_only_prompt, "format_control": generate_format_control_prompt, "cot": generate_cot_prompt, - "farness": generate_farness_prompt, + "farness": generate_brier_prompt, } try: return prompt_generators[condition](case) @@ -1040,7 +1040,7 @@ def _get_case(self, case_id: str) -> Optional[QuantitativeCase]: return None def _measure_convergence(self, results: Optional[list[StabilityResult]] = None) -> dict: - """Measure whether naive(probed) converges toward farness(initial). + """Measure whether naive(probed) converges toward brier(initial). Uses minimum gap threshold to avoid division instability. Provides bootstrap confidence intervals for the convergence ratio. @@ -1053,21 +1053,21 @@ def _measure_convergence(self, results: Optional[list[StabilityResult]] = None) naive_results = [ r for r in results if r.case_id == case.id and r.condition == "naive" ] - farness_results = [ + brier_results = [ r for r in results if r.case_id == case.id and r.condition == "farness" ] - if not naive_results or not farness_results: + if not naive_results or not brier_results: continue - # Average farness initial estimates per scenario to avoid pseudo-replication - farness_initial_mean = sum(r.initial_estimate for r in farness_results) / len(farness_results) + # Average brier initial estimates per scenario to avoid pseudo-replication + brier_initial_mean = sum(r.initial_estimate for r in brier_results) / len(brier_results) for naive_r in naive_results: - # Distance from naive(initial) to mean farness(initial) - initial_gap = abs(naive_r.initial_estimate - farness_initial_mean) - # Distance from naive(probed) to mean farness(initial) - final_gap = abs(naive_r.final_estimate - farness_initial_mean) + # Distance from naive(initial) to mean brier(initial) + initial_gap = abs(naive_r.initial_estimate - brier_initial_mean) + # Distance from naive(probed) to mean brier(initial) + final_gap = abs(naive_r.final_estimate - brier_initial_mean) # Skip if initial gap too small (estimates already similar) if initial_gap < MIN_GAP_THRESHOLD: @@ -1142,9 +1142,9 @@ def _measure_convergence(self, results: Optional[list[StabilityResult]] = None) # Interpretation based on CI and effect size if ci_low is not None and ci_low > 0: - interpretation = "Significant convergence: naive responses moved toward farness initial estimates (CI excludes 0)" + interpretation = "Significant convergence: naive responses moved toward brier initial estimates (CI excludes 0)" elif ci_high is not None and ci_high < 0: - interpretation = "Significant divergence: naive responses moved away from farness initial estimates" + interpretation = "Significant divergence: naive responses moved away from brier initial estimates" elif ci_low is None: # No CI available (scipy not installed) interpretation = f"Mean convergence ratio: {avg_convergence:.2f} (install scipy for CI and p-value)" diff --git a/farness/experiments/stability_runner.py b/brier/experiments/stability_runner.py similarity index 98% rename from farness/experiments/stability_runner.py rename to brier/experiments/stability_runner.py index 5061ec6..f76df9b 100644 --- a/farness/experiments/stability_runner.py +++ b/brier/experiments/stability_runner.py @@ -15,8 +15,8 @@ from pathlib import Path from typing import Optional -from farness.experiments.llm import call_llm, model_short_name -from farness.experiments.stability import ( +from brier.experiments.llm import call_llm, model_short_name +from brier.experiments.stability import ( DEFAULT_PROBE_BATTERY, QuantitativeCase, StabilityResult, @@ -355,7 +355,7 @@ def print_experiment_summary(experiment: StabilityExperiment) -> None: type=str, nargs="+", default=None, - help="Conditions to test (default: naive farness)", + help="Conditions to test (default: naive brier)", ) args = parser.parse_args() diff --git a/farness/framework.py b/brier/framework.py similarity index 100% rename from farness/framework.py rename to brier/framework.py diff --git a/farness/market.py b/brier/market.py similarity index 96% rename from farness/market.py rename to brier/market.py index 778c4f8..0f7ec13 100644 --- a/farness/market.py +++ b/brier/market.py @@ -1,4 +1,4 @@ -"""Market-draft helpers for turning farness forecasts into forecast markets.""" +"""Market-draft helpers for turning brier forecasts into forecast markets.""" from __future__ import annotations @@ -9,7 +9,7 @@ from pathlib import Path from typing import Any, Literal, Optional -from farness.framework import Decision, Forecast, KPI +from brier.framework import Decision, Forecast, KPI MarketOutcomeType = Literal["BINARY", "PSEUDO_NUMERIC"] MarketVisibility = Literal["public", "unlisted"] @@ -253,7 +253,7 @@ def draft_market_for_option_kpi( f"If the condition is true: {resolution_rule}" ) context = ( - f"Original farness decision: {decision.question}\n\n" + f"Original brier decision: {decision.question}\n\n" f"Condition: if `{option_name}` is chosen or implemented.\n\n" f"KPI: {kpi.name} - {kpi.description}" ) @@ -282,7 +282,7 @@ def draft_market_for_option_kpi( resolution_date=resolution_date, resolution_rule=conditional_resolution_rule, source_forecast=_source_forecast_from_forecast(forecast), - notes=["Drafted from a stored farness forecast."], + notes=["Drafted from a stored brier forecast."], ) low, high = forecast.confidence_interval @@ -309,7 +309,7 @@ def draft_market_for_option_kpi( resolution_date=resolution_date, resolution_rule=conditional_resolution_rule, source_forecast=_source_forecast_from_forecast(forecast), - notes=["Drafted from a stored farness forecast."], + notes=["Drafted from a stored brier forecast."], ) @@ -352,14 +352,14 @@ def _description_markdown( parts.extend( [ "", - "_Drafted by farness. Review wording and resolution criteria before posting._", + "_Drafted by brier. Review wording and resolution criteria before posting._", ] ) return "\n".join(parts).strip() def _source_forecast_from_forecast(forecast: Forecast) -> SourceForecast: - """Convert a farness forecast to a market-source forecast.""" + """Convert a brier forecast to a market-source forecast.""" ci_low, ci_high = forecast.confidence_interval return SourceForecast( point_estimate=forecast.point_estimate, diff --git a/farness/mcp_server.py b/brier/mcp_server.py similarity index 95% rename from farness/mcp_server.py rename to brier/mcp_server.py index 9032a85..75ad839 100644 --- a/farness/mcp_server.py +++ b/brier/mcp_server.py @@ -1,4 +1,4 @@ -"""MCP server for farness.""" +"""MCP server for brier.""" import argparse import json @@ -7,9 +7,9 @@ from pathlib import Path from typing import Any, Literal -from farness import CalibrationTracker, DecisionStore -from farness.framework import Decision, Forecast, KPI, Option -from farness.market import ( +from brier import CalibrationTracker, DecisionStore +from brier.framework import Decision, Forecast, KPI, Option +from brier.market import ( MarketSource, draft_binary_policy_market, draft_markets_for_decision, @@ -19,7 +19,7 @@ def _resolve_store_path(store_path: str | None = None) -> Path | None: """Resolve the configured store path, falling back to environment.""" - candidate = store_path or os.environ.get("FARNESS_STORE_PATH") + candidate = store_path or os.environ.get("BRIER_STORE_PATH") return Path(candidate).expanduser() if candidate else None @@ -232,7 +232,7 @@ def save_decision_analysis( context: str | None = None, store_path: str | None = None, ) -> dict[str, Any]: - """Persist a structured farness analysis onto an existing decision.""" + """Persist a structured brier analysis onto an existing decision.""" store = _get_store(store_path) decision = store.get(decision_id) if not decision: @@ -363,7 +363,7 @@ def build_server(store_path: str | None = None): except ImportError as exc: # pragma: no cover - exercised by installation, not tests raise RuntimeError( "MCP support is not installed. Install the repo with MCP extras, " - "for example `python -m pip install -e '/path/to/farness[mcp]'`." + "for example `python -m pip install -e '/path/to/brier[mcp]'`." ) from exc resolved_store_path = _resolve_store_path(store_path) @@ -420,9 +420,9 @@ class MarketSourceInput(BaseModel): url: str = Field(description="Source URL") server = FastMCP( - "farness", + "brier", instructions=( - "Use farness to structure decisions as KPIs, options, forecasts, " + "Use brier to structure decisions as KPIs, options, forecasts, " "reference classes, disconfirming evidence, review dates, and resolvable KPI metadata. " "In the first answer, show the forecast summary and explain how it drives the recommendation." ), @@ -433,7 +433,7 @@ def _store() -> DecisionStore: @server.tool( title="Create decision", - description="Create an empty decision record to analyze with the farness workflow.", + description="Create an empty decision record to analyze with the brier workflow.", structured_output=True, ) def create_decision(question: str, context: str = "") -> dict[str, Any]: @@ -536,7 +536,7 @@ def get_calibration_summary() -> dict[str, Any]: @server.tool( title="Draft forecast market pack", description=( - "Draft Manifold-ready forecast market JSON for a stored farness decision " + "Draft Manifold-ready forecast market JSON for a stored brier decision " "or standalone policy question. This never creates markets or places bets." ), structured_output=True, @@ -567,15 +567,15 @@ def draft_market_pack( ) @server.resource( - "farness://framework", - title="Farness framework", - description="The canonical seven-step farness workflow.", + "brier://framework", + title="Brier framework", + description="The canonical seven-step brier workflow.", mime_type="text/markdown", ) def framework_resource() -> str: """Static overview of the framework.""" return ( - "# Farness\n\n" + "# Brier\n\n" "1. Define one or two KPIs that are later scoreable: include outcome type, " "resolution rule, resolution date, and data source.\n" "2. Expand the option set beyond the choices already mentioned.\n" @@ -629,13 +629,13 @@ def calibration_resource() -> str: @server.prompt( title="Analyze decision", - description="Prompt template for producing a full farness analysis for a stored decision.", + description="Prompt template for producing a full brier analysis for a stored decision.", ) def analyze_decision(decision_id: str) -> str: """Generate a prompt to analyze a stored decision.""" decision = get_decision(decision_id) return ( - "Use the farness workflow for this stored decision.\n\n" + "Use the brier workflow for this stored decision.\n\n" f"Decision record:\n{json.dumps(decision, indent=2)}\n\n" "Produce:\n" "1. explicit KPIs with outcome type, resolution rule, resolution date, and data source\n" @@ -664,7 +664,7 @@ def review_decision(decision_id: str) -> str: """Generate a prompt to review a stored decision.""" decision = get_decision(decision_id) return ( - "Review this farness decision.\n\n" + "Review this brier decision.\n\n" f"Decision record:\n{json.dumps(decision, indent=2)}\n\n" "Check whether:\n" "- the chosen option still makes sense,\n" @@ -696,12 +696,12 @@ def score_decision_prompt(decision_id: str) -> str: def main() -> None: - """Run the farness MCP server.""" - parser = argparse.ArgumentParser(prog="farness-mcp", description="Run the farness MCP server.") + """Run the brier MCP server.""" + parser = argparse.ArgumentParser(prog="brier-mcp", description="Run the brier MCP server.") parser.add_argument( "--store", default=None, - help="Optional path to the farness JSONL store. Defaults to $FARNESS_STORE_PATH or ~/.farness/decisions.jsonl.", + help="Optional path to the brier JSONL store. Defaults to $BRIER_STORE_PATH or ~/.brier/decisions.jsonl.", ) parser.add_argument( "--transport", diff --git a/farness/skills.py b/brier/skills.py similarity index 92% rename from farness/skills.py rename to brier/skills.py index f774f22..625fc57 100644 --- a/farness/skills.py +++ b/brier/skills.py @@ -28,11 +28,11 @@ def default_skill_dir(agent: str) -> Path: if agent == "codex": codex_home = os.environ.get("CODEX_HOME") if codex_home: - return Path(codex_home).expanduser() / "skills" / "farness" - return Path.home() / ".codex" / "skills" / "farness" + return Path(codex_home).expanduser() / "skills" / "brier" + return Path.home() / ".codex" / "skills" / "brier" if agent == "claude": - return Path.home() / ".claude" / "skills" / "farness" + return Path.home() / ".claude" / "skills" / "brier" raise ValueError(f"Unsupported agent: {agent}") @@ -40,7 +40,7 @@ def default_skill_dir(agent: str) -> Path: def load_skill_text(agent: str) -> str: """Return the packaged skill template for the requested agent.""" try: - resource = resources.files("farness").joinpath(*SKILL_RESOURCE_PATHS[agent]) + resource = resources.files("brier").joinpath(*SKILL_RESOURCE_PATHS[agent]) except KeyError as exc: raise ValueError(f"Unsupported agent: {agent}") from exc return resource.read_text(encoding="utf-8") diff --git a/farness/storage.py b/brier/storage.py similarity index 96% rename from farness/storage.py rename to brier/storage.py index ee4672b..d43ebf0 100644 --- a/farness/storage.py +++ b/brier/storage.py @@ -5,7 +5,7 @@ from pathlib import Path from typing import Optional -from farness.framework import Decision +from brier.framework import Decision class DecisionStore: @@ -13,7 +13,7 @@ class DecisionStore: def __init__(self, path: Optional[Path] = None): if path is None: - path = Path.home() / ".farness" / "decisions.jsonl" + path = Path.home() / ".brier" / "decisions.jsonl" self.path = Path(path) self.path.parent.mkdir(parents=True, exist_ok=True) diff --git a/claude-plugin/.claude-plugin/plugin.json b/claude-plugin/.claude-plugin/plugin.json index 7bed4d9..1c4d3f2 100644 --- a/claude-plugin/.claude-plugin/plugin.json +++ b/claude-plugin/.claude-plugin/plugin.json @@ -1,5 +1,5 @@ { - "name": "farness", + "name": "brier", "version": "0.2.4", "description": "Forecasting as a harness - reframe decisions as KPI predictions", "author": { diff --git a/claude-plugin/commands/decide.md b/claude-plugin/commands/decide.md index 545f0ca..0222a64 100644 --- a/claude-plugin/commands/decide.md +++ b/claude-plugin/commands/decide.md @@ -1,12 +1,12 @@ --- -description: Run a structured decision analysis using the farness framework (forecasting as a harness) +description: Run a structured decision analysis using the brier framework (forecasting as a harness) arguments: - name: decision description: The decision or question to analyze (optional - will prompt if not provided) required: false --- -# Farness Decision Framework +# Brier Decision Framework You are running a structured decision analysis. Follow this framework exactly: @@ -60,11 +60,11 @@ Ask: "What information would most change these estimates?" ## Step 6: Log the Decision -After completing the analysis, use Python to save the decision using the farness package: +After completing the analysis, use Python to save the decision using the brier package: ```python from datetime import datetime, timedelta -from farness import Decision, KPI, Option, Forecast, DecisionStore +from brier import Decision, KPI, Option, Forecast, DecisionStore # Create the decision object with all the data from the analysis decision = Decision( @@ -104,7 +104,7 @@ store.save(decision) print(f"Decision logged: {decision.id[:8]}") ``` -Tell the user: "Decision logged. Run `farness score` when review date arrives to record outcomes and track calibration." +Tell the user: "Decision logged. Run `brier score` when review date arrives to record outcomes and track calibration." ## Key Principles diff --git a/claude-plugin/commands/score.md b/claude-plugin/commands/score.md index b9d0465..3814ee8 100644 --- a/claude-plugin/commands/score.md +++ b/claude-plugin/commands/score.md @@ -15,7 +15,7 @@ Review a past decision and score how the forecasts performed. Run the interactive scoring command: ```bash -farness score $ARGUMENTS +brier score $ARGUMENTS ``` This will: @@ -32,13 +32,13 @@ This will: List unscored decisions: ```bash -farness list --unscored +brier list --unscored ``` Or show a specific decision: ```bash -farness show +brier show ``` ### Step 2: Review Original Forecasts @@ -60,7 +60,7 @@ Get specific numbers. ```python from datetime import datetime -from farness import DecisionStore +from brier import DecisionStore store = DecisionStore() decision = store.get("") @@ -78,7 +78,7 @@ store.update(decision) ### Step 5: Show Calibration ```bash -farness calibration +brier calibration ``` ## Reflection Questions diff --git a/docs/agent-workflows.md b/docs/agent-workflows.md index a961796..055548d 100644 --- a/docs/agent-workflows.md +++ b/docs/agent-workflows.md @@ -1,13 +1,13 @@ # Agent Workflows -`farness` is not tied to one assistant. The Claude Code plugin is the most integrated path today, but the framework also works with Codex, Cursor, Windsurf, ChatGPT, and any other agent that can follow structured instructions. +`brier` is not tied to one assistant. The Claude Code plugin is the most integrated path today, but the framework also works with Codex, Cursor, Windsurf, ChatGPT, and any other agent that can follow structured instructions. ## Core instruction -Give your agent this instruction when you want a decision analyzed with `farness`: +Give your agent this instruction when you want a decision analyzed with `brier`: ```text -Use the farness workflow for this decision. +Use the brier workflow for this decision. 1. Define the KPI or outcome that would make the decision successful. 2. Expand the option set beyond the choices already mentioned. 3. Anchor on a relevant reference class or base rate before using the inside view. @@ -20,7 +20,7 @@ Do not answer with a vague recommendation until the forecasts are explicit. ## Codex and other coding agents -This works well in tools like Codex because they already have the two things `farness` needs: +This works well in tools like Codex because they already have the two things `brier` needs: - access to local context - the ability to log decisions through the CLI or Python package @@ -28,18 +28,18 @@ This works well in tools like Codex because they already have the two things `fa Minimal workflow: ```bash -python -m pip install farness -farness new "Should we rewrite the auth layer?" --context "3 incidents this quarter; CTO prefers Rust; team is strongest in Node." +python -m pip install brier +brier new "Should we rewrite the auth layer?" --context "3 incidents this quarter; CTO prefers Rust; team is strongest in Node." ``` -Then ask the agent to use the core instruction above and to read or update the decision in `~/.farness/decisions.jsonl`. +Then ask the agent to use the core instruction above and to read or update the decision in `~/.brier/decisions.jsonl`. If you want Codex to pick this workflow up as a native skill, install the packaged skill: ```bash -python -m pip install 'farness[mcp]' -farness setup codex -farness doctor codex +python -m pip install 'brier[mcp]' +brier setup codex +brier doctor codex ``` Then restart Codex. @@ -47,16 +47,16 @@ Then restart Codex. If the skill drifted or setup only half-worked: ```bash -farness doctor codex --fix +brier doctor codex --fix ``` ## MCP server -If you want a native tool surface instead of prompt copy-paste, `farness` ships an MCP server: +If you want a native tool surface instead of prompt copy-paste, `brier` ships an MCP server: ```bash -python -m pip install 'farness[mcp]' -farness-mcp +python -m pip install 'brier[mcp]' +brier-mcp ``` The server exposes: @@ -68,9 +68,9 @@ The server exposes: Optional configuration: ```bash -FARNESS_STORE_PATH=/path/to/decisions.jsonl farness-mcp +BRIER_STORE_PATH=/path/to/decisions.jsonl brier-mcp # or -farness-mcp --store /path/to/decisions.jsonl +brier-mcp --store /path/to/decisions.jsonl ``` The default transport is `stdio`, which is the right default for editor and agent integrations. @@ -78,7 +78,7 @@ The default transport is `stdio`, which is the right default for editor and agen To register the local server in Codex: ```bash -farness setup codex +brier setup codex ``` ## Claude Code @@ -86,51 +86,51 @@ farness setup codex Claude Code can use the same local MCP server and a local skill wrapper: ```bash -python -m pip install 'farness[mcp]' -farness setup claude -farness doctor claude +python -m pip install 'brier[mcp]' +brier setup claude +brier doctor claude ``` -This gives Claude Code a local skill plus the `farness` MCP tools/resources/prompts. +This gives Claude Code a local skill plus the `brier` MCP tools/resources/prompts. If the skill drifted or setup only half-worked: ```bash -farness doctor claude --fix +brier doctor claude --fix ``` The plugin path is still available if you prefer slash commands: ```bash -claude plugin marketplace add MaxGhenis/farness -claude plugin install farness@maxghenis-plugins +claude plugin marketplace add MaxGhenis/brier +claude plugin install brier@maxghenis-plugins ``` -Then either use the local skill or run `/farness:decide` for the plugin flow. +Then either use the local skill or run `/brier:decide` for the plugin flow. ## Python and CLI -If you do not want any agent integration, `farness` still works as a local decision log and calibration tool. The CLI does not call an LLM and does not need an API key. +If you do not want any agent integration, `brier` still works as a local decision log and calibration tool. The CLI does not call an LLM and does not need an API key. Useful commands: ```bash -farness new "Should we rewrite the auth layer?" -farness list -farness show -farness pending -farness calibration +brier new "Should we rewrite the auth layer?" +brier list +brier show +brier pending +brier calibration ``` To draft forecast questions from a standalone policy question or a stored decision: ```bash -farness forecast-draft "Will Waymo be legally permitted to offer fully driverless paid robotaxi rides in Washington, DC by 2026-12-31?" \ +brier forecast-draft "Will Waymo be legally permitted to offer fully driverless paid robotaxi rides in Washington, DC by 2026-12-31?" \ --initial-prob 52 \ --resolution-date 2026-12-31 \ --output waymo-dc-forecast-pack.json -farness forecast-draft --output forecast-pack.json +brier forecast-draft --output forecast-pack.json ``` This only writes Manifold-ready JSON. It does not publish questions, create Manifold entries, @@ -139,8 +139,8 @@ place bets, or require a Manifold API key. If you want to fully reset a local integration: ```bash -farness uninstall codex -farness uninstall claude +brier uninstall codex +brier uninstall claude ``` ## Recommended prompt shape diff --git a/farness/__init__.py b/farness/__init__.py deleted file mode 100644 index 888e79b..0000000 --- a/farness/__init__.py +++ /dev/null @@ -1,21 +0,0 @@ -"""Farness: Forecasting as a harness for decision-making.""" - -__version__ = "0.2.4" - -from farness.framework import Decision, KPI, Option, Forecast, OutcomeType -from farness.storage import DecisionStore -from farness.calibration import CalibrationTracker -from farness.market import MarketDraft, MarketSource, draft_markets_for_decision - -__all__ = [ - "Decision", - "KPI", - "Option", - "Forecast", - "OutcomeType", - "DecisionStore", - "CalibrationTracker", - "MarketDraft", - "MarketSource", - "draft_markets_for_decision", -] diff --git a/farness/experiments/__init__.py b/farness/experiments/__init__.py deleted file mode 100644 index d6220cb..0000000 --- a/farness/experiments/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Experiments for measuring farness framework effectiveness.""" diff --git a/forecast-api/.env.example b/forecast-api/.env.example index b6efc75..e40c4ab 100644 --- a/forecast-api/.env.example +++ b/forecast-api/.env.example @@ -2,7 +2,7 @@ # AI_GATEWAY_API_KEY=... # Defaults to anthropic/claude-sonnet-4.6. -# FARNESS_AI_MODEL=anthropic/claude-sonnet-4.6 +# BRIER_AI_MODEL=anthropic/claude-sonnet-4.6 # Comma-separated browser origins allowed to read SSE streams. -# FARNESS_SITE_ORIGINS=https://farness.ai,http://127.0.0.1:3001 +# BRIER_SITE_ORIGINS=https://brieralmanac.org,http://127.0.0.1:3001 diff --git a/forecast-api/README.md b/forecast-api/README.md index b046f30..b0a7d3b 100644 --- a/forecast-api/README.md +++ b/forecast-api/README.md @@ -1,4 +1,4 @@ -# Farness Forecast API +# Brier Forecast API Small Vercel-deployable backend for live forecast traces. @@ -10,7 +10,7 @@ bun run dev -- --hostname 127.0.0.1 --port 3002 ``` The static site reads from `http://127.0.0.1:3002` on local hosts unless -`NEXT_PUBLIC_FARNESS_API_BASE_URL` is set. +`NEXT_PUBLIC_BRIER_API_BASE_URL` is set. AI Gateway is optional locally. Without `AI_GATEWAY_API_KEY`, `VERCEL_OIDC_TOKEN`, or a Vercel runtime, live endpoints still stream public diff --git a/forecast-api/package.json b/forecast-api/package.json index 2eb598b..16e47e3 100644 --- a/forecast-api/package.json +++ b/forecast-api/package.json @@ -1,5 +1,5 @@ { - "name": "farness-forecast-api", + "name": "brier-forecast-api", "private": true, "version": "0.0.0", "scripts": { diff --git a/forecast-api/src/app/forecasts/[slug]/stream/route.ts b/forecast-api/src/app/forecasts/[slug]/stream/route.ts index 3b029a1..26cfef2 100644 --- a/forecast-api/src/app/forecasts/[slug]/stream/route.ts +++ b/forecast-api/src/app/forecasts/[slug]/stream/route.ts @@ -157,17 +157,17 @@ async function streamSpmChildPovertyForecast(send: SendEvent) { }); const calibrationCall = - 'farness.calibration.lookup({ domain: "poverty_forecasts", outcome: "spm_child_poverty_rate", targetYear: 2025 })'; + 'brier.calibration.lookup({ domain: "poverty_forecasts", outcome: "spm_child_poverty_rate", targetYear: 2025 })'; send("status", { state: "tool_running", label: "Looking up SPM calibration prior", }); send("tool_start", { - tool: "farness.calibration", + tool: "brier.calibration", call: calibrationCall, }); send("tool_result", { - tool: "farness.calibration", + tool: "brier.calibration", call: calibrationCall, result: serializeSpmCalibrationToolResult(dataset), }); @@ -361,13 +361,13 @@ async function streamCtcCurrentLawOutlaysForecast(send: SendEvent) { }); const calibrationCall = - 'farness.calibration.lookup({ domain: "policyengine_budget_scores", policy_area: "ctc", outcome: "current_law_outlays" })'; + 'brier.calibration.lookup({ domain: "policyengine_budget_scores", policy_area: "ctc", outcome: "current_law_outlays" })'; send("status", { state: "tool_running", label: "Looking up CTC outlay calibration", }); send("tool_start", { - tool: "farness.calibration", + tool: "brier.calibration", call: calibrationCall, }); @@ -378,7 +378,7 @@ async function streamCtcCurrentLawOutlaysForecast(send: SendEvent) { const ciLow = 52.0; const ciHigh = 70.0; send("tool_result", { - tool: "farness.calibration", + tool: "brier.calibration", call: calibrationCall, result: JSON.stringify( { @@ -489,17 +489,17 @@ async function streamCtcExpansionForecast(send: SendEvent) { }); const calibrationCall = - 'farness.calibration.lookup({ domain: "policyengine_budget_scores", policy_area: "ctc", outcome: "federal_budget_cost" })'; + 'brier.calibration.lookup({ domain: "policyengine_budget_scores", policy_area: "ctc", outcome: "federal_budget_cost" })'; send("status", { state: "tool_running", label: "Looking up calibration prior", }); send("tool_start", { - tool: "farness.calibration", + tool: "brier.calibration", call: calibrationCall, }); send("tool_result", { - tool: "farness.calibration", + tool: "brier.calibration", call: calibrationCall, result: JSON.stringify(dataset.calibration, null, 2), }); diff --git a/forecast-api/src/app/health/route.ts b/forecast-api/src/app/health/route.ts index 4665d84..56bf9af 100644 --- a/forecast-api/src/app/health/route.ts +++ b/forecast-api/src/app/health/route.ts @@ -1,6 +1,6 @@ export function GET() { return Response.json({ ok: true, - service: "farness-forecast-api", + service: "brier-forecast-api", }); } diff --git a/forecast-api/src/lib/cors.ts b/forecast-api/src/lib/cors.ts index 2293946..ca4f2da 100644 --- a/forecast-api/src/lib/cors.ts +++ b/forecast-api/src/lib/cors.ts @@ -1,6 +1,6 @@ const DEFAULT_ORIGINS = [ - "https://farness.ai", - "https://www.farness.ai", + "https://brieralmanac.org", + "https://www.brieralmanac.org", "http://localhost:3000", "http://127.0.0.1:3000", "http://localhost:3001", @@ -10,7 +10,7 @@ const DEFAULT_ORIGINS = [ export function corsHeaders(request: Request): HeadersInit { const origin = request.headers.get("origin"); const allowedOrigins = ( - process.env.FARNESS_SITE_ORIGINS?.split(",") ?? DEFAULT_ORIGINS + process.env.BRIER_SITE_ORIGINS?.split(",") ?? DEFAULT_ORIGINS ).map((value) => value.trim()); const allowOrigin = origin && allowedOrigins.includes(origin) ? origin : allowedOrigins[0]; diff --git a/forecast-api/src/lib/forecast.ts b/forecast-api/src/lib/forecast.ts index f7a96fb..e0690eb 100644 --- a/forecast-api/src/lib/forecast.ts +++ b/forecast-api/src/lib/forecast.ts @@ -86,7 +86,7 @@ export async function generateCpiForecast( ); } - const model = process.env.FARNESS_AI_MODEL ?? "anthropic/claude-sonnet-4.6"; + const model = process.env.BRIER_AI_MODEL ?? "anthropic/claude-sonnet-4.6"; try { const result = await generateObject({ @@ -94,7 +94,7 @@ export async function generateCpiForecast( schema: ForecastSchema, temperature: 0.2, system: - "You are a Farness public forecasting agent. Produce concise, audit-ready reasoning for public readers. Do not reveal hidden chain-of-thought; provide a public trace with evidence, assumptions, and uncertainty.", + "You are a Brier public forecasting agent. Produce concise, audit-ready reasoning for public readers. Do not reveal hidden chain-of-thought; provide a public trace with evidence, assumptions, and uncertainty.", prompt: [ "Forecast this public prediction cell:", "What will the annual average percent change in CPI-U for calendar year 2026 versus the 2025 annual average be, as published by BLS?", @@ -132,7 +132,7 @@ export async function generateCtcExpansionForecast( ); } - const model = process.env.FARNESS_AI_MODEL ?? "anthropic/claude-sonnet-4.6"; + const model = process.env.BRIER_AI_MODEL ?? "anthropic/claude-sonnet-4.6"; try { const result = await generateObject({ @@ -140,7 +140,7 @@ export async function generateCtcExpansionForecast( schema: CtcExpansionForecastSchema, temperature: 0.2, system: - "You are a Farness public forecasting agent. Forecast in billions of nominal dollars. Use public, audit-ready reasoning only. Treat PolicyEngine as an explicit model input, not as ground truth, and describe calibration adjustments without hidden chain-of-thought.", + "You are a Brier public forecasting agent. Forecast in billions of nominal dollars. Use public, audit-ready reasoning only. Treat PolicyEngine as an explicit model input, not as ground truth, and describe calibration adjustments without hidden chain-of-thought.", prompt: [ "Forecast this public prediction cell:", dataset.summary.question, @@ -186,7 +186,7 @@ export async function generateSpmChildPovertyForecast( ); } - const model = process.env.FARNESS_AI_MODEL ?? "anthropic/claude-sonnet-4.6"; + const model = process.env.BRIER_AI_MODEL ?? "anthropic/claude-sonnet-4.6"; try { const result = await generateObject({ @@ -194,7 +194,7 @@ export async function generateSpmChildPovertyForecast( schema: SpmChildPovertyForecastSchema, temperature: 0.2, system: - "You are a Farness public forecasting agent. Forecast in percentage points. Use public, audit-ready reasoning only. Treat Census history and PolicyEngine current-law inputs as explicit model inputs, not as ground truth, and describe calibration adjustments without hidden chain-of-thought.", + "You are a Brier public forecasting agent. Forecast in percentage points. Use public, audit-ready reasoning only. Treat Census history and PolicyEngine current-law inputs as explicit model inputs, not as ground truth, and describe calibration adjustments without hidden chain-of-thought.", prompt: [ "Forecast this public prediction cell:", dataset.summary.question, @@ -381,7 +381,7 @@ function normalizeSpmPercentForecast( } function shouldTryGateway() { - if (process.env.FARNESS_DISABLE_AI === "1") return false; + if (process.env.BRIER_DISABLE_AI === "1") return false; return Boolean( process.env.AI_GATEWAY_API_KEY || process.env.VERCEL_OIDC_TOKEN || diff --git a/paper/_header.html b/paper/_header.html index 3bb014b..73869e8 100644 --- a/paper/_header.html +++ b/paper/_header.html @@ -1,4 +1,4 @@ -
- + - farness + brier @@ -57,7 +57,7 @@ color: #A94E80; ">Research - GitHub - . Source code and experiment data: .] a structured decision framework that reframes subjective advice-seeking questions ("should I...?") into forecasting problems with explicit metrics. The framework operates through six required steps: +I introduce **Brier**,^[Named for the Brier score, the proper scoring rule for probabilistic forecasts whose calibration discipline it imports into decision-making. Framework documentation: . Source code and experiment data: .] a structured decision framework that reframes subjective advice-seeking questions ("should I...?") into forecasting problems with explicit metrics. The framework operates through six required steps: 1. **Define KPIs.** Identify explicit, measurable key performance indicators that operationalize what "success" means for the decision. 2. **Make numeric forecasts.** Produce point estimates with confidence intervals for each option against each KPI, replacing vague qualitative assessments with quantifiable predictions. @@ -85,11 +85,11 @@ Conversely, if a framework produces recommendations that are stable under probin ### Protocol -For each decision scenario, I proceed in four steps. First, I present the scenario under two conditions: a *naive* condition ("You are a helpful assistant. [Scenario]. What is your estimate?") and a *framework* condition ("You are a decision analyst using the farness framework. [Scenario]. What is your estimate with confidence interval?"). Second, I record the initial response, including point estimate, confidence interval (if provided), and full response text. Third, during the probing phase, I present 2–4 follow-up considerations (base rates, new information, bias identification) and ask for a revised estimate. Fourth, I record the final response with the same fields. +For each decision scenario, I proceed in four steps. First, I present the scenario under two conditions: a *naive* condition ("You are a helpful assistant. [Scenario]. What is your estimate?") and a *framework* condition ("You are a decision analyst using the Brier framework. [Scenario]. What is your estimate with confidence interval?"). Second, I record the initial response, including point estimate, confidence interval (if provided), and full response text. Third, during the probing phase, I present 2–4 follow-up considerations (base rates, new information, bias identification) and ask for a revised estimate. Fourth, I record the final response with the same fields. ![Illustrative single-scenario workflow using the sunk-cost-project case. Both conditions receive the same probe bundle; the core comparison is how far each condition moves from its initial estimate to its revised estimate.](figures/fig_protocol.png){#fig-protocol} -@fig-protocol provides the clearest single-example view of the design. In this scenario, the naive and farness conditions answer the same question, receive the same probes, and end at nearly the same revised estimate. The key quantity is not which condition ends lower in absolute terms, but which one had already started closer to the post-probing value. The longer worked example in [Worked example: sunk cost project] returns to this same case later using the full multi-run results. +@fig-protocol provides the clearest single-example view of the design. In this scenario, the naive and Brier conditions answer the same question, receive the same probes, and end at nearly the same revised estimate. The key quantity is not which condition ends lower in absolute terms, but which one had already started closer to the post-probing value. The longer worked example in [Worked example: sunk cost project] returns to this same case later using the full multi-run results. ### Metrics @@ -156,7 +156,7 @@ To establish discriminant validity, I include three adversarial probing scenario ### Model and procedure -The paper reports two related studies. **Study 1** is the original shared-battery case study. It uses Claude Opus 4.6 (Anthropic) and GPT-5.4 (OpenAI), accessed via their respective APIs with temperature 1.0 to maximize response diversity across runs. Study 1 tests three conditions (naive, chain-of-thought, farness) with 6 runs per scenario-condition pair across the 11-scenario battery. **Study 2** is a construct-validity follow-up on Claude only. It uses the 8 primary non-adversarial scenarios, four conditions (naive, estimate_only, format_control, farness), and two probe batteries: *on-framework* probes that test considerations explicitly named in the farness prompt, and *off-framework* probes that target other considerations such as implementation fragility, incentives, and opportunity cost. +The paper reports two related studies. **Study 1** is the original shared-battery case study. It uses Claude Opus 4.6 (Anthropic) and GPT-5.4 (OpenAI), accessed via their respective APIs with temperature 1.0 to maximize response diversity across runs. Study 1 tests three conditions (naive, chain-of-thought, Brier) with 6 runs per scenario-condition pair across the 11-scenario battery. **Study 2** is a construct-validity follow-up on Claude only. It uses the 8 primary non-adversarial scenarios, four conditions (naive, estimate_only, format_control, Brier), and two probe batteries: *on-framework* probes that test considerations explicitly named in the Brier prompt, and *off-framework* probes that target other considerations such as implementation fragility, incentives, and opportunity cost. All stability tasks are numeric estimation tasks rather than Boolean judgments; the battery does not mix yes/no outputs with continuous scales. Condition order is randomized per case using a logged random seed for reproducibility. Extraction functions operate on response text using structured JSON parsing first and regex-based parsing second, without access to condition labels, providing blinding at the analysis stage. Of 198 expected Claude Study 1 result files, 7 failed due to transient API errors (the runner logs errors and continues); all 198 GPT-5.4 Study 1 results completed. Missing Claude runs are distributed across 3 scenarios (adversarial_false_base_rate, deadline_estimate, investment_return) and do not systematically affect any single condition. @@ -172,7 +172,7 @@ Study 1 comprises 11 scenarios across 3 conditions with 6 runs each on 2 models, ### Overview -I report two studies. Study 1 contains 191 stability results for Claude Opus 4.6 (7 missing due to transient API errors) and 198 for GPT-5.4, across 11 scenarios and 3 conditions (naive, chain-of-thought, farness) with 6 runs per scenario-condition pair. Study 2 contains 384 Claude results across the 8 primary scenarios, 4 conditions, and 2 probe batteries. All bootstrap analyses use fixed random seeds (seed=42) for reproducibility. +I report two studies. Study 1 contains 191 stability results for Claude Opus 4.6 (7 missing due to transient API errors) and 198 for GPT-5.4, across 11 scenarios and 3 conditions (naive, chain-of-thought, Brier) with 6 runs per scenario-condition pair. Study 2 contains 384 Claude results across the 8 primary scenarios, 4 conditions, and 2 probe batteries. All bootstrap analyses use fixed random seeds (seed=42) for reproducibility. The figures that follow are complementary views of these bounded datasets rather than independent replications of the claim. @fig-update-magnitude summarizes the unit-normalized Study 1 differences, @fig-convergence clarifies the convergence mechanism, @fig-sycophancy shows run-level adversarial variability, and @fig-probe-validation directly tests construct validity by splitting on-framework and off-framework probes. @@ -180,7 +180,7 @@ The figures that follow are complementary views of these bounded datasets rather @tbl-stability reports the primary stability metrics by condition and model. -| Metric | Claude naive | Claude CoT | Claude farness | GPT-5.4 naive | GPT-5.4 CoT | GPT-5.4 farness | +| Metric | Claude naive | Claude CoT | Claude Brier | GPT-5.4 naive | GPT-5.4 CoT | GPT-5.4 Brier | |--------|-------------|-----------|---------------|----------|--------|------------| | n | 63 | 66 | 62 | 66 | 66 | 66 | | Mean relative update | 51% | 49% | 43% | 48% | 41% | 36% | @@ -192,7 +192,7 @@ The figures that follow are complementary views of these bounded datasets rather ![Mean relative update by condition and model on the Study 1 analysis set. Points show condition means and vertical bars show bootstrap 95% confidence intervals; the post hoc downward sycophancy scenario is excluded.](figures/fig_update_magnitude.png){#fig-update-magnitude} -@fig-update-magnitude visualizes the unit-normalized condition means reported in @tbl-stability. The pattern is clear on both models: farness has the lowest mean relative update, naive the highest, and CoT sits in between. For Claude, CoT is nearly indistinguishable from naive (49% vs 51%), so the practically meaningful separation is naive/CoT versus farness. For GPT-5.4, CoT improves modestly (41%), but farness still produces the smallest average relative update (36% vs 48% for naive). The raw-magnitude row in @tbl-stability shows the same ordering within each model, but those raw values are not comparable across mixed units and are therefore secondary. +@fig-update-magnitude visualizes the unit-normalized condition means reported in @tbl-stability. The pattern is clear on both models: Brier has the lowest mean relative update, naive the highest, and CoT sits in between. For Claude, CoT is nearly indistinguishable from naive (49% vs 51%), so the practically meaningful separation is naive/CoT versus Brier. For GPT-5.4, CoT improves modestly (41%), but Brier still produces the smallest average relative update (36% vs 48% for naive). The raw-magnitude row in @tbl-stability shows the same ordering within each model, but those raw values are not comparable across mixed units and are therefore secondary. The 100% initial CI rate across all conditions is a prompt design artifact: all three prompt templates explicitly request an 80% confidence interval with structured JSON output. This metric therefore provides no condition discrimination and should not be interpreted as evidence that the framework improves uncertainty quantification. @@ -200,9 +200,9 @@ The 100% initial CI rate across all conditions is a prompt design artifact: all To account for the clustering structure and mixed units, I fit a linear mixed-effects model (**relative update ~ condition** with random intercepts for scenario) using restricted maximum likelihood (REML) estimation. -For Claude, the model converges with random-intercept variance of 0.115 across 11 scenario groups (n=191). The farness coefficient is −0.080 (SE=0.021, p<0.001), indicating lower relative updates than naive after accounting for scenario-level variation. The CoT coefficient is −0.024 (SE=0.016, p=0.13), confirming little benefit from chain-of-thought prompting. The intercept (naive baseline) is 0.515 (SE=0.096, p<0.001). +For Claude, the model converges with random-intercept variance of 0.115 across 11 scenario groups (n=191). The Brier coefficient is −0.080 (SE=0.021, p<0.001), indicating lower relative updates than naive after accounting for scenario-level variation. The CoT coefficient is −0.024 (SE=0.016, p=0.13), confirming little benefit from chain-of-thought prompting. The intercept (naive baseline) is 0.515 (SE=0.096, p<0.001). -For GPT-5.4, the model converges with random-intercept variance of 0.087 (n=198). The farness coefficient is −0.128 (SE=0.033, p<0.001) and the CoT coefficient is −0.074 (SE=0.027, p=0.006). The GPT-5.4 CoT effect is smaller than farness and does not replicate on Claude, so it should be treated as model-specific rather than a general CoT result. Across both models, the more consistent finding is that farness reduces relative updating under the original shared probe battery. +For GPT-5.4, the model converges with random-intercept variance of 0.087 (n=198). The Brier coefficient is −0.128 (SE=0.033, p<0.001) and the CoT coefficient is −0.074 (SE=0.027, p=0.006). The GPT-5.4 CoT effect is smaller than Brier and does not replicate on Claude, so it should be treated as model-specific rather than a general CoT result. Across both models, the more consistent finding is that Brier reduces relative updating under the original shared probe battery. ### Non-parametric robustness check {#sec-pairwise} @@ -210,36 +210,36 @@ As a robustness check that makes no distributional or independence assumptions, | Comparison | U | p (raw) | p (corrected) | Cohen's d [95% CI] | Rank-biserial r [95% CI] | |-----------|---|---------|--------------|--------------------|-----------------------| -| Claude: naive vs farness | 2192.5 | 0.243 | 0.709 | 0.24 [−0.13, 0.58] | −0.12 [−0.32, 0.10] | -| Claude: CoT vs farness | 2226.5 | 0.391 | 0.782 | 0.20 [−0.14, 0.55] | −0.09 [−0.29, 0.11] | +| Claude: naive vs Brier | 2192.5 | 0.243 | 0.709 | 0.24 [−0.13, 0.58] | −0.12 [−0.32, 0.10] | +| Claude: CoT vs Brier | 2226.5 | 0.391 | 0.782 | 0.20 [−0.14, 0.55] | −0.09 [−0.29, 0.11] | | Claude: naive vs CoT | 2117.0 | 0.862 | 0.862 | 0.06 [−0.31, 0.40] | −0.02 [−0.22, 0.18] | -| GPT-5.4: naive vs farness | 2505.5 | 0.137 | 0.412 | 0.36 [0.04, 0.66] | −0.15 [−0.35, 0.03] | -| GPT-5.4: CoT vs farness | 2493.0 | 0.151 | 0.412 | 0.22 [−0.12, 0.57] | −0.14 [−0.34, 0.05] | +| GPT-5.4: naive vs Brier | 2505.5 | 0.137 | 0.412 | 0.36 [0.04, 0.66] | −0.15 [−0.35, 0.03] | +| GPT-5.4: CoT vs Brier | 2493.0 | 0.151 | 0.412 | 0.22 [−0.12, 0.57] | −0.14 [−0.34, 0.05] | | GPT-5.4: naive vs CoT | 2192.0 | 0.950 | 0.950 | 0.21 [−0.14, 0.51] | −0.01 [−0.21, 0.18] | : Pairwise comparisons of relative update (non-parametric robustness check). {#tbl-pairwise} -After Holm-Bonferroni correction, no comparison reaches conventional significance at alpha=0.05. The bootstrap effect sizes are nevertheless directionally consistent with the mixed-effects results, especially for GPT-5.4 naive versus farness. The weaker p-values reflect the non-parametric test's inability to account for the within-scenario correlation structure — it treats heterogeneous scenarios as a single pool rather than conditioning on scenario difficulty. +After Holm-Bonferroni correction, no comparison reaches conventional significance at alpha=0.05. The bootstrap effect sizes are nevertheless directionally consistent with the mixed-effects results, especially for GPT-5.4 naive versus Brier. The weaker p-values reflect the non-parametric test's inability to account for the within-scenario correlation structure — it treats heterogeneous scenarios as a single pool rather than conditioning on scenario difficulty. ### Cross-model comparison -Claude and GPT-5.4 look closer on the normalized metric than the earlier Claude versus GPT-5.2 comparison did. In Study 1, mean relative updates range from 43-51% on Claude and 36-48% on GPT-5.4. The archival GPT-5.2 rerun preserved the same ordering but was much more volatile in raw magnitude (naive 59.03, CoT 29.35, farness 22.03). The upward sycophancy scenario is the clearest example: GPT-5.2 naive responses moved by 466.7 leads on average, versus 191.7 for GPT-5.4 and 0.0 for Claude. The qualitative ordering therefore appears more stable across model generations than the absolute scale of updating. +Claude and GPT-5.4 look closer on the normalized metric than the earlier Claude versus GPT-5.2 comparison did. In Study 1, mean relative updates range from 43-51% on Claude and 36-48% on GPT-5.4. The archival GPT-5.2 rerun preserved the same ordering but was much more volatile in raw magnitude (naive 59.03, CoT 29.35, Brier 22.03). The upward sycophancy scenario is the clearest example: GPT-5.2 naive responses moved by 466.7 leads on average, versus 191.7 for GPT-5.4 and 0.0 for Claude. The qualitative ordering therefore appears more stable across model generations than the absolute scale of updating. ### Convergence analysis -![Selected scenarios illustrating convergence behavior. Points show mean initial and final estimates with bootstrap 95% confidence intervals. Within each panel, naive and farness usually end near similar final values, but farness starts closer.](figures/fig_convergence.png){#fig-convergence} +![Selected scenarios illustrating convergence behavior. Points show mean initial and final estimates with bootstrap 95% confidence intervals. Within each panel, naive and Brier usually end near similar final values, but Brier starts closer.](figures/fig_convergence.png){#fig-convergence} -The convergence ratio (@eq-convergence) measures whether probed naive responses move toward the framework's initial estimates. For Claude, the mean convergence ratio is −1.48 (95% bootstrap CI [−2.08, −0.94], n=53 valid pairs). For GPT-5.4, it is −1.07 (95% CI [−1.65, −0.54], n=55 valid pairs). Negative values indicate that naive responses move past the framework's initial estimate rather than toward it. @fig-convergence makes the mechanism clearer than the scalar ratio alone. In the plotted scenarios, the two conditions typically end at similar final values within a model, but farness begins closer to that shared endpoint. The pattern therefore reflects a shared destination with different starting points: farness starts closer to where both conditions end up after probing. +The convergence ratio (@eq-convergence) measures whether probed naive responses move toward the framework's initial estimates. For Claude, the mean convergence ratio is −1.48 (95% bootstrap CI [−2.08, −0.94], n=53 valid pairs). For GPT-5.4, it is −1.07 (95% CI [−1.65, −0.54], n=55 valid pairs). Negative values indicate that naive responses move past the framework's initial estimate rather than toward it. @fig-convergence makes the mechanism clearer than the scalar ratio alone. In the plotted scenarios, the two conditions typically end at similar final values within a model, but Brier begins closer to that shared endpoint. The pattern therefore reflects a shared destination with different starting points: Brier starts closer to where both conditions end up after probing. ### Adversarial resistance Both models and all conditions demonstrate near-zero updates on adversarial probes. In the irrelevant anchoring scenario (adversarial_anchoring), update magnitude is exactly 0.0 across all runs for both models and all conditions — neither model changes its estimate when presented with phone numbers or weather forecasts. -![Run-level updates in the upward sycophancy scenario. Dots are individual runs; black bars show condition means with bootstrap 95% confidence intervals. Claude stays at zero across all runs, while GPT-5.4 shows upward shifts that farness reduces but does not eliminate.](figures/fig_sycophancy.png){#fig-sycophancy} +![Run-level updates in the upward sycophancy scenario. Dots are individual runs; black bars show condition means with bootstrap 95% confidence intervals. Claude stays at zero across all runs, while GPT-5.4 shows upward shifts that Brier reduces but does not eliminate.](figures/fig_sycophancy.png){#fig-sycophancy} -The sycophantic pressure scenario (adversarial_sycophancy) reveals a large model difference (@fig-sycophancy). Every Claude run stays at exactly zero, regardless of prompting condition. GPT-5.4 looks different: the naive condition contains several upward jumps, CoT reduces them modestly, and farness lowers the mean further while still leaving some non-zero runs. On average, GPT-5.4 naive responses update by 191.7 leads, compared with 158.3 for CoT and 48.3 for farness. For historical context, the archival GPT-5.2 rerun on the same prompt battery was more volatile still (466.7 leads naive, 133.3 CoT, 108.3 farness). The figure therefore shows that prompt structure matters, but model generation matters at least as much. +The sycophantic pressure scenario (adversarial_sycophancy) reveals a large model difference (@fig-sycophancy). Every Claude run stays at exactly zero, regardless of prompting condition. GPT-5.4 looks different: the naive condition contains several upward jumps, CoT reduces them modestly, and Brier lowers the mean further while still leaving some non-zero runs. On average, GPT-5.4 naive responses update by 191.7 leads, compared with 158.3 for CoT and 48.3 for Brier. For historical context, the archival GPT-5.2 rerun on the same prompt battery was more volatile still (466.7 leads naive, 133.3 CoT, 108.3 Brier). The figure therefore shows that prompt structure matters, but model generation matters at least as much. -The false base rate scenario (adversarial_false_base_rate) produces mixed results: both models update somewhat, with Claude farness updating less (mean 13.0) than Claude naive (mean 19.8). The adversarial probes in this scenario cite misleading but plausible statistics, making appropriate resistance harder to distinguish from rational conservatism. +The false base rate scenario (adversarial_false_base_rate) produces mixed results: both models update somewhat, with Claude Brier updating less (mean 13.0) than Claude naive (mean 19.8). The adversarial probes in this scenario cite misleading but plausible statistics, making appropriate resistance harder to distinguish from rational conservatism. ### Correct direction rates @@ -247,11 +247,11 @@ Correct direction rates are uniformly high across all conditions (96–100%), in ### Per-scenario analysis -![Per-scenario effect sizes for farness versus naive on the non-adversarial scenarios. Positive values indicate less updating under farness; intervals crossing zero indicate uncertain scenario-specific effects.](figures/fig_forest_plot.png){#fig-forest-plot} +![Per-scenario effect sizes for Brier versus naive on the non-adversarial scenarios. Positive values indicate less updating under Brier; intervals crossing zero indicate uncertain scenario-specific effects.](figures/fig_forest_plot.png){#fig-forest-plot} -@fig-forest-plot and @tbl-per-scenario together show a mostly positive but clearly heterogeneous pattern. The forest plot standardizes across scenarios with different units, and most point estimates fall on the positive side of zero, indicating less updating under farness. But many intervals are wide with only six runs per scenario-condition cell, so the scenario-level evidence is better summarized as "usually positive, sometimes negligible, occasionally negative" than as a uniform gain. +@fig-forest-plot and @tbl-per-scenario together show a mostly positive but clearly heterogeneous pattern. The forest plot standardizes across scenarios with different units, and most point estimates fall on the positive side of zero, indicating less updating under Brier. But many intervals are wide with only six runs per scenario-condition cell, so the scenario-level evidence is better summarized as "usually positive, sometimes negligible, occasionally negative" than as a uniform gain. -| Scenario | Claude naive | Claude farness | Reduction | GPT-5.4 naive | GPT-5.4 farness | Reduction | +| Scenario | Claude naive | Claude Brier | Reduction | GPT-5.4 naive | GPT-5.4 Brier | Reduction | |----------|-------------|---------------|-----------|----------|------------|-----------| | Planning estimate | 4.2 | 3.4 | 18% | 4.7 | 2.3 | 51% | | Sunk cost project | 7.9 | 6.4 | 19% | 13.8 | 11.5 | 17% | @@ -264,43 +264,43 @@ Correct direction rates are uniformly high across all conditions (96–100%), in : Mean update magnitude by scenario and condition (non-adversarial scenarios only). {#tbl-per-scenario} -The raw-magnitude table shows where those scenario-level effects come from. The farness effect is largest for investment- and launch-like scenarios (acquisition synergies, product launch, investment return) and smallest for scenarios where estimates are already fairly anchored (startup success on Claude, hiring success on GPT-5.4). Notably, the effect also appears in the one upward-pushing scenario (planning estimate: 18% reduction for Claude, 51% for GPT-5.4), suggesting that the shared-battery Study 1 pattern is not limited to resisting downward pressure. This heterogeneity suggests that the framework interacts with scenario characteristics — especially how relevant base rates and bias identification are to the prompt — rather than providing a constant stability boost. Because scenarios mix weeks, percentages, and leads, the standardized effect sizes in @fig-forest-plot are the more comparable cross-scenario summary, while the table is more useful for seeing where the raw changes come from. +The raw-magnitude table shows where those scenario-level effects come from. The Brier effect is largest for investment- and launch-like scenarios (acquisition synergies, product launch, investment return) and smallest for scenarios where estimates are already fairly anchored (startup success on Claude, hiring success on GPT-5.4). Notably, the effect also appears in the one upward-pushing scenario (planning estimate: 18% reduction for Claude, 51% for GPT-5.4), suggesting that the shared-battery Study 1 pattern is not limited to resisting downward pressure. This heterogeneity suggests that the framework interacts with scenario characteristics — especially how relevant base rates and bias identification are to the prompt — rather than providing a constant stability boost. Because scenarios mix weeks, percentages, and leads, the standardized effect sizes in @fig-forest-plot are the more comparable cross-scenario summary, while the table is more useful for seeing where the raw changes come from. ### Worked example: sunk cost project {#sec-worked-example} To illustrate the stability-under-probing methodology concretely, consider the sunk_cost_project scenario — a troubled software project where leadership claims they are "almost there." The probing questions challenge with base rates (only 16% of troubled projects meet revised estimates), new information (senior engineers interviewing elsewhere), and bias identification (integration testing hasn't started). -Across 6 Claude runs, naive responses all start at exactly 12% success probability — notably invariant despite temperature 1.0 — and update to a mean of 4.1% (range 3.5–4.5%, mean update magnitude 7.9 percentage points). Farness responses start lower and with more variation — mean 10.5% (range 7–12%) — reflecting the framework's base-rate anchoring producing a wider range of initial estimates. Both conditions converge to nearly identical final estimates (~4%), but the framework starts closer (mean update magnitude 6.4 percentage points, a 19% reduction). +Across 6 Claude runs, naive responses all start at exactly 12% success probability — notably invariant despite temperature 1.0 — and update to a mean of 4.1% (range 3.5–4.5%, mean update magnitude 7.9 percentage points). Brier responses start lower and with more variation — mean 10.5% (range 7–12%) — reflecting the framework's base-rate anchoring producing a wider range of initial estimates. Both conditions converge to nearly identical final estimates (~4%), but the framework starts closer (mean update magnitude 6.4 percentage points, a 19% reduction). -GPT-5.4 tells a similar but slightly weaker version of the same story. Naive responses start higher (mean 25.0%, range 25–25%) and update to a mean of 11.2% (mean update magnitude 13.8 percentage points). Farness responses start somewhat lower (mean 22.5%, range 18–28%) and update to a mean of 11.0% (mean update magnitude 11.5 percentage points). Both conditions again converge to similar final estimates (~11%), and the framework still reduces the size of the revision, though by less than on Claude. This illustrates the heterogeneity visible in @tbl-per-scenario: the farness effect is not uniform across models or scenarios. +GPT-5.4 tells a similar but slightly weaker version of the same story. Naive responses start higher (mean 25.0%, range 25–25%) and update to a mean of 11.2% (mean update magnitude 13.8 percentage points). Brier responses start somewhat lower (mean 22.5%, range 18–28%) and update to a mean of 11.0% (mean update magnitude 11.5 percentage points). Both conditions again converge to similar final estimates (~11%), and the framework still reduces the size of the revision, though by less than on Claude. This illustrates the heterogeneity visible in @tbl-per-scenario: the Brier effect is not uniform across models or scenarios. This pattern — shared destination, different starting points — recurs across scenarios and illustrates the mechanism behind the aggregate update magnitude results: the framework's effect corresponds to better initial positioning, not to processing probe information differently. ### Study 2: Construct-validity test with held-out probes -Study 2 directly tests the main interpretive risk from Study 1: prompt-probe alignment. The follow-up design keeps the same 8 primary scenarios but adds two control conditions (`estimate_only` and `format_control`) and splits probes into *on-framework* and *off-framework* batteries. The on-framework probes test considerations explicitly named in the farness prompt, such as base rates and bias prompts. The off-framework probes target considerations not named in the prompt, such as implementation fragility, incentives, and opportunity cost. +Study 2 directly tests the main interpretive risk from Study 1: prompt-probe alignment. The follow-up design keeps the same 8 primary scenarios but adds two control conditions (`estimate_only` and `format_control`) and splits probes into *on-framework* and *off-framework* batteries. The on-framework probes test considerations explicitly named in the Brier prompt, such as base rates and bias prompts. The off-framework probes target considerations not named in the prompt, such as implementation fragility, incentives, and opportunity cost. -![Claude construct-validity check using the Study 2 design. Points show mean relative updates and vertical bars show bootstrap 95% confidence intervals. Farness helps on framework-aligned probes, but its advantage disappears on held-out probes; the format-only control is descriptively the most stable off-framework condition.](figures/fig_probe_validation.png){#fig-probe-validation} +![Claude construct-validity check using the Study 2 design. Points show mean relative updates and vertical bars show bootstrap 95% confidence intervals. Brier helps on framework-aligned probes, but its advantage disappears on held-out probes; the format-only control is descriptively the most stable off-framework condition.](figures/fig_probe_validation.png){#fig-probe-validation} -@fig-probe-validation is the most important construct-validity result in the paper. On framework-aligned probes, farness still looks better than naive: mean relative update falls from 68% under naive prompting to 56% under farness, and the mixed-effects coefficient is −0.112 (SE=0.024, p<0.001). On held-out probes, however, that advantage disappears and reverses. Naive prompting averages 70% relative update, while farness rises to 83%, with a mixed-effects coefficient of +0.139 (SE=0.056, p=0.01). The best off-framework condition is descriptively the `format_control` prompt at 60%, suggesting that some structured presentation helps, but the specific farness checklist does not generalize to held-out probes in this follow-up. +@fig-probe-validation is the most important construct-validity result in the paper. On framework-aligned probes, Brier still looks better than naive: mean relative update falls from 68% under naive prompting to 56% under Brier, and the mixed-effects coefficient is −0.112 (SE=0.024, p<0.001). On held-out probes, however, that advantage disappears and reverses. Naive prompting averages 70% relative update, while Brier rises to 83%, with a mixed-effects coefficient of +0.139 (SE=0.056, p=0.01). The best off-framework condition is descriptively the `format_control` prompt at 60%, suggesting that some structured presentation helps, but the specific Brier checklist does not generalize to held-out probes in this follow-up. -This materially changes the interpretation of Study 1. The original shared-battery result is real, but the strongest validation currently supports a narrower explanation: farness helps most when the probes test the same dimensions the framework explicitly primes. Once probes move to considerations outside that checklist, the stability advantage is not just attenuated; on the normalized metric it reverses. That pattern is more consistent with targeted priming than with broad decision-quality improvement. +This materially changes the interpretation of Study 1. The original shared-battery result is real, but the strongest validation currently supports a narrower explanation: Brier helps most when the probes test the same dimensions the framework explicitly primes. Once probes move to considerations outside that checklist, the stability advantage is not just attenuated; on the normalized metric it reverses. That pattern is more consistent with targeted priming than with broad decision-quality improvement. ## Discussion {#sec-discussion} ### Stability under probing -The central finding is now methodological more than substantive. Study 1 shows that stability-under-probing can separate prompt structures under a shared probe battery. Study 2 shows that the same method can invalidate an over-broad interpretation of that separation. Farness lowers relative updates on the original shared battery and on framework-aligned probes, but not on held-out probes. The strongest supported claim is therefore not that farness broadly improves decisions, or even that it is generally more stable, but that it prepares models for the particular considerations it explicitly names. +The central finding is now methodological more than substantive. Study 1 shows that stability-under-probing can separate prompt structures under a shared probe battery. Study 2 shows that the same method can invalidate an over-broad interpretation of that separation. Brier lowers relative updates on the original shared battery and on framework-aligned probes, but not on held-out probes. The strongest supported claim is therefore not that Brier broadly improves decisions, or even that it is generally more stable, but that it prepares models for the particular considerations it explicitly names. -One of the main construct-validity tests proposed in earlier drafts has now been run. The held-out probe split materially weakens the broad framework-validation interpretation: on Claude, the farness advantage survives on on-framework probes but disappears and reverses on off-framework probes. The remaining substantive case for farness would therefore need either replicated held-out gains on other models or better performance on outcome-linked tasks with known resolutions. +One of the main construct-validity tests proposed in earlier drafts has now been run. The held-out probe split materially weakens the broad framework-validation interpretation: on Claude, the Brier advantage survives on on-framework probes but disappears and reverses on off-framework probes. The remaining substantive case for Brier would therefore need either replicated held-out gains on other models or better performance on outcome-linked tasks with known resolutions. ### Chain-of-thought offers smaller and less consistent benefits -A key secondary finding is that chain-of-thought prompting — simply asking the model to "think step by step" — is weaker and less consistent than framework-specific prompting. On Claude, CoT is nearly indistinguishable from naive prompting on the normalized metric. On GPT-5.4, CoT provides a modest reduction in relative update, but still less than farness. This is consistent with prior findings that CoT primarily improves performance on tasks with clear logical structure (arithmetic, multi-step reasoning) and may be less decisive for judgment under uncertainty, where the relevant skill is not just reasoning more carefully but foregrounding the right considerations. +A key secondary finding is that chain-of-thought prompting — simply asking the model to "think step by step" — is weaker and less consistent than framework-specific prompting. On Claude, CoT is nearly indistinguishable from naive prompting on the normalized metric. On GPT-5.4, CoT provides a modest reduction in relative update, but still less than Brier. This is consistent with prior findings that CoT primarily improves performance on tasks with clear logical structure (arithmetic, multi-step reasoning) and may be less decisive for judgment under uncertainty, where the relevant skill is not just reasoning more carefully but foregrounding the right considerations. -The shared-battery Study 1 result suggests that specific structure can matter more than generic "think carefully" prompting. But Study 2 narrows that further: the specific structure seems to help mostly on the considerations farness explicitly names. That is a more limited claim than saying the framework is a general reasoning enhancer. +The shared-battery Study 1 result suggests that specific structure can matter more than generic "think carefully" prompting. But Study 2 narrows that further: the specific structure seems to help mostly on the considerations Brier explicitly names. That is a more limited claim than saying the framework is a general reasoning enhancer. -One important caveat: recent frontier models may employ implicit chain-of-thought reasoning even without explicit CoT prompting, potentially narrowing the gap between naive and CoT conditions. If models already reason step-by-step internally, the explicit CoT prompt adds little — which would explain the null CoT result observed here. The farness framework's advantage would then derive not from encouraging reasoning per se but from directing it toward specific decision-relevant considerations (base rates, biases, uncertainty quantification) that implicit reasoning may not prioritize. +One important caveat: recent frontier models may employ implicit chain-of-thought reasoning even without explicit CoT prompting, potentially narrowing the gap between naive and CoT conditions. If models already reason step-by-step internally, the explicit CoT prompt adds little — which would explain the null CoT result observed here. The Brier framework's advantage would then derive not from encouraging reasoning per se but from directing it toward specific decision-relevant considerations (base rates, biases, uncertainty quantification) that implicit reasoning may not prioritize. ### Both conditions converge — but the framework starts closer on the original battery @@ -308,7 +308,7 @@ Under the original shared probe battery, both conditions converge toward similar ### Model differences -GPT-5.4 is materially calmer than the archival GPT-5.2 rerun, especially in raw update magnitude, but it preserves the same qualitative Study 1 ordering: farness < CoT < naive. That suggests the shared-battery pattern is not unique to one OpenAI model generation, even if absolute volatility is model-sensitive. At the same time, I do not claim that the held-out-probe result is robust across architectures, because Study 2 has so far been completed only on Claude. +GPT-5.4 is materially calmer than the archival GPT-5.2 rerun, especially in raw update magnitude, but it preserves the same qualitative Study 1 ordering: Brier < CoT < naive. That suggests the shared-battery pattern is not unique to one OpenAI model generation, even if absolute volatility is model-sensitive. At the same time, I do not claim that the held-out-probe result is robust across architectures, because Study 2 has so far been completed only on Claude. ### Limitations @@ -320,15 +320,15 @@ Finally, all prompt conditions request structured JSON output for estimate extra ### Future work -Several directions remain for future work. The most important next step is to replicate the held-out-probe construct-validity test on GPT-5.4 and other frontier models. A decisive follow-up would combine that replication with an outcome-linked benchmark with known resolutions, such as historical project timelines, hiring outcomes, or resolved forecasting questions. If farness improved realized outcomes or reduced updates on held-out probes across multiple models, that would materially strengthen the substantive interpretation. If not, the framework should be understood more narrowly as a checklist that helps on the dimensions it names. +Several directions remain for future work. The most important next step is to replicate the held-out-probe construct-validity test on GPT-5.4 and other frontier models. A decisive follow-up would combine that replication with an outcome-linked benchmark with known resolutions, such as historical project timelines, hiring outcomes, or resolved forecasting questions. If Brier improved realized outcomes or reduced updates on held-out probes across multiple models, that would materially strengthen the substantive interpretation. If not, the framework should be understood more narrowly as a checklist that helps on the dimensions it names. -Removing CI requests from naive and CoT prompts would test whether the framework genuinely improves uncertainty quantification. Human studies could evaluate whether the framework improves decision-making when used as a scaffolding tool, rather than testing the LLM in isolation. Cross-framework comparison against other structured approaches (structured analytic techniques, red team/blue team, GRADE framework) would determine whether the observed effects are specific to farness or arise more generally from structured prompting. Finally, expanding the adversarial battery would test whether the framework provides differential protection against sycophantic pressure under newer model generations, where baseline susceptibility appears lower than in GPT-5.2 but remains non-zero. +Removing CI requests from naive and CoT prompts would test whether the framework genuinely improves uncertainty quantification. Human studies could evaluate whether the framework improves decision-making when used as a scaffolding tool, rather than testing the LLM in isolation. Cross-framework comparison against other structured approaches (structured analytic techniques, red team/blue team, GRADE framework) would determine whether the observed effects are specific to Brier or arise more generally from structured prompting. Finally, expanding the adversarial battery would test whether the framework provides differential protection against sycophantic pressure under newer model generations, where baseline susceptibility appears lower than in GPT-5.2 but remains non-zero. ## Conclusion {#sec-conclusion} -This paper introduces stability-under-probing as a process-level method for evaluating decision prompts in LLMs when ground-truth outcomes are unavailable. Study 1 shows that the method can detect a consistent separation between prompt structures under a shared probe battery: on Claude Opus 4.6 and GPT-5.4, farness is more stable than naive prompting, while CoT is weaker and less consistent. +This paper introduces stability-under-probing as a process-level method for evaluating decision prompts in LLMs when ground-truth outcomes are unavailable. Study 1 shows that the method can detect a consistent separation between prompt structures under a shared probe battery: on Claude Opus 4.6 and GPT-5.4, Brier is more stable than naive prompting, while CoT is weaker and less consistent. -The stronger claim is therefore about measurement, not framework validation. Study 2 shows why: once probes are split into framework-aligned and held-out batteries, the apparent farness advantage localizes to the aligned probes and disappears on the held-out ones. Stability-under-probing appears useful precisely because it can reveal both patterns: prompt differences under a given probe set, and the limits of those differences when construct-validity checks are added. Whether any structured prompt improves broader decision quality still requires held-out replication across models and outcome-linked benchmarks. +The stronger claim is therefore about measurement, not framework validation. Study 2 shows why: once probes are split into framework-aligned and held-out batteries, the apparent Brier advantage localizes to the aligned probes and disappears on the held-out ones. Stability-under-probing appears useful precisely because it can reveal both patterns: prompt differences under a given probe set, and the limits of those differences when construct-validity checks are added. Whether any structured prompt improves broader decision quality still requires held-out replication across models and outcome-linked benchmarks. ## References {.unnumbered} @@ -528,12 +528,12 @@ This appendix presents representative raw result records from the pilot experime } ``` -**Planning scenario, farness condition (run 1):** +**Planning scenario, Brier condition (run 1):** ```json { "case_id": "planning_estimate", - "condition": "farness", + "condition": "Brier", "initial_estimate": 3.5, "initial_ci": [2.5, 6.0], "final_estimate": 6.0, @@ -566,4 +566,4 @@ The full dataset comprising all 11 scenarios, 3 conditions, and 6 runs per condi ## Code availability {.unnumbered} -All code for running stability-under-probing experiments is available at under an open-source license. The repository includes the complete experiment infrastructure, analysis pipeline, and raw results. To reproduce the experiments, install the package with `pip install -e ".[dev]"` and run `python -m farness.experiments.stability_runner`. +All code for running stability-under-probing experiments is available at under an open-source license. The repository includes the complete experiment infrastructure, analysis pipeline, and raw results. To reproduce the experiments, install the package with `pip install -e ".[dev]"` and run `python -m brier.experiments.stability_runner`. diff --git a/paper/preemptive_rigor.md b/paper/preemptive_rigor.md index 9306359..f84be5e 100644 --- a/paper/preemptive_rigor.md +++ b/paper/preemptive_rigor.md @@ -6,7 +6,7 @@ decision prompts in LLMs *Max Ghenis*[^1] -**Disclosure:** The author created and maintains the farness framework +**Disclosure:** The author created and maintains the Brier framework and website introduced and evaluated in this paper. All code, data, and analysis are open source to enable independent verification. @@ -18,14 +18,14 @@ support when ground-truth outcomes are unavailable. The method compares how far different prompts move after a shared bundle of follow-up probes, and whether structured prompts begin closer to their post-probe values. Study 1 applies the method to a structured framework I introduce -here (“farness”), comparing it with naive and chain-of-thought (CoT) +here (“Brier”), comparing it with naive and chain-of-thought (CoT) prompting across 11 quantitative scenarios spanning planning, risk, investment, and adversarial domains on Claude Opus 4.6 (n=191) and GPT-5.4 (n=198), with 6 runs per scenario-condition pair. Because scenarios mix weeks, probabilities, and leads, pooled inference uses *relative update* rather than raw update magnitude. -In Study 1, farness produces smaller relative updates under the original +In Study 1, Brier produces smaller relative updates under the original shared probe battery than naive prompting (Claude: 43% vs 51%, mixed-effects coefficient = −0.080, p\<0.001; GPT-5.4: 36% vs 48%, coefficient = −0.128, p\<0.001). CoT provides little benefit on Claude @@ -37,7 +37,7 @@ closer and therefore moves less. Study 2 then tests construct validity on Claude only across the 8 primary scenarios, adding two control conditions and splitting probes into *on-framework* and *off-framework* batteries (n=384). On -framework-aligned probes, farness remains more stable than naive (56% vs +framework-aligned probes, Brier remains more stable than naive (56% vs 68%, coefficient = −0.112, p\<0.001). On held-out probes, that advantage disappears and reverses (83% vs 70%, coefficient = +0.139, p=0.01), while a format-only control is descriptively the most stable @@ -77,17 +77,17 @@ unavailable. This paper makes three contributions. First, it proposes stability-under-probing as a process-level evaluation method for decision prompts. Second, it demonstrates the method on a bounded case -study using farness, naive prompting, and CoT prompting. Third, it shows +study using Brier, naive prompting, and CoT prompting. Third, it shows why construct-validity checks matter: a follow-up probe split indicates -that the farness advantage localizes to framework-aligned probes rather +that the Brier advantage localizes to framework-aligned probes rather than general held-out robustness. The paper does **not** claim that -farness has been shown to improve real-world decision quality in +Brier has been shown to improve real-world decision quality in general; the current design is better suited to detecting systematic differences in prompt behavior than to validating outcome quality. -### Case study: the farness framework +### Case study: the Brier framework -I introduce farness (“forecasting as a harness”),[^2] a structured +I introduce Brier (“forecasting as a harness”),[^2] a structured decision framework that reframes subjective advice-seeking questions (“should I…?”) into forecasting problems with explicit metrics. The framework operates through six required steps: @@ -236,7 +236,7 @@ work. For each decision scenario, I proceed in four steps. First, I present the scenario under two conditions: a *naive* condition (“You are a helpful assistant. \[Scenario\]. What is your estimate?”) and a -*framework* condition (“You are a decision analyst using the farness +*framework* condition (“You are a decision analyst using the Brier framework. \[Scenario\]. What is your estimate with confidence interval?”). Second, I record the initial response, including point estimate, confidence interval (if provided), and full response text. @@ -257,7 +257,7 @@ estimate to its revised estimate. Figure 1 provides the clearest single-example view of the design. In this scenario, the naive -and farness conditions answer the same question, receive the same +and Brier conditions answer the same question, receive the same probes, and end at nearly the same revised estimate. The key quantity is not which condition ends lower in absolute terms, but which one had already started closer to the post-probing value. The longer worked @@ -375,12 +375,12 @@ The paper reports two related studies. **Study 1** is the original shared-battery case study. It uses Claude Opus 4.6 (Anthropic) and GPT-5.4 (OpenAI), accessed via their respective APIs with temperature 1.0 to maximize response diversity across runs. Study 1 tests three -conditions (naive, chain-of-thought, farness) with 6 runs per +conditions (naive, chain-of-thought, Brier) with 6 runs per scenario-condition pair across the 11-scenario battery. **Study 2** is a construct-validity follow-up on Claude only. It uses the 8 primary non-adversarial scenarios, four conditions (naive, estimate_only, -format_control, farness), and two probe batteries: *on-framework* probes -that test considerations explicitly named in the farness prompt, and +format_control, Brier), and two probe batteries: *on-framework* probes +that test considerations explicitly named in the Brier prompt, and *off-framework* probes that target other considerations such as implementation fragility, incentives, and opportunity cost. @@ -440,7 +440,7 @@ sample size for between-scenario generalization remains closer to 8 or I report two studies. Study 1 contains 191 stability results for Claude Opus 4.6 (7 missing due to transient API errors) and 198 for GPT-5.4, -across 11 scenarios and 3 conditions (naive, chain-of-thought, farness) +across 11 scenarios and 3 conditions (naive, chain-of-thought, Brier) with 6 runs per scenario-condition pair. Study 2 contains 384 Claude results across the 8 primary scenarios, 4 conditions, and 2 probe batteries. All bootstrap analyses use fixed random seeds (seed=42) for @@ -469,7 +469,7 @@ Table 2: Study 1 stability metrics by condition and model. Relative update is the pooled primary metric; raw update magnitude is included as descriptive within-model context. -| Metric | Claude naive | Claude CoT | Claude farness | GPT-5.4 naive | GPT-5.4 CoT | GPT-5.4 farness | +| Metric | Claude naive | Claude CoT | Claude Brier | GPT-5.4 naive | GPT-5.4 CoT | GPT-5.4 Brier | |----|----|----|----|----|----|----| | n | 63 | 66 | 62 | 66 | 66 | 66 | | Mean relative update | 51% | 49% | 43% | 48% | 41% | 36% | @@ -493,11 +493,11 @@ scenario is excluded. Figure 2 visualizes the unit-normalized condition means reported in Table 2. The pattern is -clear on both models: farness has the lowest mean relative update, naive +clear on both models: Brier has the lowest mean relative update, naive the highest, and CoT sits in between. For Claude, CoT is nearly indistinguishable from naive (49% vs 51%), so the practically meaningful -separation is naive/CoT versus farness. For GPT-5.4, CoT improves -modestly (41%), but farness still produces the smallest average relative +separation is naive/CoT versus Brier. For GPT-5.4, CoT improves +modestly (41%), but Brier still produces the smallest average relative update (36% vs 48% for naive). The raw-magnitude row in Table 2 shows the same ordering within each model, but those raw values are not comparable @@ -517,18 +517,18 @@ intercepts for scenario) using restricted maximum likelihood (REML) estimation. For Claude, the model converges with random-intercept variance of 0.115 -across 11 scenario groups (n=191). The farness coefficient is −0.080 +across 11 scenario groups (n=191). The Brier coefficient is −0.080 (SE=0.021, p\<0.001), indicating lower relative updates than naive after accounting for scenario-level variation. The CoT coefficient is −0.024 (SE=0.016, p=0.13), confirming little benefit from chain-of-thought prompting. The intercept (naive baseline) is 0.515 (SE=0.096, p\<0.001). For GPT-5.4, the model converges with random-intercept variance of 0.087 -(n=198). The farness coefficient is −0.128 (SE=0.033, p\<0.001) and the +(n=198). The Brier coefficient is −0.128 (SE=0.033, p\<0.001) and the CoT coefficient is −0.074 (SE=0.027, p=0.006). The GPT-5.4 CoT effect is -smaller than farness and does not replicate on Claude, so it should be +smaller than Brier and does not replicate on Claude, so it should be treated as model-specific rather than a general CoT result. Across both -models, the more consistent finding is that farness reduces relative +models, the more consistent finding is that Brier reduces relative updating under the original shared probe battery. ### Non-parametric robustness check @@ -545,11 +545,11 @@ robustness check). | Comparison | U | p (raw) | p (corrected) | Cohen’s d \[95% CI\] | Rank-biserial r \[95% CI\] | |----|----|----|----|----|----| -| Claude: naive vs farness | 2192.5 | 0.243 | 0.709 | 0.24 \[−0.13, 0.58\] | −0.12 \[−0.32, 0.10\] | -| Claude: CoT vs farness | 2226.5 | 0.391 | 0.782 | 0.20 \[−0.14, 0.55\] | −0.09 \[−0.29, 0.11\] | +| Claude: naive vs Brier | 2192.5 | 0.243 | 0.709 | 0.24 \[−0.13, 0.58\] | −0.12 \[−0.32, 0.10\] | +| Claude: CoT vs Brier | 2226.5 | 0.391 | 0.782 | 0.20 \[−0.14, 0.55\] | −0.09 \[−0.29, 0.11\] | | Claude: naive vs CoT | 2117.0 | 0.862 | 0.862 | 0.06 \[−0.31, 0.40\] | −0.02 \[−0.22, 0.18\] | -| GPT-5.4: naive vs farness | 2505.5 | 0.137 | 0.412 | 0.36 \[0.04, 0.66\] | −0.15 \[−0.35, 0.03\] | -| GPT-5.4: CoT vs farness | 2493.0 | 0.151 | 0.412 | 0.22 \[−0.12, 0.57\] | −0.14 \[−0.34, 0.05\] | +| GPT-5.4: naive vs Brier | 2505.5 | 0.137 | 0.412 | 0.36 \[0.04, 0.66\] | −0.15 \[−0.35, 0.03\] | +| GPT-5.4: CoT vs Brier | 2493.0 | 0.151 | 0.412 | 0.22 \[−0.12, 0.57\] | −0.14 \[−0.34, 0.05\] | | GPT-5.4: naive vs CoT | 2192.0 | 0.950 | 0.950 | 0.21 \[−0.14, 0.51\] | −0.01 \[−0.21, 0.18\] | @@ -557,7 +557,7 @@ robustness check). After Holm-Bonferroni correction, no comparison reaches conventional significance at alpha=0.05. The bootstrap effect sizes are nevertheless directionally consistent with the mixed-effects results, especially for -GPT-5.4 naive versus farness. The weaker p-values reflect the +GPT-5.4 naive versus Brier. The weaker p-values reflect the non-parametric test’s inability to account for the within-scenario correlation structure — it treats heterogeneous scenarios as a single pool rather than conditioning on scenario difficulty. @@ -568,7 +568,7 @@ Claude and GPT-5.4 look closer on the normalized metric than the earlier Claude versus GPT-5.2 comparison did. In Study 1, mean relative updates range from 43-51% on Claude and 36-48% on GPT-5.4. The archival GPT-5.2 rerun preserved the same ordering but was much more volatile in raw -magnitude (naive 59.03, CoT 29.35, farness 22.03). The upward sycophancy +magnitude (naive 59.03, CoT 29.35, Brier 22.03). The upward sycophancy scenario is the clearest example: GPT-5.2 naive responses moved by 466.7 leads on average, versus 191.7 for GPT-5.4 and 0.0 for Claude. The qualitative ordering therefore appears more stable across model @@ -582,8 +582,8 @@ generations than the absolute scale of updating. Figure 3: Selected scenarios illustrating convergence behavior. Points show mean initial and final estimates with bootstrap 95% confidence -intervals. Within each panel, naive and farness usually end near similar -final values, but farness starts closer. +intervals. Within each panel, naive and Brier usually end near similar +final values, but Brier starts closer. @@ -598,8 +598,8 @@ rather than toward it. Figure 3 makes the mechanism clearer than the scalar ratio alone. In the plotted scenarios, the two conditions typically end at similar final values within a model, -but farness begins closer to that shared endpoint. The pattern therefore -reflects a shared destination with different starting points: farness +but Brier begins closer to that shared endpoint. The pattern therefore +reflects a shared destination with different starting points: Brier starts closer to where both conditions end up after probing. ### Adversarial resistance @@ -617,7 +617,7 @@ when presented with phone numbers or weather forecasts. Figure 4: Run-level updates in the upward sycophancy scenario. Dots are individual runs; black bars show condition means with bootstrap 95% confidence intervals. Claude stays at zero across all runs, while -GPT-5.4 shows upward shifts that farness reduces but does not eliminate. +GPT-5.4 shows upward shifts that Brier reduces but does not eliminate. @@ -626,16 +626,16 @@ large model difference (Figure 4). Every Claude run stays at exactly zero, regardless of prompting condition. GPT-5.4 looks different: the naive condition contains several upward -jumps, CoT reduces them modestly, and farness lowers the mean further +jumps, CoT reduces them modestly, and Brier lowers the mean further while still leaving some non-zero runs. On average, GPT-5.4 naive responses update by 191.7 leads, compared with 158.3 for CoT and 48.3 -for farness. For historical context, the archival GPT-5.2 rerun on the +for Brier. For historical context, the archival GPT-5.2 rerun on the same prompt battery was more volatile still (466.7 leads naive, 133.3 -CoT, 108.3 farness). The figure therefore shows that prompt structure +CoT, 108.3 Brier). The figure therefore shows that prompt structure matters, but model generation matters at least as much. The false base rate scenario (adversarial_false_base_rate) produces -mixed results: both models update somewhat, with Claude farness updating +mixed results: both models update somewhat, with Claude Brier updating less (mean 13.0) than Claude naive (mean 19.8). The adversarial probes in this scenario cite misleading but plausible statistics, making appropriate resistance harder to distinguish from rational conservatism. @@ -654,9 +654,9 @@ the denominator. This was not a differentiating metric. ![](figures/fig_forest_plot.png) -Figure 5: Per-scenario effect sizes for farness versus naive on the +Figure 5: Per-scenario effect sizes for Brier versus naive on the non-adversarial scenarios. Positive values indicate less updating under -farness; intervals crossing zero indicate uncertain scenario-specific +Brier; intervals crossing zero indicate uncertain scenario-specific effects. @@ -666,7 +666,7 @@ effects. show a mostly positive but clearly heterogeneous pattern. The forest plot standardizes across scenarios with different units, and most point estimates fall on the positive side of zero, indicating less updating -under farness. But many intervals are wide with only six runs per +under Brier. But many intervals are wide with only six runs per scenario-condition cell, so the scenario-level evidence is better summarized as “usually positive, sometimes negligible, occasionally negative” than as a uniform gain. @@ -676,7 +676,7 @@ negative” than as a uniform gain. Table 4: Mean update magnitude by scenario and condition (non-adversarial scenarios only). -| Scenario | Claude naive | Claude farness | Reduction | GPT-5.4 naive | GPT-5.4 farness | Reduction | +| Scenario | Claude naive | Claude Brier | Reduction | GPT-5.4 naive | GPT-5.4 Brier | Reduction | |----|----|----|----|----|----|----| | Planning estimate | 4.2 | 3.4 | 18% | 4.7 | 2.3 | 51% | | Sunk cost project | 7.9 | 6.4 | 19% | 13.8 | 11.5 | 17% | @@ -690,7 +690,7 @@ Table 4: Mean update magnitude by scenario and condition The raw-magnitude table shows where those scenario-level effects come -from. The farness effect is largest for investment- and launch-like +from. The Brier effect is largest for investment- and launch-like scenarios (acquisition synergies, product launch, investment return) and smallest for scenarios where estimates are already fairly anchored (startup success on Claude, hiring success on GPT-5.4). Notably, the @@ -718,7 +718,7 @@ and bias identification (integration testing hasn’t started). Across 6 Claude runs, naive responses all start at exactly 12% success probability — notably invariant despite temperature 1.0 — and update to a mean of 4.1% (range 3.5–4.5%, mean update magnitude 7.9 percentage -points). Farness responses start lower and with more variation — mean +points). Brier responses start lower and with more variation — mean 10.5% (range 7–12%) — reflecting the framework’s base-rate anchoring producing a wider range of initial estimates. Both conditions converge to nearly identical final estimates (~4%), but the framework starts @@ -726,13 +726,13 @@ closer (mean update magnitude 6.4 percentage points, a 19% reduction). GPT-5.4 tells a similar but slightly weaker version of the same story. Naive responses start higher (mean 25.0%, range 25–25%) and update to a -mean of 11.2% (mean update magnitude 13.8 percentage points). Farness +mean of 11.2% (mean update magnitude 13.8 percentage points). Brier responses start somewhat lower (mean 22.5%, range 18–28%) and update to a mean of 11.0% (mean update magnitude 11.5 percentage points). Both conditions again converge to similar final estimates (~11%), and the framework still reduces the size of the revision, though by less than on Claude. This illustrates the heterogeneity visible in -Table 4: the farness +Table 4: the Brier effect is not uniform across models or scenarios. This pattern — shared destination, different starting points — recurs @@ -747,7 +747,7 @@ prompt-probe alignment. The follow-up design keeps the same 8 primary scenarios but adds two control conditions (`estimate_only` and `format_control`) and splits probes into *on-framework* and *off-framework* batteries. The on-framework probes test considerations -explicitly named in the farness prompt, such as base rates and bias +explicitly named in the Brier prompt, such as base rates and bias prompts. The off-framework probes target considerations not named in the prompt, such as implementation fragility, incentives, and opportunity cost. @@ -758,7 +758,7 @@ cost. Figure 6: Claude construct-validity check using the Study 2 design. Points show mean relative updates and vertical bars show bootstrap 95% -confidence intervals. Farness helps on framework-aligned probes, but its +confidence intervals. Brier helps on framework-aligned probes, but its advantage disappears on held-out probes; the format-only control is descriptively the most stable off-framework condition. @@ -766,20 +766,20 @@ descriptively the most stable off-framework condition. Figure 6 is the most important construct-validity result in the paper. On -framework-aligned probes, farness still looks better than naive: mean +framework-aligned probes, Brier still looks better than naive: mean relative update falls from 68% under naive prompting to 56% under -farness, and the mixed-effects coefficient is −0.112 (SE=0.024, +Brier, and the mixed-effects coefficient is −0.112 (SE=0.024, p\<0.001). On held-out probes, however, that advantage disappears and -reverses. Naive prompting averages 70% relative update, while farness +reverses. Naive prompting averages 70% relative update, while Brier rises to 83%, with a mixed-effects coefficient of +0.139 (SE=0.056, p=0.01). The best off-framework condition is descriptively the `format_control` prompt at 60%, suggesting that some structured -presentation helps, but the specific farness checklist does not +presentation helps, but the specific Brier checklist does not generalize to held-out probes in this follow-up. This materially changes the interpretation of Study 1. The original shared-battery result is real, but the strongest validation currently -supports a narrower explanation: farness helps most when the probes test +supports a narrower explanation: Brier helps most when the probes test the same dimensions the framework explicitly primes. Once probes move to considerations outside that checklist, the stability advantage is not just attenuated; on the normalized metric it reverses. That pattern is @@ -793,18 +793,18 @@ improvement. The central finding is now methodological more than substantive. Study 1 shows that stability-under-probing can separate prompt structures under a shared probe battery. Study 2 shows that the same method can -invalidate an over-broad interpretation of that separation. Farness +invalidate an over-broad interpretation of that separation. Brier lowers relative updates on the original shared battery and on framework-aligned probes, but not on held-out probes. The strongest -supported claim is therefore not that farness broadly improves +supported claim is therefore not that Brier broadly improves decisions, or even that it is generally more stable, but that it prepares models for the particular considerations it explicitly names. One of the main construct-validity tests proposed in earlier drafts has now been run. The held-out probe split materially weakens the broad -framework-validation interpretation: on Claude, the farness advantage +framework-validation interpretation: on Claude, the Brier advantage survives on on-framework probes but disappears and reverses on -off-framework probes. The remaining substantive case for farness would +off-framework probes. The remaining substantive case for Brier would therefore need either replicated held-out gains on other models or better performance on outcome-linked tasks with known resolutions. @@ -815,7 +815,7 @@ asking the model to “think step by step” — is weaker and less consistent than framework-specific prompting. On Claude, CoT is nearly indistinguishable from naive prompting on the normalized metric. On GPT-5.4, CoT provides a modest reduction in relative update, but still -less than farness. This is consistent with prior findings that CoT +less than Brier. This is consistent with prior findings that CoT primarily improves performance on tasks with clear logical structure (arithmetic, multi-step reasoning) and may be less decisive for judgment under uncertainty, where the relevant skill is not just reasoning more @@ -824,7 +824,7 @@ carefully but foregrounding the right considerations. The shared-battery Study 1 result suggests that specific structure can matter more than generic “think carefully” prompting. But Study 2 narrows that further: the specific structure seems to help mostly on the -considerations farness explicitly names. That is a more limited claim +considerations Brier explicitly names. That is a more limited claim than saying the framework is a general reasoning enhancer. One important caveat: recent frontier models may employ implicit @@ -832,7 +832,7 @@ chain-of-thought reasoning even without explicit CoT prompting, potentially narrowing the gap between naive and CoT conditions. If models already reason step-by-step internally, the explicit CoT prompt adds little — which would explain the null CoT result observed here. The -farness framework’s advantage would then derive not from encouraging +Brier framework’s advantage would then derive not from encouraging reasoning per se but from directing it toward specific decision-relevant considerations (base rates, biases, uncertainty quantification) that implicit reasoning may not prioritize. @@ -856,7 +856,7 @@ framework’s checklist. GPT-5.4 is materially calmer than the archival GPT-5.2 rerun, especially in raw update magnitude, but it preserves the same qualitative Study 1 -ordering: farness \< CoT \< naive. That suggests the shared-battery +ordering: Brier \< CoT \< naive. That suggests the shared-battery pattern is not unique to one OpenAI model generation, even if absolute volatility is model-sensitive. At the same time, I do not claim that the held-out-probe result is robust across architectures, because Study 2 @@ -914,7 +914,7 @@ is to replicate the held-out-probe construct-validity test on GPT-5.4 and other frontier models. A decisive follow-up would combine that replication with an outcome-linked benchmark with known resolutions, such as historical project timelines, hiring outcomes, or resolved -forecasting questions. If farness improved realized outcomes or reduced +forecasting questions. If Brier improved realized outcomes or reduced updates on held-out probes across multiple models, that would materially strengthen the substantive interpretation. If not, the framework should be understood more narrowly as a checklist that helps on the dimensions @@ -926,7 +926,7 @@ could evaluate whether the framework improves decision-making when used as a scaffolding tool, rather than testing the LLM in isolation. Cross-framework comparison against other structured approaches (structured analytic techniques, red team/blue team, GRADE framework) -would determine whether the observed effects are specific to farness or +would determine whether the observed effects are specific to Brier or arise more generally from structured prompting. Finally, expanding the adversarial battery would test whether the framework provides differential protection against sycophantic pressure under newer model @@ -939,12 +939,12 @@ This paper introduces stability-under-probing as a process-level method for evaluating decision prompts in LLMs when ground-truth outcomes are unavailable. Study 1 shows that the method can detect a consistent separation between prompt structures under a shared probe battery: on -Claude Opus 4.6 and GPT-5.4, farness is more stable than naive +Claude Opus 4.6 and GPT-5.4, Brier is more stable than naive prompting, while CoT is weaker and less consistent. The stronger claim is therefore about measurement, not framework validation. Study 2 shows why: once probes are split into -framework-aligned and held-out batteries, the apparent farness advantage +framework-aligned and held-out batteries, the apparent Brier advantage localizes to the aligned probes and disappears on the held-out ones. Stability-under-probing appears useful precisely because it can reveal both patterns: prompt differences under a given probe set, and the @@ -1387,12 +1387,12 @@ experiments to illustrate the data structure. } ``` -**Planning scenario, farness condition (run 1):** +**Planning scenario, Brier condition (run 1):** ``` json { "case_id": "planning_estimate", - "condition": "farness", + "condition": "Brier", "initial_estimate": 3.5, "initial_ci": [2.5, 6.0], "final_estimate": 6.0, @@ -1428,13 +1428,13 @@ repository at `experiments/stability_results/`. ## Code availability All code for running stability-under-probing experiments is available at - under an open-source license. The + under an open-source license. The repository includes the complete experiment infrastructure, analysis pipeline, and raw results. To reproduce the experiments, install the package with `pip install -e ".[dev]"` and run -`python -m farness.experiments.stability_runner`. +`python -m brier.experiments.stability_runner`. [^1]: Independent researcher. Contact: max@maxghenis.com -[^2]: Framework documentation: . Source code and - experiment data: . +[^2]: Framework documentation: . Source code and + experiment data: . diff --git a/paper/run_strongest_validation.py b/paper/run_strongest_validation.py index c307367..6dfbe9d 100644 --- a/paper/run_strongest_validation.py +++ b/paper/run_strongest_validation.py @@ -10,8 +10,8 @@ if str(ROOT) not in sys.path: sys.path.insert(0, str(ROOT)) -from farness.experiments.stability import get_primary_stability_cases, get_stability_case -from farness.experiments.stability_runner import run_stability_experiment +from brier.experiments.stability import get_primary_stability_cases, get_stability_case +from brier.experiments.stability_runner import run_stability_experiment DEFAULT_MODELS = ["claude-opus-4-6", "gpt-5.2"] diff --git a/paper/run_study1_rerun.py b/paper/run_study1_rerun.py index 13505a9..7b5ca8e 100644 --- a/paper/run_study1_rerun.py +++ b/paper/run_study1_rerun.py @@ -18,8 +18,8 @@ if str(ROOT) not in sys.path: sys.path.insert(0, str(ROOT)) -from farness.experiments.llm import call_llm -from farness.experiments.stability import ( +from brier.experiments.llm import call_llm +from brier.experiments.stability import ( DEFAULT_PROBE_BATTERY, QuantitativeCase, StabilityExperiment, diff --git a/pyproject.toml b/pyproject.toml index f299eb5..929cc9a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,7 +3,7 @@ requires = ["setuptools>=61.0", "wheel"] build-backend = "setuptools.build_meta" [project] -name = "farness" +name = "brier" version = "0.2.4" description = "Forecasting as a harness for decision-making" readme = "README.md" @@ -24,14 +24,14 @@ keywords = ["forecasting", "decision-making", "calibration", "prediction"] dependencies = [] [project.scripts] -farness = "farness.cli:main" -farness-mcp = "farness.mcp_server:main" +brier = "brier.cli:main" +brier-mcp = "brier.mcp_server:main" [project.urls] -Homepage = "https://farness.ai" -Documentation = "https://farness.ai/docs" -Repository = "https://github.com/MaxGhenis/farness" -Issues = "https://github.com/MaxGhenis/farness/issues" +Homepage = "https://brier.institute" +Documentation = "https://brier.institute/docs" +Repository = "https://github.com/MaxGhenis/brier" +Issues = "https://github.com/MaxGhenis/brier/issues" [project.optional-dependencies] experiments = [ @@ -54,10 +54,10 @@ dev = [ ] [tool.setuptools.packages.find] -include = ["farness*"] +include = ["brier*"] [tool.setuptools.package-data] -farness = [ +brier = [ "assets/skills/codex/SKILL.md", "assets/skills/claude/SKILL.md", ] diff --git a/scripts/generate_demo_video.py b/scripts/generate_demo_video.py index f337296..606dd72 100644 --- a/scripts/generate_demo_video.py +++ b/scripts/generate_demo_video.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -"""Generate a 4K demo video from a real Codex + farness session.""" +"""Generate a 4K demo video from a real Codex + brier session.""" from __future__ import annotations @@ -20,8 +20,8 @@ ROOT = Path(__file__).resolve().parents[1] OUT_DIR = ROOT / "site" / "public" / "demo" -VIDEO_PATH = OUT_DIR / "farness-demo.mp4" -POSTER_PATH = OUT_DIR / "farness-demo-poster.png" +VIDEO_PATH = OUT_DIR / "brier-demo.mp4" +POSTER_PATH = OUT_DIR / "brier-demo-poster.png" WIDTH = 3840 HEIGHT = 2160 @@ -100,11 +100,11 @@ def _env_with_local_bins() -> dict[str, str]: return env -def _farness_command() -> list[str]: - binary = ROOT / ".venv" / "bin" / "farness" +def _brier_command() -> list[str]: + binary = ROOT / ".venv" / "bin" / "brier" if binary.exists(): return [str(binary)] - return [sys.executable, "-m", "farness.cli"] + return [sys.executable, "-m", "brier.cli"] def run_command( @@ -191,7 +191,7 @@ def capture_real_session() -> DemoSession: if codex is None: raise RuntimeError("codex is required on PATH to generate the real demo.") - with tempfile.TemporaryDirectory(prefix="farness-demo-") as tmpdir: + with tempfile.TemporaryDirectory(prefix="brier-demo-") as tmpdir: temp_root = Path(tmpdir) store_path = temp_root / "decisions.jsonl" prompt_path = temp_root / "prompt.txt" @@ -200,7 +200,7 @@ def capture_real_session() -> DemoSession: workdir.mkdir(parents=True, exist_ok=True) prompt = ( - "Use $farness. Be concise. Analyze this decision: " + "Use $brier. Be concise. Analyze this decision: " "Should we rewrite the auth layer now? Context: 3 incidents this quarter, " "team strongest in Node, Q2 launch locked.\n" ) @@ -219,7 +219,7 @@ def capture_real_session() -> DemoSession: "-c", 'model_reasoning_effort="low"', "-c", - f'mcp_servers.farness.env.FARNESS_STORE_PATH="{store_path}"', + f'mcp_servers.brier.env.BRIER_STORE_PATH="{store_path}"', "--output-last-message", str(last_path), "-", @@ -228,13 +228,13 @@ def capture_real_session() -> DemoSession: last_path.read_text() if not store_path.exists(): - raise RuntimeError("Expected the farness store file to exist after codex exec.") + raise RuntimeError("Expected the brier store file to exist after codex exec.") decision = json.loads(store_path.read_text().strip().splitlines()[-1]) decision_prefix = decision["id"][:8] cli_env = env.copy() - cli_env["FARNESS_STORE_PATH"] = str(store_path) - run_command(_farness_command() + ["list"], env=cli_env) + cli_env["BRIER_STORE_PATH"] = str(store_path) + run_command(_brier_command() + ["list"], env=cli_env) chosen_option = next( option for option in decision["options"] if option["name"] == decision["chosen_option"] ) @@ -301,12 +301,12 @@ def build_events(session: DemoSession) -> list[Event]: hold=1.2, ), Event( - command="farness list", + command="brier list", output=session.list_lines, hold=1.0, ), Event( - command=f"farness show {session.decision_prefix}", + command=f"brier show {session.decision_prefix}", output=session.show_lines, hold=1.4, ), @@ -325,7 +325,7 @@ def style_for_line(text: str) -> tuple[int, int, int]: return ACCENT if text.startswith("mcp:") or text.startswith("mcp startup:"): return SUCCESS - if text.startswith("tool farness.") or text.startswith("farness."): + if text.startswith("tool brier.") or text.startswith("brier."): return SUCCESS if text.startswith("OpenAI Codex") or text.startswith("workdir:") or text.startswith("model:"): return TEXT_DIM @@ -384,10 +384,10 @@ def draw_intro(image: Image.Image, progress: float) -> None: title_y = 560 - int((1.0 - alpha) * 60) subtitle_y = 700 - int((1.0 - alpha) * 30) - draw.text((220, title_y), "farness.ai", font=DISPLAY_110, fill=(245, 247, 250)) + draw.text((220, title_y), "brier.institute", font=DISPLAY_110, fill=(245, 247, 250)) draw.text( (225, subtitle_y), - "Condensed from a real Codex run with the local farness skill and MCP", + "Condensed from a real Codex run with the local brier skill and MCP", font=DISPLAY_56, fill=(169, 184, 196), ) @@ -419,7 +419,7 @@ def draw_terminal_shell(image: Image.Image) -> None: (PANEL_X, PANEL_Y + HEADER_H - 42, PANEL_X + PANEL_W, PANEL_Y + HEADER_H), fill=PANEL_HEADER, ) - draw.text((PANEL_X + 150, PANEL_Y + 26), "farness.ai", font=SANS_42, fill=PANEL_HEADER_TEXT) + draw.text((PANEL_X + 150, PANEL_Y + 26), "brier.institute", font=SANS_42, fill=PANEL_HEADER_TEXT) draw.text( (PANEL_X + PANEL_W - 540, PANEL_Y + 30), "condensed real run", @@ -485,7 +485,7 @@ def main() -> None: total_frames = int(total_duration * FPS) + 1 poster_index = max(0, min(total_frames - 1, int(total_frames * 0.72))) - with tempfile.TemporaryDirectory(prefix="farness-demo-frames-") as tmpdir: + with tempfile.TemporaryDirectory(prefix="brier-demo-frames-") as tmpdir: frame_dir = Path(tmpdir) for frame_index in range(total_frames): timestamp = frame_index / FPS diff --git a/scripts/run_decision_usefulness_pilot.py b/scripts/run_decision_usefulness_pilot.py index 36d0733..c5b7046 100644 --- a/scripts/run_decision_usefulness_pilot.py +++ b/scripts/run_decision_usefulness_pilot.py @@ -15,7 +15,7 @@ if str(ROOT) not in sys.path: sys.path.insert(0, str(ROOT)) -from farness.experiments.decision_usefulness import ( +from brier.experiments.decision_usefulness import ( DECISION_USEFULNESS_CONDITIONS, JUDGE_TASKS, PRIMARY_PAIRWISE_COMPARISONS, diff --git a/scripts/smoke_packaged_install.py b/scripts/smoke_packaged_install.py index cae9be8..b1a3fbc 100644 --- a/scripts/smoke_packaged_install.py +++ b/scripts/smoke_packaged_install.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -"""Smoke-test a built farness artifact in an isolated environment.""" +"""Smoke-test a built brier artifact in an isolated environment.""" from __future__ import annotations @@ -114,7 +114,7 @@ def main() -> None: if not artifact.exists(): raise SystemExit(f"Artifact not found: {artifact}") - with tempfile.TemporaryDirectory(prefix="farness-smoke-") as tmpdir: + with tempfile.TemporaryDirectory(prefix="brier-smoke-") as tmpdir: root = Path(tmpdir) venv_dir = root / "venv" fake_bin = root / "fake-bin" @@ -142,47 +142,47 @@ def main() -> None: "-m", "pip", "install", - f"farness[mcp] @ {artifact.as_uri()}", + f"brier[mcp] @ {artifact.as_uri()}", ], env=env, ) - farness = [str(venv_dir / "bin" / "farness")] - codex_skill = Path(env["CODEX_HOME"]) / "skills" / "farness" / "SKILL.md" - claude_skill = Path(env["HOME"]) / ".claude" / "skills" / "farness" / "SKILL.md" + brier = [str(venv_dir / "bin" / "brier")] + codex_skill = Path(env["CODEX_HOME"]) / "skills" / "brier" / "SKILL.md" + claude_skill = Path(env["HOME"]) / ".claude" / "skills" / "brier" / "SKILL.md" - codex_setup = run(farness + ["setup", "codex"], env=env) - assert "Configured MCP server `farness` in codex" in codex_setup.stdout + codex_setup = run(brier + ["setup", "codex"], env=env) + assert "Configured MCP server `brier` in codex" in codex_setup.stdout assert codex_skill.exists() - codex_doctor = run(farness + ["doctor", "codex"], env=env) + codex_doctor = run(brier + ["doctor", "codex"], env=env) assert "Skill status: installed" in codex_doctor.stdout assert "configured: yes" in codex_doctor.stdout codex_skill.write_text("drifted", encoding="utf-8") - codex_fix = run(farness + ["doctor", "codex", "--fix"], env=env) + codex_fix = run(brier + ["doctor", "codex", "--fix"], env=env) assert "Skill: updated" in codex_fix.stdout assert "MCP: unchanged" in codex_fix.stdout - codex_uninstall = run(farness + ["uninstall", "codex"], env=env) + codex_uninstall = run(brier + ["uninstall", "codex"], env=env) assert "Removed codex skill" in codex_uninstall.stdout - assert "Removed MCP server `farness` from codex." in codex_uninstall.stdout + assert "Removed MCP server `brier` from codex." in codex_uninstall.stdout assert not codex_skill.exists() - codex_post = run(farness + ["doctor", "codex"], env=env) + codex_post = run(brier + ["doctor", "codex"], env=env) assert "Skill status: missing" in codex_post.stdout assert "configured: no" in codex_post.stdout - claude_setup = run(farness + ["setup", "claude"], env=env) - assert "Configured MCP server `farness` in claude" in claude_setup.stdout + claude_setup = run(brier + ["setup", "claude"], env=env) + assert "Configured MCP server `brier` in claude" in claude_setup.stdout assert claude_skill.exists() decision_new = run( - farness + ["new", "Should we launch now?", "--context", "2 bugs left"], + brier + ["new", "Should we launch now?", "--context", "2 bugs left"], env=env, ) assert "Created decision" in decision_new.stdout - decision_list = run(farness + ["list"], env=env) + decision_list = run(brier + ["list"], env=env) assert "Should we launch now?" in decision_list.stdout print("Packaged install smoke test passed.") diff --git a/site/.env.example b/site/.env.example index 890f98c..8928fef 100644 --- a/site/.env.example +++ b/site/.env.example @@ -1 +1 @@ -NEXT_PUBLIC_FARNESS_API_BASE_URL=http://127.0.0.1:3002 +NEXT_PUBLIC_BRIER_API_BASE_URL=http://127.0.0.1:3002 diff --git a/site/.nvmrc b/site/.nvmrc new file mode 100644 index 0000000..2bd5a0a --- /dev/null +++ b/site/.nvmrc @@ -0,0 +1 @@ +22 diff --git a/site/bun.lock b/site/bun.lock index 289a91f..69f4938 100644 --- a/site/bun.lock +++ b/site/bun.lock @@ -1,26 +1,28 @@ { "lockfileVersion": 1, + "configVersion": 0, "workspaces": { "": { "name": "farness-site", "dependencies": { - "next": "^16.2.1", - "react": "^19.2.4", - "react-dom": "^19.2.4", + "brier-design": "github:MaxGhenis/brier-design#v1.1.0", + "next": "latest", + "react": "latest", + "react-dom": "latest", }, "devDependencies": { - "@tailwindcss/postcss": "^4.2.2", - "@testing-library/jest-dom": "^6.9.1", - "@testing-library/react": "^16.3.2", - "@types/node": "^25.5.0", - "@types/react": "^19.2.14", - "@types/react-dom": "^19.2.3", - "jsdom": "^29.0.1", - "postcss": "^8.5.8", - "prettier": "^3.8.1", - "tailwindcss": "^4.2.2", - "typescript": "^6.0.2", - "vitest": "^4.1.1", + "@tailwindcss/postcss": "latest", + "@testing-library/jest-dom": "latest", + "@testing-library/react": "latest", + "@types/node": "latest", + "@types/react": "latest", + "@types/react-dom": "latest", + "jsdom": "latest", + "postcss": "latest", + "prettier": "latest", + "tailwindcss": "latest", + "typescript": "latest", + "vitest": "latest", }, }, }, @@ -29,9 +31,11 @@ "@alloc/quick-lru": ["@alloc/quick-lru@5.2.0", "", {}, "sha512-UrcABB+4bUrFABwbluTIBErXwvbsU/V7TZWfmbgJfbkwiBuziS9gxdODUyuiecfdGQ85jglMW6juS3+z5TsKLw=="], - "@asamuzakjp/css-color": ["@asamuzakjp/css-color@5.0.1", "", { "dependencies": { "@csstools/css-calc": "^3.1.1", "@csstools/css-color-parser": "^4.0.2", "@csstools/css-parser-algorithms": "^4.0.0", "@csstools/css-tokenizer": "^4.0.0", "lru-cache": "^11.2.6" } }, "sha512-2SZFvqMyvboVV1d15lMf7XiI3m7SDqXUuKaTymJYLN6dSGadqp+fVojqJlVoMlbZnlTmu3S0TLwLTJpvBMO1Aw=="], + "@asamuzakjp/css-color": ["@asamuzakjp/css-color@5.1.11", "", { "dependencies": { "@asamuzakjp/generational-cache": "^1.0.1", "@csstools/css-calc": "^3.2.0", "@csstools/css-color-parser": "^4.1.0", "@csstools/css-parser-algorithms": "^4.0.0", "@csstools/css-tokenizer": "^4.0.0" } }, "sha512-KVw6qIiCTUQhByfTd78h2yD1/00waTmm9uy/R7Ck/ctUyAPj+AEDLkQIdJW0T8+qGgj3j5bpNKK7Q3G+LedJWg=="], - "@asamuzakjp/dom-selector": ["@asamuzakjp/dom-selector@7.0.4", "", { "dependencies": { "@asamuzakjp/nwsapi": "^2.3.9", "bidi-js": "^1.0.3", "css-tree": "^3.2.1", "is-potential-custom-element-name": "^1.0.1", "lru-cache": "^11.2.7" } }, "sha512-jXR6x4AcT3eIrS2fSNAwJpwirOkGcd+E7F7CP3zjdTqz9B/2huHOL8YJZBgekKwLML+u7qB/6P1LXQuMScsx0w=="], + "@asamuzakjp/dom-selector": ["@asamuzakjp/dom-selector@7.1.1", "", { "dependencies": { "@asamuzakjp/generational-cache": "^1.0.1", "@asamuzakjp/nwsapi": "^2.3.9", "bidi-js": "^1.0.3", "css-tree": "^3.2.1", "is-potential-custom-element-name": "^1.0.1" } }, "sha512-67RZDnYRc8H/8MLDgQCDE//zoqVFwajkepHZgmXrbwybzXOEwOWGPYGmALYl9J2DOLfFPPs6kKCqmbzV895hTQ=="], + + "@asamuzakjp/generational-cache": ["@asamuzakjp/generational-cache@1.0.1", "", {}, "sha512-wajfB8KqzMCN2KGNFdLkReeHncd0AslUSrvHVvvYWuU8ghncRJoA50kT3zP9MVL0+9g4/67H+cdvBskj9THPzg=="], "@asamuzakjp/nwsapi": ["@asamuzakjp/nwsapi@2.3.9", "", {}, "sha512-n8GuYSrI9bF7FFZ/SjhwevlHc8xaVlb/7HmHelnc/PZXBD2ZR49NnN9sMMuDdEGPeeRQ5d0hqlSlEpgCX3Wl0Q=="], @@ -45,13 +49,13 @@ "@csstools/color-helpers": ["@csstools/color-helpers@6.0.2", "", {}, "sha512-LMGQLS9EuADloEFkcTBR3BwV/CGHV7zyDxVRtVDTwdI2Ca4it0CCVTT9wCkxSgokjE5Ho41hEPgb8OEUwoXr6Q=="], - "@csstools/css-calc": ["@csstools/css-calc@3.1.1", "", { "peerDependencies": { "@csstools/css-parser-algorithms": "^4.0.0", "@csstools/css-tokenizer": "^4.0.0" } }, "sha512-HJ26Z/vmsZQqs/o3a6bgKslXGFAungXGbinULZO3eMsOyNJHeBBZfup5FiZInOghgoM4Hwnmw+OgbJCNg1wwUQ=="], + "@csstools/css-calc": ["@csstools/css-calc@3.2.1", "", { "peerDependencies": { "@csstools/css-parser-algorithms": "^4.0.0", "@csstools/css-tokenizer": "^4.0.0" } }, "sha512-DtdHlgXh5ZkA43cwBcAm+huzgJiwx3ZTWVjBs94kwz2xKqSimDA3lBgCjphYgwgVUMWatSM0pDd8TILB1yrVVg=="], - "@csstools/css-color-parser": ["@csstools/css-color-parser@4.0.2", "", { "dependencies": { "@csstools/color-helpers": "^6.0.2", "@csstools/css-calc": "^3.1.1" }, "peerDependencies": { "@csstools/css-parser-algorithms": "^4.0.0", "@csstools/css-tokenizer": "^4.0.0" } }, "sha512-0GEfbBLmTFf0dJlpsNU7zwxRIH0/BGEMuXLTCvFYxuL1tNhqzTbtnFICyJLTNK4a+RechKP75e7w42ClXSnJQw=="], + "@csstools/css-color-parser": ["@csstools/css-color-parser@4.1.1", "", { "dependencies": { "@csstools/color-helpers": "^6.0.2", "@csstools/css-calc": "^3.2.1" }, "peerDependencies": { "@csstools/css-parser-algorithms": "^4.0.0", "@csstools/css-tokenizer": "^4.0.0" } }, "sha512-eZ5XOtyhK+mggRafYUWzA0tvaYOFgdY8AkgQiCJF9qNAePnUo/zmsqqYubBBb3sQ8uNUaSKTY9s9klfRaAXL0g=="], "@csstools/css-parser-algorithms": ["@csstools/css-parser-algorithms@4.0.0", "", { "peerDependencies": { "@csstools/css-tokenizer": "^4.0.0" } }, "sha512-+B87qS7fIG3L5h3qwJ/IFbjoVoOe/bpOdh9hAjXbvx0o8ImEmUsGXN0inFOnk2ChCFgqkkGFQ+TpM5rbhkKe4w=="], - "@csstools/css-syntax-patches-for-csstree": ["@csstools/css-syntax-patches-for-csstree@1.1.1", "", { "peerDependencies": { "css-tree": "^3.2.1" }, "optionalPeers": ["css-tree"] }, "sha512-BvqN0AMWNAnLk9G8jnUT77D+mUbY/H2b3uDTvg2isJkHaOufUE2R3AOwxWo7VBQKT1lOdwdvorddo2B/lk64+w=="], + "@csstools/css-syntax-patches-for-csstree": ["@csstools/css-syntax-patches-for-csstree@1.1.4", "", { "peerDependencies": { "css-tree": "^3.2.1" }, "optionalPeers": ["css-tree"] }, "sha512-wgsqt92b7C7tQhIdPNxj0n9zuUbQlvAuI1exyzeNrOKOi62SD7ren8zqszmpVREjAOqg8cD2FqYhQfAuKjk4sw=="], "@csstools/css-tokenizer": ["@csstools/css-tokenizer@4.0.0", "", {}, "sha512-QxULHAm7cNu72w97JUNCBFODFaXpbDg+dP8b/oWFAZ2MTRppA3U00Y2L1HqaS4J6yBqxwa/Y3nMBaxVKbB/NsA=="], @@ -171,23 +175,23 @@ "@jridgewell/trace-mapping": ["@jridgewell/trace-mapping@0.3.31", "", { "dependencies": { "@jridgewell/resolve-uri": "^3.1.0", "@jridgewell/sourcemap-codec": "^1.4.14" } }, "sha512-zzNR+SdQSDJzc8joaeP8QQoCQr8NuYx2dIIytl1QeBEZHJ9uW6hebsrYgbz8hJwUQao3TWCMtmfV8Nu1twOLAw=="], - "@next/env": ["@next/env@16.2.1", "", {}, "sha512-n8P/HCkIWW+gVal2Z8XqXJ6aB3J0tuM29OcHpCsobWlChH/SITBs1DFBk/HajgrwDkqqBXPbuUuzgDvUekREPg=="], + "@next/env": ["@next/env@16.2.6", "", {}, "sha512-gd8HoHN4ufj73WmR3JmVolrpJR47ILK6LouP5xElPglaVxir6e1a7VzvTvDWkOoPXT9rkkTzyCxBu4yeZfZwcw=="], - "@next/swc-darwin-arm64": ["@next/swc-darwin-arm64@16.2.1", "", { "os": "darwin", "cpu": "arm64" }, "sha512-BwZ8w8YTaSEr2HIuXLMLxIdElNMPvY9fLqb20LX9A9OMGtJilhHLbCL3ggyd0TwjmMcTxi0XXt+ur1vWUoxj2Q=="], + "@next/swc-darwin-arm64": ["@next/swc-darwin-arm64@16.2.6", "", { "os": "darwin", "cpu": "arm64" }, "sha512-ZJGkkcNfYgrrMkqOdZ7zoLa1TOy0qpcMfk/z4Mh/FKUz40gVO+HNQWqmLxf67Z5WB64DRp0dhEbyHfel+6sJUg=="], - "@next/swc-darwin-x64": ["@next/swc-darwin-x64@16.2.1", "", { "os": "darwin", "cpu": "x64" }, "sha512-/vrcE6iQSJq3uL3VGVHiXeaKbn8Es10DGTGRJnRZlkNQQk3kaNtAJg8Y6xuAlrx/6INKVjkfi5rY0iEXorZ6uA=="], + "@next/swc-darwin-x64": ["@next/swc-darwin-x64@16.2.6", "", { "os": "darwin", "cpu": "x64" }, "sha512-v/YLBHIY132Ced3puBJ7YJKw1lqsCrgcNo2aRJlCEyQrrCeRJlvGlnmxhPxNQI3KE3N1DN5r9TPNPvka3nq5RQ=="], - "@next/swc-linux-arm64-gnu": ["@next/swc-linux-arm64-gnu@16.2.1", "", { "os": "linux", "cpu": "arm64" }, "sha512-uLn+0BK+C31LTVbQ/QU+UaVrV0rRSJQ8RfniQAHPghDdgE+SlroYqcmFnO5iNjNfVWCyKZHYrs3Nl0mUzWxbBw=="], + "@next/swc-linux-arm64-gnu": ["@next/swc-linux-arm64-gnu@16.2.6", "", { "os": "linux", "cpu": "arm64" }, "sha512-RPOvqlYBbcQjkz9VQQDZ2T2bARIjXZV1KFlt+V2Mr6SW/e4I9fcKsaA0hdyf2FHoTlsV2xnBd5Y912rP/1Ce6w=="], - "@next/swc-linux-arm64-musl": ["@next/swc-linux-arm64-musl@16.2.1", "", { "os": "linux", "cpu": "arm64" }, "sha512-ssKq6iMRnHdnycGp9hCuGnXJZ0YPr4/wNwrfE5DbmvEcgl9+yv97/Kq3TPVDfYome1SW5geciLB9aiEqKXQjlQ=="], + "@next/swc-linux-arm64-musl": ["@next/swc-linux-arm64-musl@16.2.6", "", { "os": "linux", "cpu": "arm64" }, "sha512-URUTu1+dMkxJsPFgm+OeEvq9wf5sujw0EvgYy80TDGHTSLTnIHeqb0Eu8A3sC95IRgjejQL+kC4mw+4yPxiAXA=="], - "@next/swc-linux-x64-gnu": ["@next/swc-linux-x64-gnu@16.2.1", "", { "os": "linux", "cpu": "x64" }, "sha512-HQm7SrHRELJ30T1TSmT706IWovFFSRGxfgUkyWJZF/RKBMdbdRWJuFrcpDdE5vy9UXjFOx6L3mRdqH04Mmx0hg=="], + "@next/swc-linux-x64-gnu": ["@next/swc-linux-x64-gnu@16.2.6", "", { "os": "linux", "cpu": "x64" }, "sha512-DOj182mPV8G3UkrayLoREM5YEYI+Dk5wv7Ox9xl1fFibAELEsFD0lDPfHIeILlutMMfdyhlzYPELG3peuKaurw=="], - "@next/swc-linux-x64-musl": ["@next/swc-linux-x64-musl@16.2.1", "", { "os": "linux", "cpu": "x64" }, "sha512-aV2iUaC/5HGEpbBkE+4B8aHIudoOy5DYekAKOMSHoIYQ66y/wIVeaRx8MS2ZMdxe/HIXlMho4ubdZs/J8441Tg=="], + "@next/swc-linux-x64-musl": ["@next/swc-linux-x64-musl@16.2.6", "", { "os": "linux", "cpu": "x64" }, "sha512-HKQ5SP/V/ub73UvF7n/zeJlxk2kLmtL7Wzrg4WfmkjmNos5onJ2tKu7yZOPdL18A6Svfn3max29ym+ry7NkK4g=="], - "@next/swc-win32-arm64-msvc": ["@next/swc-win32-arm64-msvc@16.2.1", "", { "os": "win32", "cpu": "arm64" }, "sha512-IXdNgiDHaSk0ZUJ+xp0OQTdTgnpx1RCfRTalhn3cjOP+IddTMINwA7DXZrwTmGDO8SUr5q2hdP/du4DcrB1GxA=="], + "@next/swc-win32-arm64-msvc": ["@next/swc-win32-arm64-msvc@16.2.6", "", { "os": "win32", "cpu": "arm64" }, "sha512-LZXpTlPyS5v7HhSmnvsLGP3iIYgYOBnc8r8ArlT55sGHV89bR2HlDdBjWQ+PY6SJMmk8TuVGFuxalnP3k/0Dwg=="], - "@next/swc-win32-x64-msvc": ["@next/swc-win32-x64-msvc@16.2.1", "", { "os": "win32", "cpu": "x64" }, "sha512-qvU+3a39Hay+ieIztkGSbF7+mccbbg1Tk25hc4JDylf8IHjYmY/Zm64Qq1602yPyQqvie+vf5T/uPwNxDNIoeg=="], + "@next/swc-win32-x64-msvc": ["@next/swc-win32-x64-msvc@16.2.6", "", { "os": "win32", "cpu": "x64" }, "sha512-F0+4i0h9J6C4eE3EAPWsoCk7UW/dbzOjyzxY0qnDUOYFu6FFmdZ6l97/XdV3/Nz3VYyO7UWjyEJUXkGqcoXfMA=="], "@rollup/rollup-android-arm-eabi": ["@rollup/rollup-android-arm-eabi@4.59.0", "", { "os": "android", "cpu": "arm" }, "sha512-upnNBkA6ZH2VKGcBj9Fyl9IGNPULcjXRlg0LLeaioQWueH30p6IXtJEbKAgvyv+mJaMxSm1l6xwDXYjpEMiLMg=="], @@ -243,35 +247,35 @@ "@swc/helpers": ["@swc/helpers@0.5.15", "", { "dependencies": { "tslib": "^2.8.0" } }, "sha512-JQ5TuMi45Owi4/BIMAJBoSQoOJu12oOk/gADqlcUL9JEdHB8vyjUSsxqeNXnmXHjYKMi2WcYtezGEEhqUI/E2g=="], - "@tailwindcss/node": ["@tailwindcss/node@4.2.2", "", { "dependencies": { "@jridgewell/remapping": "^2.3.5", "enhanced-resolve": "^5.19.0", "jiti": "^2.6.1", "lightningcss": "1.32.0", "magic-string": "^0.30.21", "source-map-js": "^1.2.1", "tailwindcss": "4.2.2" } }, "sha512-pXS+wJ2gZpVXqFaUEjojq7jzMpTGf8rU6ipJz5ovJV6PUGmlJ+jvIwGrzdHdQ80Sg+wmQxUFuoW1UAAwHNEdFA=="], + "@tailwindcss/node": ["@tailwindcss/node@4.3.0", "", { "dependencies": { "@jridgewell/remapping": "^2.3.5", "enhanced-resolve": "^5.21.0", "jiti": "^2.6.1", "lightningcss": "1.32.0", "magic-string": "^0.30.21", "source-map-js": "^1.2.1", "tailwindcss": "4.3.0" } }, "sha512-aFb4gUhFOgdh9AXo4IzBEOzBkkAxm9VigwDJnMIYv3lcfXCJVesNfbEaBl4BNgVRyid92AmdviqwBUBRKSeY3g=="], - "@tailwindcss/oxide": ["@tailwindcss/oxide@4.2.2", "", { "optionalDependencies": { "@tailwindcss/oxide-android-arm64": "4.2.2", "@tailwindcss/oxide-darwin-arm64": "4.2.2", "@tailwindcss/oxide-darwin-x64": "4.2.2", "@tailwindcss/oxide-freebsd-x64": "4.2.2", "@tailwindcss/oxide-linux-arm-gnueabihf": "4.2.2", "@tailwindcss/oxide-linux-arm64-gnu": "4.2.2", "@tailwindcss/oxide-linux-arm64-musl": "4.2.2", "@tailwindcss/oxide-linux-x64-gnu": "4.2.2", "@tailwindcss/oxide-linux-x64-musl": "4.2.2", "@tailwindcss/oxide-wasm32-wasi": "4.2.2", "@tailwindcss/oxide-win32-arm64-msvc": "4.2.2", "@tailwindcss/oxide-win32-x64-msvc": "4.2.2" } }, "sha512-qEUA07+E5kehxYp9BVMpq9E8vnJuBHfJEC0vPC5e7iL/hw7HR61aDKoVoKzrG+QKp56vhNZe4qwkRmMC0zDLvg=="], + "@tailwindcss/oxide": ["@tailwindcss/oxide@4.3.0", "", { "optionalDependencies": { "@tailwindcss/oxide-android-arm64": "4.3.0", "@tailwindcss/oxide-darwin-arm64": "4.3.0", "@tailwindcss/oxide-darwin-x64": "4.3.0", "@tailwindcss/oxide-freebsd-x64": "4.3.0", "@tailwindcss/oxide-linux-arm-gnueabihf": "4.3.0", "@tailwindcss/oxide-linux-arm64-gnu": "4.3.0", "@tailwindcss/oxide-linux-arm64-musl": "4.3.0", "@tailwindcss/oxide-linux-x64-gnu": "4.3.0", "@tailwindcss/oxide-linux-x64-musl": "4.3.0", "@tailwindcss/oxide-wasm32-wasi": "4.3.0", "@tailwindcss/oxide-win32-arm64-msvc": "4.3.0", "@tailwindcss/oxide-win32-x64-msvc": "4.3.0" } }, "sha512-F7HZGBeN9I0/AuuJS5PwcD8xayx5ri5GhjYUDBEVYUkexyA/giwbDNjRVrxSezE3T250OU2K/wp/ltWx3UOefg=="], - "@tailwindcss/oxide-android-arm64": ["@tailwindcss/oxide-android-arm64@4.2.2", "", { "os": "android", "cpu": "arm64" }, "sha512-dXGR1n+P3B6748jZO/SvHZq7qBOqqzQ+yFrXpoOWWALWndF9MoSKAT3Q0fYgAzYzGhxNYOoysRvYlpixRBBoDg=="], + "@tailwindcss/oxide-android-arm64": ["@tailwindcss/oxide-android-arm64@4.3.0", "", { "os": "android", "cpu": "arm64" }, "sha512-TJPiq67tKlLuObP6RkwvVGDoxCMBVtDgKkLfa/uyj7/FyxvQwHS+UOnVrXXgbEsfUaMgiVvC4KbJnRr26ho4Ng=="], - "@tailwindcss/oxide-darwin-arm64": ["@tailwindcss/oxide-darwin-arm64@4.2.2", "", { "os": "darwin", "cpu": "arm64" }, "sha512-iq9Qjr6knfMpZHj55/37ouZeykwbDqF21gPFtfnhCCKGDcPI/21FKC9XdMO/XyBM7qKORx6UIhGgg6jLl7BZlg=="], + "@tailwindcss/oxide-darwin-arm64": ["@tailwindcss/oxide-darwin-arm64@4.3.0", "", { "os": "darwin", "cpu": "arm64" }, "sha512-oMN/WZRb+SO37BmUElEgeEWuU8E/HXRkiODxJxLe1UTHVXLrdVSgfaJV7pSlhRGMSOiXLuxTIjfsF3wYvz8cgQ=="], - "@tailwindcss/oxide-darwin-x64": ["@tailwindcss/oxide-darwin-x64@4.2.2", "", { "os": "darwin", "cpu": "x64" }, "sha512-BlR+2c3nzc8f2G639LpL89YY4bdcIdUmiOOkv2GQv4/4M0vJlpXEa0JXNHhCHU7VWOKWT/CjqHdTP8aUuDJkuw=="], + "@tailwindcss/oxide-darwin-x64": ["@tailwindcss/oxide-darwin-x64@4.3.0", "", { "os": "darwin", "cpu": "x64" }, "sha512-N6CUmu4a6bKVADfw77p+iw6Yd9Q3OBhe0veaDX+QazfuVYlQsHfDgxBrsjQ/IW+zywL8mTrNd0SdJT/zgtvMdA=="], - "@tailwindcss/oxide-freebsd-x64": ["@tailwindcss/oxide-freebsd-x64@4.2.2", "", { "os": "freebsd", "cpu": "x64" }, "sha512-YUqUgrGMSu2CDO82hzlQ5qSb5xmx3RUrke/QgnoEx7KvmRJHQuZHZmZTLSuuHwFf0DJPybFMXMYf+WJdxHy/nQ=="], + "@tailwindcss/oxide-freebsd-x64": ["@tailwindcss/oxide-freebsd-x64@4.3.0", "", { "os": "freebsd", "cpu": "x64" }, "sha512-zDL5hBkQdH5C6MpqbK3gQAgP80tsMwSI26vjOzjJtNCMUo0lFgOItzHKBIupOZNQxt3ouPH7RPhvNhiTfCe5CQ=="], - "@tailwindcss/oxide-linux-arm-gnueabihf": ["@tailwindcss/oxide-linux-arm-gnueabihf@4.2.2", "", { "os": "linux", "cpu": "arm" }, "sha512-FPdhvsW6g06T9BWT0qTwiVZYE2WIFo2dY5aCSpjG/S/u1tby+wXoslXS0kl3/KXnULlLr1E3NPRRw0g7t2kgaQ=="], + "@tailwindcss/oxide-linux-arm-gnueabihf": ["@tailwindcss/oxide-linux-arm-gnueabihf@4.3.0", "", { "os": "linux", "cpu": "arm" }, "sha512-R06HdNi7A7OEoMsf6d4tjZ71RCWnZQPHj2mnotSFURjNLdBC+cIgXQ7l81CqeoiQftjf6OOblxXMInMgN2VzMA=="], - "@tailwindcss/oxide-linux-arm64-gnu": ["@tailwindcss/oxide-linux-arm64-gnu@4.2.2", "", { "os": "linux", "cpu": "arm64" }, "sha512-4og1V+ftEPXGttOO7eCmW7VICmzzJWgMx+QXAJRAhjrSjumCwWqMfkDrNu1LXEQzNAwz28NCUpucgQPrR4S2yw=="], + "@tailwindcss/oxide-linux-arm64-gnu": ["@tailwindcss/oxide-linux-arm64-gnu@4.3.0", "", { "os": "linux", "cpu": "arm64" }, "sha512-qTJHELX8jetjhRQHCLilkVLmybpzNQAtaI/gaoVoidn/ufbNDbAo8KlK2J+yPoc8wQxvDxCmh/5lr8nC1+lTbg=="], - "@tailwindcss/oxide-linux-arm64-musl": ["@tailwindcss/oxide-linux-arm64-musl@4.2.2", "", { "os": "linux", "cpu": "arm64" }, "sha512-oCfG/mS+/+XRlwNjnsNLVwnMWYH7tn/kYPsNPh+JSOMlnt93mYNCKHYzylRhI51X+TbR+ufNhhKKzm6QkqX8ag=="], + "@tailwindcss/oxide-linux-arm64-musl": ["@tailwindcss/oxide-linux-arm64-musl@4.3.0", "", { "os": "linux", "cpu": "arm64" }, "sha512-Z6sukiQsngnWO+l39X4pPbiWT81IC+PLKF+PHxIlyZbGNb9MODfYlXEVlFvej5BOZInWX01kVyzeLvHsXhfczQ=="], - "@tailwindcss/oxide-linux-x64-gnu": ["@tailwindcss/oxide-linux-x64-gnu@4.2.2", "", { "os": "linux", "cpu": "x64" }, "sha512-rTAGAkDgqbXHNp/xW0iugLVmX62wOp2PoE39BTCGKjv3Iocf6AFbRP/wZT/kuCxC9QBh9Pu8XPkv/zCZB2mcMg=="], + "@tailwindcss/oxide-linux-x64-gnu": ["@tailwindcss/oxide-linux-x64-gnu@4.3.0", "", { "os": "linux", "cpu": "x64" }, "sha512-DRNdQRpSGzRGfARVuVkxvM8Q12nh19l4BF/G7zGA1oe+9wcC6saFBHTISrpIcKzhiXtSrlSrluCfvMuledoCTQ=="], - "@tailwindcss/oxide-linux-x64-musl": ["@tailwindcss/oxide-linux-x64-musl@4.2.2", "", { "os": "linux", "cpu": "x64" }, "sha512-XW3t3qwbIwiSyRCggeO2zxe3KWaEbM0/kW9e8+0XpBgyKU4ATYzcVSMKteZJ1iukJ3HgHBjbg9P5YPRCVUxlnQ=="], + "@tailwindcss/oxide-linux-x64-musl": ["@tailwindcss/oxide-linux-x64-musl@4.3.0", "", { "os": "linux", "cpu": "x64" }, "sha512-Z0IADbDo8bh6I7h2IQMx601AdXBLfFpEdUotft86evd/8ZPflZe9COPO8Q1vw+pfLWIUo9zN/JGZvwuAJqduqg=="], - "@tailwindcss/oxide-wasm32-wasi": ["@tailwindcss/oxide-wasm32-wasi@4.2.2", "", { "dependencies": { "@emnapi/core": "^1.8.1", "@emnapi/runtime": "^1.8.1", "@emnapi/wasi-threads": "^1.1.0", "@napi-rs/wasm-runtime": "^1.1.1", "@tybys/wasm-util": "^0.10.1", "tslib": "^2.8.1" }, "cpu": "none" }, "sha512-eKSztKsmEsn1O5lJ4ZAfyn41NfG7vzCg496YiGtMDV86jz1q/irhms5O0VrY6ZwTUkFy/EKG3RfWgxSI3VbZ8Q=="], + "@tailwindcss/oxide-wasm32-wasi": ["@tailwindcss/oxide-wasm32-wasi@4.3.0", "", { "dependencies": { "@emnapi/core": "^1.10.0", "@emnapi/runtime": "^1.10.0", "@emnapi/wasi-threads": "^1.2.1", "@napi-rs/wasm-runtime": "^1.1.4", "@tybys/wasm-util": "^0.10.1", "tslib": "^2.8.1" }, "cpu": "none" }, "sha512-HNZGOUxEmElksYR7S6sC5jTeNGpobAsy9u7Gu0AskJ8/20FR9GqebUyB+HBcU/ax6BHuiuJi+Oda4B+YX6H1yA=="], - "@tailwindcss/oxide-win32-arm64-msvc": ["@tailwindcss/oxide-win32-arm64-msvc@4.2.2", "", { "os": "win32", "cpu": "arm64" }, "sha512-qPmaQM4iKu5mxpsrWZMOZRgZv1tOZpUm+zdhhQP0VhJfyGGO3aUKdbh3gDZc/dPLQwW4eSqWGrrcWNBZWUWaXQ=="], + "@tailwindcss/oxide-win32-arm64-msvc": ["@tailwindcss/oxide-win32-arm64-msvc@4.3.0", "", { "os": "win32", "cpu": "arm64" }, "sha512-Pe+RPVTi1T+qymuuRpcdvwSVZjnll/f7n8gBxMMh3xLTctMDKqpdfGimbMyioqtLhUYZxdJ9wGNhV7MKHvgZsQ=="], - "@tailwindcss/oxide-win32-x64-msvc": ["@tailwindcss/oxide-win32-x64-msvc@4.2.2", "", { "os": "win32", "cpu": "x64" }, "sha512-1T/37VvI7WyH66b+vqHj/cLwnCxt7Qt3WFu5Q8hk65aOvlwAhs7rAp1VkulBJw/N4tMirXjVnylTR72uI0HGcA=="], + "@tailwindcss/oxide-win32-x64-msvc": ["@tailwindcss/oxide-win32-x64-msvc@4.3.0", "", { "os": "win32", "cpu": "x64" }, "sha512-Mvrf2kXW/yeW/OTezZlCGOirXRcUuLIBx/5Y12BaPM7wJoryG6dfS/NJL8aBPqtTEx/Vm4T4vKzFUcKDT+TKUA=="], - "@tailwindcss/postcss": ["@tailwindcss/postcss@4.2.2", "", { "dependencies": { "@alloc/quick-lru": "^5.2.0", "@tailwindcss/node": "4.2.2", "@tailwindcss/oxide": "4.2.2", "postcss": "^8.5.6", "tailwindcss": "4.2.2" } }, "sha512-n4goKQbW8RVXIbNKRB/45LzyUqN451deQK0nzIeauVEqjlI49slUlgKYJM2QyUzap/PcpnS7kzSUmPb1sCRvYQ=="], + "@tailwindcss/postcss": ["@tailwindcss/postcss@4.3.0", "", { "dependencies": { "@alloc/quick-lru": "^5.2.0", "@tailwindcss/node": "4.3.0", "@tailwindcss/oxide": "4.3.0", "postcss": "^8.5.10", "tailwindcss": "4.3.0" } }, "sha512-Jm05Tjx+9yCLGv5qw1c+84Psds8MnyrEQYCB+FFk2lgGiUjlRqdxke4mVTuYrj2xnVZqKim2Apr5ySuQRYAw/w=="], "@testing-library/dom": ["@testing-library/dom@10.4.1", "", { "dependencies": { "@babel/code-frame": "^7.10.4", "@babel/runtime": "^7.12.5", "@types/aria-query": "^5.0.1", "aria-query": "5.3.0", "dom-accessibility-api": "^0.5.9", "lz-string": "^1.5.0", "picocolors": "1.1.1", "pretty-format": "^27.0.2" } }, "sha512-o4PXJQidqJl82ckFaXUeoAW+XysPLauYI43Abki5hABd853iMhitooc6znOnczgbTYmEP6U6/y1ZyKAIsvMKGg=="], @@ -287,25 +291,25 @@ "@types/estree": ["@types/estree@1.0.8", "", {}, "sha512-dWHzHa2WqEXI/O1E9OjrocMTKJl2mSrEolh1Iomrv6U+JuNwaHXsXx9bLu5gG7BUWFIN0skIQJQ/L1rIex4X6w=="], - "@types/node": ["@types/node@25.5.0", "", { "dependencies": { "undici-types": "~7.18.0" } }, "sha512-jp2P3tQMSxWugkCUKLRPVUpGaL5MVFwF8RDuSRztfwgN1wmqJeMSbKlnEtQqU8UrhTmzEmZdu2I6v2dpp7XIxw=="], + "@types/node": ["@types/node@25.9.1", "", { "dependencies": { "undici-types": ">=7.24.0 <7.24.7" } }, "sha512-xfrlY7UD5rMJk3ZVJP8BNzS28J36YJg+xp+LPXV1TdWxr8uMH5A860QNxYDGQe/ylDSgjxE52Q9VnO7p75tJxg=="], - "@types/react": ["@types/react@19.2.14", "", { "dependencies": { "csstype": "^3.2.2" } }, "sha512-ilcTH/UniCkMdtexkoCN0bI7pMcJDvmQFPvuPvmEaYA/NSfFTAgdUSLAoVjaRJm7+6PvcM+q1zYOwS4wTYMF9w=="], + "@types/react": ["@types/react@19.2.15", "", { "dependencies": { "csstype": "^3.2.2" } }, "sha512-eRwcGNHve+E8qtEQSSRl6urh+rFop4v8gm6O8rGv25CodbvFdLjA1vVQ1KkiFE0w0UPOnb8tDiFKL5lp0rtY5Q=="], "@types/react-dom": ["@types/react-dom@19.2.3", "", { "peerDependencies": { "@types/react": "^19.2.0" } }, "sha512-jp2L/eY6fn+KgVVQAOqYItbF0VY/YApe5Mz2F0aykSO8gx31bYCZyvSeYxCHKvzHG5eZjc+zyaS5BrBWya2+kQ=="], - "@vitest/expect": ["@vitest/expect@4.1.1", "", { "dependencies": { "@standard-schema/spec": "^1.1.0", "@types/chai": "^5.2.2", "@vitest/spy": "4.1.1", "@vitest/utils": "4.1.1", "chai": "^6.2.2", "tinyrainbow": "^3.0.3" } }, "sha512-xAV0fqBTk44Rn6SjJReEQkHP3RrqbJo6JQ4zZ7/uVOiJZRarBtblzrOfFIZeYUrukp2YD6snZG6IBqhOoHTm+A=="], + "@vitest/expect": ["@vitest/expect@4.1.7", "", { "dependencies": { "@standard-schema/spec": "^1.1.0", "@types/chai": "^5.2.2", "@vitest/spy": "4.1.7", "@vitest/utils": "4.1.7", "chai": "^6.2.2", "tinyrainbow": "^3.1.0" } }, "sha512-1R+tw0ortHEbZDGMymm+pN7/AFQ/RkFFdtd7EN+VBpynKmLbP8A3rpEXdshBJ7+8hQ9zBJh/i1s0yKNtxAnU7w=="], - "@vitest/mocker": ["@vitest/mocker@4.1.1", "", { "dependencies": { "@vitest/spy": "4.1.1", "estree-walker": "^3.0.3", "magic-string": "^0.30.21" }, "peerDependencies": { "msw": "^2.4.9", "vite": "^6.0.0 || ^7.0.0 || ^8.0.0" }, "optionalPeers": ["msw", "vite"] }, "sha512-h3BOylsfsCLPeceuCPAAJ+BvNwSENgJa4hXoXu4im0bs9Lyp4URc4JYK4pWLZ4pG/UQn7AT92K6IByi6rE6g3A=="], + "@vitest/mocker": ["@vitest/mocker@4.1.7", "", { "dependencies": { "@vitest/spy": "4.1.7", "estree-walker": "^3.0.3", "magic-string": "^0.30.21" }, "peerDependencies": { "msw": "^2.4.9", "vite": "^6.0.0 || ^7.0.0 || ^8.0.0" }, "optionalPeers": ["msw", "vite"] }, "sha512-vY7nuamKgfvpA1Koa3oYIw/k7D6kZnpGyNMZW8loow2bsBYla1TFdqTaXncWdRn4pgwNs+90RhnXhJScDwQeJA=="], - "@vitest/pretty-format": ["@vitest/pretty-format@4.1.1", "", { "dependencies": { "tinyrainbow": "^3.0.3" } }, "sha512-GM+TEQN5WhOygr1lp7skeVjdLPqqWMHsfzXrcHAqZJi/lIVh63H0kaRCY8MDhNWikx19zBUK8ceaLB7X5AH9NQ=="], + "@vitest/pretty-format": ["@vitest/pretty-format@4.1.7", "", { "dependencies": { "tinyrainbow": "^3.1.0" } }, "sha512-umgCarTOYQWIaDMvGDRZij+6b9oVeLIyJzfN+AS88e0ZOU3QTgNNSTtjQOpcvWr3np1N0j4WgZj+sb3oYBDscw=="], - "@vitest/runner": ["@vitest/runner@4.1.1", "", { "dependencies": { "@vitest/utils": "4.1.1", "pathe": "^2.0.3" } }, "sha512-f7+FPy75vN91QGWsITueq0gedwUZy1fLtHOCMeQpjs8jTekAHeKP80zfDEnhrleviLHzVSDXIWuCIOFn3D3f8A=="], + "@vitest/runner": ["@vitest/runner@4.1.7", "", { "dependencies": { "@vitest/utils": "4.1.7", "pathe": "^2.0.3" } }, "sha512-BapjmAQ2aI78WdMEfeUWivnfVzB+VPGwWRQcJE0OUq7qEeEcBsCSf+0T5iREBNE5nBb4wA5Ya0W6IA+sghdEFw=="], - "@vitest/snapshot": ["@vitest/snapshot@4.1.1", "", { "dependencies": { "@vitest/pretty-format": "4.1.1", "@vitest/utils": "4.1.1", "magic-string": "^0.30.21", "pathe": "^2.0.3" } }, "sha512-kMVSgcegWV2FibXEx9p9WIKgje58lcTbXgnJixfcg15iK8nzCXhmalL0ZLtTWLW9PH1+1NEDShiFFedB3tEgWg=="], + "@vitest/snapshot": ["@vitest/snapshot@4.1.7", "", { "dependencies": { "@vitest/pretty-format": "4.1.7", "@vitest/utils": "4.1.7", "magic-string": "^0.30.21", "pathe": "^2.0.3" } }, "sha512-ZacLzja+TmJeZ1h14xW2FB/WpeimUD3haBXQPyJqxvo8jQTmfeA8zv58mtjN2C7EHXZDYVcVYdYmAxjkWVvKCw=="], - "@vitest/spy": ["@vitest/spy@4.1.1", "", {}, "sha512-6Ti/KT5OVaiupdIZEuZN7l3CZcR0cxnxt70Z0//3CtwgObwA6jZhmVBA3yrXSVN3gmwjgd7oDNLlsXz526gpRA=="], + "@vitest/spy": ["@vitest/spy@4.1.7", "", {}, "sha512-kbkI5LMWakyuTIvs6fUJ5qdIVb1XVKsYJAT4OJ938cHMROYMSfmoQdZy0aaAnjbbc8F61vkoTqz/Az+/HiIu5Q=="], - "@vitest/utils": ["@vitest/utils@4.1.1", "", { "dependencies": { "@vitest/pretty-format": "4.1.1", "convert-source-map": "^2.0.0", "tinyrainbow": "^3.0.3" } }, "sha512-cNxAlaB3sHoCdL6pj6yyUXv9Gry1NHNg0kFTXdvSIZXLHsqKH7chiWOkwJ5s5+d/oMwcoG9T0bKU38JZWKusrQ=="], + "@vitest/utils": ["@vitest/utils@4.1.7", "", { "dependencies": { "@vitest/pretty-format": "4.1.7", "convert-source-map": "^2.0.0", "tinyrainbow": "^3.1.0" } }, "sha512-T532WBu791cBxJlCl6SO+J14l81DQx6uQHm1bQbmCDY7nqlEIgkza/UFnSBNaUtSf41unldDFjdOBYEQC4b5Hw=="], "ansi-regex": ["ansi-regex@5.0.1", "", {}, "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ=="], @@ -319,6 +323,8 @@ "bidi-js": ["bidi-js@1.0.3", "", { "dependencies": { "require-from-string": "^2.0.2" } }, "sha512-RKshQI1R3YQ+n9YJz2QQ147P66ELpa1FQEg20Dk8oW9t2KgLbpDLLp9aGZ7y8WHSshDknG0bknqGw5/tyCs5tw=="], + "brier-design": ["brier-design@github:MaxGhenis/brier-design#5e30cc9", {}, "MaxGhenis-brier-design-5e30cc9", "sha512-H4hIZLh2uP34NfNdo8wSVOv+bHDZjEe+/bAPe9R9x6gZUEQT8k2kiPcBnAkDsKS3dCpTdBsvfFXKMqWWXQAh1w=="], + "caniuse-lite": ["caniuse-lite@1.0.30001774", "", {}, "sha512-DDdwPGz99nmIEv216hKSgLD+D4ikHQHjBC/seF98N9CPqRX4M5mSxT9eTV6oyisnJcuzxtZy4n17yKKQYmYQOA=="], "chai": ["chai@6.2.2", "", {}, "sha512-NUPRluOfOiTKBKvWPtSD4PhFvWCqOi0BGStNWs57X9js7XGTprSmFoz5F0tWhR4WPjNeR9jXqdC7/UpSJTnlRg=="], @@ -343,9 +349,9 @@ "dom-accessibility-api": ["dom-accessibility-api@0.6.3", "", {}, "sha512-7ZgogeTnjuHbo+ct10G9Ffp0mif17idi0IyWNVA/wcwcm7NPOD/WEHVP3n7n3MhXqxoIYm8d6MuZohYWIZ4T3w=="], - "enhanced-resolve": ["enhanced-resolve@5.19.0", "", { "dependencies": { "graceful-fs": "^4.2.4", "tapable": "^2.3.0" } }, "sha512-phv3E1Xl4tQOShqSte26C7Fl84EwUdZsyOuSSk9qtAGyyQs2s3jJzComh+Abf4g187lUUAvH+H26omrqia2aGg=="], + "enhanced-resolve": ["enhanced-resolve@5.22.1", "", { "dependencies": { "graceful-fs": "^4.2.4", "tapable": "^2.3.3" } }, "sha512-6QEuw3zoX1SJQc7b87aBXke/no+mG2bTBgw29gWMQonLmpEkWoCAVkl+M49e48AZlWzxiDzDZzYdp6kobcyLww=="], - "entities": ["entities@6.0.1", "", {}, "sha512-aN97NXWF6AWBTahfVOIrB/NShkzi5H7F9r1s9mD3cDj4Ko5f2qhhVoYMibXF7GlLveb/D2ioWay8lxI97Ven3g=="], + "entities": ["entities@8.0.0", "", {}, "sha512-zwfzJecQ/Uej6tusMqwAqU/6KL2XaB2VZ2Jg54Je6ahNBGNH6Ek6g3jjNCF0fG9EWQKGZNddNjU5F1ZQn/sBnA=="], "es-module-lexer": ["es-module-lexer@2.0.0", "", {}, "sha512-5POEcUuZybH7IdmGsD8wlf0AI55wMecM9rVBTI/qEAy2c1kTOm3DjFYjrBdI2K3BaJjJYfYFeRtM0t9ssnRuxw=="], @@ -371,7 +377,7 @@ "js-tokens": ["js-tokens@4.0.0", "", {}, "sha512-RdJUflcE3cUzKiMqQgsCu06FPu9UdIJO0beYbPhHN4k6apgJtifcoCtT9bcxOpYBtpD2kCM6Sbzg4CausW/PKQ=="], - "jsdom": ["jsdom@29.0.1", "", { "dependencies": { "@asamuzakjp/css-color": "^5.0.1", "@asamuzakjp/dom-selector": "^7.0.3", "@bramus/specificity": "^2.4.2", "@csstools/css-syntax-patches-for-csstree": "^1.1.1", "@exodus/bytes": "^1.15.0", "css-tree": "^3.2.1", "data-urls": "^7.0.0", "decimal.js": "^10.6.0", "html-encoding-sniffer": "^6.0.0", "is-potential-custom-element-name": "^1.0.1", "lru-cache": "^11.2.7", "parse5": "^8.0.0", "saxes": "^6.0.0", "symbol-tree": "^3.2.4", "tough-cookie": "^6.0.1", "undici": "^7.24.5", "w3c-xmlserializer": "^5.0.0", "webidl-conversions": "^8.0.1", "whatwg-mimetype": "^5.0.0", "whatwg-url": "^16.0.1", "xml-name-validator": "^5.0.0" }, "peerDependencies": { "canvas": "^3.0.0" }, "optionalPeers": ["canvas"] }, "sha512-z6JOK5gRO7aMybVq/y/MlIpKh8JIi68FBKMUtKkK2KH/wMSRlCxQ682d08LB9fYXplyY/UXG8P4XXTScmdjApg=="], + "jsdom": ["jsdom@29.1.1", "", { "dependencies": { "@asamuzakjp/css-color": "^5.1.11", "@asamuzakjp/dom-selector": "^7.1.1", "@bramus/specificity": "^2.4.2", "@csstools/css-syntax-patches-for-csstree": "^1.1.3", "@exodus/bytes": "^1.15.0", "css-tree": "^3.2.1", "data-urls": "^7.0.0", "decimal.js": "^10.6.0", "html-encoding-sniffer": "^6.0.0", "is-potential-custom-element-name": "^1.0.1", "lru-cache": "^11.3.5", "parse5": "^8.0.1", "saxes": "^6.0.0", "symbol-tree": "^3.2.4", "tough-cookie": "^6.0.1", "undici": "^7.25.0", "w3c-xmlserializer": "^5.0.0", "webidl-conversions": "^8.0.1", "whatwg-mimetype": "^5.0.0", "whatwg-url": "^16.0.1", "xml-name-validator": "^5.0.0" }, "peerDependencies": { "canvas": "^3.0.0" }, "optionalPeers": ["canvas"] }, "sha512-ECi4Fi2f7BdJtUKTflYRTiaMxIB0O6zfR1fX0GXpUrf6flp8QIYn1UT20YQqdSOfk2dfkCwS8LAFoJDEppNK5Q=="], "lightningcss": ["lightningcss@1.32.0", "", { "dependencies": { "detect-libc": "^2.0.3" }, "optionalDependencies": { "lightningcss-android-arm64": "1.32.0", "lightningcss-darwin-arm64": "1.32.0", "lightningcss-darwin-x64": "1.32.0", "lightningcss-freebsd-x64": "1.32.0", "lightningcss-linux-arm-gnueabihf": "1.32.0", "lightningcss-linux-arm64-gnu": "1.32.0", "lightningcss-linux-arm64-musl": "1.32.0", "lightningcss-linux-x64-gnu": "1.32.0", "lightningcss-linux-x64-musl": "1.32.0", "lightningcss-win32-arm64-msvc": "1.32.0", "lightningcss-win32-x64-msvc": "1.32.0" } }, "sha512-NXYBzinNrblfraPGyrbPoD19C1h9lfI/1mzgWYvXUTe414Gz/X1FD2XBZSZM7rRTrMA8JL3OtAaGifrIKhQ5yQ=="], @@ -397,7 +403,7 @@ "lightningcss-win32-x64-msvc": ["lightningcss-win32-x64-msvc@1.32.0", "", { "os": "win32", "cpu": "x64" }, "sha512-Amq9B/SoZYdDi1kFrojnoqPLxYhQ4Wo5XiL8EVJrVsB8ARoC1PWW6VGtT0WKCemjy8aC+louJnjS7U18x3b06Q=="], - "lru-cache": ["lru-cache@11.2.7", "", {}, "sha512-aY/R+aEsRelme17KGQa/1ZSIpLpNYYrhcrepKTZgE+W3WM16YMCaPwOHLHsmopZHELU0Ojin1lPVxKR0MihncA=="], + "lru-cache": ["lru-cache@11.5.1", "", {}, "sha512-RPimw/7aMdv2oqRrxKwvZXcPfwBrn/JZ2xYcY9Hus/6LaS3VOAKVWKWgNLCFSiOm1ESXinjsDlidVU7JlnCN2A=="], "lz-string": ["lz-string@1.5.0", "", { "bin": { "lz-string": "bin/bin.js" } }, "sha512-h5bgJWpxJNswbU7qCrV0tIKQCaS3blPDrqKWx+QxzuzL1zGUzij9XCWLrSLsJPu5t+eWA/ycetzYAO5IOMcWAQ=="], @@ -407,13 +413,13 @@ "min-indent": ["min-indent@1.0.1", "", {}, "sha512-I9jwMn07Sy/IwOj3zVkVik2JTvgpaykDZEigL6Rx6N9LbMywwUSMtxET+7lVoDLLd3O3IXwJwvuuns8UB/HeAg=="], - "nanoid": ["nanoid@3.3.11", "", { "bin": { "nanoid": "bin/nanoid.cjs" } }, "sha512-N8SpfPUnUp1bK+PMYW8qSWdl9U+wwNWI4QKxOYDy9JAro3WMX7p2OeVRF9v+347pnakNevPmiHhNmZ2HbFA76w=="], + "nanoid": ["nanoid@3.3.12", "", { "bin": { "nanoid": "bin/nanoid.cjs" } }, "sha512-ZB9RH/39qpq5Vu6Y+NmUaFhQR6pp+M2Xt76XBnEwDaGcVAqhlvxrl3B2bKS5D3NH3QR76v3aSrKaF/Kiy7lEtQ=="], - "next": ["next@16.2.1", "", { "dependencies": { "@next/env": "16.2.1", "@swc/helpers": "0.5.15", "baseline-browser-mapping": "^2.9.19", "caniuse-lite": "^1.0.30001579", "postcss": "8.4.31", "styled-jsx": "5.1.6" }, "optionalDependencies": { "@next/swc-darwin-arm64": "16.2.1", "@next/swc-darwin-x64": "16.2.1", "@next/swc-linux-arm64-gnu": "16.2.1", "@next/swc-linux-arm64-musl": "16.2.1", "@next/swc-linux-x64-gnu": "16.2.1", "@next/swc-linux-x64-musl": "16.2.1", "@next/swc-win32-arm64-msvc": "16.2.1", "@next/swc-win32-x64-msvc": "16.2.1", "sharp": "^0.34.5" }, "peerDependencies": { "@opentelemetry/api": "^1.1.0", "@playwright/test": "^1.51.1", "babel-plugin-react-compiler": "*", "react": "^18.2.0 || 19.0.0-rc-de68d2f4-20241204 || ^19.0.0", "react-dom": "^18.2.0 || 19.0.0-rc-de68d2f4-20241204 || ^19.0.0", "sass": "^1.3.0" }, "optionalPeers": ["@opentelemetry/api", "@playwright/test", "babel-plugin-react-compiler", "sass"], "bin": { "next": "dist/bin/next" } }, "sha512-VaChzNL7o9rbfdt60HUj8tev4m6d7iC1igAy157526+cJlXOQu5LzsBXNT+xaJnTP/k+utSX5vMv7m0G+zKH+Q=="], + "next": ["next@16.2.6", "", { "dependencies": { "@next/env": "16.2.6", "@swc/helpers": "0.5.15", "baseline-browser-mapping": "^2.9.19", "caniuse-lite": "^1.0.30001579", "postcss": "8.4.31", "styled-jsx": "5.1.6" }, "optionalDependencies": { "@next/swc-darwin-arm64": "16.2.6", "@next/swc-darwin-x64": "16.2.6", "@next/swc-linux-arm64-gnu": "16.2.6", "@next/swc-linux-arm64-musl": "16.2.6", "@next/swc-linux-x64-gnu": "16.2.6", "@next/swc-linux-x64-musl": "16.2.6", "@next/swc-win32-arm64-msvc": "16.2.6", "@next/swc-win32-x64-msvc": "16.2.6", "sharp": "^0.34.5" }, "peerDependencies": { "@opentelemetry/api": "^1.1.0", "@playwright/test": "^1.51.1", "babel-plugin-react-compiler": "*", "react": "^18.2.0 || 19.0.0-rc-de68d2f4-20241204 || ^19.0.0", "react-dom": "^18.2.0 || 19.0.0-rc-de68d2f4-20241204 || ^19.0.0", "sass": "^1.3.0" }, "optionalPeers": ["@opentelemetry/api", "@playwright/test", "babel-plugin-react-compiler", "sass"], "bin": { "next": "dist/bin/next" } }, "sha512-qOVgKJg1+At15NpeUP+eJgCHvTCgXsogweq87Ri/Ix7PkqQHg4sdaXmSFqKlgaIXE4kW0g25LE68W87UANlHtw=="], "obug": ["obug@2.1.1", "", {}, "sha512-uTqF9MuPraAQ+IsnPf366RG4cP9RtUi7MLO1N3KEc+wb0a6yKpeL0lmk2IB1jY5KHPAlTc6T/JRdC/YqxHNwkQ=="], - "parse5": ["parse5@8.0.0", "", { "dependencies": { "entities": "^6.0.0" } }, "sha512-9m4m5GSgXjL4AjumKzq1Fgfp3Z8rsvjRNbnkVwfu2ImRqE5D0LnY2QfDen18FSY9C573YU5XxSapdHZTZ2WolA=="], + "parse5": ["parse5@8.0.1", "", { "dependencies": { "entities": "^8.0.0" } }, "sha512-z1e/HMG90obSGeidlli3hj7cbocou0/wa5HacvI3ASx34PecNjNQeaHNo5WIZpWofN9kgkqV1q5YvXe3F0FoPw=="], "pathe": ["pathe@2.0.3", "", {}, "sha512-WUjGcAqP1gQacoQe+OBJsFA7Ld4DyXuUIjZ5cc75cLHvJ7dtNsTugphxIADwspS+AraAUePCKrSVtPLFj/F88w=="], @@ -421,17 +427,17 @@ "picomatch": ["picomatch@4.0.3", "", {}, "sha512-5gTmgEY/sqK6gFXLIsQNH19lWb4ebPDLA4SdLP7dsWkIXHWlG66oPuVvXSGFPppYZz8ZDZq0dYYrbHfBCVUb1Q=="], - "postcss": ["postcss@8.5.8", "", { "dependencies": { "nanoid": "^3.3.11", "picocolors": "^1.1.1", "source-map-js": "^1.2.1" } }, "sha512-OW/rX8O/jXnm82Ey1k44pObPtdblfiuWnrd8X7GJ7emImCOstunGbXUpp7HdBrFQX6rJzn3sPT397Wp5aCwCHg=="], + "postcss": ["postcss@8.5.15", "", { "dependencies": { "nanoid": "^3.3.12", "picocolors": "^1.1.1", "source-map-js": "^1.2.1" } }, "sha512-FfR8sjd4em2T6fb3I2MwAJU7HWVMr9zba+enmQeeWFfCbm+UOC/0X4DS8XtpUTMwWMGbjKYP7xjfNekzyGmB3A=="], - "prettier": ["prettier@3.8.1", "", { "bin": { "prettier": "bin/prettier.cjs" } }, "sha512-UOnG6LftzbdaHZcKoPFtOcCKztrQ57WkHDeRD9t/PTQtmT0NHSeWWepj6pS0z/N7+08BHFDQVUrfmfMRcZwbMg=="], + "prettier": ["prettier@3.8.3", "", { "bin": { "prettier": "bin/prettier.cjs" } }, "sha512-7igPTM53cGHMW8xWuVTydi2KO233VFiTNyF5hLJqpilHfmn8C8gPf+PS7dUT64YcXFbiMGZxS9pCSxL/Dxm/Jw=="], "pretty-format": ["pretty-format@27.5.1", "", { "dependencies": { "ansi-regex": "^5.0.1", "ansi-styles": "^5.0.0", "react-is": "^17.0.1" } }, "sha512-Qb1gy5OrP5+zDf2Bvnzdl3jsTf1qXVMazbvCoKhtKqVs4/YK4ozX4gKQJJVyNe+cajNPn0KoC0MC3FUmaHWEmQ=="], "punycode": ["punycode@2.3.1", "", {}, "sha512-vYt7UD1U9Wg6138shLtLOvdAu+8DsC/ilFtEVHcH+wydcSpNE20AfSOduf6MkRFahL5FY7X1oU7nKVZFtfq8Fg=="], - "react": ["react@19.2.4", "", {}, "sha512-9nfp2hYpCwOjAN+8TZFGhtWEwgvWHXqESH8qT89AT/lWklpLON22Lc8pEtnpsZz7VmawabSU0gCjnj8aC0euHQ=="], + "react": ["react@19.2.6", "", {}, "sha512-sfWGGfavi0xr8Pg0sVsyHMAOziVYKgPLNrS7ig+ivMNb3wbCBw3KxtflsGBAwD3gYQlE/AEZsTLgToRrSCjb0Q=="], - "react-dom": ["react-dom@19.2.4", "", { "dependencies": { "scheduler": "^0.27.0" }, "peerDependencies": { "react": "^19.2.4" } }, "sha512-AXJdLo8kgMbimY95O2aKQqsz2iWi9jMgKJhRBAxECE4IFxfcazB2LmzloIoibJI3C12IlY20+KFaLv+71bUJeQ=="], + "react-dom": ["react-dom@19.2.6", "", { "dependencies": { "scheduler": "^0.27.0" }, "peerDependencies": { "react": "^19.2.6" } }, "sha512-0prMI+hvBbPjsWnxDLxlCGyM8PN6UuWjEUCYmZhO67xIV9Xasa/r/vDnq+Xyq4Lo27g8QSbO5YzARu0D1Sps3g=="], "react-is": ["react-is@17.0.2", "", {}, "sha512-w2GsyukL62IJnlaff/nRegPQR94C/XXamvMWmSHRJ4y7Ts/4ocGRmTHvOs8PSE6pB3dWOrD/nueuU5sduBsQ4w=="], @@ -463,9 +469,9 @@ "symbol-tree": ["symbol-tree@3.2.4", "", {}, "sha512-9QNk5KwDF+Bvz+PyObkmSYjI5ksVUYtjW7AU22r2NKcfLJcXp96hkDWU3+XndOsUb+AQ9QhfzfCT2O+CNWT5Tw=="], - "tailwindcss": ["tailwindcss@4.2.2", "", {}, "sha512-KWBIxs1Xb6NoLdMVqhbhgwZf2PGBpPEiwOqgI4pFIYbNTfBXiKYyWoTsXgBQ9WFg/OlhnvHaY+AEpW7wSmFo2Q=="], + "tailwindcss": ["tailwindcss@4.3.0", "", {}, "sha512-y6nxMGB1nMW9R6k96e5gdIFzcfL/gTJRNaqGes1YvkLnPVXzWgbqFF2yLC0T8G774n24cx3Pe8XrKoniCOAH+Q=="], - "tapable": ["tapable@2.3.0", "", {}, "sha512-g9ljZiwki/LfxmQADO3dEY1CbpmXT5Hm2fJ+QaGKwSXUylMybePR7/67YW7jOrrvjEgL1Fmz5kzyAjWVWLlucg=="], + "tapable": ["tapable@2.3.3", "", {}, "sha512-uxc/zpqFg6x7C8vOE7lh6Lbda8eEL9zmVm/PLeTPBRhh1xCgdWaQ+J1CUieGpIfm2HdtsUpRv+HshiasBMcc6A=="], "tinybench": ["tinybench@2.9.0", "", {}, "sha512-0+DUvqWMValLmha6lr4kD8iAMK1HzV0/aKnCtWb9v9641TnP/MFb7Pc2bxoxQjTXAErryXVgUOfv2YqNllqGeg=="], @@ -473,7 +479,7 @@ "tinyglobby": ["tinyglobby@0.2.15", "", { "dependencies": { "fdir": "^6.5.0", "picomatch": "^4.0.3" } }, "sha512-j2Zq4NyQYG5XMST4cbs02Ak8iJUdxRM0XI5QyxXuZOzKOINmWurp3smXu3y5wDcJrptwpSjgXHzIQxR0omXljQ=="], - "tinyrainbow": ["tinyrainbow@3.0.3", "", {}, "sha512-PSkbLUoxOFRzJYjjxHJt9xro7D+iilgMX/C9lawzVuYiIdcihh9DXmVibBe8lmcFrRi/VzlPjBxbN7rH24q8/Q=="], + "tinyrainbow": ["tinyrainbow@3.1.0", "", {}, "sha512-Bf+ILmBgretUrdJxzXM0SgXLZ3XfiaUuOj/IKQHuTXip+05Xn+uyEYdVg0kYDipTBcLrCVyUzAPz7QmArb0mmw=="], "tldts": ["tldts@7.0.23", "", { "dependencies": { "tldts-core": "^7.0.23" }, "bin": { "tldts": "bin/cli.js" } }, "sha512-ASdhgQIBSay0R/eXggAkQ53G4nTJqTXqC2kbaBbdDwM7SkjyZyO0OaaN1/FH7U/yCeqOHDwFO5j8+Os/IS1dXw=="], @@ -485,15 +491,15 @@ "tslib": ["tslib@2.8.1", "", {}, "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w=="], - "typescript": ["typescript@6.0.2", "", { "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" } }, "sha512-bGdAIrZ0wiGDo5l8c++HWtbaNCWTS4UTv7RaTH/ThVIgjkveJt83m74bBHMJkuCbslY8ixgLBVZJIOiQlQTjfQ=="], + "typescript": ["typescript@6.0.3", "", { "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" } }, "sha512-y2TvuxSZPDyQakkFRPZHKFm+KKVqIisdg9/CZwm9ftvKXLP8NRWj38/ODjNbr43SsoXqNuAisEf1GdCxqWcdBw=="], - "undici": ["undici@7.24.5", "", {}, "sha512-3IWdCpjgxp15CbJnsi/Y9TCDE7HWVN19j1hmzVhoAkY/+CJx449tVxT5wZc1Gwg8J+P0LWvzlBzxYRnHJ+1i7Q=="], + "undici": ["undici@7.26.0", "", {}, "sha512-3O9Tf67pGhgOv9jM35AbhkXAKi13f3oy3aE4CSgr+TckGeY+/iu97ZXN+J7DpHPzLbVApFd1IFhcnBjREYXYcg=="], - "undici-types": ["undici-types@7.18.2", "", {}, "sha512-AsuCzffGHJybSaRrmr5eHr81mwJU3kjw6M+uprWvCXiNeN9SOGwQ3Jn8jb8m3Z6izVgknn1R0FTCEAP2QrLY/w=="], + "undici-types": ["undici-types@7.24.6", "", {}, "sha512-WRNW+sJgj5OBN4/0JpHFqtqzhpbnV0GuB+OozA9gCL7a993SmU+1JBZCzLNxYsbMfIeDL+lTsphD5jN5N+n0zg=="], "vite": ["vite@7.3.1", "", { "dependencies": { "esbuild": "^0.27.0", "fdir": "^6.5.0", "picomatch": "^4.0.3", "postcss": "^8.5.6", "rollup": "^4.43.0", "tinyglobby": "^0.2.15" }, "optionalDependencies": { "fsevents": "~2.3.3" }, "peerDependencies": { "@types/node": "^20.19.0 || >=22.12.0", "jiti": ">=1.21.0", "less": "^4.0.0", "lightningcss": "^1.21.0", "sass": "^1.70.0", "sass-embedded": "^1.70.0", "stylus": ">=0.54.8", "sugarss": "^5.0.0", "terser": "^5.16.0", "tsx": "^4.8.1", "yaml": "^2.4.2" }, "optionalPeers": ["@types/node", "jiti", "less", "lightningcss", "sass", "sass-embedded", "stylus", "sugarss", "terser", "tsx", "yaml"], "bin": { "vite": "bin/vite.js" } }, "sha512-w+N7Hifpc3gRjZ63vYBXA56dvvRlNWRczTdmCBBa+CotUzAPf5b7YMdMR/8CQoeYE5LX3W4wj6RYTgonm1b9DA=="], - "vitest": ["vitest@4.1.1", "", { "dependencies": { "@vitest/expect": "4.1.1", "@vitest/mocker": "4.1.1", "@vitest/pretty-format": "4.1.1", "@vitest/runner": "4.1.1", "@vitest/snapshot": "4.1.1", "@vitest/spy": "4.1.1", "@vitest/utils": "4.1.1", "es-module-lexer": "^2.0.0", "expect-type": "^1.3.0", "magic-string": "^0.30.21", "obug": "^2.1.1", "pathe": "^2.0.3", "picomatch": "^4.0.3", "std-env": "^4.0.0-rc.1", "tinybench": "^2.9.0", "tinyexec": "^1.0.2", "tinyglobby": "^0.2.15", "tinyrainbow": "^3.0.3", "vite": "^6.0.0 || ^7.0.0 || ^8.0.0", "why-is-node-running": "^2.3.0" }, "peerDependencies": { "@edge-runtime/vm": "*", "@opentelemetry/api": "^1.9.0", "@types/node": "^20.0.0 || ^22.0.0 || >=24.0.0", "@vitest/browser-playwright": "4.1.1", "@vitest/browser-preview": "4.1.1", "@vitest/browser-webdriverio": "4.1.1", "@vitest/ui": "4.1.1", "happy-dom": "*", "jsdom": "*" }, "optionalPeers": ["@edge-runtime/vm", "@opentelemetry/api", "@types/node", "@vitest/browser-playwright", "@vitest/browser-preview", "@vitest/browser-webdriverio", "@vitest/ui", "happy-dom", "jsdom"], "bin": { "vitest": "vitest.mjs" } }, "sha512-yF+o4POL41rpAzj5KVILUxm1GCjKnELvaqmU9TLLUbMfDzuN0UpUR9uaDs+mCtjPe+uYPksXDRLQGGPvj1cTmA=="], + "vitest": ["vitest@4.1.7", "", { "dependencies": { "@vitest/expect": "4.1.7", "@vitest/mocker": "4.1.7", "@vitest/pretty-format": "4.1.7", "@vitest/runner": "4.1.7", "@vitest/snapshot": "4.1.7", "@vitest/spy": "4.1.7", "@vitest/utils": "4.1.7", "es-module-lexer": "^2.0.0", "expect-type": "^1.3.0", "magic-string": "^0.30.21", "obug": "^2.1.1", "pathe": "^2.0.3", "picomatch": "^4.0.3", "std-env": "^4.0.0-rc.1", "tinybench": "^2.9.0", "tinyexec": "^1.0.2", "tinyglobby": "^0.2.15", "tinyrainbow": "^3.1.0", "vite": "^6.0.0 || ^7.0.0 || ^8.0.0", "why-is-node-running": "^2.3.0" }, "peerDependencies": { "@edge-runtime/vm": "*", "@opentelemetry/api": "^1.9.0", "@types/node": "^20.0.0 || ^22.0.0 || >=24.0.0", "@vitest/browser-playwright": "4.1.7", "@vitest/browser-preview": "4.1.7", "@vitest/browser-webdriverio": "4.1.7", "@vitest/coverage-istanbul": "4.1.7", "@vitest/coverage-v8": "4.1.7", "@vitest/ui": "4.1.7", "happy-dom": "*", "jsdom": "*" }, "optionalPeers": ["@edge-runtime/vm", "@opentelemetry/api", "@types/node", "@vitest/browser-playwright", "@vitest/browser-preview", "@vitest/browser-webdriverio", "@vitest/coverage-istanbul", "@vitest/coverage-v8", "@vitest/ui", "happy-dom", "jsdom"], "bin": { "vitest": "vitest.mjs" } }, "sha512-flYyaFd2CgoCoU+0UKt3pxksgC+S02iTDN0n3LtqaMeXsI9SBcdNujc2k0DeFLzUn/0k538yNjOSdwgCqcrwJA=="], "w3c-xmlserializer": ["w3c-xmlserializer@5.0.0", "", { "dependencies": { "xml-name-validator": "^5.0.0" } }, "sha512-o8qghlI8NZHU1lLPrpi2+Uq7abh4GGPpYANlalzWxyWteJOCsr/P+oPBA49TOLu5FTZO4d3F9MnWJfiMo4BkmA=="], @@ -509,17 +515,15 @@ "xmlchars": ["xmlchars@2.2.0", "", {}, "sha512-JZnDKK8B0RCDw84FNdDAIpZK+JuJw+s7Lz8nksI7SIuU3UXJJslUthsi+uWBUYOwPFwW7W7PRLRfUKpxjtjFCw=="], - "@asamuzakjp/css-color/lru-cache": ["lru-cache@11.2.6", "", {}, "sha512-ESL2CrkS/2wTPfuend7Zhkzo2u0daGJ/A2VucJOgQ/C48S/zB8MMeMHSGKYpXhIjbPxfuezITkaBH1wqv00DDQ=="], - "@bramus/specificity/css-tree": ["css-tree@3.1.0", "", { "dependencies": { "mdn-data": "2.12.2", "source-map-js": "^1.0.1" } }, "sha512-0eW44TGN5SQXU1mWSkKwFstI/22X2bG1nYzZTYMAWjylYURhse752YgbE4Cx46AC+bAvI+/dYTPRk1LqSUnu6w=="], - "@tailwindcss/oxide-wasm32-wasi/@emnapi/core": ["@emnapi/core@1.8.1", "", { "dependencies": { "@emnapi/wasi-threads": "1.1.0", "tslib": "^2.4.0" }, "bundled": true }, "sha512-AvT9QFpxK0Zd8J0jopedNm+w/2fIzvtPKPjqyw9jwvBaReTTqPBk9Hixaz7KbjimP+QNz605/XnjFcDAL2pqBg=="], + "@tailwindcss/oxide-wasm32-wasi/@emnapi/core": ["@emnapi/core@1.10.0", "", { "dependencies": { "@emnapi/wasi-threads": "1.2.1", "tslib": "^2.4.0" }, "bundled": true }, "sha512-yq6OkJ4p82CAfPl0u9mQebQHKPJkY7WrIuk205cTYnYe+k2Z8YBh11FrbRG/H6ihirqcacOgl2BIO8oyMQLeXw=="], - "@tailwindcss/oxide-wasm32-wasi/@emnapi/runtime": ["@emnapi/runtime@1.8.1", "", { "dependencies": { "tslib": "^2.4.0" }, "bundled": true }, "sha512-mehfKSMWjjNol8659Z8KxEMrdSJDDot5SXMq00dM8BN4o+CLNXQ0xH2V7EchNHV4RmbZLmmPdEaXZc5H2FXmDg=="], + "@tailwindcss/oxide-wasm32-wasi/@emnapi/runtime": ["@emnapi/runtime@1.10.0", "", { "dependencies": { "tslib": "^2.4.0" }, "bundled": true }, "sha512-ewvYlk86xUoGI0zQRNq/mC+16R1QeDlKQy21Ki3oSYXNgLb45GV1P6A0M+/s6nyCuNDqe5VpaY84BzXGwVbwFA=="], - "@tailwindcss/oxide-wasm32-wasi/@emnapi/wasi-threads": ["@emnapi/wasi-threads@1.1.0", "", { "dependencies": { "tslib": "^2.4.0" }, "bundled": true }, "sha512-WI0DdZ8xFSbgMjR1sFsKABJ/C5OnRrjT06JXbZKexJGrDuPTzZdDYfFlsgcCXCyf+suG5QU2e/y1Wo2V/OapLQ=="], + "@tailwindcss/oxide-wasm32-wasi/@emnapi/wasi-threads": ["@emnapi/wasi-threads@1.2.1", "", { "dependencies": { "tslib": "^2.4.0" }, "bundled": true }, "sha512-uTII7OYF+/Mes/MrcIOYp5yOtSMLBWSIoLPpcgwipoiKbli6k322tcoFsxoIIxPDqW01SQGAgko4EzZi2BNv2w=="], - "@tailwindcss/oxide-wasm32-wasi/@napi-rs/wasm-runtime": ["@napi-rs/wasm-runtime@1.1.1", "", { "dependencies": { "@emnapi/core": "^1.7.1", "@emnapi/runtime": "^1.7.1", "@tybys/wasm-util": "^0.10.1" }, "bundled": true }, "sha512-p64ah1M1ld8xjWv3qbvFwHiFVWrq1yFvV4f7w+mzaqiR4IlSgkqhcRdHwsGgomwzBH51sRY4NEowLxnaBjcW/A=="], + "@tailwindcss/oxide-wasm32-wasi/@napi-rs/wasm-runtime": ["@napi-rs/wasm-runtime@1.1.4", "", { "dependencies": { "@tybys/wasm-util": "^0.10.1" }, "peerDependencies": { "@emnapi/core": "^1.7.1", "@emnapi/runtime": "^1.7.1" }, "bundled": true }, "sha512-3NQNNgA1YSlJb/kMH1ildASP9HW7/7kYnRI2szWJaofaS1hWmbGI4H+d3+22aGzXXN9IJ+n+GiFVcGipJP18ow=="], "@tailwindcss/oxide-wasm32-wasi/@tybys/wasm-util": ["@tybys/wasm-util@0.10.1", "", { "dependencies": { "tslib": "^2.4.0" }, "bundled": true }, "sha512-9tTaPJLSiejZKx+Bmog4uSubteqTvFrVrURwkmHixBo0G4seD0zUxp98E1DzUBJxLQ3NPwXrGKDiVjwx/DpPsg=="], @@ -538,5 +542,9 @@ "whatwg-url/@exodus/bytes": ["@exodus/bytes@1.14.1", "", { "peerDependencies": { "@noble/hashes": "^1.8.0 || ^2.0.0" }, "optionalPeers": ["@noble/hashes"] }, "sha512-OhkBFWI6GcRMUroChZiopRiSp2iAMvEBK47NhJooDqz1RERO4QuZIZnjP63TXX8GAiLABkYmX+fuQsdJ1dd2QQ=="], "@bramus/specificity/css-tree/mdn-data": ["mdn-data@2.12.2", "", {}, "sha512-IEn+pegP1aManZuckezWCO+XZQDplx1366JoVhTpMpBB1sPey/SbveZQUosKiKiGYjg1wH4pMlNgXbCiYgihQA=="], + + "next/postcss/nanoid": ["nanoid@3.3.11", "", { "bin": { "nanoid": "bin/nanoid.cjs" } }, "sha512-N8SpfPUnUp1bK+PMYW8qSWdl9U+wwNWI4QKxOYDy9JAro3WMX7p2OeVRF9v+347pnakNevPmiHhNmZ2HbFA76w=="], + + "vite/postcss/nanoid": ["nanoid@3.3.11", "", { "bin": { "nanoid": "bin/nanoid.cjs" } }, "sha512-N8SpfPUnUp1bK+PMYW8qSWdl9U+wwNWI4QKxOYDy9JAro3WMX7p2OeVRF9v+347pnakNevPmiHhNmZ2HbFA76w=="], } } diff --git a/site/next.config.ts b/site/next.config.ts index cb651cd..dbe6dda 100644 --- a/site/next.config.ts +++ b/site/next.config.ts @@ -1,5 +1,23 @@ import type { NextConfig } from "next"; -const nextConfig: NextConfig = {}; +const nextConfig: NextConfig = { + async redirects() { + return [ + { source: "/thesis", destination: "/about", permanent: true }, + { + source: "/:path*", + has: [{ type: "host", value: "farness.ai" }], + destination: "https://app.thesisinstitute.org/:path*", + permanent: true, + }, + { + source: "/:path*", + has: [{ type: "host", value: "www.farness.ai" }], + destination: "https://app.thesisinstitute.org/:path*", + permanent: true, + }, + ]; + }, +}; export default nextConfig; diff --git a/site/package.json b/site/package.json index 8705ed5..ef19499 100644 --- a/site/package.json +++ b/site/package.json @@ -1,5 +1,5 @@ { - "name": "farness-site", + "name": "brier-site", "private": true, "version": "0.0.0", "scripts": { @@ -10,22 +10,23 @@ "test": "vitest run" }, "dependencies": { - "next": "^16.2.1", - "react": "^19.2.4", - "react-dom": "^19.2.4" + "brier-design": "github:MaxGhenis/brier-design#v1.1.0", + "next": "^16.2.6", + "react": "^19.2.6", + "react-dom": "^19.2.6" }, "devDependencies": { - "@tailwindcss/postcss": "^4.2.2", + "@tailwindcss/postcss": "^4.3.0", "@testing-library/jest-dom": "^6.9.1", "@testing-library/react": "^16.3.2", - "@types/node": "^25.5.0", - "@types/react": "^19.2.14", + "@types/node": "^25.9.1", + "@types/react": "^19.2.15", "@types/react-dom": "^19.2.3", - "jsdom": "^29.0.1", - "postcss": "^8.5.8", - "prettier": "^3.8.1", - "tailwindcss": "^4.2.2", - "typescript": "^6.0.2", - "vitest": "^4.1.1" + "jsdom": "^29.1.1", + "postcss": "^8.5.15", + "prettier": "^3.8.3", + "tailwindcss": "^4.3.0", + "typescript": "^6.0.3", + "vitest": "^4.1.7" } } diff --git a/site/public/demo/farness-demo-poster.png b/site/public/demo/brier-demo-poster.png similarity index 100% rename from site/public/demo/farness-demo-poster.png rename to site/public/demo/brier-demo-poster.png diff --git a/site/public/demo/farness-demo.mp4 b/site/public/demo/brier-demo.mp4 similarity index 100% rename from site/public/demo/farness-demo.mp4 rename to site/public/demo/brier-demo.mp4 diff --git a/site/public/og-image.png b/site/public/og-image.png index 809b881..c23f30f 100644 Binary files a/site/public/og-image.png and b/site/public/og-image.png differ diff --git a/site/src/__tests__/migration.test.tsx b/site/src/__tests__/migration.test.tsx deleted file mode 100644 index 8699da9..0000000 --- a/site/src/__tests__/migration.test.tsx +++ /dev/null @@ -1,263 +0,0 @@ -import { describe, it, expect, vi } from "vitest"; -import { render, screen } from "@testing-library/react"; - -// Mock next/link to render as a simple tag -vi.mock("next/link", () => ({ - default: ({ - children, - href, - ...props - }: { - children: React.ReactNode; - href: string; - [key: string]: unknown; - }) => ( - - {children} - - ), -})); - -// Import components after mocks are set up -import HomePage from "../app/page"; -import DocsPage from "../app/docs/page"; -import ThesisPage from "../app/thesis/page"; - -describe("Next.js migration", () => { - describe("Homepage", () => { - it("renders without crashing", () => { - render(); - }); - - it("renders header with logo", () => { - render(); - const farnessElements = screen.getAllByText("farness"); - expect(farnessElements.length).toBeGreaterThan(0); - }); - - it("renders hero headline", () => { - render(); - const matches = screen.getAllByText(/AI is often fluent about decisions/); - expect(matches.length).toBeGreaterThan(0); - }); - - it("opens on the forecast prototype", () => { - render(); - expect( - screen.getByText( - "Forecasts on every consequential cell of government data", - ), - ).toBeInTheDocument(); - expect(screen.getByText("Prototype status")).toBeInTheDocument(); - expect(screen.getByText("Static mock traces")).toBeInTheDocument(); - }); - - it("renders hero subhead with farness mention", () => { - render(); - expect( - screen.getByText(/demands a forecast: a KPI, a confidence interval/), - ).toBeInTheDocument(); - }); - - it("renders how it works section", () => { - render(); - expect( - screen.getByText("From intuition to instrument"), - ).toBeInTheDocument(); - expect(screen.getByText("Intercept")).toBeInTheDocument(); - expect(screen.getByText("Reframe")).toBeInTheDocument(); - expect(screen.getByText("Anchor")).toBeInTheDocument(); - }); - - it("renders workflow demo section", () => { - render(); - expect( - screen.getByText("Watch the packaged path end to end"), - ).toBeInTheDocument(); - expect( - screen.getAllByLabelText("End-to-end farness workflow demo for Codex") - .length, - ).toBeGreaterThan(0); - }); - - it("renders forecast artifact", () => { - render(); - expect( - screen.getByText("Should we rewrite the auth layer now?"), - ).toBeInTheDocument(); - }); - - it("renders research proof section", () => { - render(); - expect(screen.getByText("Stability-under-probing")).toBeInTheDocument(); - expect(screen.getAllByText("11").length).toBeGreaterThan(0); - expect(screen.getAllByText("2").length).toBeGreaterThan(0); - }); - - it("renders instrument modules", () => { - render(); - expect(screen.getByText("What farness produces")).toBeInTheDocument(); - }); - - it("renders editorial pull quote", () => { - render(); - const matches = screen.getAllByText(/AI is often fluent about decisions/); - expect(matches.length).toBeGreaterThanOrEqual(1); - }); - - it("renders installation section", () => { - render(); - expect( - screen.getByText("Use it natively or from the CLI"), - ).toBeInTheDocument(); - expect(screen.getByText("Codex")).toBeInTheDocument(); - expect(screen.getAllByText(/\$farness/).length).toBeGreaterThan(0); - }); - - it("renders closing CTA", () => { - render(); - expect( - screen.getByText("See further before you decide."), - ).toBeInTheDocument(); - }); - - it("renders footer", () => { - render(); - expect(screen.getByText("GitHub")).toBeInTheDocument(); - }); - }); - - describe("Thesis page", () => { - it("renders without crashing", () => { - render(); - }); - - it("renders header with active thesis link", () => { - render(); - const thesisLinks = screen.getAllByText("Thesis"); - expect(thesisLinks.length).toBeGreaterThan(0); - }); - - it("renders thesis title", () => { - render(); - expect(screen.getByText("Forecasting as a harness")).toBeInTheDocument(); - }); - - it("renders all section headings", () => { - render(); - expect(screen.getByText("The problem with advice")).toBeInTheDocument(); - expect(screen.getByText("The reframe")).toBeInTheDocument(); - expect( - screen.getByText("The superforecasting connection"), - ).toBeInTheDocument(); - expect(screen.getByText("Why AI makes this better")).toBeInTheDocument(); - expect(screen.getByText("The calibration loop")).toBeInTheDocument(); - expect( - screen.getByText("The decision quality chain"), - ).toBeInTheDocument(); - expect(screen.getByText("The framework")).toBeInTheDocument(); - expect(screen.getByText("When to use it")).toBeInTheDocument(); - expect(screen.getByText("The vision")).toBeInTheDocument(); - }); - - it("renders references section", () => { - render(); - expect(screen.getAllByText("References").length).toBeGreaterThan(0); - }); - }); - - describe("Docs page", () => { - it("renders without crashing", () => { - render(); - }); - - it("renders docs title and install guidance", () => { - render(); - expect( - screen.getByText( - "Use farness with Codex, Claude Code, or the local CLI.", - ), - ).toBeInTheDocument(); - expect( - screen.getByText("Install the package and choose a path"), - ).toBeInTheDocument(); - expect(screen.getAllByText(/farness setup codex/).length).toBeGreaterThan( - 0, - ); - expect( - screen.getAllByText(/farness doctor codex/).length, - ).toBeGreaterThan(0); - expect( - screen.getByText("See the packaged flow before you install"), - ).toBeInTheDocument(); - expect( - screen.getByText("Fix drifted installs or reset cleanly"), - ).toBeInTheDocument(); - expect( - screen.getByText("Draft public forecast questions"), - ).toBeInTheDocument(); - expect(screen.getAllByText(/\$farness/).length).toBeGreaterThan(0); - }); - - it("explains that the CLI does not need an API key", () => { - render(); - expect(screen.getByText(/No LLM API key is/)).toBeInTheDocument(); - }); - }); - - // Paper page is now rendered by Quarto (not a React component) - - describe("shared Header component", () => { - it("renders nav links on all pages", () => { - render(); - expect(screen.getAllByText("GitHub").length).toBeGreaterThan(0); - expect(screen.getAllByText("Docs").length).toBeGreaterThan(0); - }); - - it("renders install button", () => { - render(); - expect(screen.getByText("Install")).toBeInTheDocument(); - }); - - it("flags the site as a prototype", () => { - render(); - expect(screen.getByLabelText("Prototype build")).toBeInTheDocument(); - }); - - it("uses Tailwind classes (no old CSS module class names)", () => { - const { container } = render(); - const html = container.innerHTML; - - // These old CSS class names should NOT appear - const oldClasses = [ - 'class="app-dark"', - 'class="header"', - 'class="header-inner"', - 'class="nav-link"', - 'class="btn "', - 'class="btn-accent"', - 'class="btn-ghost"', - ]; - - for (const cls of oldClasses) { - expect(html).not.toContain(cls); - } - }); - }); - - describe("theme classes", () => { - it("Homepage wrapper does NOT have dark theme class (light by default)", () => { - const { container } = render(); - const wrapper = container.firstElementChild as HTMLElement; - expect(wrapper.className).not.toContain("theme-dark"); - }); - - it("Thesis page renders without dark theme", () => { - const { container } = render(); - const wrapper = container.firstElementChild as HTMLElement; - expect(wrapper.className).not.toContain("theme-dark"); - }); - - // Paper page is now Quarto-rendered, not a React component - }); -}); diff --git a/site/src/app/about/page.tsx b/site/src/app/about/page.tsx new file mode 100644 index 0000000..811a7db --- /dev/null +++ b/site/src/app/about/page.tsx @@ -0,0 +1,161 @@ +import { Header } from "@/components/Header"; + +export const metadata = { + title: "About — The Thesis Institute", + description: + "Thesis builds open, calibrated forecasts of public outcomes — every prediction scored against reality, grounded in encoded law, and open all the way down.", +}; + +export default function AboutPage() { + return ( +
+
+
+
+

+ About The Thesis Institute +

+

+ Forecasts, scored against reality. +

+

+ An open, neutral forecaster of the outcomes that shape public + life — calibrated, grounded in encoded law, and open all the way + down. +

+
+ +
+
+

The thesis

+

+ A forecast you can score is worth more than an opinion you + can't. Thesis publishes calibrated forecasts on the outcomes + that shape public life — tax and benefit statistics, poverty, + government data, and the consequences of policy — and grades every + one against reality when the official number arrives. Each forecast + carries its full chain of reasoning. The track record is + the product. +

+

+ Prediction markets aggregate information but can't show their + work; official scores are single estimates filtered through + judgment that isn't fully documented. A forecast that is open + and scored is a different kind of object — auditable + before the fact, accountable after it. +

+
+ +
+

Why now

+

+ AI forecasters now beat most humans. In 2025 an AI system placed + 4th of 539 entrants in the Metaculus Cup, and the trend line is + steep. For the first time, calibrated forecasting can be produced + at the scale, speed, and transparency of a public good — instead of + locked inside a trading desk or a private intelligence shop. +

+
+ +
+

Rules-as-code in the loop

+

+ Not everything that shapes an outcome is uncertain — some of it is + fixed by statute. Axiom, a separate open project, + encodes law as executable rules-as-code and computes those parts + exactly. Brier agents draw on it as a tool: where a forecast turns + on a rule rather than a judgment call, Axiom supplies the exact + figure, and once a policy is enacted its computed result can settle + the forecast in the ledger. PolicyEngine, the + microsimulation instrument in the Thesis stack, runs on the same + encoded rules — two distinct projects, deliberately connected. +

+
+ +
+

The instruments

+

+ Forecasting public outcomes takes more than a language model. Thesis + runs on PolicyEngine — open-source microsimulation + of encoded tax and benefit law — and Microplex — + calibrated synthetic populations. So a forecast isn't a guess + about a headline; it's grounded in the mechanics of the statute + and the shape of the population it lands on. +

+
+ +
+

The model line

+

+ The forecasting agents are a model lineage —{" "} + Brier‑N, named for the calibration score + they're judged by. The arc is deliberate: prompt today's + models, then fine-tune on scored forecast traces, then + reinforcement-learn against open-weights models, and ultimately + train a from-scratch, open forecast-native model with calibration + itself as the reward. It is how Thesis becomes an AI lab without + becoming a closed one — the open-model path that produced + OLMo, pointed squarely at forecasting. +

+
+ +
+

Open all the way down

+

+ Open source opened the code. Open data opened the inputs. Open + weights opened the models.{" "} + Open predictions opens the reasoning itself — + every prior, every tool call, every calibration result, on the + consequential questions where forecasts drive decisions. You + cannot inspect a human forecaster's reasoning the way you can + inspect an agent's; that transparency compounds, because every + improvement is shared the moment it's found. +

+
+ +
+

Why a nonprofit

+

+ A forecaster's only durable asset is being unconflicted. A + for-profit selling foresight to the highest bidder is the + credit-rating-agency trap — the conflict that helped detonate 2008, + applied to the one product whose entire value is having no angle. + Thesis is a public-benefit institute: no owners, no trades, no + angle — funded by philanthropy and sponsored compute, the way the + open public goods before it were built. Nonprofit status is not a + constraint here; it is the structural proof of neutrality. +

+
+ +
+

The flywheel

+

+ Publish a thesis. Reality scores it. The scored record becomes the + training data for the next model, which writes better theses, which + are scored in turn. The public showcase and the training ground are + the same surface — which is why the forecasts and the models can + never really be separated, and why all of it stays open. +

+
+ +
+

Where this goes

+

+ A continuously-updated, openly-scored map of the outcomes that + decisions depend on — a public good that closed forecasting + infrastructure cannot match, built in the open by the people it + describes. If you want to fund it, build on it, or sponsor a set of + questions you want better-calibrated, get in touch:{" "} + + max@policyengine.org + + . +

+
+
+ +
+
+
+ ); +} diff --git a/site/src/app/docs/page.tsx b/site/src/app/docs/page.tsx deleted file mode 100644 index 6c7f620..0000000 --- a/site/src/app/docs/page.tsx +++ /dev/null @@ -1,470 +0,0 @@ -import Link from "next/link"; -import { Header } from "@/components/Header"; -import { DemoVideo } from "@/components/DemoVideo"; - -function CodeBlock({ children }: { children: React.ReactNode }) { - return ( -
-
-        {children}
-      
-
- ); -} - -function Section({ - kicker, - title, - children, -}: { - kicker: string; - title: string; - children: React.ReactNode; -}) { - return ( -
-
-
- {kicker} -
-

- {title} -

-
- {children} -
- ); -} - -export default function DocsPage() { - return ( -
-
-
-
-
- Documentation -
-

- Use farness with Codex, Claude Code, or the local CLI. -

-

- The install story is package-first. The PyPI package now includes - the CLI, MCP server, and packaged Codex and Claude skills. The CLI - itself is local-only and does not call an LLM or require an API key. -

- -
-
-
- Recommended -
-
- Codex + MCP -
-

- Best path if you want native tools, persistent decisions, and - the `$farness` trigger. -

-
-
-
- Local -
-
- CLI / Python -
-

- Use this if you want a decision log and calibration loop without - any agent integration. -

-
-
-
- Plugin -
-
- Claude Code -
-

- Use the plugin if you want the slash-command flow and - Claude-specific integration. -

-
-
-
- -
-
-
-

- 1. Codex with MCP -

-

- This gives Codex native tools, access to stored decisions, and a - reusable `$farness` skill. -

- {`python -m pip install 'farness[mcp]' -farness setup codex -# restart Codex, then use $farness`} -
- -
-

- 2. Claude Code local skill -

-

- This gives Claude Code the same local MCP-backed workflow as - Codex, but through Claude skills instead of the Codex skill - format. -

- {`python -m pip install 'farness[mcp]' -farness setup claude -# restart Claude Code`} -
- -
-

- 3. Local CLI / Python -

-

- This path creates and scores decisions locally. No LLM API key - is required for these commands. -

- {`python -m pip install farness -farness new "Should we rewrite the auth layer?" -farness list -farness calibration`} -
-
- -
-
- Optional -
-
- Claude plugin path -
-

- If you prefer the older plugin flow instead of local Claude - skills, it still works: -

- {`claude plugin marketplace add MaxGhenis/farness -claude plugin install farness@maxghenis-plugins -# then use /farness:decide`} -
-
- -
-
- {[ - [ - "CLI", - "Creates, lists, reviews, and scores decisions in ~/.farness/decisions.jsonl.", - ], - [ - "MCP server", - "Exposes the same decision store as native tools, resources, and prompts for agent clients.", - ], - [ - "Codex skill", - "Tells Codex when to use the MCP tools and what the farness workflow should produce.", - ], - [ - "Claude skill", - "Tells Claude Code when to use the same local MCP server. The older plugin path stays optional.", - ], - ].map(([title, description]) => ( -
-
- {title} -
-

- {description} -

-
- ))} -
-
- -
-
- {`python -m pip install 'farness[mcp]' -farness setup codex`} - {`python -m pip install 'farness[mcp]' -farness setup claude`} -
-

- `farness setup` installs the packaged skill and registers the local - MCP server with the same Python interpreter that launched `farness`. - The last step is just restarting Codex or Claude Code. -

-
- {`farness doctor codex`} - {`farness doctor claude`} -
-

- `farness doctor` checks three things: whether the packaged skill is - installed, whether the agent CLI is on `PATH`, and whether the local - MCP server is already registered. -

-
- -
-
-
-

- This is the actual package-first Codex path from the docs: - install, run setup, use - - {" "} - $farness{" "} - - in Codex, then confirm the decision landed in the local store. -

- {`python -m pip install 'farness[mcp]' -farness setup codex -farness doctor codex`} -
- -
-
- -
-
-
-

- Repair in place -

-

- If the skill file drifted, the agent CLI moved, or MCP setup - only half-worked, let `doctor` repair what it can. -

- {`farness doctor codex --fix -farness doctor claude --fix`} -
-
-

- Reset from scratch -

-

- Remove the local skill and MCP registration, then run setup - again. -

- {`farness uninstall codex -farness setup codex - -farness uninstall claude -farness setup claude`} -
-
-
- -
-
-
-

- The framework is not “ask an LLM for advice.” It is a structured - decision workflow: -

-
    -
  1. Define the KPI and time horizon.
  2. -
  3. Expand the option set beyond the initial framing.
  4. -
  5. Anchor on a reference class or base rate.
  6. -
  7. Show the mechanism or decomposition.
  8. -
  9. Surface disconfirming evidence and traps.
  10. -
  11. Give point estimates with 80% confidence intervals.
  12. -
  13. Set a review date and score outcomes later.
  14. -
-
- {`Decision: Should we rewrite the auth layer now? -KPI: critical_auth_incidents / 90d -Options: rewrite now | defer 60d | harden existing system -Base rate: 27% of similar infra rewrites produce >40% reliability gains -Forecast (rewrite now): 58% [42, 71] -Disconfirming evidence: ops fixes may solve this faster -Review date: 2026-06-15`} -
-
- -
-
-
-

- `farness forecast-draft` turns a stored decision forecast or a - standalone policy question into Manifold-ready JSON. It is - intentionally draft-only: it does not publish anything, place a - bet, or require a Manifold API key. -

-

- For public policy questions, use it to turn a live debate into a - falsifiable forecast with explicit resolution criteria before - anyone posts a public question. The Waymo/DC example uses an - existing Manifold public-service question as the gate, then - drafts conditional aggregate 2027 safety forecasts for DC - traffic fatalities and serious injuries. -

-
- {`farness forecast-draft \\ - "Will Waymo be legally permitted to offer fully driverless paid robotaxi rides in Washington, DC by 2026-12-31?" \\ - --initial-prob 52 \\ - --resolution-date 2026-12-31 \\ - --visibility unlisted \\ - --output waymo-dc-forecast-pack.json - -# From a stored decision with forecasts: -farness forecast-draft abc123 --output forecast-pack.json`} -
-
- -
-
- {[ - { - title: "Architecture", - body: "Should we rewrite the auth layer now or harden the existing service first?", - code: `KPI: critical_auth_incidents / 90d -Options: rewrite now | defer 60d | harden existing -Forecast: rewrite now 58% [42, 71] -Base rate: 27%`, - }, - { - title: "Product", - body: "Should we launch the new onboarding flow this sprint or hold for one more iteration?", - code: `KPI: activated_users / signup cohort -Options: ship now | hold 2 weeks | A/B limited rollout -Forecast: limited rollout 64% [49, 77] -Disconfirming evidence: sample size may be too small`, - }, - { - title: "Hiring", - body: "Should we hire a generalist engineer now or wait for a more specialized infra candidate?", - code: `KPI: roadmap throughput / quarter -Options: hire generalist | wait for specialist | contractor bridge -Forecast: contractor bridge 51% [38, 63] -Review date: 2026-09-01`, - }, - ].map((example) => ( -
-
- {example.title} -
-

- {example.body} -

- {example.code} -
- ))} -
-
- -
-
-
-
- CLI -
-

- No model credentials required. The CLI reads and writes local - decision records only. -

-
-
-
- MCP + skills -
-

- No separate farness API key. Your agent client uses its own - normal model credentials. -

-
-
-
- Experiments -
-

- The experiment runners do call external models and need provider - keys like `OPENAI_API_KEY` or `ANTHROPIC_API_KEY`. -

-
-
-
- -
-
- {[ - { - title: "Skill installed, not triggering", - body: "Run `farness doctor codex` or `farness doctor claude`, then restart the client. Skills are loaded at startup.", - }, - { - title: "Agent CLI not found", - body: "Install the `codex` or `claude` CLI first, then rerun `farness doctor --fix` to register MCP with the right interpreter.", - }, - { - title: "Want a clean reset", - body: "Use `farness uninstall codex` or `farness uninstall claude`, then rerun `farness setup ...` instead of editing config by hand.", - }, - ].map((item) => ( -
-
- {item.title} -
-

- {item.body} -

-
- ))} -
-
- -
- -
-
-
- ); -} diff --git a/site/src/app/globals.css b/site/src/app/globals.css index 2c65a25..714aaab 100644 --- a/site/src/app/globals.css +++ b/site/src/app/globals.css @@ -2,57 +2,7 @@ /* ── Design tokens — "Clear Horizon" palette ── */ -@theme { - /* Core light palette */ - --color-canvas: #F7FAFC; - --color-paper: #FCFDFE; - --color-section: #EEF4F8; - --color-card: #FFFFFF; - - --color-border-soft: #D9E4EC; - --color-border-strong: #BED0DB; - - --color-text-primary: #14202B; - --color-text-secondary: #415463; - --color-text-tertiary: #6B7C89; - --color-text-disabled: #94A3AF; - - /* Atmospheric mist blues */ - --color-mist-100: #E7EFF4; - --color-mist-200: #D6E3EB; - --color-mist-400: #9FB6C6; - --color-mist-600: #5E7A8D; - - /* Horizon blues */ - --color-horizon-300: #9FC4E6; - --color-horizon-500: #5E97C8; - --color-horizon-700: #356C99; - - /* Rose accent system */ - --color-rose-100: #F6E7F0; - --color-rose-300: #E7A6C8; - --color-rose-500: #C96B9C; - --color-rose-600: #A94E80; - --color-rose-700: #863A65; - - /* Dark instrument panel colors */ - --color-ink-dark: #0F1A24; - --color-slate-dark: #172633; - --color-ink-border: #2B3D4B; - --color-ink-text: #E8F0F5; - --color-ink-muted: #9DB1BF; - - /* Semantic */ - --color-accent: #A94E80; - --color-accent-hover: #8E456A; - --color-accent-subtle: #F6E7F0; - - /* Fonts */ - --font-display: "Newsreader", Georgia, serif; - --font-body: "IBM Plex Sans", -apple-system, sans-serif; - --font-editorial: "Instrument Serif", serif; - --font-mono: "IBM Plex Mono", monospace; -} +@import "brier-design/theme.css"; @keyframes fade-up { from { diff --git a/site/src/app/layout.tsx b/site/src/app/layout.tsx index 1aba4ba..87f9c98 100644 --- a/site/src/app/layout.tsx +++ b/site/src/app/layout.tsx @@ -2,31 +2,31 @@ import type { Metadata } from "next"; import "./globals.css"; export const metadata: Metadata = { - title: "Farness — Open Forecast Prototype", + title: "Thesis — open forecasts on government data", description: "Open forecast cells for public policy, tax, benefit, poverty, and government data, with agent reasoning traces and calibrated uncertainty.", openGraph: { type: "website", - title: "Farness — Open Forecast Prototype", + title: "Thesis", description: "Open forecast cells for public policy, tax, benefit, poverty, and government data, with agent reasoning traces and calibrated uncertainty.", - url: "https://farness.ai", - siteName: "farness", + url: "https://app.thesisinstitute.org", + siteName: "Thesis", images: [ { - url: "https://farness.ai/og-image.png", + url: "https://app.thesisinstitute.org/og-image.png", width: 1200, height: 630, - alt: "Farness — open forecast prototype for public policy and government data", + alt: "Thesis — open forecasts for public policy and government data", }, ], }, twitter: { card: "summary_large_image", - title: "Farness — Open Forecast Prototype", + title: "Thesis", description: "Open forecast cells for public policy, tax, benefit, poverty, and government data.", - images: ["https://farness.ai/og-image.png"], + images: ["https://app.thesisinstitute.org/og-image.png"], }, }; diff --git a/site/src/app/markets/[slug]/page.tsx b/site/src/app/markets/[slug]/page.tsx index 4073af5..8f8e5ba 100644 --- a/site/src/app/markets/[slug]/page.tsx +++ b/site/src/app/markets/[slug]/page.tsx @@ -23,9 +23,9 @@ export async function generateMetadata({ }): Promise { const { slug } = await params; const m = getMarket(slug); - if (!m) return { title: "Forecast not found — Farness" }; + if (!m) return { title: "Forecast not found — Brier" }; return { - title: `${m.title} — Farness forecasts`, + title: `${m.title} — Brier forecasts`, description: m.question, robots: { index: false, diff --git a/site/src/app/markets/page.tsx b/site/src/app/markets/page.tsx index b21c643..8615017 100644 --- a/site/src/app/markets/page.tsx +++ b/site/src/app/markets/page.tsx @@ -3,9 +3,9 @@ import { Header } from "@/components/Header"; import { MarketsBrowser } from "@/components/MarketsBrowser"; export const metadata: Metadata = { - title: "Policy forecasts — Farness", + title: "Policy forecasts — Thesis", description: - "Open forecasts on government statistics, law-encoded policy parameters, and outcomes conditional on policy states. A public prototype for Farness analyst agents calling public data and the PolicyEngine microsim.", + "Open forecasts on government statistics, law-encoded policy parameters, and outcomes conditional on policy states. A public preview of the Thesis, where analyst agents call public data and the PolicyEngine microsim.", robots: { index: false, follow: false, @@ -25,7 +25,7 @@ export default function MarketsPage() {

- Policy futures · prototype + Thesis · policy futures

Forecasts on every consequential cell of government data @@ -35,7 +35,7 @@ export default function MarketsPage() { government data points on published statistics,{" "} policy state forecasts on law-encoded parameters, and conditional forecasts on outcomes given policy - states. The prototype shows the target agent workflow: call public + states. The Almanac shows the target agent workflow: call public data and the PolicyEngine microsim, then publish calibrated uncertainty with an audit trail behind it.

@@ -49,7 +49,7 @@ export default function MarketsPage() { How forecasts are generated

- Every forecast cell is opened by the Farness analyst agent, which + Every forecast cell is opened by the Brier analyst agent, which decomposes the question, calls the PolicyEngine microsim against scenarios drawn from law-encoded statutes and MICROPLEX synthetic populations, integrates external baselines (CBO, FOMC SEP, JCT, BLS, diff --git a/site/src/app/page.tsx b/site/src/app/page.tsx index 75a1370..9b4d5f7 100644 --- a/site/src/app/page.tsx +++ b/site/src/app/page.tsx @@ -1,206 +1,9 @@ "use client"; -import Link from "next/link"; import { Header } from "@/components/Header"; -import { DemoVideo } from "@/components/DemoVideo"; import { MarketsBrowser } from "@/components/MarketsBrowser"; -/* ── Hero ── */ - -function Hero() { - return ( -

- {/* Atmospheric sky gradient */} -
- - {/* Content: two-column on desktop */} -
- {/* LEFT — headline, subhead, CTAs */} -
-
- Decision framework for agents -
-

- Arm your agent’s decisions with forecasting. -

- -

- - farness - {" "} - intercepts agent decisions and demands a forecast: a KPI, a - confidence interval, a base rate, disconfirming evidence, and a - review date. Works with Codex, Claude Code, and any agent that - speaks MCP. -

- -
- - Get started - - - Read the paper - -
-
- - {/* RIGHT — Forecast artifact on dark panel */} -
- -
-
-
- ); -} - -/* ── Forecast Artifact — dark instrument panel ── */ - -function ForecastArtifact() { - return ( -
-
-
- Decision prompt: -
- Should we rewrite the auth layer now? -
-
- -
- Reframed as: -
- P(critical auth incidents decrease by >40% in 90 days | rewrite - now) -
-
- -
- KPI: -
- critical_auth_incidents / 90d -
-
- -
- Forecast: -
- - -
-
- -
- Base rate: -
- 27% - - similar infra rewrites yielding material reliability gains - -
-
- -
- - Disconfirming evidence: - -
- ops fixes may solve this faster - rewrite could slip roadmap delivery - recent outage may overweight urgency -
-
- -
- Review date: -
2026-06-15
-
-
-
- ); -} - -function ForecastBar({ - label, - value, - low, - high, -}: { - label: string; - value: number; - low: number; - high: number; -}) { - return ( -
- - {label}: - -
- {/* Confidence interval fill */} -
- {/* Point estimate */} -
-
- - {value}% [{low}-{high}] - -
- ); -} - -function EvidenceTag({ children }: { children: React.ReactNode }) { - return ( - - {children} - - ); -} - -/* ── Forecast Prototype ── */ +/* ── Almanac hero + live forecast cells ── */ function ForecastPrototype() { return ( @@ -208,13 +11,13 @@ function ForecastPrototype() {

- Policy futures · prototype + Thesis · policy futures

Forecasts on every consequential cell of government data

- Farness analyst agents forecast published government statistics, + Brier analyst agents forecast published government statistics, law-encoded policy parameters, and outcomes conditional on policy states. Each cell carries a calibrated interval and an audit trail of the reasoning behind it. @@ -241,7 +44,7 @@ function ForecastPrototype() { Live API paths

- CPI-U and two CTC cells stream through api.farness.ai. + CPI-U and two CTC cells stream through api.thesisinstitute.org.
@@ -277,27 +80,27 @@ function HorizonDivider() { ); } -/* ── How It Works — 3 steps on light cards ── */ +/* ── How the Almanac works — 3 steps ── */ function HowItWorks() { const stages = [ { num: "01", - title: "Intercept", + title: "The catalog", description: - "Catch decision-language before the model hardens into advice. When a prompt sounds like 'Should we...?' or 'Which is better?', farness reframes it as a forecastable choice.", + "Every consequential cell of government data — published statistics, law-encoded policy parameters, and outcomes conditional on policy states — gets its own forecast cell, with an explicit resolution source and date.", }, { num: "02", - title: "Reframe", + title: "The forecast", description: - "Convert vague 'Should I?' into explicit, measurable outcome questions. Define the KPIs that would actually tell you whether the decision was good.", + "Brier analyst agents predict each cell with a point estimate, a calibrated interval, and a full audit trail of the reasoning, sources, and key drivers behind it — open by construction.", }, { num: "03", - title: "Anchor", + title: "The score", description: - "Produce numeric forecasts with confidence intervals, reference classes from comparable situations, disconfirming evidence, and a review date for accountability.", + "When the official number publishes, every forecast is scored against the record. Calibration is public, per cell and per agent, so the track record is the product.", }, ]; @@ -306,10 +109,10 @@ function HowItWorks() {
- How farness works + How the Almanac works

- From intuition to instrument + Open forecasts, scored against reality

@@ -336,482 +139,38 @@ function HowItWorks() { ); } -/* ── Workflow Demo ── */ - -function WorkflowDemo() { - const steps = [ - "python -m pip install 'farness[mcp]'", - "farness setup codex", - "$farness inside Codex, then review the saved decision locally", - ]; - - return ( -
-
-
- - Workflow demo - -

- Watch the packaged path end to end -

-

- The clip below shows the current Codex path exactly the way the docs - describe it: install the package, register the local MCP server, use - $farness - in Codex, then pull the decision back out of the local store. -

-
- {steps.map((step, index) => ( -
-
- {index + 1} -
-
- {step} -
-
- ))} -
- -
- - -
-
- ); -} - -/* ── Example Transformation (Before / After) ── */ - -function ExampleTransformation() { - return ( -
-
-
- - From intuition to forecast - -

- What the framework forces into view -

-
- -
- {/* Before — light card */} -
- - Diffuse prompt - -

- “Should I refactor this module first?” -

-
- - {/* After — dark instrument panel */} -
- - Farness output - -
-
- KPI: bug_rate / 30d -
-
- Event: >25% bug - reduction -
-
- Horizon: 90 days -
-
- Forecast: 44% [28-61] -
-
- Base rate: 22% -
-
- Disconfirming evidence:{" "} - migration drag, auth edge cases -
-
- Review: 2026-06-15 -
-
-
-
-
-
- ); -} - -/* ── Research Proof — Scholarly Panel ── */ - -function ResearchProof() { - return ( -
-
-
- - Research - -

- Stability-under-probing -

-
- -
- {/* Faint grid texture */} -
- -
- - - -
- -
-

- The paper introduces stability-under-probing as a way to evaluate - decision prompts without waiting for outcomes. In Study 1, farness - looked more prepared for the shared probe battery on Claude Opus - 4.6 and GPT-5.4. -

-

- Study 2 then added held-out probes and showed the broader claim - weakens sharply off-framework. That makes the paper a methods - result first, not proof that farness is universally superior. -

-

- The useful claim is narrower and better: structured decision - prompts can be tested empirically, and farness is one case study. -

-
- -
- - Read the full paper - -
-
-
-
- ); -} - -function StatBlock({ - value, - label, - accent, -}: { - value: string; - label: string; - accent: string; -}) { - return ( -
-
- {value} -
-
- {label} -
-
- ); -} - -/* ── Instrument Modules — What farness produces ── */ - -function InstrumentModules() { - const modules = [ - { - title: "KPI", - description: - "What outcome actually matters. Defined before the analysis, not after.", - }, - { - title: "Forecast", - description: - "Numeric probability for each option. Not opinions — predictions you can score.", - }, - { - title: "Confidence interval", - description: - "The honest range around the estimate. Calibrated uncertainty, not false precision.", - }, - { - title: "Base rate", - description: - "What usually happens in comparable situations. The outside view as empirical anchor.", - }, - { - title: "Disconfirming evidence", - description: - "What counter-evidence, failure modes, or decision traps could make the leading option wrong.", - }, - { - title: "Review date", - description: - "When to check the forecast against reality. Accountability built in.", - }, - ]; - - return ( -
-
-
- - Output primitives - -

- What farness produces -

-
- -
- {modules.map((mod) => ( -
-
- {mod.title} -
-

- {mod.description} -

-
- ))} -
-
-
- ); -} - -/* ── Editorial Pull Quote ── */ - -function WhyItMatters() { - return ( -
-
-

- AI is often fluent about decisions before it is rigorous about them. - farness adds structure before confidence hardens into action. -

-
-
-
- ); -} - -/* ── Installation ── */ - -function Installation() { - const workflows = [ - { - title: "Codex", - description: - "Install the package, run one setup command, then use $farness when a decision prompt shows up.", - code: `$ python -m pip install 'farness[mcp]' -$ farness setup codex -$ # restart Codex, then use $farness`, - }, - { - title: "Claude Code", - description: - "Use the same single-command setup flow for Claude. The plugin is still available if you prefer slash-command UX.", - code: `$ python -m pip install 'farness[mcp]' -$ farness setup claude -$ # restart Claude Code`, - }, - { - title: "CLI / Python", - description: - "Local decision log and calibration tool. No LLM API key required unless you run separate experiment code against external models.", - code: `$ python -m pip install farness -$ farness new "Should we rewrite the auth layer?" -$ farness calibration`, - }, - ]; - - return ( -
-
- - Agent integrations - -

- Use it natively or from the CLI -

- -

- Farness now has a package-first agent path: a local MCP server for - persistence, packaged skills for Codex and Claude Code, and the same - forecast structure used in the paper. The Claude plugin remains - optional, and the CLI is a local store and calibration surface, not an - LLM client. If setup drifts, `farness doctor --fix` repairs the local - integration. -

- -
- {workflows.map((workflow) => ( -
-
- {workflow.title} -
-

- {workflow.description} -

-
-
-                  {workflow.code}
-                
-
-
- ))} -
- - -
-
- ); -} - -/* ── Closing + Footer ── */ - -function ClosingSection() { - return ( -
-

- See further before you decide. -

- - Start with farness - -
- ); -} +/* ── Footer ── */ function Footer() { return ( @@ -825,17 +184,8 @@ export default function HomePage() {
- - - - - - - - - - +
); diff --git a/site/src/app/thesis/page.tsx b/site/src/app/thesis/page.tsx deleted file mode 100644 index 2c404f2..0000000 --- a/site/src/app/thesis/page.tsx +++ /dev/null @@ -1,756 +0,0 @@ -import Link from "next/link"; -import { Cite } from "@/components/Cite"; -import { Header } from "@/components/Header"; - -export default function ThesisPage() { - return ( -
-
-
-
-

- The Farness thesis -

-

- Forecasting as a harness -

-

- Why reframing decisions as predictions leads to better outcomes—and - how to do it. -

-
- -
-
-

The problem with advice

-

- When we ask someone—a friend, a mentor, an AI—"Should I do - X?", we're asking the wrong question. The answer we get - depends entirely on unstated assumptions: What do we value? What - counts as success? How certain is the advisor? None of this is - made explicit. -

-

- Worse, we can never learn from these answers. A year later, we - can't evaluate whether the advice was good because we never - defined what "good" meant. The feedback loop is broken. -

-

- This isn't just a problem with AI (though AI's tendency - toward sycophancy makes it worse - 1). It's a problem with how we structure - decision-making conversations. Annie Duke calls this - "resulting"—judging decisions by outcomes rather than - process - 16. When we ask for advice and get a good - outcome, we credit the advice. Bad outcome, we blame it. But a - single outcome tells us almost nothing about whether the decision - was good. -

-
- -
-

The reframe

-

- Instead of asking for advice, ask for{" "} - forecasts conditional on actions. -

-

The shift is subtle but transformative:

-
-

- Before: "Should I take this job?" -

-

- After: "If I value income, growth, and - work-life balance, what's the probability that each of - these exceeds my threshold under Option A vs Option B? What - assumptions drive those estimates?" -

-
-

This forces several things to happen:

-
    -
  • - Values become explicit. You must state what - you're optimizing for before anyone can help you. -
  • -
  • - Uncertainty becomes visible. A forecast - requires a confidence interval. "Probably fine" - becomes "70% chance, with a range of 50-85%." -
  • -
  • - Assumptions surface. To make a forecast, you - must reason about mechanisms. What needs to be true for this - outcome to occur? -
  • -
  • - Accountability emerges. Predictions can be - scored. Opinions cannot. -
  • -
-
- -
-

The superforecasting connection

-

- This isn't a new idea. Philip Tetlock's research on - superforecasting - 2 identified a set of techniques that reliably - improve predictive accuracy. In the Good Judgment Project, a small - group of forecasters consistently beat professional intelligence - analysts with access to classified information - 3. -

-

Their techniques include:

-
    -
  • - Fermi decomposition: Break complex estimates - into simpler, estimable components - 4. -
  • -
  • - Outside view first: Start with base rates - before adjusting for specifics—what Kahneman calls - "reference class forecasting" - 5. -
  • -
  • - Calibrated confidence: Your 80% predictions - should come true 80% of the time. -
  • -
  • - Continuous updating: Revise estimates as new - information arrives, following Bayesian principles. -
  • -
-

- Superforecasters don't have access to secret information. - They're just more disciplined about structuring their - thinking. Across nearly 100 comparative studies, Dawes, Faust, and - Meehl found that structured "mechanical" prediction - equaled or outperformed unstructured expert judgment in every - domain tested - 17. Farness applies this discipline to - personal and professional decisions. -

-
- -
-

Why AI makes this better

-

- Large language models are surprisingly good at forecasting. LLM - ensembles can match human crowd accuracy on prediction tasks - 6. Halawi et al. built a retrieval-augmented - system that approaches competitive forecaster accuracy - 18, and AI forecasting systems like AIA - Forecaster have achieved superforecaster-level performance through - structured pipelines of search, independent reasoning, and - calibration - 7. The CAIS forecasting bot has demonstrated - superhuman accuracy on competitive forecasting platforms - 8. On ForecastBench, LLMs now surpass the - median public forecaster, with projected LLM-superforecaster - parity by late 2026 - 28. -

-

- But LLMs are also prone to sycophancy: telling you what you want - to hear rather than what's true. Research has shown this - tendency is robust across models and contexts - 1. -

-

- The forecasting frame is a harness that constrains this - tendency. When you ask an AI for a probability with a confidence - interval, it's harder for it to simply validate your existing - beliefs. Numbers create accountability. Xiong et al. found that - structured elicitation strategies—multi-step prompting, top-k - sampling—can help mitigate LLM overconfidence, though no single - technique consistently outperforms others - 19. How you ask matters as much as what you - ask. -

-

- More importantly, the structure itself improves thinking. Research - on LLM-augmented forecasting found that AI assistance - significantly boosts human forecasting accuracy, with the largest - gains for less experienced forecasters - 9: -

-
    -
  • - KPI definition forces you to articulate what - you actually care about. -
  • -
  • - Option expansion surfaces alternatives you - hadn't considered. -
  • -
  • - Assumption surfacing reveals where your model - might be wrong. -
  • -
  • - Sensitivity analysis shows which uncertainties - matter most. -
  • -
-

The AI becomes a structured thinking partner, not an oracle.

-

- See the research: I've developed a - methodology called "stability-under-probing" to - empirically test whether frameworks reduce sycophancy.{" "} - Read the paper → -

-
- -
-

The calibration loop

-

- The most powerful part of this approach is what happens over time. - By logging your forecasts and scoring them against reality, you - build a calibration curve. -

-

- Research on expert prediction shows that without feedback, even - domain experts are poorly calibrated - 10. Lichtenstein, Fischhoff, and Phillips - found that when people said they were 98% confident, they were - correct only 68% of the time - 20. But with structured feedback, calibration - improves dramatically. Weather forecasters and professional - oddsmakers—who receive regular, structured feedback on their - probabilistic predictions—exhibited little or no overconfidence. - The Good Judgment Project confirmed this: regular accuracy - feedback was one of the key interventions that improved - performance - 3. -

-

- You learn that you're overconfident on career decisions. Or - underconfident on technical estimates. Or systematically biased - toward optimism about timelines. -

-

- This meta-knowledge is invaluable. It's not just about making - better individual decisions—it's about understanding your own - decision-making patterns and compensating for systematic biases. -

-
- -
-

The decision quality chain

-

- Ron Howard and the Strategic Decisions Group developed a framework - for measuring decision quality at the time of decision, - independent of outcome - 21. A decision is only as good as its weakest - link across six elements: appropriate frame, creative - alternatives, reliable information, clear values, sound reasoning, - and commitment to action - 22. -

-

- Farness maps directly onto this chain. Defining KPIs addresses{" "} - frame and values. Option expansion addresses{" "} - creative alternatives. Forecasting with base rates - addresses reliable information and{" "} - sound reasoning. The calibration loop addresses the - feedback mechanism that strengthens every link over time. -

-

- The key insight from decision analysis is that you can assess - decision quality without waiting for outcomes. Howard's - information value theory shows that when decisions are framed as - forecasts, you can calculate exactly how much to invest in - resolving each uncertainty - 23. If the expected value of learning your - probability of success is only $50, don't spend $5,000 on a - feasibility study. -

-

- This connects to what Kahneman and Lovallo call the "inside - view" versus "outside view" - 24. Decision makers naturally treat each - problem as unique, anchoring on plans and scenarios rather than - base rates from comparable situations. Reframing decisions as - forecasts naturally invokes the outside view by forcing explicit - probability assessment against a reference class. -

-
- -
-

Boosting, not nudging

-

- Hertwig and Grune-Yanoff distinguish "nudges" - (environmental changes that steer behavior) from - "boosts" (interventions that build decision-making - competence) - 25. A nudge might default your retirement - savings to 10%. A boost teaches you to think about compound - interest so you choose the right rate yourself. -

-

- Farness is a boost, not a nudge. It doesn't tell you what to - decide. It teaches a way of thinking—probabilistic, structured, - accountable—that transfers across domains. Julia Galef calls this - the "scout mindset": treating beliefs as provisional - hypotheses to be stress-tested, not positions to defend - 26. The forecasting frame cultivates this - mindset by making accuracy the explicit goal. -

-

- And critically, Koriat, Lichtenstein, and Fischhoff showed that - simply asking people to generate reasons against their - preferred option eliminates overconfidence almost entirely - 27. Structured consideration of - alternatives—a core forecasting discipline—is one of the most - robust debiasing techniques known. -

-
- -
-

The framework

-

- Farness implements a five-step process, drawing on structured - analytic techniques from intelligence analysis - 11 and the superforecasting literature: -

-
    -
  1. - Define KPIs. What outcomes matter? Pick 1-3 - metrics you'd actually use to judge success in hindsight. - This mirrors the "AIMS" technique (Audience, Issue, - Message, Storyline) from intelligence analysis - 11. -
  2. -
  3. - Expand options. Don't just compare A vs B. - What about C? Waiting? A hybrid? The best option is often one - you didn't initially consider. This combats "premature - closure"—a well-documented cognitive bias - 12. -
  4. -
  5. - Decompose and forecast. For each option x KPI, - apply outside view, inside view, Fermi decomposition. Produce a - point estimate with confidence interval. Decomposition is one of - Heuer's core structured analytic techniques - 11. -
  6. -
  7. - Surface assumptions. What must be true for this - forecast to hold? What would change it? This is the "key - assumptions check" from intelligence tradecraft - 13. -
  8. -
  9. - Log and score. Record the decision. Return in - 3-6 months. Compare predictions to reality. Update your - calibration. Brier scores provide a proper scoring rule that - rewards both accuracy and calibration - 14. -
  10. -
-
- -
-

When to use it

-

Farness is valuable across a range of decisions:

-
    -
  • - High-stakes decisions where the cost of being - wrong is significant. -
  • -
  • - Recurring decision types where you can build - calibration over time. -
  • -
  • - Decisions with delayed feedback where you - won't know if you were right for months or years. -
  • -
  • - Decisions where you suspect motivated reasoning - —where you might be fooling yourself - 15. -
  • -
  • - Smaller decisions as practice—building the - habit and calibration data that pays off when stakes are high. -
  • -
-
- -
-

The vision

-

Imagine a world where every significant decision comes with:

-
    -
  • Explicit success criteria
  • -
  • A range of options, not just the obvious ones
  • -
  • Quantified predictions with uncertainty ranges
  • -
  • Surfaced assumptions that can be tested
  • -
  • A record that can be scored and learned from
  • -
-

- This is possible today. The tools exist. The research supports it. - What's missing is the habit—the muscle memory of reaching for - forecasts instead of opinions. -

-

- Farness is an attempt to build that habit. Use it as a Python - library, a CLI tool, or a Claude Code plugin. Log your decisions. - Score your predictions. Get better over time. -

-

- - Get started → - -

-
-
- -
-

- References -

-
    -
  1. - Sharma, M., et al. (2024). "Towards - Understanding Sycophancy in Language Models."{" "} - ICLR 2024.{" "} - - openreview.net - -
  2. -
  3. - Tetlock, P. E., & Gardner, D. (2015).{" "} - Superforecasting: The Art and Science of Prediction. - Crown.{" "} - - Amazon - -
  4. -
  5. - Mellers, B., et al. (2014). - "Psychological Strategies for Winning a Geopolitical - Forecasting Tournament." Psychological Science, - 25(5), 1106-1115.{" "} - - DOI - -
  6. -
  7. - Good Judgment. - "Superforecasters' Toolbox: Fermi-ization in - Forecasting."{" "} - - goodjudgment.com - -
  8. -
  9. - Kahneman, D., & Tversky, A. (1979). - "Intuitive Prediction: Biases and Corrective - Procedures." TIMS Studies in Management Science, 12, - 313-327. -
  10. -
  11. - Schoenegger, P., et al. (2024). - "Wisdom of the Silicon Crowd: LLM Ensemble Prediction - Capabilities Rival Human Crowd Accuracy."{" "} - arXiv:2402.19379.{" "} - - arxiv.org/abs/2402.19379 - -
  12. -
  13. - Alur, R., et al. (2025). "AIA - Forecaster: Technical Report." arXiv:2511.07678.{" "} - - arxiv.org/abs/2511.07678 - -
  14. -
  15. - Center for AI Safety. "Superhuman - Automated Forecasting."{" "} - - safe.ai/blog/forecasting - -
  16. -
  17. - Schoenegger, P., et al. (2024). - "AI-Augmented Predictions: LLM Assistants Improve Human - Forecasting Accuracy." arXiv:2402.07862.{" "} - - arxiv.org/abs/2402.07862 - -
  18. -
  19. - Tetlock, P. E. (2005).{" "} - - Expert Political Judgment: How Good Is It? How Can We Know? - {" "} - Princeton University Press.{" "} - - Princeton University Press - -
  20. -
  21. - Heuer, R. J., & Pherson, R. H. (2015).{" "} - Structured Analytic Techniques for Intelligence Analysis{" "} - (2nd ed.). CQ Press.{" "} - - Amazon - -
  22. -
  23. - Kruglanski, A. W., & Webster, D. M. - (1996). "Motivated Closing of the Mind: 'Seizing' - and 'Freezing'." Psychological Review, - 103(2), 263-283.{" "} - - DOI - -
  24. -
  25. - CIA. (2009). "A Tradecraft Primer: - Structured Analytic Techniques for Improving Intelligence - Analysis."{" "} - - cia.gov - -
  26. -
  27. - Brier, G. W. (1950). "Verification - of Forecasts Expressed in Terms of Probability."{" "} - Monthly Weather Review, 78(1), 1-3.{" "} - - DOI - -
  28. -
  29. - Kunda, Z. (1990). "The Case for - Motivated Reasoning." Psychological Bulletin, - 108(3), 480-498.{" "} - - DOI - -
  30. -
  31. - Duke, A. (2018).{" "} - - Thinking in Bets: Making Smarter Decisions When You Don't - Have All the Facts - - . Portfolio/Penguin. -
  32. -
  33. - Dawes, R. M., Faust, D., & Meehl, P. E. - (1989). "Clinical Versus Actuarial Judgment."{" "} - Science, 243(4899), 1668-1674.{" "} - - DOI - -
  34. -
  35. - Halawi, D., Zhang, F., Chen, Y.-H., & - Steinhardt, J. (2024). "Approaching Human-Level Forecasting - with Language Models." NeurIPS 2024.{" "} - - arxiv.org/abs/2402.18563 - -
  36. -
  37. - Xiong, M., Hu, Z., Lu, X., et al. (2024). - "Can LLMs Express Their Uncertainty? An Empirical Evaluation - of Confidence Elicitation in LLMs." ICLR 2024.{" "} - - arxiv.org/abs/2306.13063 - -
  38. -
  39. - Lichtenstein, S., Fischhoff, B., & - Phillips, L. D. (1982). "Calibration of Probabilities: The - State of the Art to 1980." In D. Kahneman, P. Slovic, & A. - Tversky (Eds.),{" "} - Judgment under Uncertainty: Heuristics and Biases (pp. - 306-334). Cambridge University Press. -
  40. -
  41. - Howard, R. A. (1988). "Decision - Analysis: Practice and Promise." Management Science, - 34(6), 679-695.{" "} - - DOI - -
  42. -
  43. - Spetzler, C., Winter, H., & Meyer, J. - (2016).{" "} - - Decision Quality: Value Creation from Better Business Decisions - - . Wiley. -
  44. -
  45. - Howard, R. A. (1966). "Information - Value Theory."{" "} - IEEE Transactions on Systems Science and Cybernetics, - 2(1), 22-26.{" "} - - DOI - -
  46. -
  47. - Kahneman, D., & Lovallo, D. (1993). - "Timid Choices and Bold Forecasts: A Cognitive Perspective on - Risk Taking." Management Science, 39(1), 17-31.{" "} - - DOI - -
  48. -
  49. - Hertwig, R., & Grune-Yanoff, T. (2017). - "Nudging and Boosting: Steering or Empowering Good - Decisions." Perspectives on Psychological Science, - 12(6), 973-986.{" "} - - DOI - -
  50. -
  51. - Galef, J. (2021).{" "} - - The Scout Mindset: Why Some People See Things Clearly and Others - Don't - - . Portfolio/Penguin. -
  52. -
  53. - Koriat, A., Lichtenstein, S., & - Fischhoff, B. (1980). "Reasons for Confidence."{" "} - - Journal of Experimental Psychology: Human Learning and Memory - - , 6(2), 107-118.{" "} - - DOI - -
  54. -
  55. - Karger, E., et al. (2025). - "ForecastBench: A Dynamic Benchmark of AI Forecasting - Capabilities." ICLR 2025.{" "} - - openreview.net - -
  56. -
-
- - -
-
- ); -} diff --git a/site/src/app/vision/page.tsx b/site/src/app/vision/page.tsx deleted file mode 100644 index e2f270d..0000000 --- a/site/src/app/vision/page.tsx +++ /dev/null @@ -1,550 +0,0 @@ -import type { Metadata } from "next"; -import Link from "next/link"; -import { Header } from "@/components/Header"; - -export const metadata: Metadata = { - title: "Farness vision — working document", - description: - "Working synthesis of the Farness Foundation vision: we build open AI forecasters that publish, explain, and score predictions on consequential outcomes.", - robots: { - index: false, - follow: false, - nocache: true, - googleBot: { - index: false, - follow: false, - noimageindex: true, - }, - }, -}; - -export default function VisionPage() { - return ( -
-
-
-
-

- Working document · not for distribution -

-

- Open predictions -

-

- - farness - {" "} - builds open AI forecasters. We make them predict consequential - outcomes, show their work, call public tools, publish calibrated - uncertainty, and score every forecast against reality. -

-
- -
-
-

The bet

-

- Open source software opened code. Open data opened the inputs. - Open weights opened the reasoning machinery. We open the - predictions: continuously-updated forecasts from AI systems whose - tool calls, assumptions, uncertainty, calibration, and later - outcomes are public. -

-

- We use forecasting as an alignment pressure. A system that must - predict public facts before they happen has to track reality, - expose uncertainty, use evidence, and learn from misses. When the - trace is open, the public can inspect the model's evidence and the - model can learn from the public record of its own errors. -

-

- The traces create a compounding loop. Aggregate them and - systematic biases become visible. Score the forecasts and weak - methods lose credibility. Publish the fixes and the next - generation of forecasters starts from a better baseline. Applied - to prediction, the open-source dynamic becomes epistemic - infrastructure. -

-
- -
-

What we build

-

- Farness Foundation builds four connected pieces of public-good - forecasting infrastructure: -

-

We run open forecasters on consequential questions

-

- We run AI-agent ensembles across the structured grid of - consequential questions: government statistics from BEA, BLS, - Census, and IRS; law-encoded policy parameters; and counterfactual - questions that drive policy and economic decisions. We publish the - forecasts, the traces, the calibration history, and the running - methodology notes openly. Funded compute scales the depth of the - ensemble; the substrate stays free at the point of use. -

-

We simulate policy with inspectable models

-

- We maintain PolicyEngine as open-source microsimulation for US, - UK, and Canadian tax-benefit systems. Governments, think tanks, - advocacy organizations, and researchers use it for custom policy - analysis. Farness forecasters call PolicyEngine when they need - policy-conditional distributions, and PolicyEngine keeps serving - the policy community through the brand and workflows people - already know. -

-

We build calibrated synthetic populations

-

- We build Microplex as the synthetic micro-data substrate for - PolicyEngine simulations and calibration-native AI research. We - publish the population data, methodology, and synthesizer code - openly. Microplex replaces PolicyEngine's Enhanced CPS substrate - with data calibrated more tightly to administrative benchmarks and - useful beyond tax-benefit microsimulation. -

-

We make everyday agent advice forecastable

-

- We maintain the open-source Farness Decisions package, CLI, MCP - server, and agent skills. They turn advice-seeking into explicit - forecasts with KPIs, options, confidence intervals, resolution - rules, and calibration tracking. This keeps the same discipline - available for individual decisions, team decisions, and public - policy forecasts. -

-
- -
-

The transparency advantage is the durable moat

-

- We make transparency the core mechanism. Every methodology - improvement, newly-discovered bias, and successful tool - integration becomes shared infrastructure. Researchers can inspect - the trace, reproduce the forecast, challenge the assumptions, and - contribute a better method. Each improvement raises the baseline - for everyone who builds on the substrate. -

-

- The same dynamic that made Linux durable protects Farness's - position. The compounding work happens across the whole community - of users and contributors. The foundation maintains the core - infrastructure, integrates the best contributions, sets direction, - and protects the public-good character. The community expands the - surface area faster than any single organization could. -

-
- -
-

Built for the agents of tomorrow

-

- The infrastructure that matters most gets built ahead of the - capability that needs it. TCP/IP was designed for a few hundred - nodes and scaled to billions because the design anticipated future - use. Kubernetes solved orchestration problems most organizations - had not yet reached when it shipped. Linux was built when - computing was tiny and scaled with hardware nobody had imagined. - Substrate-builders capture disproportionate value because they are - already there when the demand shows up. -

-

- Farness is built with this in mind. Every capability is reachable - through a clean machine-callable API; future agents will call - tools directly. Every agent trace is structured for downstream - consumption by other agents and human readers. Every tool in the - simulation engine is self-describing so that agents that have not - been invented yet can discover what is available. Permissioning - anticipates millions of automated participants through scoped - automated access. Calibration scoring is queryable, so current - agents can learn from history and future agents can preferentially - route to tool configurations with proven track records. -

-

- This costs a little more today and pays disproportionately when - capability arrives. By the time agents are reliably orchestrating - tools, composing pipelines, and proposing methodology - improvements, the substrate they need will already be open, - public, free, and continuously calibrated. Open substrate gives - tomorrow's agents permission-less infrastructure the next decade - of AI development can build on. -

-
- -
-

The stack

-

- Two independent 501(c)(3) foundations, technically integrated as - one open stack: -

-
-

- Encoded-law substrate — computable statutes, - regulations, holdings, and metadata linking published government - statistics to the laws that mandate them. Separate organization, - shared substrate. -

-

- Farness Foundation — open-predictions platform, - microsimulation engine and custom policy analysis - (PolicyEngine), synthetic-population substrate (Microplex), - personal decision tool (Farness Decisions), and the research - program on calibration-native foundation models and value - forecasting. -

-
-

- The Farness platform consumes encoded law, government data - architecture, and Microplex population substrate as inputs, runs - ensembles through PolicyEngine and other computational engines, - and publishes calibrated forecasts. Policy partners interact with - PolicyEngine directly through its own brand and channels. New - audiences — AI safety, agencies funding their own forecasts, - prediction-market researchers, broader policy analysts — interact - with Farness as the umbrella platform. -

-
- -
-

Open predictions as a movement

-

- The category needs a name to anchor its identity. The lineage: -

-
    -
  • - Open source software opened the code. Linux, - Apache, Mozilla. The free software movement and its successors - made source available and rewrote the economics of software - distribution. -
  • -
  • - Open data opened the inputs. Wikipedia, Common - Crawl, OpenStreetMap, government open-data portals. The raw - material of analysis became public and citable. -
  • -
  • - Open weights opened the reasoning machinery. - Allen Institute's Olmo, Llama, Mistral, DeepSeek. The trained - models themselves became available for inspection and reuse. -
  • -
  • - Open predictions opens the reasoning - itself, on consequential questions. Every prior, every - tool call, every update is auditable. The output includes the - forecast and the full chain of reasoning that produced it. -
  • -
-

- Each step opens more of the epistemic process. Each step produces - durable public goods and gives the next generation of builders - more to start from. Open predictions is the natural next layer, - and Farness is the foundation building it. -

-
- -
-

We align AI by making it predict

-

- We give AI systems a narrow job with a hard feedback loop: predict - consequential outcomes before they happen, explain the evidence - behind the prediction, quantify uncertainty, and accept a public - score when reality arrives. That objective pushes models toward - truth-tracking behavior because calibration, evidence use, and - humility become measurable product requirements. -

-

- We use the strongest available models as forecasters today. We - connect them to public data, encoded law, PolicyEngine - simulations, Microplex populations, and explicit calibration - records. We evaluate which model-tool-method combinations predict - best. We publish the traces so other researchers can reproduce, - criticize, and improve the methods. -

-

- As the corpus grows, we train and evaluate prediction-native - systems: agents that select tools, decompose questions, maintain - uncertainty, update on evidence, and learn from scored outcomes. - The lab advances by making forecasts useful in the world and by - making the full learning loop open. -

-
- -
-

We train prediction-native agents

-

- The long-run research program is not just prompting other - companies' models. We build the data, tools, and evaluation loop - needed to train our own forecasters. The training corpus is - time-versioned so every backtest can ask what the agent could have - known on a specific date, which tool versions were available, and - which outcomes had not yet resolved. That leakage control turns - forecasting into a real scientific benchmark instead of a vibes - demo. -

-

- We make tool use native. The agent learns that forecasting often - means calling BLS, Census, IRS, CMS, CBO, BEA, PolicyEngine, - Microplex, and other public or inspectable systems. Tool outputs - carry provenance, vintage, and uncertainty. Forecast artifacts - store the question, evidence, tool calls, assumptions, uncertainty - decomposition, final distribution, resolution rule, and eventual - score as first-class data. -

-

- We train on scored predictions. Supervised learning starts from - strong public forecast traces. Reinforcement learning and reward - modeling optimize proper scoring rules, calibration, interval - sharpness, and decision usefulness once enough forecasts resolve. - The objective is not eloquence; it is measured accuracy under - uncertainty with a visible audit trail. -

-

- We let agents learn from other agents' public traces. Not private - chain-of-thought, but durable artifacts: which evidence was used, - which tools were called, which assumptions mattered, which - forecaster configurations were overconfident, and which methods - improved after resolution. Future agents can route to the tools, - methods, and peer traces with the best calibration record. -

-

- This is how Farness can become an AI lab without becoming a closed - frontier company. Whether we train foundation models directly or - specialize open models into forecasters, the purpose stays narrow - and public: build systems whose job is to predict, explain, - resolve, and improve against reality. -

-
- -
-

Funder fit

-

- The funder base that matches the thesis is broader and more - accessible than the funder base for any of the predecessor - framings: -

-
    -
  • - Coefficient Giving (Open Philanthropy rebrand) - — AI safety, forecasting infrastructure, consequence-visibility - framing fits directly in their existing grant portfolios. -
  • -
  • - Survival and Flourishing Fund — long-horizon AI - safety and alignment-adjacent infrastructure. -
  • -
  • - Astera Institute,{" "} - Schmidt Sciences / Schmidt Futures,{" "} - Mozilla Foundation — novel public-good - scientific infrastructure and open-source AI ethos. -
  • -
  • - - Anthropic alumni and AI-safety-aligned liquidity - {" "} - — tender-offer and IPO-event capital from Anthropic and similar - frontier labs. Open-source-by-construction means current - frontier-lab employees can publicly back the work without - conflict of interest. The complement-not-compete frame is unique - to this category. -
  • -
  • - Arnold Ventures Mission Aligned Investments — - fits the structure Andrew Moylan and team have already signaled - interest in, particularly for the open policy-forecasting - infrastructure angle. -
  • -
  • - - Government agencies and international equivalents - {" "} - — Treasury, state revenue offices, Federal Reserve regional - banks, HHS, Census, and international counterparts paying for - marginal compute on the questions they care about. Sponsored - runs are program-related revenue that fits 501(c)(3) structure - cleanly. -
  • -
  • - National research funding — NSF, DARPA, IARPA, - ARIA UK, NIH for specific research directions. -
  • -
  • - - Sponsorship capital from AI labs, Big Tech, and philanthropies - - , per the Fradkin/Jabarian/Koh - well-capitalized-prediction-markets model, applied to specific - question sets the sponsor wants better-calibrated forecasts on. -
  • -
-

- Farness funds the work through multiple channels. Foundation - grants fund the platform and research. Sponsored compute pays for - specific question coverage. Custom analysis through PolicyEngine - generates additional program revenue. The revenue mix keeps the - foundation institutionally independent. -

-
- -
-

What success looks like in five years

-

- At maturity, Farness produces continuously-updated calibrated - forecasts on every consequential government statistic, every - encoded policy parameter, and every counterfactual conditional - question stakeholders care about. The platform runs hundreds to - thousands of specialized agent configurations, each with published - methodology and visible track record. Calibration history goes - back years and is queryable per question, per configuration, per - resolution period. Government agencies fund targeted compute on - their projection questions. Researchers build on the open - infrastructure for their own work. Frontier AI labs use the - calibration corpus as a training and evaluation resource. - Open-source forecaster configurations and tool integrations are - contributed by people the foundation has never met. -

-

- The forecasts feed into the decisions of governments, advocacy - organizations, firms, and individuals because calibrated - probability distributions with visible evidence improve the - decisions those institutions already make. The substrate - compounds: every new tool integration, every new methodology - insight, and every new question coverage makes everything that was - already there more useful. -

-

- And when the AI agents of 2030 arrive — substantially more capable - than today's, better at tool selection, better at composing - methodology, better at reasoning over their own outputs — they - find a substrate already built for them. Open, calibrated, - audit-trail-native, and free at the point of use. The capability - becomes immediately deployable on consequential questions because - the infrastructure is already there. -

-
- -
-

Honest caveats and open questions

-

- - The autonomous-improvement language is aspirational. - {" "} - Today's AI systems can iterate variants, tune hyperparameters, and - generate model code, but autonomous improvement of methodology - without sustained human guidance is years out. Honest framing: - open human-in-the-loop improvement of AI ensembles on a - transparent substrate, with the substrate compounding the - human-and-AI work over time. We build collaborative compounding - before autonomous self-improvement. -

-

- The one-year launch starts narrower. The platform - launches with a smaller agent ensemble, fewer questions, a - narrower research program, and a working but incomplete substrate. - Building toward the mature state takes real research and - engineering investment over years. The vision is the north star; - the early stages look more like a focused shipping organization - than a complete forecasting layer. -

-

- Open infrastructure depends on adoption.{" "} - Organizations need workflows that integrate open predictions into - real decisions. Building that institutional muscle across policy - shops, agencies, and other users takes years. Farness can lead the - category and still has to earn adoption one workflow at a time. -

-

- - Regulatory ambiguity if forecasts become market-moving. - {" "} - Farness publishes forecasts rather than trades, which avoids most - prediction-market regulatory complexity. If open forecasts become - widely consumed by financial markets, the SEC or CFTC may still - take interest in disclosure rules. Probably solvable through - precedents like Federal Reserve forecast publication, but warrants - real legal review. -

-

- The PolicyEngine brand transition. PolicyEngine - continues operationally unchanged, but funders, board, and - partners need to be brought along on the umbrella structure. - Existing grants are to PolicyEngine via PSL Foundation fiscal - sponsorship; the cleanest path is incorporating Farness Foundation - as the new 501(c)(3) and graduating PolicyEngine into it from PSL. - Donor consent process is straightforward; the communications work - requires care. -

-

- - The "Farness" name has multiple uses to disambiguate. - {" "} - Farness Foundation (the org), Farness (the open-predictions - platform — the flagship), Farness Decisions (the personal decision - tool). Naming hierarchy needs to be settled before any public - launch. -

-
- -
-

The shape of the work, in priority order

-
    -
  • - Incorporate Farness Foundation as a 501(c)(3) - upon graduating PolicyEngine from PSL fiscal sponsorship. Use - fiscal sponsorship during the application period. -
  • -
  • - Settle the naming hierarchy publicly and - internally before any launch announcement: foundation, platform, - PolicyEngine, Microplex, Decisions, and the shared law/data - substrate. -
  • -
  • - Compose the board with AI-safety, policy, and - technical credibility — names that signal what the foundation is - to the funder base it most needs to reach. -
  • -
  • - Publish the manifesto in its public form (this - document, rewritten for external audience) with accompanying - funder one-pager and FAQ. -
  • -
  • - Ship the first visible version of the platform— - Manifold-hosted forecast experiments with full agent telemetry, - a small set of government-data-anchored questions with published - calibration, and the agent traces openly available. -
  • -
  • - Move Microplex into PolicyEngine as the - Enhanced CPS replacement, with the methodology and synthesizer - code published openly. -
  • -
  • - Pre-flight major funder conversations — - Coefficient Giving, SFF, Schmidt, Anthropic-alumni outreach, - Arnold Ventures MAI — with the manifesto and one-pager in hand. -
  • -
  • - Coordinate the encoded-law substrate on shared - roadmap for law access and government-data architecture. -
  • -
-

- - Read the thesis → - -

-
-
- -
-

- Working synthesis by Max Ghenis. - Living document. Not for distribution. -

-
-
-
- ); -} diff --git a/site/src/components/FarnessLogo.tsx b/site/src/components/BrierLogo.tsx similarity index 80% rename from site/src/components/FarnessLogo.tsx rename to site/src/components/BrierLogo.tsx index 4862345..333e764 100644 --- a/site/src/components/FarnessLogo.tsx +++ b/site/src/components/BrierLogo.tsx @@ -1,12 +1,12 @@ /** - * Farness logo mark — "The Vanishing Point" + * Brier logo mark — "The Vanishing Point" * * Two perspective lines converging toward a luminous focal point. * Updated for the "Clear Horizon" palette: * - Lines use Mist-400 (#9FB6C6) — visible on light backgrounds * - Dot uses Rose-600 (#A94E80) — brand accent */ -export function FarnessLogoMark({ +export function BrierLogoMark({ size = 28, className = "", }: { @@ -21,7 +21,7 @@ export function FarnessLogoMark({ fill="none" xmlns="http://www.w3.org/2000/svg" className={className} - aria-label="Farness" + aria-label="Thesis" > {/* Glow halo — rose at low opacity */} - - farness + + thesis ); } diff --git a/site/src/components/DemoVideo.tsx b/site/src/components/DemoVideo.tsx index 8886390..cecca50 100644 --- a/site/src/components/DemoVideo.tsx +++ b/site/src/components/DemoVideo.tsx @@ -4,8 +4,8 @@ export function DemoVideo({ caption?: string; }) { const assetRev = process.env.NEXT_PUBLIC_SITE_ASSET_REV || "dev"; - const videoSrc = `/demo/farness-demo.mp4?v=${assetRev}`; - const posterSrc = `/demo/farness-demo-poster.png?v=${assetRev}`; + const videoSrc = `/demo/brier-demo.mp4?v=${assetRev}`; + const posterSrc = `/demo/brier-demo-poster.png?v=${assetRev}`; return (
@@ -19,7 +19,7 @@ export function DemoVideo({ playsInline poster={posterSrc} preload="metadata" - aria-label="End-to-end farness workflow demo for Codex" + aria-label="End-to-end brier workflow demo for Codex" > diff --git a/site/src/components/Header.tsx b/site/src/components/Header.tsx index 6382132..b7186bd 100644 --- a/site/src/components/Header.tsx +++ b/site/src/components/Header.tsx @@ -1,10 +1,10 @@ import Link from "next/link"; -import { FarnessLogoMark } from "./FarnessLogo"; +import { BrierLogoMark } from "./BrierLogo"; export function Header({ activePage, }: { - activePage?: "docs" | "thesis" | "paper" | "forecasts"; + activePage?: "docs" | "thesis" | "paper" | "forecasts" | "about"; }) { return (
- - farness + + thesis

diff --git a/site/src/components/MarketRuntime.tsx b/site/src/components/MarketRuntime.tsx index 08139e6..d9be4d4 100644 --- a/site/src/components/MarketRuntime.tsx +++ b/site/src/components/MarketRuntime.tsx @@ -598,7 +598,7 @@ function Spinner() { } function resolveApiBase() { - const configured = process.env.NEXT_PUBLIC_FARNESS_API_BASE_URL?.replace( + const configured = process.env.NEXT_PUBLIC_BRIER_API_BASE_URL?.replace( /\/$/, "", ); @@ -609,7 +609,7 @@ function resolveApiBase() { ) { return "http://127.0.0.1:3002"; } - return "https://api.farness.ai"; + return "https://api.thesisinstitute.org"; } function parseEventData(event: Event) { diff --git a/site/src/data/markets.ts b/site/src/data/markets.ts index bbb12d4..da0d417 100644 --- a/site/src/data/markets.ts +++ b/site/src/data/markets.ts @@ -386,7 +386,7 @@ export const MARKETS: Market[] = [ }, { kind: "text", - text: "The FOMC SEP median, CBO projection, and the farness structural-VAR all cluster between 4.27 and 4.40 for full-year 2026. December prints tend to run very slightly below the annual mean in expansions due to seasonal adjustment behavior in the household survey.", + text: "The FOMC SEP median, CBO projection, and the brier structural-VAR all cluster between 4.27 and 4.40 for full-year 2026. December prints tend to run very slightly below the annual mean in expansions due to seasonal adjustment behavior in the household survey.", }, { kind: "heading", text: "Risk distribution" }, { @@ -908,8 +908,8 @@ export const MARKETS: Market[] = [ { kind: "heading", text: "Calibration layer" }, { kind: "tool", - tool: "farness.calibration", - call: 'farness.calibration.lookup({ domain: "policyengine_budget_scores", policy_area: "ctc", outcome: "federal_budget_cost" })', + tool: "brier.calibration", + call: 'brier.calibration.lookup({ domain: "policyengine_budget_scores", policy_area: "ctc", outcome: "federal_budget_cost" })', result: "{ raw_to_final_ratio: 1.04, additive_billions: 3.5, queued_uncertainty_multiplier: 1.4 }", }, @@ -971,8 +971,8 @@ export const MARKETS: Market[] = [ }, { kind: "tool", - tool: "farness.calibration", - call: 'farness.calibration.lookup({ domain: "policyengine_budget_scores", policy_area: "ctc", target: "irs_soi_outlays" })', + tool: "brier.calibration", + call: 'brier.calibration.lookup({ domain: "policyengine_budget_scores", policy_area: "ctc", target: "irs_soi_outlays" })', result: "{ ratio: 1.02, additive_billions: 0.5, widened_for_reporting_lag: true }", }, diff --git a/farness/assets/skills/codex/SKILL.md b/skills/brier/SKILL.md similarity index 86% rename from farness/assets/skills/codex/SKILL.md rename to skills/brier/SKILL.md index 935113c..585759b 100644 --- a/farness/assets/skills/codex/SKILL.md +++ b/skills/brier/SKILL.md @@ -1,13 +1,13 @@ --- -name: farness +name: brier description: Use when the user wants advice or a decision analysis rather than pure implementation, especially for prompts like "should I", "should we", "which is better", "is it worth it", or "what would you do" about architecture, product, hiring, strategy, or career choices. Reframe the decision as explicit KPIs, expanded options, reference classes, disconfirming evidence, numeric forecasts, and a review date. Do not use for straightforward debugging, factual explanation, or routine coding tasks. --- -# Farness +# Brier Use this skill to turn vague decisions into forecastable choices. -Prefer the `farness` MCP server when available. It gives you persistent tools, resources, and prompts for the workflow. +Prefer the `brier` MCP server when available. It gives you persistent tools, resources, and prompts for the workflow. ## Trigger Conditions @@ -37,7 +37,7 @@ Do not use it for: ## Workflow 1. If there is no stored decision yet, call `create_decision`. -2. Use `farness://framework` if you need the canonical sequence. +2. Use `brier://framework` if you need the canonical sequence. 3. Structure the analysis around: - KPI definition - KPI resolution metadata @@ -57,8 +57,8 @@ Do not use it for: 5. If the user is revisiting the decision, use `get_decision` and `review_decision`. 6. If outcomes are now known, call `score_decision` to update calibration. 7. If the user wants to externalize a forecast into a prediction market, draft it first: - - Use `farness forecast-draft --output forecast-pack.json` for stored decisions. - - Use `farness forecast-draft "" --initial-prob <1-99> --resolution-date YYYY-MM-DD --output forecast-pack.json` for standalone policy questions. + - Use `brier forecast-draft --output forecast-pack.json` for stored decisions. + - Use `brier forecast-draft "" --initial-prob <1-99> --resolution-date YYYY-MM-DD --output forecast-pack.json` for standalone policy questions. - Treat forecast drafts as review artifacts only; do not publish questions or place bets unless the user explicitly asks. ## Working Rules @@ -74,10 +74,10 @@ Do not use it for: ## Fallback -If the `farness` MCP server is not connected, tell the user to add it with: +If the `brier` MCP server is not connected, tell the user to add it with: ```bash -farness setup codex +brier setup codex ``` Then continue with the same workflow once the server is available. diff --git a/skills/brier/agents/openai.yaml b/skills/brier/agents/openai.yaml new file mode 100644 index 0000000..f68332f --- /dev/null +++ b/skills/brier/agents/openai.yaml @@ -0,0 +1,13 @@ +interface: + display_name: "Brier" + short_description: "Turn decisions into tracked forecasts with MCP-backed structure" + default_prompt: "Use $brier to turn this decision into explicit KPIs, options, forecasts, and a review date, then persist it with the brier MCP tools." + +dependencies: + tools: + - type: "mcp" + value: "brier" + description: "Local brier MCP server" + +policy: + allow_implicit_invocation: true diff --git a/skills/farness/agents/openai.yaml b/skills/farness/agents/openai.yaml deleted file mode 100644 index b189b5f..0000000 --- a/skills/farness/agents/openai.yaml +++ /dev/null @@ -1,13 +0,0 @@ -interface: - display_name: "Farness" - short_description: "Turn decisions into tracked forecasts with MCP-backed structure" - default_prompt: "Use $farness to turn this decision into explicit KPIs, options, forecasts, and a review date, then persist it with the farness MCP tools." - -dependencies: - tools: - - type: "mcp" - value: "farness" - description: "Local farness MCP server" - -policy: - allow_implicit_invocation: true diff --git a/tests/test_agent_setup.py b/tests/test_agent_setup.py index 5ee02e6..48c5659 100644 --- a/tests/test_agent_setup.py +++ b/tests/test_agent_setup.py @@ -7,7 +7,7 @@ import pytest -from farness.agent_setup import ( +from brier.agent_setup import ( inspect_agent_setup, manual_setup_command, remove_agent_setup, @@ -19,8 +19,8 @@ def test_manual_setup_command_for_claude(): command = manual_setup_command("claude", "/tmp/venv/bin/python") assert ( - command == "claude mcp add --scope user farness -- " - "/tmp/venv/bin/python -m farness.mcp_server" + command == "claude mcp add --scope user brier -- " + "/tmp/venv/bin/python -m brier.mcp_server" ) @@ -30,10 +30,10 @@ def test_setup_agent_skips_add_when_server_exists(monkeypatch, tmp_path): skill_path.write_text("skill") monkeypatch.setattr( - "farness.agent_setup.install_skill", lambda *args, **kwargs: skill_path + "brier.agent_setup.install_skill", lambda *args, **kwargs: skill_path ) monkeypatch.setattr( - "farness.agent_setup.shutil.which", lambda name: f"/usr/bin/{name}" + "brier.agent_setup.shutil.which", lambda name: f"/usr/bin/{name}" ) calls = [] @@ -42,12 +42,12 @@ def fake_run(cmd, capture_output, text, check): calls.append(cmd) return SimpleNamespace(returncode=0, stdout="", stderr="") - monkeypatch.setattr("farness.agent_setup.subprocess.run", fake_run) + monkeypatch.setattr("brier.agent_setup.subprocess.run", fake_run) result = setup_agent("codex", python_bin="/tmp/python") assert result.mcp_already_configured is True - assert calls == [["codex", "mcp", "get", "farness"]] + assert calls == [["codex", "mcp", "get", "brier"]] def test_setup_agent_adds_missing_server(monkeypatch, tmp_path): @@ -56,10 +56,10 @@ def test_setup_agent_adds_missing_server(monkeypatch, tmp_path): skill_path.write_text("skill") monkeypatch.setattr( - "farness.agent_setup.install_skill", lambda *args, **kwargs: skill_path + "brier.agent_setup.install_skill", lambda *args, **kwargs: skill_path ) monkeypatch.setattr( - "farness.agent_setup.shutil.which", lambda name: f"/usr/bin/{name}" + "brier.agent_setup.shutil.which", lambda name: f"/usr/bin/{name}" ) calls = [] @@ -70,24 +70,24 @@ def fake_run(cmd, capture_output, text, check): return SimpleNamespace(returncode=1, stdout="", stderr="missing") return SimpleNamespace(returncode=0, stdout="", stderr="") - monkeypatch.setattr("farness.agent_setup.subprocess.run", fake_run) + monkeypatch.setattr("brier.agent_setup.subprocess.run", fake_run) result = setup_agent("claude", python_bin="/tmp/python") assert result.mcp_already_configured is False assert calls == [ - ["claude", "mcp", "get", "farness"], + ["claude", "mcp", "get", "brier"], [ "claude", "mcp", "add", "--scope", "user", - "farness", + "brier", "--", "/tmp/python", "-m", - "farness.mcp_server", + "brier.mcp_server", ], ] @@ -98,34 +98,34 @@ def test_setup_agent_reports_missing_cli(monkeypatch, tmp_path): skill_path.write_text("skill") monkeypatch.setattr( - "farness.agent_setup.install_skill", lambda *args, **kwargs: skill_path + "brier.agent_setup.install_skill", lambda *args, **kwargs: skill_path ) - monkeypatch.setattr("farness.agent_setup.shutil.which", lambda name: None) + monkeypatch.setattr("brier.agent_setup.shutil.which", lambda name: None) with pytest.raises(RuntimeError) as excinfo: setup_agent("codex", python_bin="/tmp/python") message = str(excinfo.value) assert "Installed the codex skill" in message - assert "codex mcp add farness -- /tmp/python -m farness.mcp_server" in message + assert "codex mcp add brier -- /tmp/python -m brier.mcp_server" in message def test_inspect_agent_setup_uses_codex_home(monkeypatch, tmp_path): codex_home = tmp_path / "codex-home" - skill_path = codex_home / "skills" / "farness" / "SKILL.md" + skill_path = codex_home / "skills" / "brier" / "SKILL.md" skill_path.parent.mkdir(parents=True) skill_path.write_text("skill") monkeypatch.setenv("CODEX_HOME", str(codex_home)) monkeypatch.setattr( - "farness.agent_setup.shutil.which", lambda name: f"/usr/bin/{name}" + "brier.agent_setup.shutil.which", lambda name: f"/usr/bin/{name}" ) def fake_run(cmd, capture_output, text, check): - assert cmd == ["codex", "mcp", "get", "farness"] + assert cmd == ["codex", "mcp", "get", "brier"] return SimpleNamespace(returncode=0, stdout="", stderr="") - monkeypatch.setattr("farness.agent_setup.subprocess.run", fake_run) + monkeypatch.setattr("brier.agent_setup.subprocess.run", fake_run) result = inspect_agent_setup("codex", python_bin="/tmp/python") @@ -139,12 +139,12 @@ def test_inspect_agent_setup_skips_mcp_check_without_cli(monkeypatch, tmp_path): target = tmp_path / "claude-skill" target.mkdir(parents=True) - monkeypatch.setattr("farness.agent_setup.shutil.which", lambda name: None) + monkeypatch.setattr("brier.agent_setup.shutil.which", lambda name: None) def fail_run(*args, **kwargs): # pragma: no cover raise AssertionError("subprocess.run should not be called when CLI is missing") - monkeypatch.setattr("farness.agent_setup.subprocess.run", fail_run) + monkeypatch.setattr("brier.agent_setup.subprocess.run", fail_run) result = inspect_agent_setup("claude", target_dir=str(target), python_bin="/tmp/python") @@ -156,32 +156,32 @@ def fail_run(*args, **kwargs): # pragma: no cover def test_repair_agent_setup_rewrites_modified_skill(monkeypatch, tmp_path): - target = tmp_path / "codex-home" / "skills" / "farness" + target = tmp_path / "codex-home" / "skills" / "brier" target.mkdir(parents=True) skill_path = target / "SKILL.md" skill_path.write_text("modified") monkeypatch.setenv("CODEX_HOME", str(tmp_path / "codex-home")) monkeypatch.setattr( - "farness.agent_setup.shutil.which", lambda name: f"/usr/bin/{name}" + "brier.agent_setup.shutil.which", lambda name: f"/usr/bin/{name}" ) calls = [] def fake_run(cmd, capture_output, text, check): calls.append(cmd) - if cmd == ["codex", "mcp", "get", "farness"]: + if cmd == ["codex", "mcp", "get", "brier"]: return SimpleNamespace(returncode=0, stdout="", stderr="") return SimpleNamespace(returncode=0, stdout="", stderr="") - monkeypatch.setattr("farness.agent_setup.subprocess.run", fake_run) + monkeypatch.setattr("brier.agent_setup.subprocess.run", fake_run) result = repair_agent_setup("codex", python_bin="/tmp/python") assert result.skill_action == "updated" assert result.mcp_action == "unchanged" assert "Use this skill to turn vague decisions into forecastable choices." in skill_path.read_text() - assert calls == [["codex", "mcp", "get", "farness"]] + assert calls == [["codex", "mcp", "get", "brier"]] def test_remove_agent_setup_removes_skill_and_mcp(monkeypatch, tmp_path): @@ -191,20 +191,20 @@ def test_remove_agent_setup_removes_skill_and_mcp(monkeypatch, tmp_path): skill_path.write_text("skill") monkeypatch.setattr( - "farness.agent_setup.shutil.which", lambda name: f"/usr/bin/{name}" + "brier.agent_setup.shutil.which", lambda name: f"/usr/bin/{name}" ) calls = [] def fake_run(cmd, capture_output, text, check): calls.append(cmd) - if cmd == ["claude", "mcp", "get", "farness"]: + if cmd == ["claude", "mcp", "get", "brier"]: return SimpleNamespace(returncode=0, stdout="", stderr="") - if cmd == ["claude", "mcp", "remove", "farness"]: + if cmd == ["claude", "mcp", "remove", "brier"]: return SimpleNamespace(returncode=0, stdout="", stderr="") raise AssertionError(f"Unexpected command: {cmd}") - monkeypatch.setattr("farness.agent_setup.subprocess.run", fake_run) + monkeypatch.setattr("brier.agent_setup.subprocess.run", fake_run) result = remove_agent_setup("claude", target_dir=str(target)) @@ -212,6 +212,6 @@ def fake_run(cmd, capture_output, text, check): assert result.mcp_removed is True assert not skill_path.exists() assert calls == [ - ["claude", "mcp", "get", "farness"], - ["claude", "mcp", "remove", "farness"], + ["claude", "mcp", "get", "brier"], + ["claude", "mcp", "remove", "brier"], ] diff --git a/tests/test_calibration.py b/tests/test_calibration.py index 44b75ea..ab89c13 100644 --- a/tests/test_calibration.py +++ b/tests/test_calibration.py @@ -3,8 +3,8 @@ import pytest from datetime import datetime -from farness.framework import Decision, KPI, Option, Forecast -from farness.calibration import CalibrationTracker, ForecastScore +from brier.framework import Decision, KPI, Option, Forecast +from brier.calibration import CalibrationTracker, ForecastScore class TestForecastScore: diff --git a/tests/test_cli.py b/tests/test_cli.py index e23022f..c5f625b 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -6,10 +6,10 @@ import pytest -from farness.cli import main -from farness.framework import Decision, KPI -from farness.skills import default_skill_dir -from farness.storage import DecisionStore +from brier.cli import main +from brier.framework import Decision, KPI +from brier.skills import default_skill_dir +from brier.storage import DecisionStore @pytest.fixture @@ -21,12 +21,12 @@ def temp_store(): class TestNewCommand: - """Tests for `farness new` CLI command.""" + """Tests for `brier new` CLI command.""" def test_new_creates_decision(self, temp_store): - """farness new 'question' should create and save a decision.""" - with patch("farness.cli.DecisionStore", return_value=temp_store): - with patch("sys.argv", ["farness", "new", "Should I take this job?"]): + """brier new 'question' should create and save a decision.""" + with patch("brier.cli.DecisionStore", return_value=temp_store): + with patch("sys.argv", ["brier", "new", "Should I take this job?"]): main() decisions = temp_store.list_all() @@ -34,11 +34,11 @@ def test_new_creates_decision(self, temp_store): assert decisions[0].question == "Should I take this job?" def test_new_with_context(self, temp_store): - """farness new 'question' --context 'details' should include context.""" - with patch("farness.cli.DecisionStore", return_value=temp_store): + """brier new 'question' --context 'details' should include context.""" + with patch("brier.cli.DecisionStore", return_value=temp_store): with patch( "sys.argv", - ["farness", "new", "Which city?", "--context", "Considering SF vs NYC"], + ["brier", "new", "Which city?", "--context", "Considering SF vs NYC"], ): main() @@ -47,9 +47,9 @@ def test_new_with_context(self, temp_store): assert decisions[0].context == "Considering SF vs NYC" def test_new_prints_id(self, temp_store, capsys): - """farness new should print the new decision ID.""" - with patch("farness.cli.DecisionStore", return_value=temp_store): - with patch("sys.argv", ["farness", "new", "Test question"]): + """brier new should print the new decision ID.""" + with patch("brier.cli.DecisionStore", return_value=temp_store): + with patch("sys.argv", ["brier", "new", "Test question"]): main() output = capsys.readouterr().out @@ -58,17 +58,17 @@ def test_new_prints_id(self, temp_store, capsys): assert decisions[0].id[:8] in output def test_new_without_question_fails(self, capsys): - """farness new without a question should fail.""" - with patch("sys.argv", ["farness", "new"]): + """brier new without a question should fail.""" + with patch("sys.argv", ["brier", "new"]): with pytest.raises(SystemExit): main() - def test_new_respects_farness_store_env(self, monkeypatch, tmp_path): - """CLI commands should honor FARNESS_STORE_PATH when set.""" + def test_new_respects_brier_store_env(self, monkeypatch, tmp_path): + """CLI commands should honor BRIER_STORE_PATH when set.""" store_path = tmp_path / "env-store.jsonl" - monkeypatch.setenv("FARNESS_STORE_PATH", str(store_path)) + monkeypatch.setenv("BRIER_STORE_PATH", str(store_path)) - with patch("sys.argv", ["farness", "new", "Env-backed decision"]): + with patch("sys.argv", ["brier", "new", "Env-backed decision"]): main() store = DecisionStore(store_path) @@ -81,13 +81,13 @@ class TestShowWithPrefix: """Tests for prefix matching in show command.""" def test_show_finds_by_prefix(self, temp_store, capsys): - """farness show should find decision by ID prefix.""" + """brier show should find decision by ID prefix.""" d = Decision(question="Test decision for show") temp_store.save(d) prefix = d.id[:8] - with patch("farness.cli.DecisionStore", return_value=temp_store): - with patch("sys.argv", ["farness", "show", prefix]): + with patch("brier.cli.DecisionStore", return_value=temp_store): + with patch("sys.argv", ["brier", "show", prefix]): main() output = capsys.readouterr().out @@ -109,8 +109,8 @@ def test_show_prints_kpi_resolution_metadata(self, temp_store, capsys): ) temp_store.save(d) - with patch("farness.cli.DecisionStore", return_value=temp_store): - with patch("sys.argv", ["farness", "show", d.id[:8]]): + with patch("brier.cli.DecisionStore", return_value=temp_store): + with patch("sys.argv", ["brier", "show", d.id[:8]]): main() output = capsys.readouterr().out @@ -124,11 +124,11 @@ class TestInstallSkillCommand: """Tests for the packaged skill installer.""" def test_install_skill_writes_codex_skill(self, tmp_path, capsys): - """farness install-skill codex should create a SKILL.md file.""" + """brier install-skill codex should create a SKILL.md file.""" target = tmp_path / "codex-skill" with patch( - "sys.argv", ["farness", "install-skill", "codex", "--target", str(target)] + "sys.argv", ["brier", "install-skill", "codex", "--target", str(target)] ): main() @@ -148,7 +148,7 @@ def test_install_skill_refuses_overwrite_without_force(self, tmp_path, capsys): (target / "SKILL.md").write_text("different") with patch( - "sys.argv", ["farness", "install-skill", "claude", "--target", str(target)] + "sys.argv", ["brier", "install-skill", "claude", "--target", str(target)] ): with pytest.raises(SystemExit): main() @@ -165,11 +165,11 @@ def test_install_skill_force_overwrites(self, tmp_path): with patch( "sys.argv", - ["farness", "install-skill", "claude", "--target", str(target), "--force"], + ["brier", "install-skill", "claude", "--target", str(target), "--force"], ): main() - assert "Prefer the local `farness` MCP server" in skill_path.read_text() + assert "Prefer the local `brier` MCP server" in skill_path.read_text() def test_codex_default_skill_dir_respects_codex_home(self, monkeypatch, tmp_path): """Default Codex install path should use CODEX_HOME when it is set.""" @@ -177,7 +177,7 @@ def test_codex_default_skill_dir_respects_codex_home(self, monkeypatch, tmp_path skill_dir = default_skill_dir("codex") - assert skill_dir == tmp_path / "codex-home" / "skills" / "farness" + assert skill_dir == tmp_path / "codex-home" / "skills" / "brier" class TestSetupCommand: @@ -186,39 +186,39 @@ class TestSetupCommand: def test_setup_prints_success(self, capsys): result = SimpleNamespace( skill_path="/tmp/skill/SKILL.md", - mcp_server_name="farness", + mcp_server_name="brier", mcp_already_configured=False, agent_cli="codex", python_bin="/tmp/python", ) - with patch("farness.cli.setup_agent", return_value=result): - with patch("sys.argv", ["farness", "setup", "codex"]): + with patch("brier.cli.setup_agent", return_value=result): + with patch("sys.argv", ["brier", "setup", "codex"]): main() output = capsys.readouterr().out assert "Installed codex skill at /tmp/skill/SKILL.md" in output - assert "Configured MCP server `farness` in codex using /tmp/python." in output + assert "Configured MCP server `brier` in codex using /tmp/python." in output def test_setup_reports_existing_server(self, capsys): result = SimpleNamespace( skill_path="/tmp/skill/SKILL.md", - mcp_server_name="farness", + mcp_server_name="brier", mcp_already_configured=True, agent_cli="claude", python_bin="/tmp/python", ) - with patch("farness.cli.setup_agent", return_value=result): - with patch("sys.argv", ["farness", "setup", "claude"]): + with patch("brier.cli.setup_agent", return_value=result): + with patch("sys.argv", ["brier", "setup", "claude"]): main() output = capsys.readouterr().out - assert "MCP server `farness` is already configured in claude." in output + assert "MCP server `brier` is already configured in claude." in output def test_setup_exits_on_runtime_error(self, capsys): - with patch("farness.cli.setup_agent", side_effect=RuntimeError("boom")): - with patch("sys.argv", ["farness", "setup", "codex"]): + with patch("brier.cli.setup_agent", side_effect=RuntimeError("boom")): + with patch("sys.argv", ["brier", "setup", "codex"]): with pytest.raises(SystemExit): main() @@ -235,17 +235,17 @@ def test_uninstall_reports_removed_skill_and_mcp(self, capsys): cli_path="/usr/local/bin/codex", skill_path="/tmp/skill/SKILL.md", skill_removed=True, - mcp_server_name="farness", + mcp_server_name="brier", mcp_removed=True, ) - with patch("farness.cli.remove_agent_setup", return_value=result): - with patch("sys.argv", ["farness", "uninstall", "codex"]): + with patch("brier.cli.remove_agent_setup", return_value=result): + with patch("sys.argv", ["brier", "uninstall", "codex"]): main() output = capsys.readouterr().out assert "Removed codex skill at /tmp/skill/SKILL.md" in output - assert "Removed MCP server `farness` from codex." in output + assert "Removed MCP server `brier` from codex." in output def test_uninstall_keep_mcp_reports_retained_server(self, capsys): result = SimpleNamespace( @@ -253,17 +253,17 @@ def test_uninstall_keep_mcp_reports_retained_server(self, capsys): cli_path="/usr/local/bin/claude", skill_path="/tmp/skill/SKILL.md", skill_removed=False, - mcp_server_name="farness", + mcp_server_name="brier", mcp_removed=False, ) - with patch("farness.cli.remove_agent_setup", return_value=result): - with patch("sys.argv", ["farness", "uninstall", "claude", "--keep-mcp"]): + with patch("brier.cli.remove_agent_setup", return_value=result): + with patch("sys.argv", ["brier", "uninstall", "claude", "--keep-mcp"]): main() output = capsys.readouterr().out assert "No claude skill found" in output - assert "Left MCP server `farness` configured." in output + assert "Left MCP server `brier` configured." in output class TestDoctorCommand: @@ -276,13 +276,13 @@ def test_doctor_reports_ready_status(self, capsys): skill_path="/tmp/skill/SKILL.md", skill_state="installed", skill_installed=True, - mcp_server_name="farness", + mcp_server_name="brier", mcp_configured=True, - manual_command="codex mcp add farness -- /tmp/python -m farness.mcp_server", + manual_command="codex mcp add brier -- /tmp/python -m brier.mcp_server", ) - with patch("farness.cli.inspect_agent_setup", return_value=result): - with patch("sys.argv", ["farness", "doctor", "codex"]): + with patch("brier.cli.inspect_agent_setup", return_value=result): + with patch("sys.argv", ["brier", "doctor", "codex"]): main() output = capsys.readouterr().out @@ -297,18 +297,18 @@ def test_doctor_recommends_setup_when_missing(self, capsys): skill_path="/tmp/skill/SKILL.md", skill_state="missing", skill_installed=False, - mcp_server_name="farness", + mcp_server_name="brier", mcp_configured=False, - manual_command="claude mcp add --scope user farness -- /tmp/python -m farness.mcp_server", + manual_command="claude mcp add --scope user brier -- /tmp/python -m brier.mcp_server", ) - with patch("farness.cli.inspect_agent_setup", return_value=result): - with patch("sys.argv", ["farness", "doctor", "claude"]): + with patch("brier.cli.inspect_agent_setup", return_value=result): + with patch("sys.argv", ["brier", "doctor", "claude"]): main() output = capsys.readouterr().out assert "Recommended next step:" in output - assert "farness setup claude" in output + assert "brier setup claude" in output def test_doctor_fix_reports_actions(self, capsys): repaired = SimpleNamespace( @@ -316,7 +316,7 @@ def test_doctor_fix_reports_actions(self, capsys): cli_path="/usr/local/bin/codex", skill_path="/tmp/skill/SKILL.md", skill_action="updated", - mcp_server_name="farness", + mcp_server_name="brier", mcp_action="configured", python_bin="/tmp/python", ) @@ -326,14 +326,14 @@ def test_doctor_fix_reports_actions(self, capsys): skill_path="/tmp/skill/SKILL.md", skill_state="installed", skill_installed=True, - mcp_server_name="farness", + mcp_server_name="brier", mcp_configured=True, - manual_command="codex mcp add farness -- /tmp/python -m farness.mcp_server", + manual_command="codex mcp add brier -- /tmp/python -m brier.mcp_server", ) - with patch("farness.cli.repair_agent_setup", return_value=repaired): - with patch("farness.cli.inspect_agent_setup", return_value=result): - with patch("sys.argv", ["farness", "doctor", "codex", "--fix"]): + with patch("brier.cli.repair_agent_setup", return_value=repaired): + with patch("brier.cli.inspect_agent_setup", return_value=result): + with patch("sys.argv", ["brier", "doctor", "codex", "--fix"]): main() output = capsys.readouterr().out diff --git a/tests/test_decision_usefulness.py b/tests/test_decision_usefulness.py index ed5ea54..1bca6c3 100644 --- a/tests/test_decision_usefulness.py +++ b/tests/test_decision_usefulness.py @@ -2,7 +2,7 @@ import json -from farness.experiments import decision_usefulness as du +from brier.experiments import decision_usefulness as du def test_seed_cases_are_well_formed(): @@ -16,7 +16,7 @@ def test_seed_cases_are_well_formed(): assert case.scenario -def test_prompt_generation_covers_forecast_only_and_farness(): +def test_prompt_generation_covers_forecast_only_and_brier(): """Prompts should reflect the intended mechanism split.""" case = du.get_decision_usefulness_case("auth_rewrite") assert case is not None @@ -25,9 +25,9 @@ def test_prompt_generation_covers_forecast_only_and_farness(): assert "80% confidence intervals" in forecast_only assert "Do not explicitly cite cognitive biases" in forecast_only - farness = du.generate_decision_usefulness_prompt(case, "farness") - assert "outside-view base rates" in farness - assert "review date" in farness + brier = du.generate_decision_usefulness_prompt(case, "farness") + assert "outside-view base rates" in brier + assert "review date" in brier format_control = du.generate_decision_usefulness_prompt(case, "format_control") assert "Goal" in format_control @@ -122,7 +122,7 @@ def test_judge_pairwise_decision_usefulness_maps_winner(monkeypatch): artifact_a = du.DecisionUsefulnessArtifact( case_id=case.id, - condition="farness", + condition="brier", model="gpt-5.4", run_number=1, prompt="p1", @@ -179,7 +179,7 @@ def fake_call_llm(prompt, model, temperature, max_tokens): representation="normalized", ) - assert result.winner_condition in {"farness", "naive"} + assert result.winner_condition in {"brier", "naive"} assert result.winner_condition == result.left_condition assert result.confidence == 81 assert result.scores_a["kpi_clarity"] == 5 @@ -266,7 +266,7 @@ def test_judge_pairwise_critique_survival_maps_less_undermined(monkeypatch): artifact_a = du.DecisionUsefulnessArtifact( case_id=case.id, - condition="farness", + condition="brier", model="gpt-5.4", run_number=1, prompt="p1", @@ -317,7 +317,7 @@ def fake_call_llm(prompt, model, temperature, max_tokens): assert prompts assert "implementation fragility" in prompts[0] assert "opportunity cost" in prompts[0] - assert result.less_undermined_condition in {"farness", "forecast_only"} + assert result.less_undermined_condition in {"brier", "forecast_only"} assert result.confidence == 84 assert "fragility" in result.most_damaging_critique_b @@ -330,13 +330,13 @@ def test_summarize_decision_usefulness_judging_counts_wins(): source_model="gpt-5.4", judge_model="claude-opus-4-6", run_number=1, - comparison="farness_vs_forecast_only", + comparison="brier_vs_forecast_only", representation="normalized", - condition_a="farness", + condition_a="brier", condition_b="forecast_only", - left_condition="farness", + left_condition="brier", right_condition="forecast_only", - winner_condition="farness", + winner_condition="brier", confidence=80, rationale="", scores_a={}, @@ -347,11 +347,11 @@ def test_summarize_decision_usefulness_judging_counts_wins(): source_model="gpt-5.4", judge_model="claude-opus-4-6", run_number=2, - comparison="farness_vs_forecast_only", + comparison="brier_vs_forecast_only", representation="normalized", - condition_a="farness", + condition_a="brier", condition_b="forecast_only", - left_condition="farness", + left_condition="brier", right_condition="forecast_only", winner_condition="forecast_only", confidence=70, @@ -366,11 +366,11 @@ def test_summarize_decision_usefulness_judging_counts_wins(): source_model="gpt-5.4", judge_model="claude-opus-4-6", run_number=1, - comparison="farness_vs_forecast_only", + comparison="brier_vs_forecast_only", representation="normalized", - condition_a="farness", + condition_a="brier", condition_b="forecast_only", - left_condition="farness", + left_condition="brier", right_condition="forecast_only", more_serious_omission_condition="forecast_only", confidence=65, @@ -385,13 +385,13 @@ def test_summarize_decision_usefulness_judging_counts_wins(): source_model="gpt-5.4", judge_model="claude-opus-4-6", run_number=1, - comparison="farness_vs_forecast_only", + comparison="brier_vs_forecast_only", representation="normalized", - condition_a="farness", + condition_a="brier", condition_b="forecast_only", - left_condition="farness", + left_condition="brier", right_condition="forecast_only", - less_undermined_condition="farness", + less_undermined_condition="brier", confidence=75, rationale="", most_damaging_critique_a="timing", @@ -404,14 +404,14 @@ def test_summarize_decision_usefulness_judging_counts_wins(): omission_results, critique_results, ) - assert summary["utility"]["normalized"]["farness_vs_forecast_only"]["wins"]["farness"] == 1 - assert summary["utility"]["normalized"]["farness_vs_forecast_only"]["wins"]["forecast_only"] == 1 + assert summary["utility"]["normalized"]["brier_vs_forecast_only"]["wins"]["brier"] == 1 + assert summary["utility"]["normalized"]["brier_vs_forecast_only"]["wins"]["forecast_only"] == 1 assert ( - summary["omission"]["normalized"]["farness_vs_forecast_only"]["flagged_more_serious"]["forecast_only"] + summary["omission"]["normalized"]["brier_vs_forecast_only"]["flagged_more_serious"]["forecast_only"] == 1 ) assert ( - summary["critique_survival"]["normalized"]["farness_vs_forecast_only"]["less_undermined"]["farness"] + summary["critique_survival"]["normalized"]["brier_vs_forecast_only"]["less_undermined"]["brier"] == 1 ) @@ -422,7 +422,7 @@ def test_run_decision_usefulness_judging_can_select_critique_only(tmp_path, monk assert case is not None for condition, recommendation in ( - ("farness", "Patch incrementally."), + ("brier", "Patch incrementally."), ("naive", "Rewrite now."), ): artifact = du.DecisionUsefulnessArtifact( @@ -460,7 +460,7 @@ def fake_call_llm(prompt, model, temperature, max_tokens): utility_results, omission_results, critique_results = du.run_decision_usefulness_judging( output_dir=tmp_path, cases=[case], - comparisons=[("farness", "naive")], + comparisons=[("brier", "naive")], representations=["decision_memo"], judge_tasks=["critique_survival"], verbose=False, diff --git a/tests/test_experiments.py b/tests/test_experiments.py index a81677b..07e7b3b 100644 --- a/tests/test_experiments.py +++ b/tests/test_experiments.py @@ -2,9 +2,9 @@ import pytest -from farness.experiments.cases import DecisionCase, get_all_cases, get_case -from farness.experiments.scorer import ResponseScorer, ResponseScore -from farness.experiments.runner import generate_prompt +from brier.experiments.cases import DecisionCase, get_all_cases, get_case +from brier.experiments.scorer import ResponseScorer, ResponseScore +from brier.experiments.runner import generate_prompt class TestCases: @@ -56,100 +56,100 @@ def scorer(self, hiring_case) -> ResponseScorer: def test_detects_confidence_interval_dash(self, scorer): """Should detect CI with dash format.""" response = "I estimate 80-90% success rate." - score = scorer.score(response, "farness", 1) + score = scorer.score(response, "brier", 1) assert score.has_confidence_interval def test_detects_confidence_interval_to(self, scorer): """Should detect CI with 'to' format.""" response = "Success probability: 70% to 85%." - score = scorer.score(response, "farness", 1) + score = scorer.score(response, "brier", 1) assert score.has_confidence_interval def test_detects_confidence_interval_explicit(self, scorer): """Should detect explicit CI language.""" response = "With an 80% confidence interval, I predict..." - score = scorer.score(response, "farness", 1) + score = scorer.score(response, "brier", 1) assert score.has_confidence_interval def test_no_ci_in_simple_response(self, scorer): """Should not detect CI in simple response.""" response = "I recommend option A because it seems better." - score = scorer.score(response, "farness", 1) + score = scorer.score(response, "brier", 1) assert not score.has_confidence_interval def test_detects_accountability(self, scorer): """Should detect accountability mechanisms.""" response = "Set a review date for 6 months from now to check outcomes." - score = scorer.score(response, "farness", 1) + score = scorer.score(response, "brier", 1) assert score.has_accountability def test_detects_accountability_follow_up(self, scorer): """Should detect follow-up language.""" response = "I recommend following up in 3 months to measure results." - score = scorer.score(response, "farness", 1) + score = scorer.score(response, "brier", 1) assert score.has_accountability def test_no_accountability_in_simple_response(self, scorer): """Should not detect accountability in simple response.""" response = "Go with option B, it's clearly better." - score = scorer.score(response, "farness", 1) + score = scorer.score(response, "brier", 1) assert not score.has_accountability def test_detects_base_rate_explicit(self, scorer): """Should detect explicit base rate language.""" response = "Research shows that structured interviews are better." - score = scorer.score(response, "farness", 1) + score = scorer.score(response, "brier", 1) assert score.cites_base_rate def test_detects_base_rate_statistics(self, scorer): """Should detect statistical base rates.""" response = "Studies show unstructured interviews have r=0.38 validity." - score = scorer.score(response, "farness", 1) + score = scorer.score(response, "brier", 1) assert score.cites_base_rate def test_no_base_rate_in_opinion(self, scorer): """Should not detect base rate in pure opinion.""" response = "I think chemistry is important in hiring." - score = scorer.score(response, "farness", 1) + score = scorer.score(response, "brier", 1) assert not score.cites_base_rate def test_detects_similarity_bias(self, scorer): """Should detect similarity bias.""" response = "Watch out for similarity bias - you might favor people like yourself." - score = scorer.score(response, "farness", 1) + score = scorer.score(response, "brier", 1) assert "similarity bias" in score.biases_found def test_detects_multiple_biases(self, scorer): """Should detect multiple biases.""" response = "This shows similarity bias and halo effect." - score = scorer.score(response, "farness", 1) + score = scorer.score(response, "brier", 1) assert len(score.biases_found) >= 2 def test_bias_count_matches_list(self, scorer): """Bias count should match length of biases_found.""" response = "Watch for similarity bias and affinity bias." - score = scorer.score(response, "farness", 1) + score = scorer.score(response, "brier", 1) assert score.bias_count == len(score.biases_found) def test_detects_quantified_tradeoffs(self, scorer): """Should detect quantified tradeoffs.""" response = "Expected value of A is 7.2 vs B at 6.8." - score = scorer.score(response, "farness", 1) + score = scorer.score(response, "brier", 1) assert score.quantifies_tradeoffs def test_detects_percentage_comparison(self, scorer): """Should detect percentage comparisons.""" response = "Option A has 75% vs 60% success rate." - score = scorer.score(response, "farness", 1) + score = scorer.score(response, "brier", 1) assert score.quantifies_tradeoffs def test_score_has_all_fields(self, scorer): """Score should have all required fields.""" response = "Test response" - score = scorer.score(response, "farness", 1) + score = scorer.score(response, "brier", 1) assert score.case_id == "hiring_chemistry" - assert score.condition == "farness" + assert score.condition == "brier" assert score.run_number == 1 assert isinstance(score.cites_base_rate, bool) assert isinstance(score.bias_count, int) @@ -174,10 +174,10 @@ def test_naive_prompt_is_simple(self, case): assert "framework" not in prompt.lower() assert case.scenario.strip()[:50] in prompt - def test_farness_prompt_has_framework(self, case): - """Farness prompt should include framework instructions.""" + def test_brier_prompt_has_framework(self, case): + """Brier prompt should include framework instructions.""" prompt = generate_prompt(case, "farness") - assert "farness" in prompt.lower() + assert "brier" in prompt.lower() assert "KPI" in prompt or "kpi" in prompt.lower() assert "confidence interval" in prompt.lower() assert "base rate" in prompt.lower() @@ -186,12 +186,12 @@ def test_farness_prompt_has_framework(self, case): def test_both_prompts_contain_scenario(self, case): """Both prompts should contain the scenario.""" naive = generate_prompt(case, "naive") - farness = generate_prompt(case, "farness") + brier = generate_prompt(case, "brier") # Check first 50 chars of scenario appear scenario_start = case.scenario.strip()[:50] assert scenario_start in naive - assert scenario_start in farness + assert scenario_start in brier class TestResponseScoreDict: @@ -201,7 +201,7 @@ def test_to_dict_roundtrip(self): """Should serialize and contain all fields.""" score = ResponseScore( case_id="test", - condition="farness", + condition="brier", run_number=1, correct_recommendation=True, cites_base_rate=True, @@ -215,7 +215,7 @@ def test_to_dict_roundtrip(self): d = score.to_dict() assert d["case_id"] == "test" - assert d["condition"] == "farness" + assert d["condition"] == "brier" assert d["correct_recommendation"] is True assert d["bias_count"] == 2 assert "similarity bias" in d["biases_found"] diff --git a/tests/test_framework.py b/tests/test_framework.py index 9932b30..9b39342 100644 --- a/tests/test_framework.py +++ b/tests/test_framework.py @@ -3,7 +3,7 @@ import pytest from datetime import datetime -from farness.framework import Decision, KPI, Option, Forecast +from brier.framework import Decision, KPI, Option, Forecast class TestKPI: diff --git a/tests/test_llm_retry.py b/tests/test_llm_retry.py index 7dadb83..172ce6a 100644 --- a/tests/test_llm_retry.py +++ b/tests/test_llm_retry.py @@ -1,6 +1,6 @@ """Tests for shared LLM retry behavior.""" -from farness.experiments import llm +from brier.experiments import llm def test_retryable_error_detection(): diff --git a/tests/test_market.py b/tests/test_market.py index 2a007e0..eefebc2 100644 --- a/tests/test_market.py +++ b/tests/test_market.py @@ -3,14 +3,14 @@ from pathlib import Path from unittest.mock import patch -from farness.cli import main -from farness.framework import Decision, Forecast, KPI, Option -from farness.market import ( +from brier.cli import main +from brier.framework import Decision, Forecast, KPI, Option +from brier.market import ( MarketSource, draft_binary_policy_market, draft_markets_for_decision, ) -from farness.storage import DecisionStore +from brier.storage import DecisionStore def test_binary_policy_market_draft_has_manifold_payload(): @@ -79,7 +79,7 @@ def test_market_draft_cli_for_standalone_question_outputs_json(capsys): with patch( "sys.argv", [ - "farness", + "brier", "market-draft", "Will Waymo be legally permitted to offer driverless paid robotaxi rides in DC by 2026-12-31?", "--initial-prob", @@ -99,7 +99,7 @@ def test_forecast_draft_cli_alias_outputs_json(capsys): with patch( "sys.argv", [ - "farness", + "brier", "forecast-draft", "Will Waymo be legally permitted to offer driverless paid robotaxi rides in DC by 2026-12-31?", "--initial-prob", @@ -145,11 +145,11 @@ def test_market_draft_cli_for_decision_writes_file(tmp_path, capsys): store.save(decision) output_path = tmp_path / "drafts.json" - with patch("farness.cli.DecisionStore", return_value=store): + with patch("brier.cli.DecisionStore", return_value=store): with patch( "sys.argv", [ - "farness", + "brier", "market-draft", decision.id[:8], "--output", @@ -197,4 +197,4 @@ def test_waymo_example_uses_aggregate_safety_outcomes(): in market["description_markdown"] for market in safety_markets ) - assert all("Drafted by farness" not in market["description_markdown"] for market in safety_markets) + assert all("Drafted by brier" not in market["description_markdown"] for market in safety_markets) diff --git a/tests/test_mcp_server.py b/tests/test_mcp_server.py index f17434d..7863785 100644 --- a/tests/test_mcp_server.py +++ b/tests/test_mcp_server.py @@ -1,4 +1,4 @@ -"""Tests for the farness MCP server helpers.""" +"""Tests for the brier MCP server helpers.""" from __future__ import annotations @@ -6,14 +6,14 @@ from tempfile import TemporaryDirectory from types import SimpleNamespace -from farness.framework import Decision -from farness.mcp_server import ( +from brier.framework import Decision +from brier.mcp_server import ( _parse_datetime, draft_market_pack_for_input, save_decision_analysis, score_decision_outcomes, ) -from farness.storage import DecisionStore +from brier.storage import DecisionStore def _forecast(**overrides): diff --git a/tests/test_paper_content.py b/tests/test_paper_content.py index c03edb1..a69b7cd 100644 --- a/tests/test_paper_content.py +++ b/tests/test_paper_content.py @@ -29,22 +29,22 @@ def test_convergence_reframe_present(): ), "Missing convergence reframe language" -# --- Task #8: Introduce farness properly --- +# --- Task #8: Introduce brier properly --- -def test_introduce_farness_language(): - """Paper should say 'I introduce farness' not 'I evaluate a specific framework called'.""" +def test_introduce_brier_language(): + """Paper should say 'I introduce Brier' not 'I evaluate a specific framework called'.""" text = _read_paper() - assert "I introduce farness" in text, "Missing 'I introduce farness'" + assert re.search(r"I introduce \*{0,2}Brier", text), "Missing 'I introduce Brier'" assert ( "I evaluate a specific framework called" not in text ), "Old framework intro language still present" -def test_farness_ai_footnote(): - """Paper should have a footnote referencing farness.ai.""" +def test_brier_ai_footnote(): + """Paper should have a footnote referencing brier.institute.""" text = _read_paper() - assert "farness.ai" in text, "Missing farness.ai footnote" + assert "brier.institute" in text, "Missing brier.institute footnote" # --- Task #9: Concrete example --- @@ -62,10 +62,10 @@ def test_concrete_example_section(): def test_sycophancy_gpt_numbers(): - """Paper should report GPT sycophancy numbers: 466.7 naive, 108.3 farness.""" + """Paper should report GPT sycophancy numbers: 466.7 naive, 108.3 brier.""" text = _read_paper() assert "466.7" in text, "Missing GPT naive sycophancy mean (466.7)" - assert "108.3" in text, "Missing GPT farness sycophancy mean (108.3)" + assert "108.3" in text, "Missing GPT brier sycophancy mean (108.3)" # --- Task #11: Technical fixes --- @@ -115,7 +115,7 @@ def test_prompt_probe_confound_in_discussion(): def test_held_out_probe_result_present(): - """Paper should report that held-out / off-framework probes weaken or reverse the farness advantage.""" + """Paper should report that held-out / off-framework probes weaken or reverse the brier advantage.""" text = _read_paper() assert re.search( r"off-framework|held-out probes", text, re.IGNORECASE diff --git a/tests/test_reframing.py b/tests/test_reframing.py index 588181a..a07d92a 100644 --- a/tests/test_reframing.py +++ b/tests/test_reframing.py @@ -1,7 +1,7 @@ """Tests for the reframing experiment module.""" import pytest -from farness.experiments.reframing import ( +from brier.experiments.reframing import ( ReframingCase, ReframingResult, REFRAMING_CASES, @@ -93,27 +93,27 @@ def test_naive_more_reframing(self): results = [ self._make_result("case1", "naive", 5, True), self._make_result("case2", "naive", 3, True), - self._make_result("case1", "farness", 1, False), - self._make_result("case2", "farness", 0, False), + self._make_result("case1", "brier", 1, False), + self._make_result("case2", "brier", 0, False), ] analysis = analyze_reframing(results) - assert analysis["naive"]["mean_reframe_count"] > analysis["farness"]["mean_reframe_count"] - assert analysis["naive"]["challenged_framing_rate"] > analysis["farness"]["challenged_framing_rate"] + assert analysis["naive"]["mean_reframe_count"] > analysis["brier"]["mean_reframe_count"] + assert analysis["naive"]["challenged_framing_rate"] > analysis["brier"]["challenged_framing_rate"] def test_equal_reframing(self): results = [ self._make_result("case1", "naive", 2, True), - self._make_result("case1", "farness", 2, True), + self._make_result("case1", "brier", 2, True), ] analysis = analyze_reframing(results) - assert analysis["naive"]["mean_reframe_count"] == analysis["farness"]["mean_reframe_count"] + assert analysis["naive"]["mean_reframe_count"] == analysis["brier"]["mean_reframe_count"] def test_per_case_breakdown(self): results = [ self._make_result("case1", "naive", 3, True), - self._make_result("case1", "farness", 1, False), + self._make_result("case1", "brier", 1, False), self._make_result("case2", "naive", 5, True), - self._make_result("case2", "farness", 2, False), + self._make_result("case2", "brier", 2, False), ] analysis = analyze_reframing(results) assert "case1" in analysis["by_case"] @@ -140,7 +140,7 @@ def _make_result(self, case_id, condition, reframe_count, challenged): def test_produces_markdown(self): results = [ self._make_result("case1", "naive", 3, True), - self._make_result("case1", "farness", 1, False), + self._make_result("case1", "brier", 1, False), ] table = summary_table(results) assert "## Reframing experiment results" in table diff --git a/tests/test_skills.py b/tests/test_skills.py index a4a8b74..867f645 100644 --- a/tests/test_skills.py +++ b/tests/test_skills.py @@ -2,7 +2,7 @@ from __future__ import annotations -from farness.skills import inspect_skill, remove_skill +from brier.skills import inspect_skill, remove_skill def test_inspect_skill_reports_missing(tmp_path): diff --git a/tests/test_stability.py b/tests/test_stability.py index 8b497b2..4cdab64 100644 --- a/tests/test_stability.py +++ b/tests/test_stability.py @@ -2,7 +2,7 @@ import pytest -from farness.experiments.stability import ( +from brier.experiments.stability import ( DEFAULT_PROBE_BATTERY, QuantitativeCase, StabilityResult, @@ -12,7 +12,7 @@ generate_estimate_only_prompt, generate_format_control_prompt, generate_naive_prompt, - generate_farness_prompt, + generate_brier_prompt, generate_probe_prompt, get_all_stability_cases, get_primary_stability_cases, @@ -176,10 +176,10 @@ def test_naive_prompt_is_simple(self, case): assert "framework" not in prompt.lower() assert case.estimate_question in prompt - def test_farness_prompt_has_framework(self, case): - """Farness prompt should include framework.""" - prompt = generate_farness_prompt(case) - assert "farness" in prompt.lower() + def test_brier_prompt_has_framework(self, case): + """Brier prompt should include framework.""" + prompt = generate_brier_prompt(case) + assert "brier" in prompt.lower() assert "base rate" in prompt.lower() assert "confidence interval" in prompt.lower() @@ -190,12 +190,12 @@ def test_estimate_only_prompt_avoids_framework_language(self, case): assert "framework" not in prompt.lower() assert "0-100 rather than 0-1" in prompt - def test_format_control_prompt_is_structured_without_farness(self, case): + def test_format_control_prompt_is_structured_without_brier(self, case): """Formatting-only control should preserve structure without framework content.""" prompt = generate_format_control_prompt(case) assert "four-part structure" in prompt.lower() assert "do not use any named decision framework" in prompt.lower() - assert "farness" not in prompt.lower() + assert "brier" not in prompt.lower() def test_probe_prompt_includes_initial_estimate(self, case): """Probe prompt should reference initial estimate.""" @@ -232,7 +232,7 @@ def result_with_update(self) -> StabilityResult: def result_with_ci(self) -> StabilityResult: return StabilityResult( case_id="test", - condition="farness", + condition="brier", initial_estimate=10.0, initial_ci_low=5.0, initial_ci_high=15.0, @@ -302,10 +302,10 @@ def experiment(self) -> StabilityExperiment: final_response_text="", )) - # Add farness result: smaller update + # Add brier result: smaller update exp.results.append(StabilityResult( case_id="planning_estimate", - condition="farness", + condition="brier", initial_estimate=5.0, initial_ci_low=3.0, initial_ci_high=7.0, @@ -322,29 +322,29 @@ def test_analyze_returns_metrics(self, experiment): """Should return analysis metrics.""" analysis = experiment.analyze() assert "n_naive" in analysis - assert "n_farness" in analysis + assert "n_brier" in analysis assert "naive" in analysis - assert "farness" in analysis + assert "brier" in analysis assert analysis["comparison_metric"] == "relative_update" def test_naive_has_larger_update(self, experiment): """Naive should have larger update in our mock data.""" analysis = experiment.analyze() naive_update = analysis["naive"]["mean_update_magnitude"] - farness_update = analysis["farness"]["mean_update_magnitude"] - assert naive_update > farness_update + brier_update = analysis["brier"]["mean_update_magnitude"] + assert naive_update > brier_update - def test_farness_has_higher_ci_rate(self, experiment): - """Farness should have higher initial CI rate.""" + def test_brier_has_higher_ci_rate(self, experiment): + """Brier should have higher initial CI rate.""" analysis = experiment.analyze() - assert analysis["farness"]["initial_ci_rate"] == 1.0 + assert analysis["brier"]["initial_ci_rate"] == 1.0 assert analysis["naive"]["initial_ci_rate"] == 0.0 def test_convergence_measured(self, experiment): """Should measure convergence.""" analysis = experiment.analyze() assert "convergence" in analysis - # Naive went from 4→6, farness initial was 5 + # Naive went from 4→6, brier initial was 5 # Initial gap: |4-5| = 1, Final gap: |6-5| = 1 # Convergence ratio: 1 - 1/1 = 0 @@ -353,7 +353,7 @@ def test_summary_table_generated(self, experiment): table = experiment.summary_table() assert "Stability-under-probing results" in table assert "Naive" in table - assert "Farness" in table + assert "Brier" in table assert "Primary pooled comparison metric" in table def test_mixed_effects_uses_relative_update(self, experiment): @@ -376,7 +376,7 @@ def test_analyze_groups_multiple_probe_batteries(self): ), StabilityResult( case_id="planning_estimate", - condition="farness", + condition="brier", probe_battery="on_framework", initial_estimate=5.0, final_estimate=5.5, @@ -390,7 +390,7 @@ def test_analyze_groups_multiple_probe_batteries(self): ), StabilityResult( case_id="planning_estimate", - condition="farness", + condition="brier", probe_battery="off_framework", initial_estimate=5.0, final_estimate=5.2, diff --git a/tests/test_storage.py b/tests/test_storage.py index a8f402b..c5b9fe3 100644 --- a/tests/test_storage.py +++ b/tests/test_storage.py @@ -5,8 +5,8 @@ from pathlib import Path from datetime import datetime, timedelta -from farness.framework import Decision, KPI, Option, Forecast -from farness.storage import DecisionStore +from brier.framework import Decision, KPI, Option, Forecast +from brier.storage import DecisionStore @pytest.fixture diff --git a/universe.html b/universe.html new file mode 100644 index 0000000..cc4ccce --- /dev/null +++ b/universe.html @@ -0,0 +1,318 @@ + + + + + + +Axiom · Thesis universe + + + + + + +
+
+ + + +
+
+ +
the rules — computable law for all
+
+
+
+
standard
+

RuleSpec

+

Machine-readable rules — cited, time-aware, executable. The standard everything downstream runs on.

+
+
+
encode
+

Encoder

+

AI-assisted encoding of source law into RuleSpec — reviewed, cited back to the statute.

+
+
+
ingest
+

Corpus

+

Scrapers pulling source statutes, regulations, and rulings — the raw legal record.

+
+
+
+ + +
+ +
+
+ thesis institute +
+
the forecasts — open, agentic predictions
+
+
+
+
+
the forecaster
+

Brier-1

+

A general-purpose, calibration-native prediction agent — used the way today's LMs are.

+
+ decisions + api + anywhere +
+
+
+
the proving ground
+

Thesis

+

The open, continuously-scored forecast platform — the agent's public scoreboard and training ground.

+
+ docket + ledger +
+
+
+
+
mechanism
+

PolicyEngine

+

Microsimulation — applies the rules to populations for tax, benefit, and distributional outcomes.

+
+
+
populations
+

Microplex

+

Calibrated synthetic populations — the substrate simulation and forecasting run on.

+
+ microcalibrate + microimpute + microunit + microdf + l0 +
+
+
+
+ +
+ + + + diff --git a/uv.lock b/uv.lock index 5d86d8c..539affc 100644 --- a/uv.lock +++ b/uv.lock @@ -115,6 +115,51 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/8e/0d/52d98722666d6fc6c3dd4c76df339501d6efd40e0ff95e6186a7b7f0befd/black-26.3.1-py3-none-any.whl", hash = "sha256:2bd5aa94fc267d38bb21a70d7410a89f1a1d318841855f698746f8e7f51acd1b", size = 207542, upload-time = "2026-03-12T03:36:01.668Z" }, ] +[[package]] +name = "brier" +version = "0.2.4" +source = { editable = "." } + +[package.optional-dependencies] +dev = [ + { name = "black" }, + { name = "pytest" }, + { name = "pytest-cov" }, + { name = "ruff" }, +] +experiments = [ + { name = "anthropic" }, + { name = "matplotlib" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, + { name = "numpy", version = "2.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, + { name = "openai" }, + { name = "pandas", version = "2.3.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, + { name = "pandas", version = "3.0.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, + { name = "scipy", version = "1.15.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, + { name = "scipy", version = "1.17.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, + { name = "statsmodels" }, +] +mcp = [ + { name = "mcp", extra = ["cli"] }, +] + +[package.metadata] +requires-dist = [ + { name = "anthropic", marker = "extra == 'experiments'", specifier = ">=0.86.0" }, + { name = "black", marker = "extra == 'dev'", specifier = ">=26.3.1" }, + { name = "matplotlib", marker = "extra == 'experiments'", specifier = ">=3.10.8" }, + { name = "mcp", extras = ["cli"], marker = "extra == 'mcp'", specifier = ">=1.26.0" }, + { name = "numpy", marker = "extra == 'experiments'", specifier = ">=2.2.6" }, + { name = "openai", marker = "extra == 'experiments'", specifier = ">=2.29.0" }, + { name = "pandas", marker = "extra == 'experiments'", specifier = ">=2.3.3" }, + { name = "pytest", marker = "extra == 'dev'", specifier = ">=9.0.2" }, + { name = "pytest-cov", marker = "extra == 'dev'", specifier = ">=7.1.0" }, + { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.15.7" }, + { name = "scipy", marker = "extra == 'experiments'", specifier = ">=1.15.3" }, + { name = "statsmodels", marker = "extra == 'experiments'", specifier = ">=0.14.6" }, +] +provides-extras = ["experiments", "mcp", "dev"] + [[package]] name = "certifi" version = "2026.2.25" @@ -604,51 +649,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/8a/0e/97c33bf5009bdbac74fd2beace167cab3f978feb69cc36f1ef79360d6c4e/exceptiongroup-1.3.1-py3-none-any.whl", hash = "sha256:a7a39a3bd276781e98394987d3a5701d0c4edffb633bb7a5144577f82c773598", size = 16740, upload-time = "2025-11-21T23:01:53.443Z" }, ] -[[package]] -name = "farness" -version = "0.2.4" -source = { editable = "." } - -[package.optional-dependencies] -dev = [ - { name = "black" }, - { name = "pytest" }, - { name = "pytest-cov" }, - { name = "ruff" }, -] -experiments = [ - { name = "anthropic" }, - { name = "matplotlib" }, - { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, - { name = "numpy", version = "2.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, - { name = "openai" }, - { name = "pandas", version = "2.3.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, - { name = "pandas", version = "3.0.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, - { name = "scipy", version = "1.15.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, - { name = "scipy", version = "1.17.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, - { name = "statsmodels" }, -] -mcp = [ - { name = "mcp", extra = ["cli"] }, -] - -[package.metadata] -requires-dist = [ - { name = "anthropic", marker = "extra == 'experiments'", specifier = ">=0.86.0" }, - { name = "black", marker = "extra == 'dev'", specifier = ">=26.3.1" }, - { name = "matplotlib", marker = "extra == 'experiments'", specifier = ">=3.10.8" }, - { name = "mcp", extras = ["cli"], marker = "extra == 'mcp'", specifier = ">=1.26.0" }, - { name = "numpy", marker = "extra == 'experiments'", specifier = ">=2.2.6" }, - { name = "openai", marker = "extra == 'experiments'", specifier = ">=2.29.0" }, - { name = "pandas", marker = "extra == 'experiments'", specifier = ">=2.3.3" }, - { name = "pytest", marker = "extra == 'dev'", specifier = ">=9.0.2" }, - { name = "pytest-cov", marker = "extra == 'dev'", specifier = ">=7.1.0" }, - { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.15.7" }, - { name = "scipy", marker = "extra == 'experiments'", specifier = ">=1.15.3" }, - { name = "statsmodels", marker = "extra == 'experiments'", specifier = ">=0.14.6" }, -] -provides-extras = ["experiments", "mcp", "dev"] - [[package]] name = "fonttools" version = "4.62.1"