Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -833,7 +833,7 @@ cmd/waza/ CLI entrypoint and command definitions
tokens/ Token counting subcommand
internal/
config/ Configuration with functional options
execution/ AgentEngine interface (mock, copilot)
execution/ AgentEngine interface (mock, copilot, codex)
graders/ Validator registry and built-in graders
metrics/ Scoring metrics
models/ Data structures (EvalSpec, TestCase, EvaluationOutcome)
Expand All @@ -857,8 +857,9 @@ config:
max_attempts: 3 # Retry failed graders up to 3 times (default: 1, no retries)
timeout_seconds: 300
parallel: false
executor: mock # or copilot-sdk
executor: mock # or copilot-sdk, codex
model: claude-sonnet-4-20250514
model_reasoning_effort: high # codex only; none, minimal, low, medium, high, xhigh
group_by: model # Group results by model (or other dimension)

# Custom input variables available as {{.Vars.key}} in tasks and hooks
Expand Down Expand Up @@ -1149,6 +1150,8 @@ jobs:
| **Go Version** | 1.26 or higher |
| **Executor** | Use `mock` executor for CI (no API keys needed) |
| **GitHub Token** | Only required for `copilot-sdk` executor: set `GITHUB_TOKEN` env var |
| **Codex Auth** | Only required for `codex` executor: uses the local Codex CLI config/auth from `~/.codex` |
| **Codex Reasoning** | Optional `model_reasoning_effort` uses Codex's config key; common values are `none`, `minimal`, `low`, `medium`, `high`, `xhigh` |
| **Exit Codes** | 0=success, 1=test failure, 2=config error |

#### Expected Skill Structure
Expand Down
5 changes: 5 additions & 0 deletions cmd/waza/cmd_init.go
Original file line number Diff line number Diff line change
Expand Up @@ -322,6 +322,7 @@
Description("Choose how evals are executed").
Options(
huh.NewOption("Copilot SDK — real model execution", "copilot-sdk"),
huh.NewOption("Codex — use ~/.codex config/auth", "codex"),
huh.NewOption("Mock — fast iteration, no API calls", "mock"),
).
Value(&engine),
Expand All @@ -333,7 +334,7 @@
}

// Model selector (hidden when engine ≠ copilot-sdk)
if engine == "copilot-sdk" {

Check failure on line 337 in cmd/waza/cmd_init.go

View workflow job for this annotation

GitHub Actions / Lint

QF1003: could use tagged switch on engine (staticcheck)
modelForm := huh.NewForm(
huh.NewGroup(
huh.NewSelect[string]().
Expand Down Expand Up @@ -366,6 +367,10 @@
if err := modelForm.Run(); err != nil {
model = projectconfig.DefaultModel
}
} else if engine == "codex" {
// Let Codex read the default model from ~/.codex/config.toml unless
// the eval later sets config.model or the user passes --model.
model = ""
}

pathsForm := huh.NewForm(
Expand Down
61 changes: 60 additions & 1 deletion cmd/waza/cmd_run.go
Original file line number Diff line number Diff line change
Expand Up @@ -451,6 +451,9 @@ func runCommandForSpec(cmd *cobra.Command, sp skillSpecPath, defaultSkills []str
if err != nil {
return nil, fmt.Errorf("failed to load spec: %w", err)
}
if cfg, cfgErr := projectconfig.Load(filepath.Dir(specPath)); cfgErr == nil {
applyProjectDefaultsToEvalSpec(spec, cfg)
}

// CLI flags override spec config
if parallel {
Expand Down Expand Up @@ -572,9 +575,51 @@ func runCommandForSpec(cmd *cobra.Command, sp skillSpecPath, defaultSkills []str
return allResults, nil
}

func applyProjectDefaultsToEvalSpec(spec *models.EvalSpec, cfg *projectconfig.ProjectConfig) {
if spec == nil || cfg == nil {
return
}

defaultEngine := cfg.Defaults.Engine
if defaultEngine == "" {
defaultEngine = projectconfig.DefaultEngine
}

engineWasDefault := spec.Config.EngineType == "" ||
(spec.Config.EngineType == projectconfig.DefaultEngine && defaultEngine != projectconfig.DefaultEngine)
if engineWasDefault {
spec.Config.EngineType = defaultEngine
}

defaultModel := cfg.Defaults.Model
modelWasDefault := spec.Config.ModelID == "" ||
(spec.Config.ModelID == projectconfig.DefaultModel &&
(defaultModel != projectconfig.DefaultModel || engineWasDefault))
if modelWasDefault {
Comment on lines +587 to +598
spec.Config.ModelID = defaultModel
}
if spec.Config.ModelReasoningEffort == "" {
spec.Config.ModelReasoningEffort = cfg.Defaults.ModelReasoningEffort
}
}

func displayModel(cfg models.Config) string {
if cfg.ModelID != "" {
return cfg.ModelID
}
if cfg.EngineType == "codex" {
return "default (Codex config)"
}
return ""
}

// runSingleModel executes a benchmark for one model and returns the outcome.
// It prints the per-model summary and saves output for single-model runs.
func runSingleModel(cmd *cobra.Command, spec *models.EvalSpec, specPath string, defaultSkills []string) (*models.EvaluationOutcome, error) {
if err := validateEngineFeatureSupport(spec); err != nil {
return nil, err
}

// Get spec directory for resolving relative paths
specDir := filepath.Dir(specPath)
if !filepath.IsAbs(specDir) {
Expand Down Expand Up @@ -641,6 +686,8 @@ func runSingleModel(cmd *cobra.Command, spec *models.EvalSpec, specPath string,
engine = execution.NewCopilotEngineBuilder(spec.Config.ModelID, &execution.CopilotEngineBuilderOptions{
NewCopilotClient: newCopilotClientFn, // if nil, uses the real function, otherwise overridable for tests.
}).Build()
case "codex":
engine = execution.NewCodexEngine(spec.Config.ModelID)
default:
return nil, fmt.Errorf("unknown engine type: %s", spec.Config.EngineType)
}
Expand Down Expand Up @@ -735,7 +782,7 @@ func runSingleModel(cmd *cobra.Command, spec *models.EvalSpec, specPath string,
fmt.Printf("Running benchmark: %s\n", spec.Name)
fmt.Printf("Skill: %s\n", spec.SkillName)
fmt.Printf("Engine: %s\n", spec.Config.EngineType)
fmt.Printf("Model: %s\n", spec.Config.ModelID)
fmt.Printf("Model: %s\n", displayModel(spec.Config))
if spec.Config.JudgeModel != "" {
fmt.Printf("Judge Model: %s\n", spec.Config.JudgeModel)
}
Expand Down Expand Up @@ -906,6 +953,18 @@ func runSingleModel(cmd *cobra.Command, spec *models.EvalSpec, specPath string,
return outcome, nil
}

func validateEngineFeatureSupport(spec *models.EvalSpec) error {
if spec == nil || spec.Config.EngineType != "codex" {
return nil
}
for _, grader := range spec.Graders {
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[MEDIUM] Logic: Codex runs still execute discovered trigger.yaml suites even though the engine rejects CancelOnSkillInvocation. Those trigger runs hit CodexEngine.Execute, produce per-prompt errors, and Waza can record misleading trigger metrics instead of failing fast with a clear configuration error. Extend this validation to reject Codex when trigger tests are present before the run starts, matching the unsupported skill_invocation grader check.

if grader.Kind == models.GraderKindSkillInvocation {
return fmt.Errorf("grader %q uses skill_invocation, which is not supported by the codex executor because Codex CLI does not emit skill invocation telemetry", grader.Identifier)
}
}
return nil
}

// printModelComparison renders a comparison table for multi-model runs.
func printModelComparison(results []modelResult) {
slices.SortFunc(results, func(a, b modelResult) int {
Expand Down
Loading
Loading