diff --git a/README.md b/README.md index 9ef3973e7..05b0ce765 100644 --- a/README.md +++ b/README.md @@ -833,7 +833,7 @@ cmd/waza/ CLI entrypoint and command definitions tokens/ Token counting subcommand internal/ config/ Configuration with functional options - execution/ AgentEngine interface (mock, copilot) + execution/ AgentEngine interface (mock, copilot, codex) graders/ Validator registry and built-in graders metrics/ Scoring metrics models/ Data structures (EvalSpec, TestCase, EvaluationOutcome) @@ -857,8 +857,9 @@ config: max_attempts: 3 # Retry failed graders up to 3 times (default: 1, no retries) timeout_seconds: 300 parallel: false - executor: mock # or copilot-sdk + executor: mock # or copilot-sdk, codex model: claude-sonnet-4-20250514 + model_reasoning_effort: high # codex only; none, minimal, low, medium, high, xhigh group_by: model # Group results by model (or other dimension) # Custom input variables available as {{.Vars.key}} in tasks and hooks @@ -1149,6 +1150,8 @@ jobs: | **Go Version** | 1.26 or higher | | **Executor** | Use `mock` executor for CI (no API keys needed) | | **GitHub Token** | Only required for `copilot-sdk` executor: set `GITHUB_TOKEN` env var | +| **Codex Auth** | Only required for `codex` executor: uses the local Codex CLI config/auth from `~/.codex` | +| **Codex Reasoning** | Optional `model_reasoning_effort` uses Codex's config key; common values are `none`, `minimal`, `low`, `medium`, `high`, `xhigh` | | **Exit Codes** | 0=success, 1=test failure, 2=config error | #### Expected Skill Structure diff --git a/cmd/waza/cmd_init.go b/cmd/waza/cmd_init.go index 31ec04fda..ad236aa39 100644 --- a/cmd/waza/cmd_init.go +++ b/cmd/waza/cmd_init.go @@ -322,6 +322,7 @@ func initCommandE(cmd *cobra.Command, args []string, noSkill bool, flagSkillsDir Description("Choose how evals are executed"). Options( huh.NewOption("Copilot SDK — real model execution", "copilot-sdk"), + huh.NewOption("Codex — use ~/.codex config/auth", "codex"), huh.NewOption("Mock — fast iteration, no API calls", "mock"), ). Value(&engine), @@ -366,6 +367,10 @@ func initCommandE(cmd *cobra.Command, args []string, noSkill bool, flagSkillsDir if err := modelForm.Run(); err != nil { model = projectconfig.DefaultModel } + } else if engine == "codex" { + // Let Codex read the default model from ~/.codex/config.toml unless + // the eval later sets config.model or the user passes --model. + model = "" } pathsForm := huh.NewForm( diff --git a/cmd/waza/cmd_run.go b/cmd/waza/cmd_run.go index c3ddd2a29..8eb8d28ed 100644 --- a/cmd/waza/cmd_run.go +++ b/cmd/waza/cmd_run.go @@ -451,6 +451,9 @@ func runCommandForSpec(cmd *cobra.Command, sp skillSpecPath, defaultSkills []str if err != nil { return nil, fmt.Errorf("failed to load spec: %w", err) } + if cfg, cfgErr := projectconfig.Load(filepath.Dir(specPath)); cfgErr == nil { + applyProjectDefaultsToEvalSpec(spec, cfg) + } // CLI flags override spec config if parallel { @@ -572,9 +575,51 @@ func runCommandForSpec(cmd *cobra.Command, sp skillSpecPath, defaultSkills []str return allResults, nil } +func applyProjectDefaultsToEvalSpec(spec *models.EvalSpec, cfg *projectconfig.ProjectConfig) { + if spec == nil || cfg == nil { + return + } + + defaultEngine := cfg.Defaults.Engine + if defaultEngine == "" { + defaultEngine = projectconfig.DefaultEngine + } + + engineWasDefault := spec.Config.EngineType == "" || + (spec.Config.EngineType == projectconfig.DefaultEngine && defaultEngine != projectconfig.DefaultEngine) + if engineWasDefault { + spec.Config.EngineType = defaultEngine + } + + defaultModel := cfg.Defaults.Model + modelWasDefault := spec.Config.ModelID == "" || + (spec.Config.ModelID == projectconfig.DefaultModel && + (defaultModel != projectconfig.DefaultModel || engineWasDefault)) + if modelWasDefault { + spec.Config.ModelID = defaultModel + } + if spec.Config.ModelReasoningEffort == "" { + spec.Config.ModelReasoningEffort = cfg.Defaults.ModelReasoningEffort + } +} + +func displayModel(cfg models.Config) string { + if cfg.ModelID != "" { + return cfg.ModelID + } + if cfg.EngineType == "codex" { + return "default (Codex config)" + } + return "" +} + // runSingleModel executes a benchmark for one model and returns the outcome. // It prints the per-model summary and saves output for single-model runs. func runSingleModel(cmd *cobra.Command, spec *models.EvalSpec, specPath string, defaultSkills []string) (*models.EvaluationOutcome, error) { + if err := validateEngineFeatureSupport(spec); err != nil { + return nil, err + } + // Get spec directory for resolving relative paths specDir := filepath.Dir(specPath) if !filepath.IsAbs(specDir) { @@ -641,6 +686,8 @@ func runSingleModel(cmd *cobra.Command, spec *models.EvalSpec, specPath string, engine = execution.NewCopilotEngineBuilder(spec.Config.ModelID, &execution.CopilotEngineBuilderOptions{ NewCopilotClient: newCopilotClientFn, // if nil, uses the real function, otherwise overridable for tests. }).Build() + case "codex": + engine = execution.NewCodexEngine(spec.Config.ModelID) default: return nil, fmt.Errorf("unknown engine type: %s", spec.Config.EngineType) } @@ -735,7 +782,7 @@ func runSingleModel(cmd *cobra.Command, spec *models.EvalSpec, specPath string, fmt.Printf("Running benchmark: %s\n", spec.Name) fmt.Printf("Skill: %s\n", spec.SkillName) fmt.Printf("Engine: %s\n", spec.Config.EngineType) - fmt.Printf("Model: %s\n", spec.Config.ModelID) + fmt.Printf("Model: %s\n", displayModel(spec.Config)) if spec.Config.JudgeModel != "" { fmt.Printf("Judge Model: %s\n", spec.Config.JudgeModel) } @@ -906,6 +953,18 @@ func runSingleModel(cmd *cobra.Command, spec *models.EvalSpec, specPath string, return outcome, nil } +func validateEngineFeatureSupport(spec *models.EvalSpec) error { + if spec == nil || spec.Config.EngineType != "codex" { + return nil + } + for _, grader := range spec.Graders { + if grader.Kind == models.GraderKindSkillInvocation { + return fmt.Errorf("grader %q uses skill_invocation, which is not supported by the codex executor because Codex CLI does not emit skill invocation telemetry", grader.Identifier) + } + } + return nil +} + // printModelComparison renders a comparison table for multi-model runs. func printModelComparison(results []modelResult) { slices.SortFunc(results, func(a, b modelResult) int { diff --git a/cmd/waza/cmd_run_test.go b/cmd/waza/cmd_run_test.go index 11c7559b0..f0ee6d87a 100644 --- a/cmd/waza/cmd_run_test.go +++ b/cmd/waza/cmd_run_test.go @@ -11,6 +11,7 @@ import ( "maps" "os" "path/filepath" + "runtime" "slices" "sort" "strings" @@ -290,6 +291,309 @@ func TestRunCommand_MockEngineRun(t *testing.T) { assert.NoError(t, err) } +func TestRunCommand_CodexEngineRun(t *testing.T) { + if runtime.GOOS == "windows" { + t.Skip("fake codex shell script is POSIX-only") + } + resetRunGlobals() + defer resetRunGlobals() + + fakeDir := t.TempDir() + fakeCodex := filepath.Join(fakeDir, "codex") + fakeScript := `#!/bin/sh +work="" +out="" +while [ "$#" -gt 0 ]; do + case "$1" in + --cd) + work="$2" + shift 2 + ;; + --output-last-message) + out="$2" + shift 2 + ;; + *) + shift + ;; + esac +done +if [ -n "$work" ]; then + cd "$work" +fi +cat > prompt.txt +printf "codex command output" > generated.txt +printf "codex final output" > "$out" +` + require.NoError(t, os.WriteFile(fakeCodex, []byte(fakeScript), 0o755)) + t.Setenv("PATH", fakeDir+string(os.PathListSeparator)+os.Getenv("PATH")) + + specPath := createTestSpec(t, "codex") + outFile := filepath.Join(t.TempDir(), "results.json") + + cmd := newRunCommand() + cmd.SetArgs([]string{specPath, "--output", outFile}) + cmd.SetOut(io.Discard) + cmd.SetErr(io.Discard) + + err := cmd.Execute() + require.NoError(t, err) + + data, err := os.ReadFile(outFile) + require.NoError(t, err) + var result models.EvaluationOutcome + require.NoError(t, json.Unmarshal(data, &result)) + assert.Equal(t, "codex", result.Setup.EngineType) + assert.Equal(t, "test-model", result.Setup.ModelID) + require.Len(t, result.TestOutcomes, 1) + require.Len(t, result.TestOutcomes[0].Runs, 1) + assert.Equal(t, "codex final output", result.TestOutcomes[0].Runs[0].FinalOutput) +} + +func TestRunCommand_WazaYamlCodexOverridesScaffoldedCopilotDefault(t *testing.T) { + if runtime.GOOS == "windows" { + t.Skip("fake codex shell script is POSIX-only") + } + resetRunGlobals() + defer resetRunGlobals() + + fakeDir := t.TempDir() + fakeCodex := filepath.Join(fakeDir, "codex") + fakeScript := `#!/bin/sh +work="" +out="" +args="" +while [ "$#" -gt 0 ]; do + case "$1" in + --cd) + work="$2" + args="$args $1 $2" + shift 2 + ;; + --output-last-message) + out="$2" + args="$args $1 $2" + shift 2 + ;; + *) + args="$args $1" + shift + ;; + esac +done +if [ -n "$work" ]; then + cd "$work" +fi +cat > prompt.txt +printf "%s" "$args" > args.txt +if [ -n "${WAZA_FAKE_CODEX_ARGS:-}" ]; then + printf "%s" "$args" > "$WAZA_FAKE_CODEX_ARGS" +fi +printf "codex final output" > "$out" +` + require.NoError(t, os.WriteFile(fakeCodex, []byte(fakeScript), 0o755)) + t.Setenv("PATH", fakeDir+string(os.PathListSeparator)+os.Getenv("PATH")) + argsPath := filepath.Join(t.TempDir(), "args.txt") + t.Setenv("WAZA_FAKE_CODEX_ARGS", argsPath) + + dir := t.TempDir() + require.NoError(t, os.WriteFile(filepath.Join(dir, ".waza.yaml"), []byte("defaults:\n engine: codex\n"), 0o644)) + + taskDir := filepath.Join(dir, "tasks") + require.NoError(t, os.MkdirAll(taskDir, 0o755)) + task := `id: default-engine-task +name: Default Engine Task +inputs: + prompt: "Hello" +` + require.NoError(t, os.WriteFile(filepath.Join(taskDir, "task.yaml"), []byte(task), 0o644)) + + spec := `name: default-engine-test +skill: cfg-skill +version: "1.0" +config: + trials_per_task: 1 + timeout_seconds: 10 + executor: copilot-sdk + model: claude-sonnet-4.6 +tasks: + - "tasks/*.yaml" +` + specPath := filepath.Join(dir, "eval.yaml") + require.NoError(t, os.WriteFile(specPath, []byte(spec), 0o644)) + + outFile := filepath.Join(t.TempDir(), "results.json") + cmd := newRunCommand() + cmd.SetArgs([]string{specPath, "--output", outFile}) + cmd.SetOut(io.Discard) + cmd.SetErr(io.Discard) + + err := cmd.Execute() + require.NoError(t, err) + + data, err := os.ReadFile(outFile) + require.NoError(t, err) + var result models.EvaluationOutcome + require.NoError(t, json.Unmarshal(data, &result)) + assert.Equal(t, "codex", result.Setup.EngineType) + assert.Equal(t, "", result.Setup.ModelID, "codex should use ~/.codex/config.toml when default model is omitted") + assert.Equal(t, "codex final output", result.TestOutcomes[0].Runs[0].FinalOutput) + + argsData, err := os.ReadFile(argsPath) + require.NoError(t, err) + assert.NotContains(t, string(argsData), "--model") +} + +func TestRunCommand_WazaYamlCodexConfigModelAndReasoningEffort(t *testing.T) { + if runtime.GOOS == "windows" { + t.Skip("fake codex shell script is POSIX-only") + } + resetRunGlobals() + defer resetRunGlobals() + + fakeDir := t.TempDir() + fakeCodex := filepath.Join(fakeDir, "codex") + fakeScript := `#!/bin/sh +work="" +out="" +args="" +while [ "$#" -gt 0 ]; do + case "$1" in + --cd) + work="$2" + args="$args $1 $2" + shift 2 + ;; + --output-last-message) + out="$2" + args="$args $1 $2" + shift 2 + ;; + *) + args="$args $1" + shift + ;; + esac +done +if [ -n "$work" ]; then + cd "$work" +fi +cat > prompt.txt +printf "%s" "$args" > args.txt +if [ -n "${WAZA_FAKE_CODEX_ARGS:-}" ]; then + printf "%s" "$args" > "$WAZA_FAKE_CODEX_ARGS" +fi +printf "codex final output" > "$out" +` + require.NoError(t, os.WriteFile(fakeCodex, []byte(fakeScript), 0o755)) + t.Setenv("PATH", fakeDir+string(os.PathListSeparator)+os.Getenv("PATH")) + argsPath := filepath.Join(t.TempDir(), "args.txt") + t.Setenv("WAZA_FAKE_CODEX_ARGS", argsPath) + + dir := t.TempDir() + require.NoError(t, os.WriteFile(filepath.Join(dir, ".waza.yaml"), []byte(`defaults: + engine: codex + model: gpt-4o + model_reasoning_effort: high +`), 0o644)) + + taskDir := filepath.Join(dir, "tasks") + require.NoError(t, os.MkdirAll(taskDir, 0o755)) + task := `id: config-model-task +name: Config Model Task +inputs: + prompt: "Hello" +` + require.NoError(t, os.WriteFile(filepath.Join(taskDir, "task.yaml"), []byte(task), 0o644)) + + spec := `name: config-model-test +skill: cfg-skill +version: "1.0" +config: + trials_per_task: 1 + timeout_seconds: 10 + executor: copilot-sdk + model: claude-sonnet-4.6 +tasks: + - "tasks/*.yaml" +` + specPath := filepath.Join(dir, "eval.yaml") + require.NoError(t, os.WriteFile(specPath, []byte(spec), 0o644)) + + outFile := filepath.Join(t.TempDir(), "results.json") + cmd := newRunCommand() + cmd.SetArgs([]string{specPath, "--output", outFile}) + cmd.SetOut(io.Discard) + cmd.SetErr(io.Discard) + + err := cmd.Execute() + require.NoError(t, err) + + data, err := os.ReadFile(outFile) + require.NoError(t, err) + var result models.EvaluationOutcome + require.NoError(t, json.Unmarshal(data, &result)) + assert.Equal(t, "codex", result.Setup.EngineType) + assert.Equal(t, "gpt-4o", result.Setup.ModelID) + + argsData, err := os.ReadFile(argsPath) + require.NoError(t, err) + args := string(argsData) + assert.Contains(t, args, "--model gpt-4o") + assert.Contains(t, args, `model_reasoning_effort="high"`) +} + +func TestDisplayModelShowsCodexConfigDefault(t *testing.T) { + assert.Equal(t, "default (Codex config)", displayModel(models.Config{EngineType: "codex"})) + assert.Equal(t, "gpt-4o", displayModel(models.Config{ + EngineType: "codex", + ModelID: "gpt-4o", + })) +} + +func TestRunCommand_CodexRejectsSkillInvocationGrader(t *testing.T) { + resetRunGlobals() + defer resetRunGlobals() + + dir := t.TempDir() + taskDir := filepath.Join(dir, "tasks") + require.NoError(t, os.MkdirAll(taskDir, 0o755)) + require.NoError(t, os.WriteFile(filepath.Join(taskDir, "task.yaml"), []byte(`id: skill-telemetry-task +name: Skill Telemetry Task +inputs: + prompt: "Hello" +`), 0o644)) + + spec := `name: skill-telemetry-test +skill: cfg-skill +version: "1.0" +config: + trials_per_task: 1 + timeout_seconds: 10 + executor: codex +graders: + - type: skill_invocation + name: required_skill + config: + required_skills: + - cfg-skill +tasks: + - "tasks/*.yaml" +` + specPath := filepath.Join(dir, "eval.yaml") + require.NoError(t, os.WriteFile(specPath, []byte(spec), 0o644)) + + cmd := newRunCommand() + cmd.SetArgs([]string{specPath}) + cmd.SetOut(io.Discard) + cmd.SetErr(io.Discard) + + err := cmd.Execute() + require.Error(t, err) + assert.Contains(t, err.Error(), "skill_invocation") + assert.Contains(t, err.Error(), "not supported by the codex executor") +} + func TestRunCommand_MockEngineVerbose(t *testing.T) { resetRunGlobals() diff --git a/internal/execution/codex.go b/internal/execution/codex.go new file mode 100644 index 000000000..4914d7f56 --- /dev/null +++ b/internal/execution/codex.go @@ -0,0 +1,513 @@ +package execution + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "os" + "os/exec" + "path/filepath" + "strings" + "sync" + "sync/atomic" + "time" + + copilot "github.com/github/copilot-sdk/go" + "github.com/microsoft/waza/internal/models" +) + +// CodexEngine executes tasks through the local Codex CLI. +// +// The Codex CLI owns its own configuration and authentication discovery, so this +// engine intentionally does not parse ~/.codex/config.toml or auth.json. It +// invokes `codex exec` in Waza's isolated workspace and lets Codex load its +// normal config/auth state. +type CodexEngine struct { + defaultModelID string + binary string + binaryPath string + + workspacesMu sync.Mutex + workspaces []string + keepWorkspace bool + + initCalled atomic.Bool +} + +// CodexEngineOption configures a CodexEngine. +type CodexEngineOption func(*CodexEngine) + +// WithCodexBinary overrides the Codex executable path. It is mainly useful for +// tests and for users who keep Codex outside PATH. +func WithCodexBinary(path string) CodexEngineOption { + return func(e *CodexEngine) { + if path != "" { + e.binary = path + } + } +} + +// NewCodexEngine creates a Codex-backed execution engine. +func NewCodexEngine(defaultModelID string, opts ...CodexEngineOption) *CodexEngine { + e := &CodexEngine{ + defaultModelID: defaultModelID, + binary: "codex", + } + for _, opt := range opts { + opt(e) + } + return e +} + +// SetKeepWorkspace enables or disables workspace preservation on shutdown. +func (e *CodexEngine) SetKeepWorkspace(keep bool) { + e.keepWorkspace = keep +} + +// Initialize verifies that the Codex CLI can be found. Codex itself handles +// config/auth loading when the first task is executed. +func (e *CodexEngine) Initialize(ctx context.Context) error { + select { + case <-ctx.Done(): + return ctx.Err() + default: + } + + path, err := exec.LookPath(e.binary) + if err != nil { + return fmt.Errorf("codex executable %q not found in PATH: %w", e.binary, err) + } + e.binaryPath = path + e.initCalled.Store(true) + return nil +} + +// Execute runs a test prompt with `codex exec`. +func (e *CodexEngine) Execute(ctx context.Context, req *ExecutionRequest) (*ExecutionResponse, error) { + if req == nil { + return nil, fmt.Errorf("nil req was passed to CodexEngine.Execute") + } + if !e.initCalled.Load() { + return nil, fmt.Errorf("engine was not initialized. Initialize needs to be called before Execute") + } + if req.Timeout <= 0 { + return nil, fmt.Errorf("positive Timeout is required") + } + + modelID := e.defaultModelID + if req.ModelID != "" { + modelID = req.ModelID + } + + sourceDir := req.SourceDir + if sourceDir == "" { + cwd, err := os.Getwd() + if err != nil { + return nil, fmt.Errorf("failed to get current directory: %w", err) + } + sourceDir = cwd + } + + start := time.Now() + + workspaceDir := req.WorkspaceDir + if workspaceDir == "" { + tmpDir, err := os.MkdirTemp("", "waza-codex-*") + if err != nil { + return nil, fmt.Errorf("failed to create codex workspace: %w", err) + } + workspaceDir = tmpDir + e.trackWorkspace(workspaceDir) + + if err := setupWorkspaceResources(workspaceDir, req.Resources); err != nil { + return nil, fmt.Errorf("failed to setup codex workspace resources: %w", err) + } + } + + if _, hasDeadline := ctx.Deadline(); !hasDeadline { + var cancel context.CancelFunc + ctx, cancel = context.WithTimeout(ctx, req.Timeout) + defer cancel() + } + + outputFile, err := os.CreateTemp("", "waza-codex-output-*.txt") + if err != nil { + return nil, fmt.Errorf("failed to create codex output file: %w", err) + } + outputPath := outputFile.Name() + _ = outputFile.Close() + defer os.Remove(outputPath) //nolint:errcheck + + if req.CancelOnSkillInvocation { + return nil, fmt.Errorf("codex engine does not support skill invocation telemetry required by trigger tests") + } + + args := e.buildArgs(req, modelID, workspaceDir, outputPath) + + prompt := e.buildPrompt(sourceDir, req) + cmd := exec.CommandContext(ctx, e.binaryPath, args...) + cmd.Dir = workspaceDir + cmd.Env = os.Environ() + cmd.Stdin = strings.NewReader(prompt) + + var stdout, stderr bytes.Buffer + cmd.Stdout = &stdout + cmd.Stderr = &stderr + + runErr := cmd.Run() + telemetry := parseCodexJSONEvents(stdout.String()) + finalOutput := readCodexOutput(outputPath, telemetry.FinalOutput()) + + errMsg := "" + success := true + if runErr != nil { + success = false + errMsg = strings.TrimSpace(stderr.String()) + if errMsg == "" { + errMsg = runErr.Error() + } else { + errMsg = fmt.Sprintf("%s: %v", errMsg, runErr) + } + } + + sessionID := telemetry.SessionID + if sessionID == "" { + sessionID = req.SessionID + } + if sessionID == "" { + sessionID = fmt.Sprintf("codex-session-%d", time.Now().UnixNano()) + } + + return &ExecutionResponse{ + FinalOutput: finalOutput, + Events: telemetry.Events, + ModelID: modelID, + DurationMs: time.Since(start).Milliseconds(), + ToolCalls: models.FilterToolCalls(telemetry.Events), + ErrorMsg: errMsg, + Success: success, + WorkspaceDir: workspaceDir, + WorkspaceFiles: captureWorkspaceFiles(workspaceDir), + SessionID: sessionID, + Usage: telemetry.Usage, + }, nil +} + +func (e *CodexEngine) buildArgs(req *ExecutionRequest, modelID, workspaceDir, outputPath string) []string { + common := []string{ + "-c", `approval_policy="never"`, + "--skip-git-repo-check", + "--output-last-message", outputPath, + } + if modelID != "" { + common = append(common, "--model", modelID) + } + if req.ModelReasoningEffort != "" { + common = append(common, "-c", fmt.Sprintf("model_reasoning_effort=%q", req.ModelReasoningEffort)) + } + + if req.SessionID != "" { + args := []string{ + "exec", + "resume", + "--json", + "-c", `sandbox_mode="workspace-write"`, + } + args = append(args, common...) + args = append(args, req.SessionID, "-") + return args + } + + args := []string{ + "exec", + "--json", + "--cd", workspaceDir, + "--sandbox", "workspace-write", + "--color", "never", + } + args = append(args, common...) + args = append(args, "-") + return args +} + +// Shutdown removes Codex workspaces created by this engine. +func (e *CodexEngine) Shutdown(ctx context.Context) error { + workspaces := func() []string { + e.workspacesMu.Lock() + defer e.workspacesMu.Unlock() + ws := e.workspaces + e.workspaces = nil + return ws + }() + + for _, ws := range workspaces { + select { + case <-ctx.Done(): + return ctx.Err() + default: + } + if ws == "" { + continue + } + if e.keepWorkspace { + fmt.Fprintf(os.Stderr, "Workspace preserved: %s\n", ws) + continue + } + if err := os.RemoveAll(ws); err != nil { + return fmt.Errorf("failed to remove codex workspace %s: %w", ws, err) + } + } + return nil +} + +// SessionUsage returns nil because codex exec does not currently expose Waza's +// Copilot-style session usage digest. +func (e *CodexEngine) SessionUsage(sessionID string) *models.UsageStats { + return nil +} + +func (e *CodexEngine) trackWorkspace(path string) { + e.workspacesMu.Lock() + defer e.workspacesMu.Unlock() + e.workspaces = append(e.workspaces, path) +} + +func (e *CodexEngine) buildPrompt(sourceDir string, req *ExecutionRequest) string { + var sb strings.Builder + + if !req.NoSkills { + skillDirs := skillDirsForRequest(sourceDir, req) + if msg := buildSkillSystemMessage(skillDirs, req.SkillName); msg != "" { + sb.WriteString(msg) + sb.WriteString("\n") + } + } + + if req.TaskName != "" || req.TaskDescription != "" || len(req.Context) > 0 { + sb.WriteString("\n") + if req.TaskName != "" { + fmt.Fprintf(&sb, "Name: %s\n", req.TaskName) + } + if req.TaskDescription != "" { + fmt.Fprintf(&sb, "Description: %s\n", req.TaskDescription) + } + if len(req.Context) > 0 { + sb.WriteString("Metadata:\n") + for k, v := range req.Context { + fmt.Fprintf(&sb, "- %s: %v\n", k, v) + } + } + sb.WriteString("\n\n") + } + + sb.WriteString(req.Message) + return sb.String() +} + +func readCodexOutput(outputPath, stdout string) string { + data, err := os.ReadFile(outputPath) + if err == nil && len(data) > 0 { + return string(data) + } + return stdout +} + +type codexTelemetry struct { + SessionID string + Events []copilot.SessionEvent + Usage *models.UsageStats +} + +func (t codexTelemetry) FinalOutput() string { + for i := len(t.Events) - 1; i >= 0; i-- { + evt := t.Events[i] + if evt.Type == copilot.AssistantMessage && evt.Data.Content != nil { + return *evt.Data.Content + } + } + return "" +} + +type codexJSONEvent struct { + Type string `json:"type"` + ThreadID string `json:"thread_id"` + Item codexJSONItem `json:"item"` + Usage codexJSONUsage `json:"usage"` +} + +type codexJSONItem struct { + ID string `json:"id"` + Type string `json:"type"` + Text string `json:"text"` + Name string `json:"name"` + ToolName string `json:"tool_name"` + Command string `json:"command"` + AggregatedOutput string `json:"aggregated_output"` + Output string `json:"output"` + Status string `json:"status"` + ExitCode *int `json:"exit_code"` + Arguments any `json:"arguments"` + Changes []codexJSONFileChange `json:"changes"` + Extra map[string]interface{} `json:"-"` +} + +type codexJSONFileChange struct { + Path string `json:"path"` + Kind string `json:"kind"` +} + +type codexJSONUsage struct { + InputTokens int `json:"input_tokens"` + CachedInputTokens int `json:"cached_input_tokens"` + OutputTokens int `json:"output_tokens"` + ReasoningOutputTokens int `json:"reasoning_output_tokens"` +} + +func parseCodexJSONEvents(stdout string) codexTelemetry { + var telemetry codexTelemetry + for _, line := range strings.Split(stdout, "\n") { + line = strings.TrimSpace(line) + if line == "" || !strings.HasPrefix(line, "{") { + continue + } + + var event codexJSONEvent + if err := json.Unmarshal([]byte(line), &event); err != nil { + continue + } + + switch event.Type { + case "thread.started": + telemetry.SessionID = event.ThreadID + case "item.started": + if evt, ok := codexItemToSessionEvent(event.Item, false); ok { + telemetry.Events = append(telemetry.Events, evt) + } + case "item.completed": + if event.Item.Type == "agent_message" { + if event.Item.Text != "" { + text := event.Item.Text + telemetry.Events = append(telemetry.Events, copilot.SessionEvent{ + Type: copilot.AssistantMessage, + Data: copilot.Data{Content: &text}, + }) + } + continue + } + if evt, ok := codexItemToSessionEvent(event.Item, true); ok { + telemetry.Events = append(telemetry.Events, evt) + } + case "turn.completed": + usage := &models.UsageStats{ + Turns: 1, + InputTokens: event.Usage.InputTokens, + OutputTokens: event.Usage.OutputTokens, + CacheReadTokens: event.Usage.CachedInputTokens, + } + if !usage.IsZero() { + telemetry.Usage = usage + } + } + } + return telemetry +} + +func codexItemToSessionEvent(item codexJSONItem, completed bool) (copilot.SessionEvent, bool) { + toolName, args, resultText, ok := codexToolFields(item) + if !ok { + return copilot.SessionEvent{}, false + } + + toolCallID := item.ID + if toolCallID == "" { + toolCallID = fmt.Sprintf("%s-%s", item.Type, toolName) + } + + if !completed { + return copilot.SessionEvent{ + Type: copilot.ToolExecutionStart, + Data: copilot.Data{ + ToolCallID: &toolCallID, + ToolName: &toolName, + Arguments: args, + }, + }, true + } + + success := item.Status != "failed" + if item.ExitCode != nil && *item.ExitCode != 0 { + success = false + } + return copilot.SessionEvent{ + Type: copilot.ToolExecutionComplete, + Data: copilot.Data{ + ToolCallID: &toolCallID, + ToolName: &toolName, + Success: &success, + Result: &copilot.Result{ + Content: &resultText, + }, + }, + }, true +} + +func codexToolFields(item codexJSONItem) (string, any, string, bool) { + switch item.Type { + case "command_execution": + return "bash", map[string]any{"command": item.Command}, item.AggregatedOutput, true + case "file_change": + path := "" + kind := "" + if len(item.Changes) > 0 { + path = item.Changes[0].Path + kind = item.Changes[0].Kind + } + return "edit", map[string]any{"path": path, "command": kind}, item.Status, true + } + + if strings.Contains(item.Type, "tool") { + name := item.Name + if name == "" { + name = item.ToolName + } + if name == "" { + name = item.Type + } + result := item.Output + if result == "" { + result = item.AggregatedOutput + } + if result == "" { + result = item.Status + } + return name, item.Arguments, result, true + } + + return "", nil, "", false +} + +func skillDirsForRequest(cwd string, req *ExecutionRequest) []string { + skillDirs := []string{cwd} + seen := map[string]bool{cwd: true} + + for _, path := range req.SkillPaths { + if !seen[path] { + seen[path] = true + skillDirs = append(skillDirs, path) + } + } + + return cleanSkillDirs(skillDirs) +} + +func cleanSkillDirs(paths []string) []string { + cleaned := make([]string, 0, len(paths)) + for _, path := range paths { + if path == "" { + continue + } + cleaned = append(cleaned, filepath.Clean(path)) + } + return cleaned +} diff --git a/internal/execution/codex_test.go b/internal/execution/codex_test.go new file mode 100644 index 000000000..52f862b5e --- /dev/null +++ b/internal/execution/codex_test.go @@ -0,0 +1,195 @@ +package execution + +import ( + "context" + "os" + "path/filepath" + "runtime" + "strconv" + "testing" + "time" + + "github.com/stretchr/testify/require" +) + +func TestCodexEngineExecuteUsesCLIWorkspaceAndSkillContext(t *testing.T) { + if runtime.GOOS == "windows" { + t.Skip("fake codex shell script is POSIX-only") + } + + fakeCodex := writeFakeCodex(t, 0) + sourceDir := t.TempDir() + skillDir := filepath.Join(sourceDir, "skills", "demo") + require.NoError(t, os.MkdirAll(skillDir, 0o755)) + require.NoError(t, os.WriteFile(filepath.Join(skillDir, "SKILL.md"), []byte("---\nname: demo\n---\nAlways mention workspace facts."), 0o644)) + + engine := NewCodexEngine("test-model", WithCodexBinary(fakeCodex)) + require.NoError(t, engine.Initialize(context.Background())) + defer func() { + require.NoError(t, engine.Shutdown(context.Background())) + }() + + resp, err := engine.Execute(context.Background(), &ExecutionRequest{ + Message: "Inspect the fixture.", + ModelReasoningEffort: "high", + Resources: []ResourceFile{{Path: "input.txt", Content: []byte("fixture data")}}, + SourceDir: sourceDir, + SkillName: "demo", + TaskName: "Codex task", + TaskDescription: "Verify fake execution.", + SkillPaths: []string{filepath.Join(sourceDir, "skills")}, + Timeout: 10 * time.Second, + }) + + require.NoError(t, err) + require.True(t, resp.Success) + require.Equal(t, "final from fake codex", resp.FinalOutput) + require.Equal(t, "test-model", resp.ModelID) + require.Contains(t, resp.WorkspaceFiles, "created.txt") + require.Equal(t, []byte("fixture data"), resp.WorkspaceFiles["input.txt"]) + + prompt := string(resp.WorkspaceFiles["prompt.txt"]) + require.Contains(t, prompt, "") + require.Contains(t, prompt, "Always mention workspace facts.") + require.Contains(t, prompt, "Name: Codex task") + + args := string(resp.WorkspaceFiles["args.txt"]) + require.Contains(t, args, "--model test-model") + require.Contains(t, args, `approval_policy="never"`) + require.Contains(t, args, `model_reasoning_effort="high"`) + require.Contains(t, args, "--sandbox workspace-write") + require.NotContains(t, args, "--ephemeral") + require.Len(t, resp.ToolCalls, 1) + require.Equal(t, "bash", resp.ToolCalls[0].Name) + require.Equal(t, "codex-test-session", resp.SessionID) + require.NotNil(t, resp.Usage) + require.Equal(t, 12, resp.Usage.InputTokens) +} + +func TestCodexEngineExecuteReportsCLIError(t *testing.T) { + if runtime.GOOS == "windows" { + t.Skip("fake codex shell script is POSIX-only") + } + + fakeCodex := writeFakeCodex(t, 7) + engine := NewCodexEngine("", WithCodexBinary(fakeCodex)) + require.NoError(t, engine.Initialize(context.Background())) + defer func() { + require.NoError(t, engine.Shutdown(context.Background())) + }() + + resp, err := engine.Execute(context.Background(), &ExecutionRequest{ + Message: "fail", + Timeout: 10 * time.Second, + }) + + require.NoError(t, err) + require.False(t, resp.Success) + require.Contains(t, resp.ErrorMsg, "fake codex failed") + require.Equal(t, "final from fake codex", resp.FinalOutput) +} + +func TestCodexEngineExecuteResumesSessionForFollowUp(t *testing.T) { + if runtime.GOOS == "windows" { + t.Skip("fake codex shell script is POSIX-only") + } + + fakeCodex := writeFakeCodex(t, 0) + engine := NewCodexEngine("", WithCodexBinary(fakeCodex)) + require.NoError(t, engine.Initialize(context.Background())) + defer func() { + require.NoError(t, engine.Shutdown(context.Background())) + }() + + first, err := engine.Execute(context.Background(), &ExecutionRequest{ + Message: "Remember apple.", + Timeout: 10 * time.Second, + }) + require.NoError(t, err) + require.Equal(t, "codex-test-session", first.SessionID) + + second, err := engine.Execute(context.Background(), &ExecutionRequest{ + Message: "What did I ask you to remember?", + SessionID: first.SessionID, + WorkspaceDir: first.WorkspaceDir, + Timeout: 10 * time.Second, + }) + require.NoError(t, err) + + args := string(second.WorkspaceFiles["args.txt"]) + require.Contains(t, args, "exec resume") + require.Contains(t, args, "codex-test-session") + require.NotContains(t, args, "--ephemeral") + require.Equal(t, first.WorkspaceDir, second.WorkspaceDir) +} + +func TestCodexEngineExecuteRejectsSkillTriggerTelemetry(t *testing.T) { + fakeCodex := writeFakeCodex(t, 0) + engine := NewCodexEngine("", WithCodexBinary(fakeCodex)) + require.NoError(t, engine.Initialize(context.Background())) + defer func() { + require.NoError(t, engine.Shutdown(context.Background())) + }() + + _, err := engine.Execute(context.Background(), &ExecutionRequest{ + Message: "trigger?", + Timeout: 10 * time.Second, + CancelOnSkillInvocation: true, + }) + require.ErrorContains(t, err, "does not support skill invocation telemetry") +} + +func writeFakeCodex(t *testing.T, exitCode int) string { + t.Helper() + + dir := t.TempDir() + path := filepath.Join(dir, "codex") + script := `#!/bin/sh +set -u +work="" +out="" +args="" +while [ "$#" -gt 0 ]; do + case "$1" in + --cd) + work="$2" + args="$args $1 $2" + shift 2 + ;; + --output-last-message) + out="$2" + args="$args $1 $2" + shift 2 + ;; + *) + args="$args $1" + shift + ;; + esac +done +if [ -n "$work" ]; then + cd "$work" +fi +cat > prompt.txt +printf "%s" "$args" > args.txt +printf "created by fake codex" > created.txt +if [ -n "$out" ]; then + printf "final from fake codex" > "$out" +else + printf "final from fake codex" +fi +cat <<'JSON' +{"type":"thread.started","thread_id":"codex-test-session"} +{"type":"item.started","item":{"id":"item_0","type":"command_execution","command":"/bin/sh -c pwd","aggregated_output":"","exit_code":null,"status":"in_progress"}} +{"type":"item.completed","item":{"id":"item_0","type":"command_execution","command":"/bin/sh -c pwd","aggregated_output":"fake pwd\n","exit_code":0,"status":"completed"}} +{"type":"item.completed","item":{"id":"item_1","type":"agent_message","text":"final from fake codex"}} +{"type":"turn.completed","usage":{"input_tokens":12,"cached_input_tokens":3,"output_tokens":4,"reasoning_output_tokens":1}} +JSON +if [ ` + strconv.Itoa(exitCode) + ` -ne 0 ]; then + printf "fake codex failed\n" >&2 + exit ` + strconv.Itoa(exitCode) + ` +fi +` + require.NoError(t, os.WriteFile(path, []byte(script), 0o755)) + return path +} diff --git a/internal/execution/engine.go b/internal/execution/engine.go index acbd870e5..907e7d54a 100644 --- a/internal/execution/engine.go +++ b/internal/execution/engine.go @@ -36,10 +36,11 @@ type WorkspaceKeeper interface { // ExecutionRequest represents a test execution request type ExecutionRequest struct { - ModelID string - Message string - Context map[string]any - Resources []ResourceFile + ModelID string + ModelReasoningEffort string + Message string + Context map[string]any + Resources []ResourceFile SessionID string WorkspaceDir string // Reuse an existing workspace directory (for follow-up prompts) diff --git a/internal/models/spec.go b/internal/models/spec.go index 6f8c33545..de423bfb7 100644 --- a/internal/models/spec.go +++ b/internal/models/spec.go @@ -35,20 +35,21 @@ type SpecIdentity struct { // Config controls execution behavior type Config struct { - TrialsPerTask int `yaml:"trials_per_task" json:"runs_per_test"` - TimeoutSec int `yaml:"timeout_seconds" json:"timeout_sec"` - Concurrent bool `yaml:"parallel" json:"concurrent"` - Workers int `yaml:"workers,omitempty" json:"workers,omitempty"` - StopOnError bool `yaml:"fail_fast,omitempty" json:"stop_on_error,omitempty"` - EngineType string `yaml:"executor" json:"engine_type"` - ModelID string `yaml:"model" json:"model_id"` - SkillPaths []string `yaml:"skill_directories,omitempty" json:"skill_paths,omitempty"` - DisabledSkills []string `yaml:"disabled_skills,omitempty" json:"disabled_skills,omitempty"` - RequiredSkills []string `yaml:"required_skills,omitempty" json:"required_skills,omitempty"` - ServerConfigs map[string]any `yaml:"mcp_servers,omitempty" json:"server_configs,omitempty"` - MaxAttempts int `yaml:"max_attempts,omitempty" json:"max_attempts,omitempty"` - GroupBy string `yaml:"group_by,omitempty" json:"group_by,omitempty"` - JudgeModel string `yaml:"judge_model,omitempty" json:"judge_model,omitempty"` + TrialsPerTask int `yaml:"trials_per_task" json:"runs_per_test"` + TimeoutSec int `yaml:"timeout_seconds" json:"timeout_sec"` + Concurrent bool `yaml:"parallel" json:"concurrent"` + Workers int `yaml:"workers,omitempty" json:"workers,omitempty"` + StopOnError bool `yaml:"fail_fast,omitempty" json:"stop_on_error,omitempty"` + EngineType string `yaml:"executor" json:"engine_type"` + ModelID string `yaml:"model" json:"model_id"` + ModelReasoningEffort string `yaml:"model_reasoning_effort,omitempty" json:"model_reasoning_effort,omitempty"` + SkillPaths []string `yaml:"skill_directories,omitempty" json:"skill_paths,omitempty"` + DisabledSkills []string `yaml:"disabled_skills,omitempty" json:"disabled_skills,omitempty"` + RequiredSkills []string `yaml:"required_skills,omitempty" json:"required_skills,omitempty"` + ServerConfigs map[string]any `yaml:"mcp_servers,omitempty" json:"server_configs,omitempty"` + MaxAttempts int `yaml:"max_attempts,omitempty" json:"max_attempts,omitempty"` + GroupBy string `yaml:"group_by,omitempty" json:"group_by,omitempty"` + JudgeModel string `yaml:"judge_model,omitempty" json:"judge_model,omitempty"` } // GraderConfig defines a validator/grader diff --git a/internal/models/spec_test.go b/internal/models/spec_test.go index db0a437ba..25ce0e0f6 100644 --- a/internal/models/spec_test.go +++ b/internal/models/spec_test.go @@ -364,6 +364,32 @@ config: }) } +func TestEvalSpec_ModelReasoningEffort(t *testing.T) { + tempDir := t.TempDir() + yamlContent := `name: reasoning-test +skill: test +config: + trials_per_task: 1 + timeout_seconds: 60 + executor: codex + model: gpt-4o + model_reasoning_effort: high +` + specPath := filepath.Join(tempDir, "reasoning.yaml") + if err := os.WriteFile(specPath, []byte(yamlContent), 0644); err != nil { + t.Fatalf("Failed to write spec file: %v", err) + } + + spec, err := LoadEvalSpec(specPath) + if err != nil { + t.Fatalf("Failed to load spec: %v", err) + } + + if spec.Config.ModelReasoningEffort != "high" { + t.Errorf("Expected model_reasoning_effort='high', got '%s'", spec.Config.ModelReasoningEffort) + } +} + func TestConfig_AllSkillsDisabled(t *testing.T) { tests := []struct { name string diff --git a/internal/orchestration/runner.go b/internal/orchestration/runner.go index f9eaf731a..6d3ecb159 100644 --- a/internal/orchestration/runner.go +++ b/internal/orchestration/runner.go @@ -1158,16 +1158,18 @@ func (r *EvalRunner) buildExecutionRequest(tc *models.TestCase) *execution.Execu noSkills := spec.Config.AllSkillsDisabled() return &execution.ExecutionRequest{ - Message: tc.Stimulus.Message, - Context: tc.Stimulus.Metadata, - Resources: resources, - SkillName: spec.SkillName, - TaskName: tc.DisplayName, - TaskDescription: tc.Summary, - SkillPaths: resolvedSkillPaths, - NoSkills: noSkills, - Timeout: time.Duration(timeout) * time.Second, - MCPServers: convertMCPServers(spec.Config.ServerConfigs), + ModelID: spec.Config.ModelID, + ModelReasoningEffort: spec.Config.ModelReasoningEffort, + Message: tc.Stimulus.Message, + Context: tc.Stimulus.Metadata, + Resources: resources, + SkillName: spec.SkillName, + TaskName: tc.DisplayName, + TaskDescription: tc.Summary, + SkillPaths: resolvedSkillPaths, + NoSkills: noSkills, + Timeout: time.Duration(timeout) * time.Second, + MCPServers: convertMCPServers(spec.Config.ServerConfigs), } } diff --git a/internal/projectconfig/config.go b/internal/projectconfig/config.go index cac235e6e..7ec5fb579 100644 --- a/internal/projectconfig/config.go +++ b/internal/projectconfig/config.go @@ -56,14 +56,15 @@ type PathsConfig struct { // DefaultsConfig holds default execution parameters. type DefaultsConfig struct { - Engine string `yaml:"engine,omitempty"` - Model string `yaml:"model,omitempty"` - JudgeModel string `yaml:"judgeModel,omitempty"` - Timeout int `yaml:"timeout,omitempty"` - Parallel *bool `yaml:"parallel,omitempty"` - Workers int `yaml:"workers,omitempty"` - Verbose *bool `yaml:"verbose,omitempty"` - SessionLog *bool `yaml:"sessionLog,omitempty"` + Engine string `yaml:"engine,omitempty"` + Model string `yaml:"model,omitempty"` + ModelReasoningEffort string `yaml:"model_reasoning_effort,omitempty"` + JudgeModel string `yaml:"judgeModel,omitempty"` + Timeout int `yaml:"timeout,omitempty"` + Parallel *bool `yaml:"parallel,omitempty"` + Workers int `yaml:"workers,omitempty"` + Verbose *bool `yaml:"verbose,omitempty"` + SessionLog *bool `yaml:"sessionLog,omitempty"` } // CacheConfig holds cache settings. @@ -146,14 +147,15 @@ func New() *ProjectConfig { Results: DefaultResultsDir, }, Defaults: DefaultsConfig{ - Engine: DefaultEngine, - Model: DefaultModel, - JudgeModel: "", - Timeout: DefaultTimeout, - Parallel: boolPtr(false), - Workers: DefaultWorkers, - Verbose: boolPtr(false), - SessionLog: boolPtr(false), + Engine: DefaultEngine, + Model: DefaultModel, + ModelReasoningEffort: "", + JudgeModel: "", + Timeout: DefaultTimeout, + Parallel: boolPtr(false), + Workers: DefaultWorkers, + Verbose: boolPtr(false), + SessionLog: boolPtr(false), }, Cache: CacheConfig{ Enabled: boolPtr(false), @@ -206,9 +208,17 @@ func Load(startDir string) (*ProjectConfig, error) { if err := decoder.Decode(&fileCfg); err != nil { return nil, fmt.Errorf("parsing .waza.yaml: %w", err) } + defaultsModelSet := hasDefaultsField(data, "model") // Merge file values onto defaults. mergeConfig(cfg, &fileCfg) + if defaultsModelSet { + // An explicit empty model is meaningful for engines such as codex, where + // the underlying tool can read its default model from its own config. + cfg.Defaults.Model = fileCfg.Defaults.Model + } else if fileCfg.Defaults.Engine == "codex" { + cfg.Defaults.Model = "" + } return cfg, nil } @@ -262,6 +272,9 @@ func mergeConfig(dst, src *ProjectConfig) { if src.Defaults.Model != "" { dst.Defaults.Model = src.Defaults.Model } + if src.Defaults.ModelReasoningEffort != "" { + dst.Defaults.ModelReasoningEffort = src.Defaults.ModelReasoningEffort + } if src.Defaults.JudgeModel != "" { dst.Defaults.JudgeModel = src.Defaults.JudgeModel } @@ -340,3 +353,26 @@ func mergeConfig(dst, src *ProjectConfig) { func boolPtr(b bool) *bool { return &b } + +func hasDefaultsField(data []byte, field string) bool { + var root yaml.Node + if err := yaml.Unmarshal(data, &root); err != nil { + return false + } + if len(root.Content) == 0 || root.Content[0].Kind != yaml.MappingNode { + return false + } + top := root.Content[0] + for i := 0; i+1 < len(top.Content); i += 2 { + if top.Content[i].Value != "defaults" || top.Content[i+1].Kind != yaml.MappingNode { + continue + } + defaults := top.Content[i+1] + for j := 0; j+1 < len(defaults.Content); j += 2 { + if defaults.Content[j].Value == field { + return true + } + } + } + return false +} diff --git a/internal/projectconfig/config_test.go b/internal/projectconfig/config_test.go index 47b497990..04b19d4bc 100644 --- a/internal/projectconfig/config_test.go +++ b/internal/projectconfig/config_test.go @@ -17,6 +17,7 @@ func TestNew_ReturnsAllDefaults(t *testing.T) { // Defaults assertEqual(t, "Defaults.Engine", "copilot-sdk", cfg.Defaults.Engine) assertEqual(t, "Defaults.Model", "claude-sonnet-4.6", cfg.Defaults.Model) + assertEqual(t, "Defaults.ModelReasoningEffort", "", cfg.Defaults.ModelReasoningEffort) assertEqual(t, "Defaults.JudgeModel", "", cfg.Defaults.JudgeModel) assertEqualInt(t, "Defaults.Timeout", 300, cfg.Defaults.Timeout) assertBoolPtr(t, "Defaults.Parallel", false, cfg.Defaults.Parallel) @@ -58,6 +59,7 @@ paths: defaults: engine: mock model: gpt-4o + model_reasoning_effort: high judgeModel: claude-sonnet-4.6 timeout: 600 parallel: true @@ -96,6 +98,7 @@ graders: assertEqual(t, "Paths.Results", "custom-results/", cfg.Paths.Results) assertEqual(t, "Defaults.Engine", "mock", cfg.Defaults.Engine) assertEqual(t, "Defaults.Model", "gpt-4o", cfg.Defaults.Model) + assertEqual(t, "Defaults.ModelReasoningEffort", "high", cfg.Defaults.ModelReasoningEffort) assertEqual(t, "Defaults.JudgeModel", "claude-sonnet-4.6", cfg.Defaults.JudgeModel) assertEqualInt(t, "Defaults.Timeout", 600, cfg.Defaults.Timeout) assertBoolPtr(t, "Defaults.Parallel", true, cfg.Defaults.Parallel) @@ -148,6 +151,39 @@ defaults: assertEqualInt(t, "Graders.ProgramTimeout", 30, cfg.Graders.ProgramTimeout) } +func TestLoad_CodexDefaultsAllowConfigModel(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, ".waza.yaml", ` +defaults: + engine: codex +`) + + cfg, err := Load(dir) + if err != nil { + t.Fatalf("Load() error: %v", err) + } + + assertEqual(t, "Defaults.Engine", "codex", cfg.Defaults.Engine) + assertEqual(t, "Defaults.Model", "", cfg.Defaults.Model) +} + +func TestLoad_ExplicitEmptyModelOverridesDefault(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, ".waza.yaml", ` +defaults: + engine: codex + model: "" +`) + + cfg, err := Load(dir) + if err != nil { + t.Fatalf("Load() error: %v", err) + } + + assertEqual(t, "Defaults.Engine", "codex", cfg.Defaults.Engine) + assertEqual(t, "Defaults.Model", "", cfg.Defaults.Model) +} + func TestLoad_MissingFile_ReturnsDefaults(t *testing.T) { dir := t.TempDir() diff --git a/internal/projectconfig/schema_parity_test.go b/internal/projectconfig/schema_parity_test.go index 7487df360..69961ec5b 100644 --- a/internal/projectconfig/schema_parity_test.go +++ b/internal/projectconfig/schema_parity_test.go @@ -65,6 +65,7 @@ func TestSchemaDefaultsMatchGoDefaults(t *testing.T) { // --- defaults --- assertStringDefault(t, getDefault("defaults", "engine"), cfg.Defaults.Engine, "defaults.engine") assertStringDefault(t, getDefault("defaults", "model"), cfg.Defaults.Model, "defaults.model") + assertStringDefault(t, getDefault("defaults", "model_reasoning_effort"), cfg.Defaults.ModelReasoningEffort, "defaults.model_reasoning_effort") assertIntDefault(t, getDefault("defaults", "timeout"), cfg.Defaults.Timeout, "defaults.timeout") assertBoolDefault(t, getDefault("defaults", "parallel"), *cfg.Defaults.Parallel, "defaults.parallel") assertIntDefault(t, getDefault("defaults", "workers"), cfg.Defaults.Workers, "defaults.workers") diff --git a/internal/scaffold/scaffold.go b/internal/scaffold/scaffold.go index 838c4e23a..ca2d0f5f5 100644 --- a/internal/scaffold/scaffold.go +++ b/internal/scaffold/scaffold.go @@ -44,7 +44,8 @@ func TitleCase(s string) string { } // ReadProjectDefaults reads engine and model from .waza.yaml if it exists. -// Falls back to copilot-sdk and claude-sonnet-4.6. +// Falls back to copilot-sdk and claude-sonnet-4.6. Codex projects may return +// an empty model so the Codex CLI can use ~/.codex/config.toml. func ReadProjectDefaults() (engine, model string) { dir, err := os.Getwd() if err != nil { @@ -59,6 +60,11 @@ func ReadProjectDefaults() (engine, model string) { // EvalYAML returns a default eval.yaml template for the given skill name. func EvalYAML(name, engine, model string) string { + modelLine := "" + if model != "" { + modelLine = fmt.Sprintf(" model: %s\n", model) + } + return fmt.Sprintf(`name: %s-eval description: Evaluation suite for %s. skill: %s @@ -68,7 +74,7 @@ config: timeout_seconds: 300 parallel: false executor: %s - model: %s +%s metrics: - name: task_completion weight: 1.0 @@ -87,7 +93,7 @@ graders: - "(?i)(explain|describe|analyze|implement)" tasks: - "tasks/*.yaml" -`, name, name, name, engine, model) +`, name, name, name, engine, modelLine) } // TaskFiles returns a map of task filename to content. diff --git a/internal/scaffold/scaffold_test.go b/internal/scaffold/scaffold_test.go index 651651c81..4b89eaea0 100644 --- a/internal/scaffold/scaffold_test.go +++ b/internal/scaffold/scaffold_test.go @@ -81,6 +81,13 @@ func TestEvalYAML_CustomEngine(t *testing.T) { assert.Contains(t, content, "model: gpt-4o") } +func TestEvalYAML_OmitsEmptyModel(t *testing.T) { + content := EvalYAML("my-skill", "codex", "") + + assert.Contains(t, content, "executor: codex") + assert.NotContains(t, content, "model:") +} + func TestTaskFiles(t *testing.T) { tasks := TaskFiles("my-skill") diff --git a/internal/validation/schema_test.go b/internal/validation/schema_test.go index 874b05f7e..f22cd64e9 100644 --- a/internal/validation/schema_test.go +++ b/internal/validation/schema_test.go @@ -56,6 +56,26 @@ func TestValidateEvalBytes_Valid(t *testing.T) { require.Empty(t, errs, "valid eval should have no errors") } +func TestValidateEvalBytes_CodexModelOptional(t *testing.T) { + yaml := `name: test-eval +description: Test evaluation +skill: test-skill +version: "1.0" +config: + trials_per_task: 1 + timeout_seconds: 60 + executor: codex +metrics: + - name: accuracy + weight: 1.0 + threshold: 0.8 +tasks: + - "tasks/*.yaml" +` + errs := ValidateEvalBytes([]byte(yaml)) + require.Empty(t, errs, "codex eval should allow model to come from ~/.codex/config.toml") +} + func TestValidateEvalBytes_Invalid(t *testing.T) { errs := ValidateEvalBytes([]byte(invalidEvalYAML)) require.NotEmpty(t, errs, "invalid eval should have errors") diff --git a/schemas/config.schema.json b/schemas/config.schema.json index a413eee22..45cb270e1 100644 --- a/schemas/config.schema.json +++ b/schemas/config.schema.json @@ -31,19 +31,25 @@ "type": "object", "description": "Default values applied to evaluations. Used as fallbacks by 'waza run' and as defaults for 'waza new'.", "properties": { - "engine": { - "type": "string", - "description": "Execution engine for evaluations.", - "enum": ["copilot-sdk", "mock"], - "default": "copilot-sdk" - }, - "model": { - "type": "string", - "description": "Default model identifier for evaluations.", - "examples": ["claude-sonnet-4.6", "gpt-4o", "claude-sonnet-4"], - "default": "claude-sonnet-4.6" - }, - "judgeModel": { + "engine": { + "type": "string", + "description": "Execution engine for evaluations. Use 'codex' to run through the local Codex CLI and its ~/.codex config/auth.", + "enum": ["copilot-sdk", "codex", "mock"], + "default": "copilot-sdk" + }, + "model": { + "type": "string", + "description": "Default model identifier for evaluations. Optional for the 'codex' engine when ~/.codex/config.toml provides the model.", + "examples": ["claude-sonnet-4.6", "gpt-4o", "claude-sonnet-4"], + "default": "claude-sonnet-4.6" + }, + "model_reasoning_effort": { + "type": "string", + "description": "Codex model reasoning effort passed as a Codex config override. Valid values depend on the Codex CLI/model; commonly: none, minimal, low, medium, high, xhigh.", + "enum": ["none", "minimal", "low", "medium", "high", "xhigh"], + "default": "" + }, + "judgeModel": { "type": "string", "description": "Model used for LLM-as-judge grading. If omitted, uses the same model as the evaluation.", "examples": ["claude-sonnet-4.6", "gpt-4o"] diff --git a/schemas/eval.schema.json b/schemas/eval.schema.json index b5b355242..98b675743 100644 --- a/schemas/eval.schema.json +++ b/schemas/eval.schema.json @@ -95,8 +95,7 @@ "required": [ "trials_per_task", "timeout_seconds", - "executor", - "model" + "executor" ], "additionalProperties": false, "description": "Execution configuration for the evaluation.", @@ -130,20 +129,33 @@ "type": "string", "enum": [ "copilot-sdk", + "codex", "mock" ], - "description": "Execution engine to use. 'copilot-sdk' for real evaluations, 'mock' for testing." + "description": "Execution engine to use. 'copilot-sdk' for GitHub Copilot SDK evaluations, 'codex' for local Codex CLI evaluations, 'mock' for testing." }, "model": { "type": "string", "minLength": 1, - "description": "Default model identifier for evaluations.", + "description": "Default model identifier for evaluations. Optional for the 'codex' executor when ~/.codex/config.toml provides the model.", "examples": [ "gpt-4o", "claude-sonnet-4-20250514", "gpt-4o-mini" ] }, + "model_reasoning_effort": { + "type": "string", + "enum": [ + "none", + "minimal", + "low", + "medium", + "high", + "xhigh" + ], + "description": "Codex model reasoning effort passed as a Codex config override." + }, "max_attempts": { "type": "integer", "minimum": 1, diff --git a/site/src/content/docs/guides/eval-yaml.mdx b/site/src/content/docs/guides/eval-yaml.mdx index e9b1bfa5d..3aa6b1464 100644 --- a/site/src/content/docs/guides/eval-yaml.mdx +++ b/site/src/content/docs/guides/eval-yaml.mdx @@ -97,8 +97,9 @@ config: parallel: false # Run tasks sequentially (true = concurrent) workers: 4 # Parallel workers if parallel: true model: claude-sonnet-4.6 # Default model (override with --model) + model_reasoning_effort: high # Codex-only reasoning effort override judge_model: gpt-4o # Model for LLM-as-judge graders (optional) - executor: mock # mock (local) or copilot-sdk (real API) + executor: mock # mock (local), copilot-sdk (GitHub Copilot), or codex (local Codex CLI) ``` | Field | Type | Default | Description | @@ -107,9 +108,10 @@ config: | `timeout_seconds` | int | 300 | Task timeout in seconds | | `parallel` | bool | false | Run tasks concurrently | | `workers` | int | 4 | Number of parallel workers | -| `model` | string | _required_ | Default model for tasks (override with `--model` flag) | +| `model` | string | _required_ | Default model for tasks (override with `--model` flag). Optional for `codex` when `~/.codex/config.toml` provides the model. | +| `model_reasoning_effort` | string | — | Codex reasoning effort passed via `-c model_reasoning_effort=...`; commonly `none`, `minimal`, `low`, `medium`, `high`, or `xhigh` | | `judge_model` | string | (same as `model`) | Model for `prompt`-type graders (LLM-as-judge) | -| `executor` | string | `copilot-sdk` | Executor: `mock` (local, echoes task metadata and file content) or `copilot-sdk` (real API) | +| `executor` | string | `copilot-sdk` | Executor: `mock` (local, echoes task metadata and file content), `copilot-sdk` (GitHub Copilot SDK), or `codex` (local Codex CLI using `~/.codex` config/auth). | | `max_attempts` | int | 0 | Maximum retry attempts per task on failure (0 = no retries) | | `group_by` | string | — | Group results by a field (e.g., `tags`, `task_id`) | | `fail_fast` | bool | false | Stop the entire run on first task failure | diff --git a/site/src/content/docs/reference/schema.mdx b/site/src/content/docs/reference/schema.mdx index 612d0312e..586fa6550 100644 --- a/site/src/content/docs/reference/schema.mdx +++ b/site/src/content/docs/reference/schema.mdx @@ -145,18 +145,31 @@ config: **Type:** string **Default:** (empty) -Default LLM model. Override with `--model` flag. +Default LLM model. Override with `--model` flag. Optional for `codex` when `~/.codex/config.toml` provides the model. ```yaml config: model: claude-sonnet-4.6 ``` +### model_reasoning_effort + +**Type:** string +**Default:** (empty) + +Codex reasoning effort passed through as `-c model_reasoning_effort=...`. Common values are `none`, `minimal`, `low`, `medium`, `high`, and `xhigh`. + +```yaml +config: + executor: codex + model_reasoning_effort: high +``` + ### executor **Type:** string **Default:** mock -**Options:** `mock`, `copilot-sdk` +**Options:** `mock`, `copilot-sdk`, `codex` Execution engine: @@ -609,6 +622,8 @@ paths: # Model defaults defaults: + engine: codex + model_reasoning_effort: high model: claude-sonnet-4.6 timeout: 300 workers: 4 diff --git a/site/src/content/docs/reference/waza-yaml.mdx b/site/src/content/docs/reference/waza-yaml.mdx index 57f408313..896e89aaa 100644 --- a/site/src/content/docs/reference/waza-yaml.mdx +++ b/site/src/content/docs/reference/waza-yaml.mdx @@ -15,7 +15,9 @@ paths: results: results/ defaults: - model: claude-sonnet-4.6 + engine: codex + model: gpt-4o + model_reasoning_effort: high timeout: 300 workers: 4 ``` @@ -38,8 +40,9 @@ Default execution parameters applied to all commands unless overridden by CLI fl | Field | Type | Default | Description | |-------|------|---------|-------------| -| `engine` | string | `copilot-sdk` | Execution engine | -| `model` | string | `claude-sonnet-4.6` | Default model for execution | +| `engine` | string | `copilot-sdk` | Execution engine: `copilot-sdk`, `codex`, or `mock` | +| `model` | string | `claude-sonnet-4.6` | Default model for execution. Optional for `codex` when `~/.codex/config.toml` provides it | +| `model_reasoning_effort` | string | | Codex reasoning effort passed via `-c model_reasoning_effort=...`; commonly `none`, `minimal`, `low`, `medium`, `high`, or `xhigh` | | `judgeModel` | string | | Model for LLM-as-judge graders | | `timeout` | int | `300` | Task timeout in seconds | | `parallel` | bool | `false` | Enable parallel execution |