Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 21 additions & 3 deletions cmd/entire/cli/review/manifest_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,13 @@ const manifestTokenTestAgentType agenttypes.AgentType = "Review Token Test"

func TestHydrateReviewSummaryTokensFromStates_PopulatesTokensFromSessionState(t *testing.T) {
t.Parallel()
started := time.Date(2026, 5, 8, 10, 0, 0, 0, time.UTC)
// Time-relative so this test doesn't go stale: session.StateStore.Load
// auto-deletes sessions whose StartedAt is older than 7 days
// (StaleSessionThreshold), and a hardcoded fixed date silently starts
// failing once the calendar clock crosses that threshold. Use "an hour
// ago" so we exercise the 5-second jitter check inside
// matchReviewSessionState while staying well inside the staleness window.
started := time.Now().UTC().Add(-time.Hour)
summary := reviewtypes.RunSummary{
StartedAt: started,
AgentRuns: []reviewtypes.AgentRun{
Expand Down Expand Up @@ -67,7 +73,13 @@ func TestHydrateReviewSummaryTokensFromStates_FallsBackToTranscript(t *testing.T
return manifestTokenTestAgent{}, nil
}

started := time.Date(2026, 5, 8, 10, 0, 0, 0, time.UTC)
// Time-relative so this test doesn't go stale: session.StateStore.Load
// auto-deletes sessions whose StartedAt is older than 7 days
// (StaleSessionThreshold), and a hardcoded fixed date silently starts
// failing once the calendar clock crosses that threshold. Use "an hour
// ago" so we exercise the 5-second jitter check inside
// matchReviewSessionState while staying well inside the staleness window.
started := time.Now().UTC().Add(-time.Hour)
tmp := t.TempDir()
transcriptPath := filepath.Join(tmp, "review.jsonl")
transcript := "review transcript\n"
Expand Down Expand Up @@ -112,7 +124,13 @@ func TestReviewSummaryTokenEnricher_LoadsCurrentSessionState(t *testing.T) {
if err != nil {
t.Fatalf("NewStateStore: %v", err)
}
started := time.Date(2026, 5, 8, 10, 0, 0, 0, time.UTC)
// Time-relative so this test doesn't go stale: session.StateStore.Load
// auto-deletes sessions whose StartedAt is older than 7 days
// (StaleSessionThreshold), and a hardcoded fixed date silently starts
// failing once the calendar clock crosses that threshold. Use "an hour
// ago" so we exercise the 5-second jitter check inside
// matchReviewSessionState while staying well inside the staleness window.
started := time.Now().UTC().Add(-time.Hour)
if err := store.Save(ctx, &session.State{
SessionID: "codex-session-token",
Kind: session.KindAgentReview,
Expand Down
164 changes: 155 additions & 9 deletions redact/redact.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,16 @@ var credentialedURIPattern = regexp.MustCompile(`(?i)\b[a-z][a-z0-9+.-]{1,31}://
// compose both the env-var assignment regex and the JSON-key regex so the
// vendor list stays in one place.
const dbPasswordKeyShape = `(?:db|database|pg|postgres|postgresql|mysql|mariadb|redis|mongo|mongodb|sqlserver|mssql|jdbc)(?:[_-]+[a-z0-9]+)*[_-]*(?:password|passwd|pwd)` //nolint:gosec // regex literal, not a credential
// secretValueKeyShape matches credential-style key names. Two alternation
// branches:
// 1. Explicit shapes (api_key, auth_token, client_secret, …) with an
// optional prefix.
// 2. Bare `secret`/`token` ONLY when preceded by at least one prefix segment
// (e.g. csrf_token, auth_secret). Standalone `{"token":"…"}` /
// `{"secret":"…"}` JSON fields are intentionally NOT matched here —
// entropy + Betterleaks cover real high-entropy values, and bare
// `token`/`secret` keys appear in many non-credential contexts.
const secretValueKeyShape = `(?:(?:[a-z0-9]+[_-]+)*(?:api[_-]*key|auth[_-]*token|access[_-]*token|refresh[_-]*token|client[_-]*secret|consumer[_-]*secret|secret[_-]*key|private[_-]*key|ssh[_-]*key)|(?:[a-z0-9]+[_-]+)+(?:secret|token))` //nolint:gosec // regex literal, not a credential

var (
jdbcPattern = regexp.MustCompile(`(?i)\bjdbc:[^\s"'<>` + "`" + `]+`)
Expand All @@ -40,6 +50,14 @@ var (
// boundary, so APP_DB_PASSWORD matches via the leading `_` but mydbpassword
// does not.
credentialValuePattern = regexp.MustCompile(`(?i)(?:^|[^A-Za-z0-9])(` + dbPasswordKeyShape + `)\s*=\s*("[^"]*"|'[^']*'|[^\s,;&]+)`)
// secretValuePattern targets shell/env-var assignments (`KEY=value`). It uses
// `=` only, not `:`, to avoid colliding with English prose ("the token: foo")
// in transcripts. YAML/JSON `key: value` shapes are handled by the JSON-aware
// path (`secretJSONKeyRegex`) which sees structured boundaries, not text.
secretValuePattern = regexp.MustCompile(`(?i)(?:^|[^A-Za-z0-9])(` + secretValueKeyShape + `)\s*=\s*("[^"]*"|'[^']*'|[^\s,;&]+)`)
shellStdinSecretPattern = regexp.MustCompile(
`(?is)\bprintf\s+(?:(?:%[A-Za-z]|'%-?[0-9.]*[A-Za-z](?:\\n)?'|"%-?[0-9.]*[A-Za-z](?:\\n)?")\s+)?("[^"\r\n]{1,512}"|'[^'\r\n]{1,512}'|[^\s|&;]{1,512})\s*\|\s*[^&;\r\n]{0,300}\bsecrets?\s+(?:put|set|add|create)\s+` + secretValueKeyShape + `\b`,
)

keywordHostPattern = regexp.MustCompile(`(?i)(?:^|\s)host=`)
keywordUserPattern = regexp.MustCompile(`(?i)(?:^|\s)user=`)
Expand All @@ -49,6 +67,7 @@ var (
// credentialJSONKeyRegex operates on output of normalizeCredentialJSONKey
// (already lowercased, `-`/` `/`.` → `_`), so the `(?i)` flag is unnecessary.
credentialJSONKeyRegex = regexp.MustCompile(`^` + dbPasswordKeyShape + `$`)
secretJSONKeyRegex = regexp.MustCompile(`^` + secretValueKeyShape + `$`)
genericPasswordKeyRegex = regexp.MustCompile(`(?i)^(?:password|passwd|pwd)$`)
)

Expand All @@ -61,6 +80,25 @@ const entropyThreshold = 4.5
// RedactedPlaceholder is the replacement text used for redacted secrets.
const RedactedPlaceholder = "REDACTED"

// nonSecretTokenPrefixes lists key-name prefixes that, when followed by
// `_token` or `_secret`, denote routinely-non-sensitive debug or pagination
// identifiers (e.g. cancel_token, pagination_token). These are excluded from
// JSON-key redaction so they remain readable in transcripts. Real
// high-entropy values still get caught by the entropy detector / Betterleaks
// via the per-value String() call in collectJSONLReplacements.
var nonSecretTokenPrefixes = map[string]struct{}{
"cancel": {},
"continuation": {},
"cursor": {},
"idempotency": {},
"next": {},
"page": {},
"pagination": {},
"prev": {},
"previous": {},
"sync": {},
}

// placeholderSecretValues lists lowercase values that should be treated as
// non-secrets when they appear as a credential value: prior redactions
// (REDACTED / [REDACTED] / <REDACTED>), common documentation placeholders,
Expand Down Expand Up @@ -160,8 +198,9 @@ var connectionStringRules = []connectionStringRule{
// 3. Credentialed URIs: URLs containing userinfo passwords
// 4. Database connection strings: JDBC, keyword DSNs, and semicolon strings
// 5. User-defined custom rules: configured via ConfigureCustomRules
// 6. Bounded credential key/value pairs: DB_PASSWORD=...
// 7. PII detection: email, phone, address patterns (only when configured via ConfigurePII)
// 6. Shell stdin literals piped into secret-management commands
// 7. Bounded credential key/value pairs: DB_PASSWORD=..., GITHUB_CLIENT_SECRET=...
// 8. PII detection: email, phone, address patterns (only when configured via ConfigurePII)
// A string is redacted if ANY method flags it.
func String(s string) string {
var regions []taggedRegion
Expand All @@ -186,6 +225,10 @@ func String(s string) string {
}
}

if isNonSecretIdentifierAssignment(s[start:end]) {
continue
}

if shannonEntropy(s[start:end]) > entropyThreshold {
regions = append(regions, taggedRegion{region: region{start, end}})
}
Expand Down Expand Up @@ -221,10 +264,13 @@ func String(s string) string {
// 5. User-defined custom rules (secrets — only runs when configured).
regions = append(regions, detectCustomRules(getCustomRulesConfig(), s)...)

// 6. Bounded credential key/value detection (secrets — always on).
// 6. Shell stdin literals piped into secret-management commands.
regions = append(regions, detectShellStdinSecrets(s)...)

// 7. Bounded credential key/value detection (secrets — always on).
regions = append(regions, detectCredentialValues(s)...)

// 7. PII detection (opt-in — only runs when configured).
// 8. PII detection (opt-in — only runs when configured).
regions = append(regions, detectPII(getPIIConfig(), s)...)

if len(regions) == 0 {
Expand Down Expand Up @@ -343,20 +389,40 @@ func hasSemicolonConnectionPassword(candidate string) bool {
hasNonPlaceholderPasswordAssignment(candidate)
}

func detectCredentialValues(s string) []taggedRegion {
func detectShellStdinSecrets(s string) []taggedRegion {
var regions []taggedRegion
for _, loc := range credentialValuePattern.FindAllStringSubmatchIndex(s, -1) {
if len(loc) < 6 || loc[4] < 0 || loc[5] < 0 {
for _, loc := range shellStdinSecretPattern.FindAllStringSubmatchIndex(s, -1) {
if len(loc) < 4 || loc[2] < 0 || loc[3] < 0 {
continue
}
start, end := unquoteRange(s, loc[4], loc[5])
start, end := unquoteRange(s, loc[2], loc[3])
if hasNonPlaceholderPasswordValue(s[start:end]) {
regions = append(regions, taggedRegion{region: region{start, end}})
}
}
return regions
}

func detectCredentialValues(s string) []taggedRegion {
patterns := []*regexp.Regexp{
credentialValuePattern,
secretValuePattern,
}
var regions []taggedRegion
for _, pattern := range patterns {
for _, loc := range pattern.FindAllStringSubmatchIndex(s, -1) {
if len(loc) < 6 || loc[4] < 0 || loc[5] < 0 {
continue
}
start, end := unquoteRange(s, loc[4], loc[5])
if hasNonPlaceholderPasswordValue(s[start:end]) {
regions = append(regions, taggedRegion{region: region{start, end}})
}
}
}
return regions
}

func unquoteRange(s string, start, end int) (int, int) {
if end-start < 2 {
return start, end
Expand Down Expand Up @@ -396,12 +462,31 @@ func isPlaceholderSecretValue(value string) bool {
if strings.HasPrefix(normalized, "${") && strings.HasSuffix(normalized, "}") {
return true
}
if isShellVariableReference(trimmed) {
return true
}
if _, ok := placeholderSecretValues[normalized]; ok {
return true
}
return isRepeatedCharPlaceholder(normalized)
}

func isShellVariableReference(s string) bool {
if len(s) < 2 || s[0] != '$' {
return false
}
if s[1] != '_' && (s[1] < 'A' || s[1] > 'Z') && (s[1] < 'a' || s[1] > 'z') {
return false
}
for i := 2; i < len(s); i++ {
c := s[i]
if c != '_' && (c < 'A' || c > 'Z') && (c < 'a' || c > 'z') && (c < '0' || c > '9') {
return false
}
}
return true
}

// bracketedPlaceholderInteriorRE matches the inside of a "<…>" placeholder
// shape: lowercase letters joined by `-` or `_`. Digits, mixed case, and
// special chars are rejected so values like `<hunter2>` or `<RealPassword>`
Expand Down Expand Up @@ -442,12 +527,63 @@ func isRepeatedCharPlaceholder(s string) bool {

func isCredentialJSONSecretKey(key string, credentialContext bool) bool {
normalized := normalizeCredentialJSONKey(key)
if credentialJSONKeyRegex.MatchString(normalized) {
if isKnownNonSecretTokenKey(normalized) {
return false
}
if isSensitiveNormalizedJSONValueKey(normalized) {
return true
}
return credentialContext && genericPasswordKeyRegex.MatchString(normalized)
}

// isKnownNonSecretTokenKey reports whether the normalized JSON key is
// `<prefix>_token` or `<prefix>_secret` where prefix is in
// nonSecretTokenPrefixes. Used to short-circuit JSON-key redaction for keys
// like `cancel_token`, `pagination_token`, `idempotency_token` that are
// routinely non-credentials.
func isKnownNonSecretTokenKey(normalized string) bool {
for _, suffix := range []string{"_token", "_secret"} {
if prefix, ok := strings.CutSuffix(normalized, suffix); ok {
if _, allowed := nonSecretTokenPrefixes[prefix]; allowed {
return true
}
}
}
return false
}

func isNonSecretIdentifierAssignment(candidate string) bool {
key, value, ok := strings.Cut(candidate, "=")
if !ok || key == "" || value == "" {
return false
}
normalized := normalizeCredentialJSONKey(key)
if isSensitiveNormalizedJSONValueKey(normalized) || genericPasswordKeyRegex.MatchString(normalized) {
return false
}
if isHighEntropySecretCandidate(value) {
return false
}
return normalized == "id" ||
normalized == "account" ||
strings.HasSuffix(normalized, "_id") ||
strings.HasSuffix(normalized, "_account")
}

func isHighEntropySecretCandidate(value string) bool {
for _, loc := range secretPattern.FindAllStringIndex(value, -1) {
if shannonEntropy(value[loc[0]:loc[1]]) > entropyThreshold {
return true
}
}
return false
}

func isSensitiveNormalizedJSONValueKey(normalized string) bool {
return credentialJSONKeyRegex.MatchString(normalized) ||
secretJSONKeyRegex.MatchString(normalized)
}

func isCredentialJSONObject(obj map[string]any) bool {
var hasHost, hasUser bool
for key := range obj {
Expand Down Expand Up @@ -692,6 +828,16 @@ func shouldSkipJSONLField(key string) bool {
return true
}

// Skip account fields. These are identifier-shape (e.g.,
// google_adsense_account, aws_account, service_account), not credential
// fields. Protects against entropy / vendor-regex false positives when
// account identifiers happen to look high-entropy. A misnamed credential
// field literally named `*_account` would slip through — acceptable trade
// because real credential fields use `*_secret`, `*_token`, etc.
if strings.HasSuffix(lower, "account") {
return true
}

// Skip common path and directory fields from agent transcripts.
// These appear frequently in tool calls and are structural, not secrets.
switch lower {
Expand Down
Loading
Loading