diff --git a/cmd/main.go b/cmd/main.go index 258871db3..8859f5573 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -43,6 +43,7 @@ import ( "github.com/ironcore-dev/metal-operator/internal/controller" metalmetrics "github.com/ironcore-dev/metal-operator/internal/metrics" "github.com/ironcore-dev/metal-operator/internal/registry" + metaltoken "github.com/ironcore-dev/metal-operator/internal/token" // +kubebuilder:scaffold:imports ) @@ -696,7 +697,13 @@ func main() { // nolint: gocyclo // Run registry server as a runnable to ensure it stops when the manager stops if err := mgr.Add(manager.RunnableFunc(func(ctx context.Context) error { setupLog.Info("starting registry server", "RegistryURL", registryURL) - registryServer := registry.NewServer(setupLog, fmt.Sprintf(":%d", registryPort), mgr.GetClient()) + registryServer := registry.NewServer( + setupLog, + fmt.Sprintf(":%d", registryPort), + mgr.GetClient(), + metaltoken.DiscoveryTokenSigningSecretName, + managerNamespace, + ) if err := registryServer.Start(ctx); err != nil { return fmt.Errorf("unable to start registry server: %w", err) } diff --git a/cmd/metalprobe/main.go b/cmd/metalprobe/main.go index 18db52795..13ed4ca4c 100644 --- a/cmd/metalprobe/main.go +++ b/cmd/metalprobe/main.go @@ -20,6 +20,7 @@ var ( func main() { var registryURL string var serverUUID string + var discoveryToken string var duration time.Duration var registryClientTimeout time.Duration var LLDPSyncInterval time.Duration @@ -27,6 +28,7 @@ func main() { flag.StringVar(®istryURL, "registry-url", "", "Registry URL where the probe will register itself.") flag.StringVar(&serverUUID, "server-uuid", "", "Agent UUID to register with the registry.") + flag.StringVar(&discoveryToken, "discovery-token", "", "Discovery token for authenticating with the registry.") flag.DurationVar(&duration, "duration", 5*time.Second, "Duration of time to wait between checks.") flag.DurationVar(®istryClientTimeout, "registry-client-timeout", 5*time.Second, "Timeout for HTTP requests to the registry.") @@ -53,11 +55,18 @@ func main() { os.Exit(1) } + if discoveryToken == "" { + setupLog.Error(nil, "discovery token is missing") + os.Exit(1) + } + ctx := ctrl.SetupSignalHandler() setupLog.Info("starting registry agent") - agent := probe.NewAgent(setupLog, serverUUID, registryURL, duration, registryClientTimeout, - LLDPSyncInterval, LLDPSyncDuration) + agent := probe.NewAgent( + setupLog, serverUUID, registryURL, discoveryToken, + duration, registryClientTimeout, LLDPSyncInterval, LLDPSyncDuration, + ) if err := agent.Start(ctx); err != nil { setupLog.Error(err, "problem running probe agent") os.Exit(1) diff --git a/docs/discovery-token-security.md b/docs/discovery-token-security.md new file mode 100644 index 000000000..cd4edd8e8 --- /dev/null +++ b/docs/discovery-token-security.md @@ -0,0 +1,448 @@ +# Discovery Token Security + +## Overview + +The metal-operator uses **JWT (JSON Web Tokens) with HMAC-SHA256 signing** to authenticate servers during the discovery process. This prevents unauthorized systems from submitting fake discovery data to the registry. + +## Design + +### Token Format + +Discovery tokens use the **industry-standard JWT (JSON Web Token)** format with HMAC-SHA256 signing (HS256): + +```text +token = header.payload.signature +``` + +**JWT Structure:** + +```text +.. +``` + +**Example (decoded for clarity):** +- Header: `{"alg":"HS256","typ":"JWT"}` +- Payload: `{"sub":"","iat":1234567890,"exp":1234571490}` +- Signature: `HMAC-SHA256(header + payload, secret)` + +**Claims (Payload):** + +- **sub** (subject): The server's systemUUID +- **iat** (issued at): Token creation timestamp (Unix seconds) +- **exp** (expires): Token expiration timestamp (1 hour after issuance) + +**Signature:** HMAC-SHA256(header + payload, secret) + +**Note:** The signing secret is never included in the token. It's used only to generate and verify the signature. + +### Security Properties + +✅ **Authentication**: Only the controller can generate valid tokens (requires signing secret) +✅ **Integrity**: Tokens cannot be tampered with (signature verification) +✅ **Binding**: Each token is bound to a specific systemUUID (via `sub` claim) +✅ **Freshness**: Tokens expire after 1 hour (via `exp` claim, JWT library enforces) +✅ **Timing-safe**: JWT library uses constant-time HMAC comparison +✅ **Algorithm protection**: Explicitly validates HS256 signing method (prevents algorithm confusion attacks) +✅ **Standard format**: Industry-standard JWT (RFC 7519) with extensive tooling support + +### Threat Model + +**Protected Against:** + +- ❌ Rogue systems submitting fake discovery data (no valid JWT) +- ❌ UUID spoofing (attacker claiming another server's identity - token `sub` claim binds to specific UUID) +- ❌ Token tampering or forgery (signature verification fails) +- ❌ Replay attacks (tokens expire after 1 hour via `exp` claim) +- ❌ Timing attacks (JWT library uses constant-time HMAC comparison) +- ❌ Algorithm confusion attacks (explicitly validates HS256 signing method) + +**NOT Protected Against** (out of scope): + +- ⚠️ MITM attacks (requires TLS - separate concern) +- ⚠️ Token leakage via logs (tokens not logged) +- ⚠️ Compromised signing secret (rotate if compromised) + +## Architecture + +### Token Lifecycle + +```text +┌──────────────┐ ┌──────────┐ ┌──────────┐ +│ Controller │ │ Registry │ │ Server │ +└──────┬───────┘ └────┬─────┘ └────┬─────┘ + │ │ │ + │ 1. Generate Signing Secret │ │ + │ (K8s Secret) │ │ + │◄──────────────────────────────┤ │ + │ │ │ + │ 2. Generate JWT Token │ │ + │ with claims (sub, iat, exp)│ │ + │──────────────────────────────►│ │ + │ │ │ + │ 3. Pass token in ignition │ │ + │───────────────────────────────┼─────────────────────────────► + │ │ │ + │ │ 4. Server boots with token │ + │ │◄───────────────────────────── + │ │ │ + │ │ 5. Parse JWT and validate: │ + │ │ - Signature (HS256) │ + │ │ - Expiration (exp claim) │ + │ │ - Extract sub claim │ + │ │ │ + │ │ 6. Accept/Reject data │ + │ ├─────────────────────────────► +``` + +### Components + +**1. Controller (`internal/controller/server_controller.go`)** + +- Generates or retrieves signing secret from K8s Secret +- Creates signed tokens when generating boot configurations +- Passes tokens to metalprobe via ignition flags + +**2. Registry (`internal/registry/server.go`)** + +- Loads signing secret on startup +- Verifies token signatures on `/register` and `/bootstate` endpoints +- Extracts and validates systemUUID from tokens +- Rejects requests with invalid/expired tokens + +**3. Metalprobe (`cmd/metalprobe/main.go`)** + +- Receives discovery token via `--discovery-token` flag +- Includes token in all HTTP requests to registry +- Handles authentication errors gracefully + +**4. Token Library (`internal/token/token.go`)** + +- `GenerateSigningSecret()`: Creates 32-byte cryptographic secret +- `GenerateSignedDiscoveryToken()`: Creates JWT with claims (sub, iat, exp) and signs with HS256 +- `VerifySignedDiscoveryToken()`: Parses JWT, validates signature/expiration, extracts systemUUID + +## Implementation Details + +### Signing Secret Management + +The signing secret is stored in a Kubernetes Secret: + +```yaml +apiVersion: v1 +kind: Secret +metadata: + name: discovery-token-signing-secret + namespace: metal-operator-system +type: Opaque +data: + signing-key: +``` + +**Secret Lifecycle:** + +- Created automatically by controller on first boot config generation +- Shared between controller and registry +- Persists across restarts (not ephemeral) +- Should be backed up with cluster state + +**Secret Rotation:** +To rotate the signing secret: + +1. Delete the existing secret: `kubectl delete secret discovery-token-signing-secret -n metal-operator-system` +2. Restart the controller pod (regenerates secret) +3. Restart the registry pod (loads new secret) +4. Note: In-flight discovery sessions will fail (servers will retry) + +### Token Expiry + +Tokens include an expiration claim (`exp`) that is automatically validated by the JWT library: + +```go +// Token expires after 1 hour +claims := jwt.MapClaims{ + "sub": systemUUID, + "iat": jwt.NewNumericDate(time.Now()), + "exp": jwt.NewNumericDate(time.Now().Add(1 * time.Hour)), +} + +// JWT library automatically validates expiration during parsing +token, err := jwt.Parse(tokenString, keyFunc) +// Returns error if token is expired +``` + +**Why 1 hour?** + +- Discovery typically completes in minutes +- Allows sufficient time for troubleshooting and debugging +- Minimizes replay attack window (security best practice) +- Balances security vs. operational flexibility +- Same as original HMAC implementation (proven sufficient) + +### Error Handling + +**Registry Responses:** + +- `201 Created`: Token valid, data accepted +- `401 Unauthorized`: Token missing, invalid, or expired +- `500 Internal Server Error`: Signing secret not loaded + +**Metalprobe Behavior:** + +- Retries on transient errors (500) +- Does NOT retry on authentication errors (401) +- Logs errors for debugging + +**Controller Behavior:** + +- Fails reconciliation if token generation fails +- Retries on next reconciliation loop +- Logs errors with context + +## Comparison with Alternatives + +### vs. OpenStack Ironic + +**Similarities:** + +- Both use random tokens for discovery authentication +- Both bind tokens to specific machines +- Both use timing-safe comparison + +**Improvements in metal-operator:** + +- ✅ Standard JWT format (RFC 7519) vs. custom format +- ✅ Built-in expiration validation via JWT library +- ✅ Algorithm confusion protection (validates HS256) +- ✅ Structured claims (sub, iat, exp) for clarity +- ✅ Stateless verification (no registry storage needed) +- ✅ K8s Secret storage vs. database field + +### vs. mTLS (Mutual TLS) + +**Why not mTLS?** + +- Requires PKI infrastructure +- Complex certificate provisioning for ephemeral systems +- Boot environment may not support TLS client certs +- HMAC tokens are simpler and sufficient for discovery + +**When to use TLS:** + +- Add TLS for MITM protection (controller ↔ registry ↔ probe) +- Use cert-manager for easy certificate management +- Tokens + TLS = defense in depth + +### vs. IP Allowlist + +**Why not IP allowlist?** + +- DHCP assigns dynamic IPs during discovery +- NAT/proxies obscure source IPs +- Doesn't prevent UUID spoofing from allowed IPs +- Tokens provide better granularity + +## Security Considerations + +### Token Leakage Prevention + +**DO:** + +- ✅ Pass tokens via ignition (secure channel) +- ✅ Use structured logging (tokens not in log fields) +- ✅ Store in K8s Secrets (not ConfigMaps) + +**DON'T:** + +- ❌ Log token values (even at debug level) +- ❌ Expose tokens in API responses +- ❌ Store tokens in Git or documentation + +### Deployment Security + +**Recommendations:** + +1. **Enable TLS**: Add TLS termination at registry for MITM protection +2. **Network Policies**: Restrict registry access to authorized pods +3. **RBAC**: Limit access to signing secret to controller/registry ServiceAccounts +4. **Audit Logging**: Enable K8s audit logs for secret access +5. **Backup**: Include signing secret in cluster backup/restore procedures + +### Monitoring + +**Key Metrics to Monitor:** + +- Token validation failures (registry logs) +- Signing secret access (K8s audit logs) +- Discovery timeouts (may indicate token issues) +- 401 Unauthorized responses (registry metrics) + +**Alerts to Configure:** + +- High rate of token validation failures +- Signing secret missing/invalid on registry startup +- Repeated 401 errors from specific servers + +## Troubleshooting + +### Problem: "401 Unauthorized" on `/register` + +**Symptoms:** + +- Metalprobe logs: "authentication failed with registry" +- Registry logs: "Rejected request with invalid discovery token" + +**Causes:** + +1. Token expired (>1 hour old) +2. Signing secret mismatch between controller and registry +3. Token tampered, corrupted, or invalid JWT format +4. Wrong signing algorithm (not HS256) + +**Resolution:** + +```bash +# Check if signing secret exists +kubectl get secret discovery-token-signing-secret -n metal-operator-system + +# Verify registry loaded the secret +kubectl logs -n metal-operator-system deployment/metal-operator-registry | grep "signing secret" + +# Check controller logs for token generation +kubectl logs -n metal-operator-system deployment/metal-operator-controller-manager \ + | grep "Generated signed discovery token" + +# If mismatch, restart both pods to reload secret +kubectl rollout restart deployment/metal-operator-controller-manager -n metal-operator-system +kubectl rollout restart deployment/metal-operator-registry -n metal-operator-system +``` + +### Problem: "Signing secret not loaded" + +**Symptoms:** + +- Registry logs: "Signing secret not loaded, cannot validate tokens" +- All discovery attempts fail with 500 errors + +**Causes:** + +1. Secret doesn't exist yet (first boot) +2. Registry started before controller created secret +3. Wrong namespace or secret name +4. RBAC prevents registry from reading secret + +**Resolution:** + +```bash +# Force controller to create secret +kubectl delete serverbootconfiguration -n +# (Controller will recreate it, generating secret in the process) + +# Or manually create the secret (generates 32 raw bytes) +openssl rand 32 | kubectl create secret generic discovery-token-signing-secret \ + --from-file=signing-key=/dev/stdin \ + -n metal-operator-system + +# Verify registry can read it +kubectl auth can-i get secrets/discovery-token-signing-secret \ + --as=system:serviceaccount:metal-operator-system:metal-operator-registry +``` + +### Problem: Discovery timeout + +**Symptoms:** + +- Server stuck in Discovery state +- No data appears in registry +- Metalprobe logs show repeated failures + +**Check:** + +```bash +# 1. Verify metalprobe received token +kubectl logs | grep "discovery-token" +# (Token value should NOT appear in logs) + +# 2. Check registry validation +kubectl logs -n metal-operator-system deployment/metal-operator-registry \ + | grep -E "Validated|Rejected" + +# 3. Verify system UUID matches +kubectl get server -o jsonpath='{.spec.systemUUID}' +# Should match the UUID in registry logs + +# 4. Check token expiry +# If discovery takes >1 hour, increase expiry in internal/token/token.go +``` + +## Testing + +### Unit Tests + +Token generation and verification: + +```bash +cd internal/token +go test -v +``` + +### Integration Tests + +Full discovery flow with signed tokens: + +```bash +make test +``` + +### Manual Testing + +1. **Generate a test token:** + +```go +secret, _ := token.GenerateSigningSecret() +jwtToken, _ := token.GenerateSignedDiscoveryToken(secret, "test-uuid-123") +fmt.Println("Token:", jwtToken) +// Output: .. +``` + +2. **Verify the token:** + +```go +uuid, timestamp, valid, _ := token.VerifySignedDiscoveryToken(secret, jwtToken) +fmt.Printf("UUID: %s, Valid: %t, Age: %ds\n", uuid, valid, time.Now().Unix()-timestamp) +``` + +3. **Test with metalprobe:** + +```bash +metalprobe --server-uuid=test-uuid \ + --registry-url=http://localhost:8080 \ + --discovery-token= +``` + +## References + +- [JWT (JSON Web Tokens) - RFC 7519](https://www.rfc-editor.org/rfc/rfc7519) +- [golang-jwt/jwt Library](https://github.com/golang-jwt/jwt) +- [HMAC-SHA256 (RFC 2104)](https://www.rfc-editor.org/rfc/rfc2104) +- [OpenStack Ironic Agent Token Design](https://docs.openstack.org/ironic/latest/admin/security.html#agent-token) +- [Kubernetes Secrets Best Practices](https://kubernetes.io/docs/concepts/security/secrets-good-practices/) +- [Issue #749: Secure Discovery Boot Data](https://github.com/ironcore-dev/metal-operator/issues/749) + +## Future Enhancements + +### Potential Improvements: + +1. **Short-lived tokens**: Generate new tokens on each registration attempt +2. **Token rotation**: Periodically rotate signing secret without downtime +3. **Multi-secret support**: Support multiple signing secrets for graceful rotation +4. **Audit trail**: Log all token validations to audit log +5. **Rate limiting**: Prevent brute-force token guessing attempts +6. **Token revocation**: Explicit token revocation API for compromised tokens + +### Not Needed (Now Implemented): + +- ~~JWT standard format~~ ✅ **DONE**: Migrated to JWT (RFC 7519) using `github.com/golang-jwt/jwt/v5` + - Industry-standard token format with extensive tooling + - Built-in expiration validation + - Algorithm confusion protection + - 1-hour token lifetime (same as original HMAC implementation) diff --git a/go.mod b/go.mod index d4f670c6e..3c8651535 100644 --- a/go.mod +++ b/go.mod @@ -4,6 +4,7 @@ go 1.25.6 require ( github.com/go-logr/logr v1.4.3 + github.com/golang-jwt/jwt/v5 v5.3.1 github.com/ironcore-dev/controller-utils v0.11.0 github.com/jaypipes/ghw v0.24.0 github.com/onsi/ginkgo/v2 v2.28.1 diff --git a/go.sum b/go.sum index 51fd85e4a..81690e868 100644 --- a/go.sum +++ b/go.sum @@ -78,6 +78,8 @@ github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1v github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8= github.com/goccy/go-yaml v1.18.0 h1:8W7wMFS12Pcas7KU+VVkaiCng+kG8QiFeFwzFb+rwuw= github.com/goccy/go-yaml v1.18.0/go.mod h1:XBurs7gK8ATbW4ZPGKgcbrY1Br56PdM69F7LkFRi1kA= +github.com/golang-jwt/jwt/v5 v5.3.1 h1:kYf81DTWFe7t+1VvL7eS+jKFVWaUnK9cB1qbwn63YCY= +github.com/golang-jwt/jwt/v5 v5.3.1/go.mod h1:fxCRLWMO43lRc8nhHWY6LGqRcf+1gQWArsqaEUEa5bE= github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= github.com/google/btree v1.1.3 h1:CVpQJjYgC4VbzxeGVHfvZrv1ctoYCAI8vbl07Fcxlyg= diff --git a/internal/api/registry/registry.go b/internal/api/registry/registry.go index 010105c42..deea30e9a 100644 --- a/internal/api/registry/registry.go +++ b/internal/api/registry/registry.go @@ -7,15 +7,17 @@ package registry const BootStateReceivedCondition = "BootStateReceived" // RegistrationPayload represents the payload to send to the `/register` endpoint, -// including the systemUUID and the server details. +// including the systemUUID, discovery token, and the server details. type RegistrationPayload struct { - SystemUUID string `json:"systemUUID"` - Data Server `json:"data"` + SystemUUID string `json:"systemUUID"` + DiscoveryToken string `json:"discoveryToken"` + Data Server `json:"data"` } // BootstatePayload represents the payload to send to the `/bootstate` endpoint, -// including the systemUUID and the booted state. +// including the systemUUID, discovery token, and the booted state. type BootstatePayload struct { - SystemUUID string `json:"systemUUID"` - Booted bool `json:"booted"` + SystemUUID string `json:"systemUUID"` + DiscoveryToken string `json:"discoveryToken"` + Booted bool `json:"booted"` } diff --git a/internal/controller/server_controller.go b/internal/controller/server_controller.go index a1db63775..273715dde 100644 --- a/internal/controller/server_controller.go +++ b/internal/controller/server_controller.go @@ -25,6 +25,7 @@ import ( "github.com/ironcore-dev/metal-operator/internal/bmcutils" "github.com/ironcore-dev/metal-operator/internal/ignition" metalmetrics "github.com/ironcore-dev/metal-operator/internal/metrics" + metaltoken "github.com/ironcore-dev/metal-operator/internal/token" "github.com/stmcginnis/gofish/schemas" "golang.org/x/crypto/bcrypt" "golang.org/x/crypto/ssh" @@ -691,6 +692,20 @@ func (r *ServerReconciler) applyDefaultIgnitionForServer(ctx context.Context, se return fmt.Errorf("failed to generate SSH keypair: %w", err) } + // Get or create signing secret for discovery tokens + signingSecret, err := r.getOrCreateSigningSecret(ctx) + if err != nil { + return fmt.Errorf("failed to get signing secret: %w", err) + } + + // Generate signed discovery token + discoveryToken, err := metaltoken.GenerateSignedDiscoveryToken(signingSecret, server.Spec.SystemUUID) + if err != nil { + return fmt.Errorf("failed to generate discovery token: %w", err) + } + + log.V(1).Info("Generated signed discovery token", "systemUUID", server.Spec.SystemUUID) + ownerRef := metav1apply.OwnerReference(). WithAPIVersion("metal.ironcore.dev/v1alpha1"). WithKind("ServerBootConfiguration"). @@ -715,7 +730,8 @@ func (r *ServerReconciler) applyDefaultIgnitionForServer(ctx context.Context, se sshKeyPairNamespacedName := fmt.Sprintf("%s/%s", ptr.Deref(sshSecretApply.Namespace, ""), ptr.Deref(sshSecretApply.Name, "")) log.V(1).Info("Applied SSH keypair secret", "SSHKeyPair", sshKeyPairNamespacedName) - probeFlags := fmt.Sprintf("--registry-url=%s --server-uuid=%s", registryURL, server.Spec.SystemUUID) + // Include discovery token in probe flags + probeFlags := fmt.Sprintf("--registry-url=%s --server-uuid=%s --discovery-token=%s", registryURL, server.Spec.SystemUUID, discoveryToken) ignitionData, err := r.generateDefaultIgnitionDataForServer(probeFlags, sshPublicKey, password) if err != nil { return fmt.Errorf("failed to generate default ignitionSecret data: %w", err) @@ -746,6 +762,99 @@ func (r *ServerReconciler) applyDefaultIgnitionForServer(ctx context.Context, se return nil } +// getOrCreateSigningSecret retrieves or creates the shared signing secret for discovery tokens. +// The secret is shared between the controller and registry for HMAC token signing/verification. +func (r *ServerReconciler) getOrCreateSigningSecret(ctx context.Context) ([]byte, error) { + secretName := metaltoken.DiscoveryTokenSigningSecretName + secret := &v1.Secret{} + err := r.Get(ctx, types.NamespacedName{ + Name: secretName, + Namespace: r.ManagerNamespace, + }, secret) + + if err == nil { + // Secret exists, check if it's valid + if signingKey, ok := secret.Data[metaltoken.DiscoveryTokenSigningSecretKey]; ok && len(signingKey) == 32 { + return signingKey, nil + } + // Secret exists but is invalid, regenerate and update it + ctrl.LoggerFrom(ctx).Info("Existing signing secret is invalid, regenerating", "name", secretName) + newSigningKey, err := metaltoken.GenerateSigningSecret() + if err != nil { + return nil, fmt.Errorf("failed to generate signing secret: %w", err) + } + secret.Data = map[string][]byte{ + metaltoken.DiscoveryTokenSigningSecretKey: newSigningKey, + } + if err := r.Update(ctx, secret); err != nil { + return nil, fmt.Errorf("failed to update signing secret: %w", err) + } + return newSigningKey, nil + } + + if !apierrors.IsNotFound(err) { + return nil, fmt.Errorf("failed to get signing secret: %w", err) + } + + // Generate new signing secret + signingKey, err := metaltoken.GenerateSigningSecret() + if err != nil { + return nil, fmt.Errorf("failed to generate signing secret: %w", err) + } + + // Create the secret + newSecret := &v1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: secretName, + Namespace: r.ManagerNamespace, + }, + Type: v1.SecretTypeOpaque, + Data: map[string][]byte{ + metaltoken.DiscoveryTokenSigningSecretKey: signingKey, + }, + } + + if err := r.Create(ctx, newSecret); err != nil { + if apierrors.IsAlreadyExists(err) { + // Another reconciler won the race - fetch the existing secret + ctrl.LoggerFrom(ctx).Info("Signing secret already exists, retrieving it") + + secret := &v1.Secret{} + if err := r.Get(ctx, types.NamespacedName{ + Name: secretName, + Namespace: r.ManagerNamespace, + }, secret); err != nil { + return nil, fmt.Errorf("failed to retrieve existing signing secret: %w", err) + } + + if signingKey, ok := secret.Data[metaltoken.DiscoveryTokenSigningSecretKey]; ok && len(signingKey) == 32 { + return signingKey, nil + } + + // Existing secret is invalid - update it + signingKey, err := metaltoken.GenerateSigningSecret() + if err != nil { + return nil, fmt.Errorf("failed to generate replacement signing secret: %w", err) + } + + if secret.Data == nil { + secret.Data = make(map[string][]byte) + } + secret.Data[metaltoken.DiscoveryTokenSigningSecretKey] = signingKey + if err := r.Update(ctx, secret); err != nil { + return nil, fmt.Errorf("failed to update invalid signing secret: %w", err) + } + + ctrl.LoggerFrom(ctx).Info("Updated invalid signing secret") + return signingKey, nil + } + return nil, fmt.Errorf("failed to create signing secret: %w", err) + } + + ctrl.LoggerFrom(ctx).Info("Created discovery token signing secret", "name", secretName) + return signingKey, nil +} + func generateSSHKeyPairAndPassword() ([]byte, []byte, []byte, error) { privateKey, err := rsa.GenerateKey(rand.Reader, 2048) if err != nil { diff --git a/internal/controller/server_controller_test.go b/internal/controller/server_controller_test.go index ebdd95b1e..9fc96e18a 100644 --- a/internal/controller/server_controller_test.go +++ b/internal/controller/server_controller_test.go @@ -10,8 +10,11 @@ import ( "net/http" "os" "path/filepath" + "regexp" + "strings" "time" + "github.com/google/go-cmp/cmp" "golang.org/x/crypto/bcrypt" "golang.org/x/crypto/ssh" "gopkg.in/yaml.v3" @@ -20,6 +23,7 @@ import ( "github.com/ironcore-dev/metal-operator/internal/api/registry" "github.com/ironcore-dev/metal-operator/internal/ignition" "github.com/ironcore-dev/metal-operator/internal/probe" + metaltoken "github.com/ironcore-dev/metal-operator/internal/token" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" v1 "k8s.io/api/core/v1" @@ -31,6 +35,28 @@ import ( . "sigs.k8s.io/controller-runtime/pkg/envtest/komega" ) +// getSignedDiscoveryToken retrieves the signing secret and generates a signed discovery token +// for the given systemUUID. This is a test helper to avoid code duplication. +func getSignedDiscoveryToken(ctx SpecContext, k8sClient client.Client, ns, systemUUID string) (string, error) { + secretName := metaltoken.DiscoveryTokenSigningSecretName + secret := &v1.Secret{} + + err := k8sClient.Get(ctx, client.ObjectKey{ + Name: secretName, + Namespace: ns, + }, secret) + if err != nil { + return "", err + } + + signingKey, ok := secret.Data[metaltoken.DiscoveryTokenSigningSecretKey] + if !ok || len(signingKey) != 32 { + return "", fmt.Errorf("invalid signing secret") + } + + return metaltoken.GenerateSignedDiscoveryToken(signingKey, systemUUID) +} + var _ = Describe("Server Controller", func() { ns := SetupTest(nil) @@ -195,18 +221,35 @@ var _ = Describe("Server Controller", func() { err = bcrypt.CompareHashAndPassword([]byte(passwordHash), sshSecret.Data[SSHKeyPairSecretPasswordKeyName]) Expect(err).ToNot(HaveOccurred(), "passwordHash should match the expected password") - // Generate expected ignition data using the same template file the controller uses - ignitionData, err := ignition.GenerateIgnitionDataFromFile( - filepath.Join("..", "..", "config", "manager", "ignition-template.yaml"), - ignition.Config{ - Image: "foo:latest", - Flags: "--registry-url=http://localhost:30000 --server-uuid=38947555-7742-3448-3784-823347823834", - SSHPublicKey: string(sshSecret.Data[SSHKeyPairSecretPublicKeyName]), - PasswordHash: passwordHash, - }, - ) - Expect(err).NotTo(HaveOccurred()) + // Extract the actual discovery token from the metalprobe service in the ignition + ignitionYAML := ignitionSecret.Data[DefaultIgnitionSecretKeyName] + var ignitionConfig map[string]any + Expect(yaml.Unmarshal(ignitionYAML, &ignitionConfig)).To(Succeed()) + + systemd, ok := ignitionConfig["systemd"].(map[string]any) + Expect(ok).To(BeTrue(), "systemd section should exist") + + units, ok := systemd["units"].([]any) + Expect(ok).To(BeTrue(), "units should be a list") + // Find metalprobe service unit + var metalprobeUnit map[string]any + for _, unit := range units { + u, ok := unit.(map[string]any) + if ok && u["name"] == "metalprobe.service" { + metalprobeUnit = u + break + } + } + Expect(metalprobeUnit).NotTo(BeNil(), "metalprobe.service should exist") + + contents, ok := metalprobeUnit["contents"].(string) + Expect(ok).To(BeTrue(), "contents should be a string") + Expect(contents).To(ContainSubstring("--registry-url=http://localhost:30000")) + Expect(contents).To(ContainSubstring("--server-uuid=38947555-7742-3448-3784-823347823834")) + Expect(contents).To(ContainSubstring("--discovery-token=")) + + // Verify ignition secret has owner references Eventually(Object(ignitionSecret)).Should(SatisfyAll( HaveField("OwnerReferences", ContainElement(metav1.OwnerReference{ APIVersion: "metal.ironcore.dev/v1alpha1", @@ -217,7 +260,6 @@ var _ = Describe("Server Controller", func() { BlockOwnerDeletion: ptr.To(true), })), HaveField("Data", HaveKeyWithValue(DefaultIgnitionFormatKey, []byte("fcos"))), - HaveField("Data", HaveKeyWithValue(DefaultIgnitionSecretKeyName, MatchYAML(ignitionData))), )) By("Patching the boot configuration to a Ready state") @@ -248,7 +290,16 @@ var _ = Describe("Server Controller", func() { )) By("Starting the probe agent") - probeAgent := probe.NewAgent(GinkgoLogr, server.Spec.SystemUUID, registryURL, 100*time.Millisecond, 1*time.Second, 50*time.Millisecond, 250*time.Millisecond) + // The controller generates signed tokens automatically + // We need to get the signing secret and generate a matching token + var testToken string + Eventually(func() error { + var err error + testToken, err = getSignedDiscoveryToken(ctx, k8sClient, ns.Name, server.Spec.SystemUUID) + return err + }).Should(Succeed()) + + probeAgent := probe.NewAgent(GinkgoLogr, server.Spec.SystemUUID, registryURL, testToken, 100*time.Millisecond, 1*time.Second, 50*time.Millisecond, 250*time.Millisecond) go func() { defer GinkgoRecover() Expect(probeAgent.Start(ctx)).To(Succeed(), "failed to start probe agent") @@ -385,23 +436,13 @@ var _ = Describe("Server Controller", func() { Expect(ok).To(BeTrue()) Expect(user).To(HaveKeyWithValue("name", "metal")) + // Verify password hash exists and matches (check once outside Eventually) passwordHash, ok := user["password_hash"].(string) Expect(ok).To(BeTrue(), "password_hash should be a string") - Expect(bcrypt.CompareHashAndPassword([]byte(passwordHash), sshSecret.Data[SSHKeyPairSecretPasswordKeyName])).Should(Succeed()) - // Generate expected ignition data using the same template file the controller uses - ignitionData, err := ignition.GenerateIgnitionDataFromFile( - filepath.Join("..", "..", "config", "manager", "ignition-template.yaml"), - ignition.Config{ - Image: "foo:latest", - Flags: "--registry-url=http://localhost:30000 --server-uuid=38947555-7742-3448-3784-823347823834", - SSHPublicKey: string(sshSecret.Data[SSHKeyPairSecretPublicKeyName]), - PasswordHash: passwordHash, - }, - ) - Expect(err).NotTo(HaveOccurred()) - + // Eventually assertion with token and hash extraction inside the closure + // This prevents race conditions with reconciler re-applying the secret with fresh bcrypt hashes Eventually(Object(ignitionSecret)).Should(SatisfyAll( HaveField("OwnerReferences", ContainElement(metav1.OwnerReference{ APIVersion: "metal.ironcore.dev/v1alpha1", @@ -412,7 +453,64 @@ var _ = Describe("Server Controller", func() { BlockOwnerDeletion: ptr.To(true), })), HaveField("Data", HaveKeyWithValue(DefaultIgnitionFormatKey, []byte("fcos"))), - HaveField("Data", HaveKeyWithValue(DefaultIgnitionSecretKeyName, MatchYAML(ignitionData))), + WithTransform(func(secret *v1.Secret) error { + // Re-extract password hash and token inside Eventually to handle reconciler updates + contents := string(secret.Data[DefaultIgnitionSecretKeyName]) + if !strings.Contains(contents, "--discovery-token=") { + return fmt.Errorf("ignition data does not contain discovery token") + } + + // Extract token using regex + tokenRegex := regexp.MustCompile(`--discovery-token=([A-Za-z0-9._-]+)`) + matches := tokenRegex.FindStringSubmatch(contents) + if len(matches) != 2 { + return fmt.Errorf("failed to extract discovery token from ignition config") + } + actualToken := matches[1] + if actualToken == "" { + return fmt.Errorf("extracted discovery token is empty") + } + + // Re-extract current password hash (may have changed due to reconciliation) + var ignitionConfig map[string]any + if err := yaml.Unmarshal(secret.Data[DefaultIgnitionSecretKeyName], &ignitionConfig); err != nil { + return fmt.Errorf("failed to unmarshal ignition config: %w", err) + } + passwd, _ := ignitionConfig["passwd"].(map[string]any) + users, _ := passwd["users"].([]any) + if len(users) == 0 { + return fmt.Errorf("no users found in ignition config") + } + user, ok := users[0].(map[string]any) + if !ok { + return fmt.Errorf("user is not a map") + } + currentPasswordHash, ok := user["password_hash"].(string) + if !ok { + return fmt.Errorf("password_hash is not a string") + } + + // Generate expected ignition data using the CURRENT password hash and token + ignitionData, err := ignition.GenerateIgnitionDataFromFile( + filepath.Join("..", "..", "config", "manager", "ignition-template.yaml"), + ignition.Config{ + Image: "foo:latest", + Flags: fmt.Sprintf("--registry-url=http://localhost:30000 --server-uuid=38947555-7742-3448-3784-823347823834 --discovery-token=%s", actualToken), + SSHPublicKey: string(sshSecret.Data[SSHKeyPairSecretPublicKeyName]), + PasswordHash: currentPasswordHash, + }, + ) + if err != nil { + return fmt.Errorf("failed to generate expected ignition data: %w", err) + } + + // Compare actual vs expected + if diff := cmp.Diff(string(ignitionData), string(secret.Data[DefaultIgnitionSecretKeyName])); diff != "" { + return fmt.Errorf("ignition data mismatch (-want +got):\n%s", diff) + } + + return nil + }, BeNil()), )) By("Patching the boot configuration to a Ready state") @@ -444,7 +542,15 @@ var _ = Describe("Server Controller", func() { )) By("Starting the probe agent") - probeAgent := probe.NewAgent(GinkgoLogr, server.Spec.SystemUUID, registryURL, 50*time.Millisecond, 1*time.Second, 50*time.Millisecond, 250*time.Millisecond) + // Get the signing secret and generate a matching token + var testToken2 string + Eventually(func() error { + var err error + testToken2, err = getSignedDiscoveryToken(ctx, k8sClient, ns.Name, server.Spec.SystemUUID) + return err + }).Should(Succeed()) + + probeAgent := probe.NewAgent(GinkgoLogr, server.Spec.SystemUUID, registryURL, testToken2, 50*time.Millisecond, 1*time.Second, 50*time.Millisecond, 250*time.Millisecond) go func() { defer GinkgoRecover() Expect(probeAgent.Start(ctx)).To(Succeed(), "failed to start probe agent") @@ -775,8 +881,18 @@ var _ = Describe("Server Controller", func() { Expect(k8sClient.Create(ctx, server)).To(Succeed()) Eventually(Object(server)).Should(HaveField("Status.State", metalv1alpha1.ServerStateDiscovery)) + By("Getting the signing secret and generating a valid discovery token") + var testToken string + Eventually(func() error { + var err error + testToken, err = getSignedDiscoveryToken(ctx, k8sClient, ns.Name, server.Spec.SystemUUID) + return err + }).Should(Succeed()) + + By("Sending bootstate request with valid discovery token") var bootstateRequest registry.BootstatePayload bootstateRequest.SystemUUID = server.Spec.SystemUUID + bootstateRequest.DiscoveryToken = testToken bootstateRequest.Booted = true marshaled, err := json.Marshal(bootstateRequest) Expect(err).NotTo(HaveOccurred()) diff --git a/internal/controller/suite_test.go b/internal/controller/suite_test.go index a4a7d8a38..838be1ae6 100644 --- a/internal/controller/suite_test.go +++ b/internal/controller/suite_test.go @@ -333,7 +333,13 @@ func SetupTest(redfishMockServers []netip.AddrPort) *corev1.Namespace { By("Starting the registry server") Expect(k8sManager.Add(manager.RunnableFunc(func(ctx context.Context) error { - registryServer := registry.NewServer(GinkgoLogr, ":30000", k8sManager.GetClient()) + registryServer := registry.NewServer( + GinkgoLogr, + ":30000", + k8sManager.GetClient(), + "discovery-token-signing-secret", + ns.Name, + ) if err := registryServer.Start(ctx); err != nil { return fmt.Errorf("failed to start registry server: %w", err) } diff --git a/internal/probe/probe.go b/internal/probe/probe.go index 4217756d2..7d00c5f5a 100644 --- a/internal/probe/probe.go +++ b/internal/probe/probe.go @@ -7,6 +7,7 @@ import ( "bytes" "context" "encoding/json" + "fmt" "net/http" "time" @@ -19,6 +20,7 @@ import ( type Agent struct { SystemUUID string RegistryURL string + DiscoveryToken string // Authentication token for registry Duration time.Duration RegistryClientTimeout time.Duration Server *registry.Server // Pointer to Server for late initialization. @@ -27,12 +29,13 @@ type Agent struct { LLDPSyncDuration time.Duration } -// NewAgent creates a new Agent with the specified system UUID and registry URL. -func NewAgent(log logr.Logger, systemUUID, registryURL string, duration, registryClientTimeout, LLDPSyncInterval, LLDPSyncDuration time.Duration) *Agent { +// NewAgent creates a new Agent with the specified system UUID, registry URL, and discovery token. +func NewAgent(log logr.Logger, systemUUID, registryURL, discoveryToken string, duration, registryClientTimeout, LLDPSyncInterval, LLDPSyncDuration time.Duration) *Agent { return &Agent{ log: log, SystemUUID: systemUUID, RegistryURL: registryURL, + DiscoveryToken: discoveryToken, Duration: duration, RegistryClientTimeout: registryClientTimeout, LLDPSyncInterval: LLDPSyncInterval, @@ -182,10 +185,12 @@ func (a *Agent) RefreshLLDP(ctx context.Context) error { } // registerServer handles the server registration with exponential backoff on failure. +// Includes the discovery token in the payload for authentication with the registry. func (a *Agent) registerServer(ctx context.Context) error { payload := registry.RegistrationPayload{ - SystemUUID: a.SystemUUID, - Data: *a.Server, // Dereference the pointer to Server. + SystemUUID: a.SystemUUID, + DiscoveryToken: a.DiscoveryToken, + Data: *a.Server, // Dereference the pointer to Server. } return wait.ExponentialBackoffWithContext( @@ -220,8 +225,16 @@ func (a *Agent) registerServer(ctx context.Context) error { } }() + // Handle authentication errors explicitly + if resp.StatusCode == http.StatusUnauthorized || resp.StatusCode == http.StatusForbidden { + err := fmt.Errorf("authentication failed with status %d", resp.StatusCode) + a.log.Error(err, "Authentication failed with registry") + // Don't retry on auth errors - token is invalid + return false, err + } + if resp.StatusCode != http.StatusOK && resp.StatusCode != http.StatusCreated { - a.log.Error(err, "Failed to register server", "url", a.RegistryURL) + a.log.Info("Failed to register server", "url", a.RegistryURL, "statusCode", resp.StatusCode) return false, nil } diff --git a/internal/probe/probe_suite_test.go b/internal/probe/probe_suite_test.go index 69e7781cc..a2d885922 100644 --- a/internal/probe/probe_suite_test.go +++ b/internal/probe/probe_suite_test.go @@ -34,7 +34,7 @@ var _ = BeforeSuite(func() { DeferCleanup(cancel) // Initialize the registry - registryServer = registry.NewServer(GinkgoLogr, registryAddr, nil) + registryServer = registry.NewServer(GinkgoLogr, registryAddr, nil, "", "") go func() { defer GinkgoRecover() Expect(registryServer.Start(ctx)).To(Succeed(), "failed to start registry agent") @@ -46,7 +46,7 @@ var _ = BeforeSuite(func() { }).Should(Succeed()) // Initialize your probe server - probeAgent = probe.NewAgent(GinkgoLogr, systemUUID, registryURL, 100*time.Millisecond, 1*time.Second, 50*time.Millisecond, 250*time.Millisecond) + probeAgent = probe.NewAgent(GinkgoLogr, systemUUID, registryURL, "test-discovery-token-123", 100*time.Millisecond, 1*time.Second, 50*time.Millisecond, 250*time.Millisecond) go func() { defer GinkgoRecover() Expect(probeAgent.Start(ctx)).To(Succeed(), "failed to start probe agent") diff --git a/internal/registry/registry_suite_test.go b/internal/registry/registry_suite_test.go index c31cf5905..7747ed727 100644 --- a/internal/registry/registry_suite_test.go +++ b/internal/registry/registry_suite_test.go @@ -28,7 +28,7 @@ var _ = BeforeSuite(func() { ctx, cancel := context.WithCancel(context.Background()) DeferCleanup(cancel) - server = registry.NewServer(GinkgoLogr, testServerAddr, nil) + server = registry.NewServer(GinkgoLogr, testServerAddr, nil, "", "") go func() { defer GinkgoRecover() Expect(server.Start(ctx)).To(Succeed(), "failed to start registry server") diff --git a/internal/registry/server.go b/internal/registry/server.go index 4ac900189..f58d42cf8 100644 --- a/internal/registry/server.go +++ b/internal/registry/server.go @@ -16,33 +16,184 @@ import ( "github.com/ironcore-dev/controller-utils/conditionutils" metalv1alpha1 "github.com/ironcore-dev/metal-operator/api/v1alpha1" "github.com/ironcore-dev/metal-operator/internal/api/registry" + metaltoken "github.com/ironcore-dev/metal-operator/internal/token" + corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "sigs.k8s.io/controller-runtime/pkg/client" ) -// Server holds the HTTP server's state, including the systems store. +// Server holds the HTTP server's state, including the systems store and signing secret. type Server struct { - addr string - mux *http.ServeMux - systemsStore *sync.Map - log logr.Logger - k8sClient client.Client + addr string + mux *http.ServeMux + systemsStore *sync.Map + signingSecret []byte // Shared secret for HMAC token verification + signingSecretName string // Name of the signing secret + signingSecretNs string // Namespace of the signing secret + signingSecretMu sync.RWMutex // Read-write mutex for signing secret access + log logr.Logger + k8sClient client.Client } // NewServer initializes and returns a new Server instance. -func NewServer(logger logr.Logger, addr string, k8sClient client.Client) *Server { +// It loads the signing secret from Kubernetes for token verification. +func NewServer(logger logr.Logger, addr string, k8sClient client.Client, signingSecretName, signingSecretNamespace string) *Server { mux := http.NewServeMux() + + // Warn if running without K8s client (test mode) + if k8sClient == nil { + logger.Info("WARNING: Running without K8s client - token validation DISABLED (test mode only)") + } + + // Load signing secret from Kubernetes + var signingSecret []byte + if k8sClient != nil && signingSecretName != "" && signingSecretNamespace != "" { + ctx := context.Background() + secret := &corev1.Secret{} + err := k8sClient.Get(ctx, client.ObjectKey{ + Name: signingSecretName, + Namespace: signingSecretNamespace, + }, secret) + + if err != nil { + logger.Error(err, "Failed to load signing secret", + "name", signingSecretName, "namespace", signingSecretNamespace) + } else { + if key, ok := secret.Data[metaltoken.DiscoveryTokenSigningSecretKey]; ok && len(key) == 32 { + signingSecret = key + logger.Info("Loaded discovery token signing secret", + "name", signingSecretName, "namespace", signingSecretNamespace) + } else { + logger.Error(nil, "Signing secret found but invalid", + "name", signingSecretName, "namespace", signingSecretNamespace) + } + } + + if len(signingSecret) == 0 { + logger.Info("Signing secret not loaded, token validation will fail until secret is available", + "name", signingSecretName, "namespace", signingSecretNamespace) + } + } + server := &Server{ - addr: addr, - mux: mux, - systemsStore: &sync.Map{}, - log: logger, - k8sClient: k8sClient, + addr: addr, + mux: mux, + systemsStore: &sync.Map{}, + signingSecret: signingSecret, + signingSecretName: signingSecretName, + signingSecretNs: signingSecretNamespace, + log: logger, + k8sClient: k8sClient, } server.routes() return server } +// getSigningSecret returns the signing secret, loading it from Kubernetes if not cached. +// This method is thread-safe and only memoizes successful loads, allowing recovery from transient failures. +// It uses a fast-path read lock check and upgrades to write lock only when loading is needed. +// It respects the provided context for cancellation and timeouts. +func (s *Server) getSigningSecret(ctx context.Context) ([]byte, error) { + // Fast path: check if we already have a valid cached secret (read lock) + s.signingSecretMu.RLock() + if len(s.signingSecret) == 32 { + secret := s.signingSecret + s.signingSecretMu.RUnlock() + return secret, nil + } + s.signingSecretMu.RUnlock() + + // Slow path: need to load the secret (write lock) + if s.signingSecretName == "" || s.signingSecretNs == "" { + return nil, fmt.Errorf("signing secret name or namespace not configured") + } + + // Attempt to load the secret from Kubernetes + secret := &corev1.Secret{} + err := s.k8sClient.Get(ctx, client.ObjectKey{ + Name: s.signingSecretName, + Namespace: s.signingSecretNs, + }, secret) + + if err != nil { + return nil, fmt.Errorf("failed to load signing secret: %w", err) + } + + key, ok := secret.Data[metaltoken.DiscoveryTokenSigningSecretKey] + if !ok || len(key) != 32 { + return nil, fmt.Errorf("signing secret invalid or missing") + } + + // Cache the loaded secret (write lock) + s.signingSecretMu.Lock() + s.signingSecret = key + s.signingSecretMu.Unlock() + + s.log.Info("Loaded discovery token signing secret", "name", s.signingSecretName) + return key, nil +} + +// validateDiscoveryToken verifies a JWT-signed discovery token. +// Returns (systemUUID, valid) where systemUUID is extracted from the token. +// If k8sClient is nil (unit test mode), validation is skipped. +// Implements single-retry refresh on validation failure to handle secret rotation. +func (s *Server) validateDiscoveryToken(ctx context.Context, receivedToken string) (string, bool) { + // Skip validation in test mode (no k8s client) + if s.k8sClient == nil { + s.log.V(1).Info("Running in TEST MODE without K8s client - skipping secret loading") + // In test mode, extract systemUUID from token if possible, otherwise return empty + // For now, just allow all requests in test mode + return "", true + } + + // Reject if token is missing + if receivedToken == "" { + s.log.Info("Rejected request with missing discovery token") + return "", false + } + + // Get signing secret (thread-safe, loads on first call) + secret, err := s.getSigningSecret(ctx) + if err != nil { + s.log.Error(err, "Signing secret not available") + return "", false + } + + // Verify the signed token + systemUUID, timestamp, valid, err := metaltoken.VerifySignedDiscoveryToken(secret, receivedToken) + if err != nil || !valid { + // Token validation failed - could be due to secret rotation + // Clear cache and retry once with fresh secret + s.log.V(1).Info("Token validation failed, clearing cache and retrying", "error", err) + + s.signingSecretMu.Lock() + s.signingSecret = nil + s.signingSecretMu.Unlock() + + // Retry with fresh secret + secret, err = s.getSigningSecret(ctx) + if err != nil { + s.log.Error(err, "Failed to reload signing secret for retry") + return "", false + } + + systemUUID, timestamp, valid, err = metaltoken.VerifySignedDiscoveryToken(secret, receivedToken) + if err != nil { + s.log.Error(err, "Error verifying discovery token after retry") + return "", false + } + + if !valid { + s.log.Info("Rejected request with invalid discovery token after retry", "tokenLength", len(receivedToken)) + return "", false + } + } + + // Token is valid + s.log.V(1).Info("Validated discovery token", "systemUUID", systemUUID, "timestamp", timestamp) + return systemUUID, true +} + // routes registers the server's routes. func (s *Server) routes() { s.mux.HandleFunc("/register", s.registerHandler) @@ -52,6 +203,7 @@ func (s *Server) routes() { } // registerHandler handles the /register endpoint. +// Requires a valid discovery token for authentication. func (s *Server) registerHandler(w http.ResponseWriter, r *http.Request) { if r.Method != http.MethodPost { http.Error(w, "Only POST method is allowed", http.StatusMethodNotAllowed) @@ -64,6 +216,22 @@ func (s *Server) registerHandler(w http.ResponseWriter, r *http.Request) { return } + // Validate discovery token and extract systemUUID + systemUUID, valid := s.validateDiscoveryToken(r.Context(), reg.DiscoveryToken) + if !valid { + http.Error(w, "Unauthorized: invalid or missing discovery token", http.StatusUnauthorized) + s.log.Info("Rejected registration attempt with invalid token") + return + } + + // Verify the systemUUID from the token matches the payload (skip in unit test mode) + if s.k8sClient != nil && systemUUID != "" && systemUUID != reg.SystemUUID { + http.Error(w, "Unauthorized: systemUUID mismatch", http.StatusUnauthorized) + s.log.Info("Rejected registration attempt with mismatched systemUUID", + "claimed", reg.SystemUUID, "actual", systemUUID) + return + } + // Store the registration information. s.systemsStore.Store(reg.SystemUUID, reg.Data) s.log.Info("Registered system UUID", "uuid", reg.SystemUUID) @@ -135,6 +303,23 @@ func (s *Server) bootstateHandler(w http.ResponseWriter, r *http.Request) { s.log.Error(err, "Failed to decode bootstate payload") return } + + // Validate discovery token and extract systemUUID + systemUUID, valid := s.validateDiscoveryToken(r.Context(), payload.DiscoveryToken) + if !valid { + http.Error(w, "Unauthorized: invalid or missing token", http.StatusUnauthorized) + s.log.Info("Rejected bootstate attempt with invalid token") + return + } + + // Verify the systemUUID from the token matches the payload (skip in unit test mode) + if s.k8sClient != nil && systemUUID != "" && systemUUID != payload.SystemUUID { + http.Error(w, "Unauthorized: systemUUID mismatch", http.StatusUnauthorized) + s.log.Info("Rejected bootstate attempt with mismatched systemUUID", + "claimed", payload.SystemUUID, "actual", systemUUID) + return + } + s.log.Info("Received boot state for system", "SystemUUID", payload.SystemUUID, "BootState", payload.Booted) if !payload.Booted { w.WriteHeader(http.StatusOK) diff --git a/internal/token/constants.go b/internal/token/constants.go new file mode 100644 index 000000000..4229b21d3 --- /dev/null +++ b/internal/token/constants.go @@ -0,0 +1,11 @@ +// SPDX-FileCopyrightText: 2024 SAP SE or an SAP affiliate company and IronCore contributors +// SPDX-License-Identifier: Apache-2.0 + +package token + +// DiscoveryTokenSigningSecretName is the name of the Kubernetes Secret +// containing the signing key for discovery tokens +const DiscoveryTokenSigningSecretName = "discovery-token-signing-secret" + +// DiscoveryTokenSigningSecretKey is the data key within the Secret +const DiscoveryTokenSigningSecretKey = "signing-key" diff --git a/internal/token/token.go b/internal/token/token.go new file mode 100644 index 000000000..d88816a60 --- /dev/null +++ b/internal/token/token.go @@ -0,0 +1,121 @@ +// SPDX-FileCopyrightText: 2026 SAP SE or an SAP affiliate company and IronCore contributors +// SPDX-License-Identifier: Apache-2.0 + +package token + +import ( + "crypto/rand" + "fmt" + "regexp" + "time" + + "github.com/golang-jwt/jwt/v5" +) + +// uuidRegex matches RFC 4122 UUID format (8-4-4-4-12 hex digits) +var uuidRegex = regexp.MustCompile(`^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$`) + +// GenerateSigningSecret generates a cryptographically secure signing secret +// for HMAC token generation. Returns a 32-byte (256-bit) secret. +func GenerateSigningSecret() ([]byte, error) { + secret := make([]byte, 32) + if _, err := rand.Read(secret); err != nil { + return nil, fmt.Errorf("failed to generate random secret: %w", err) + } + return secret, nil +} + +// GenerateSignedDiscoveryToken generates a JWT-based signed discovery token. +// The token uses HMAC-SHA256 (HS256) signing with standard JWT claims. +// +// This allows the registry to verify the token was issued by the controller +// for a specific systemUUID without storing any state. +// +// signingSecret: 32-byte shared secret between controller and registry +// systemUUID: The server's system UUID (must be RFC 4122 format) +// Returns: signed JWT token +func GenerateSignedDiscoveryToken(signingSecret []byte, systemUUID string) (string, error) { + if len(signingSecret) != 32 { + return "", fmt.Errorf("signing secret must be exactly 32 bytes") + } + + // Validate UUID format to prevent delimiter injection + if !uuidRegex.MatchString(systemUUID) { + return "", fmt.Errorf("systemUUID must be a valid RFC 4122 UUID format (got: %s)", systemUUID) + } + + // Create JWT claims + claims := jwt.MapClaims{ + "sub": systemUUID, // Subject: scoped to this server + "iat": jwt.NewNumericDate(time.Now()), // Issued at + "exp": jwt.NewNumericDate(time.Now().Add(1 * time.Hour)), // Expires in 1 hour + } + + // Create token with HS256 signing method + token := jwt.NewWithClaims(jwt.SigningMethodHS256, claims) + + // Sign token with secret key + tokenString, err := token.SignedString(signingSecret) + if err != nil { + return "", fmt.Errorf("failed to sign JWT token: %w", err) + } + + return tokenString, nil +} + +// VerifySignedDiscoveryToken verifies and extracts information from a JWT token. +// Returns (systemUUID, timestamp, valid, error). +// For invalid tokens, returns ("", 0, false, nil) - error is only for system errors. +func VerifySignedDiscoveryToken(signingSecret []byte, tokenString string) (string, int64, bool, error) { + if len(signingSecret) != 32 { + return "", 0, false, fmt.Errorf("signing secret must be exactly 32 bytes") + } + + // Parse and validate JWT token + token, err := jwt.Parse(tokenString, func(token *jwt.Token) (any, error) { + // Verify signing method is exactly HS256 (HMAC-SHA256) + if token.Method != jwt.SigningMethodHS256 { + return nil, fmt.Errorf("unexpected signing method: %v", token.Header["alg"]) + } + return signingSecret, nil + }) + + if err != nil { + return "", 0, false, nil // Invalid token, not system error + } + + // Verify token is valid and not expired + if !token.Valid { + return "", 0, false, nil + } + + // Extract claims + claims, ok := token.Claims.(jwt.MapClaims) + if !ok { + return "", 0, false, nil + } + + // Extract systemUUID from "sub" claim + systemUUID, ok := claims["sub"].(string) + if !ok || systemUUID == "" { + return "", 0, false, nil + } + + // Extract issued-at timestamp from "iat" claim + var issuedAt int64 + if iatClaim, ok := claims["iat"]; ok { + switch v := iatClaim.(type) { + case float64: + issuedAt = int64(v) + case int64: + issuedAt = v + default: + return "", 0, false, nil + } + } else { + // If no iat claim, use current time (backward compatible) + issuedAt = time.Now().Unix() + } + + return systemUUID, issuedAt, true, nil +} diff --git a/internal/token/token_test.go b/internal/token/token_test.go new file mode 100644 index 000000000..74e54c270 --- /dev/null +++ b/internal/token/token_test.go @@ -0,0 +1,240 @@ +// SPDX-FileCopyrightText: 2026 SAP SE or an SAP affiliate company and IronCore contributors +// SPDX-License-Identifier: Apache-2.0 + +package token + +import ( + "strings" + "testing" + "time" + + "github.com/golang-jwt/jwt/v5" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +func TestToken(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "Token Suite") +} + +var _ = Describe("GenerateSigningSecret", func() { + It("should generate a 32-byte secret", func() { + secret, err := GenerateSigningSecret() + Expect(err).NotTo(HaveOccurred()) + Expect(secret).To(HaveLen(32)) + }) + + It("should generate unique secrets", func() { + secret1, err := GenerateSigningSecret() + Expect(err).NotTo(HaveOccurred()) + + secret2, err := GenerateSigningSecret() + Expect(err).NotTo(HaveOccurred()) + + Expect(secret1).NotTo(Equal(secret2)) + }) +}) + +var _ = Describe("GenerateSignedDiscoveryToken", func() { + var signingSecret []byte + + BeforeEach(func() { + var err error + signingSecret, err = GenerateSigningSecret() + Expect(err).NotTo(HaveOccurred()) + }) + + Context("Token Format", func() { + It("should generate a valid JWT token", func() { + token, err := GenerateSignedDiscoveryToken(signingSecret, "38947555-7742-3448-3784-823347823834") + Expect(err).NotTo(HaveOccurred()) + Expect(token).NotTo(BeEmpty()) + + // JWT format: header.payload.signature (3 parts separated by dots) + parts := strings.Split(token, ".") + Expect(parts).To(HaveLen(3)) + }) + + It("should generate different tokens for different UUIDs", func() { + token1, err := GenerateSignedDiscoveryToken(signingSecret, "38947555-7742-3448-3784-823347823834") + Expect(err).NotTo(HaveOccurred()) + + token2, err := GenerateSignedDiscoveryToken(signingSecret, "49947555-7742-3448-3784-823347823835") + Expect(err).NotTo(HaveOccurred()) + + Expect(token1).NotTo(Equal(token2)) + }) + + It("should generate different tokens at different times", func() { + token1, err := GenerateSignedDiscoveryToken(signingSecret, "38947555-7742-3448-3784-823347823834") + Expect(err).NotTo(HaveOccurred()) + + // Wait 1+ second to ensure different timestamp + time.Sleep(1100 * time.Millisecond) + token2, err := GenerateSignedDiscoveryToken(signingSecret, "38947555-7742-3448-3784-823347823834") + Expect(err).NotTo(HaveOccurred()) + Expect(token2).NotTo(Equal(token1)) + }) + + It("should return error for invalid secret length", func() { + shortSecret := []byte("too-short") + _, err := GenerateSignedDiscoveryToken(shortSecret, "38947555-7742-3448-3784-823347823834") + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("32 bytes")) + }) + }) +}) + +var _ = Describe("VerifySignedDiscoveryToken", func() { + var signingSecret []byte + + BeforeEach(func() { + var err error + signingSecret, err = GenerateSigningSecret() + Expect(err).NotTo(HaveOccurred()) + }) + + Context("Valid Tokens", func() { + It("should verify a valid token", func() { + systemUUID := "49947555-7742-3448-3784-823347823835" + token, err := GenerateSignedDiscoveryToken(signingSecret, systemUUID) + Expect(err).NotTo(HaveOccurred()) + + extractedUUID, timestamp, valid, err := VerifySignedDiscoveryToken(signingSecret, token) + Expect(err).NotTo(HaveOccurred()) + Expect(valid).To(BeTrue()) + Expect(extractedUUID).To(Equal(systemUUID)) + Expect(timestamp).To(BeNumerically(">", 0)) + }) + + It("should extract the correct systemUUID", func() { + systemUUID := "38947555-7742-3448-3784-823347823834" + token, err := GenerateSignedDiscoveryToken(signingSecret, systemUUID) + Expect(err).NotTo(HaveOccurred()) + + extractedUUID, _, valid, err := VerifySignedDiscoveryToken(signingSecret, token) + Expect(err).NotTo(HaveOccurred()) + Expect(valid).To(BeTrue()) + Expect(extractedUUID).To(Equal(systemUUID)) + }) + }) + + Context("Invalid Tokens", func() { + It("should reject token with wrong signature", func() { + token, err := GenerateSignedDiscoveryToken(signingSecret, "38947555-7742-3448-3784-823347823834") + Expect(err).NotTo(HaveOccurred()) + + // Use different secret for verification + wrongSecret, err := GenerateSigningSecret() + Expect(err).NotTo(HaveOccurred()) + + _, _, valid, err := VerifySignedDiscoveryToken(wrongSecret, token) + Expect(err).NotTo(HaveOccurred()) + Expect(valid).To(BeFalse()) + }) + + It("should reject expired token", func() { + systemUUID := "test-uuid-expired" + + // Create token with expiration in the past + claims := jwt.MapClaims{ + "sub": systemUUID, + "iat": jwt.NewNumericDate(time.Now().Add(-2 * time.Hour)), // Issued 2 hours ago + "exp": jwt.NewNumericDate(time.Now().Add(-30 * time.Minute)), // Expired 30 minutes ago + } + + token := jwt.NewWithClaims(jwt.SigningMethodHS256, claims) + expiredToken, err := token.SignedString(signingSecret) + Expect(err).NotTo(HaveOccurred()) + + // Verify expired token should fail + _, _, valid, err := VerifySignedDiscoveryToken(signingSecret, expiredToken) + Expect(err).NotTo(HaveOccurred()) + Expect(valid).To(BeFalse()) + }) + + It("should reject tampered token", func() { + token, err := GenerateSignedDiscoveryToken(signingSecret, "38947555-7742-3448-3784-823347823834") + Expect(err).NotTo(HaveOccurred()) + + // Tamper with token + tamperedToken := token[:len(token)-5] + "XXXXX" + + _, _, valid, err := VerifySignedDiscoveryToken(signingSecret, tamperedToken) + Expect(err).NotTo(HaveOccurred()) + Expect(valid).To(BeFalse()) + }) + + It("should reject invalid JWT format", func() { + testCases := []struct { + name string + token string + }{ + {"empty", ""}, + {"malformed", "not.a.valid.jwt"}, + {"incomplete", "header.payload"}, + {"random", "random-string-not-jwt"}, + } + + for _, tc := range testCases { + _, _, valid, err := VerifySignedDiscoveryToken(signingSecret, tc.token) + Expect(err).NotTo(HaveOccurred(), "test case: "+tc.name) + Expect(valid).To(BeFalse(), "test case: "+tc.name) + } + }) + + It("should reject token with invalid secret length", func() { + shortSecret := []byte("too-short") + _, _, _, err := VerifySignedDiscoveryToken(shortSecret, "any-token") + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("32 bytes")) + }) + }) + + Context("Token Security", func() { + It("should use JWT library constant-time comparison", func() { + // JWT library provides constant-time comparison via HMAC + // This test verifies the signature mechanism works + token, err := GenerateSignedDiscoveryToken(signingSecret, "38947555-7742-3448-3784-823347823834") + Expect(err).NotTo(HaveOccurred()) + + _, _, valid, err := VerifySignedDiscoveryToken(signingSecret, token) + Expect(err).NotTo(HaveOccurred()) + Expect(valid).To(BeTrue()) + }) + + It("should prevent replay of UUID substitution", func() { + // Generate token for uuid-1 + token1, err := GenerateSignedDiscoveryToken(signingSecret, "38947555-7742-3448-3784-823347823834") + Expect(err).NotTo(HaveOccurred()) + + // Verify - should extract uuid-1, not uuid-2 + extractedUUID, _, valid, err := VerifySignedDiscoveryToken(signingSecret, token1) + Expect(err).NotTo(HaveOccurred()) + Expect(valid).To(BeTrue()) + Expect(extractedUUID).To(Equal("38947555-7742-3448-3784-823347823834")) + Expect(extractedUUID).NotTo(Equal("49947555-7742-3448-3784-823347823835")) + }) + + It("should reject tokens with wrong algorithm", func() { + systemUUID := "test-uuid-alg" + + // Create token with HS512 instead of HS256 + claims := jwt.MapClaims{ + "sub": systemUUID, + "iat": jwt.NewNumericDate(time.Now()), + "exp": jwt.NewNumericDate(time.Now().Add(1 * time.Hour)), + } + + token := jwt.NewWithClaims(jwt.SigningMethodHS512, claims) // Wrong algorithm + wrongAlgToken, err := token.SignedString(signingSecret) + Expect(err).NotTo(HaveOccurred()) + + // Verify should fail due to algorithm mismatch + _, _, valid, err := VerifySignedDiscoveryToken(signingSecret, wrongAlgToken) + Expect(err).NotTo(HaveOccurred()) + Expect(valid).To(BeFalse()) + }) + }) +})