diff --git a/.gitignore b/.gitignore index ea5ccca18..716d3fd5e 100644 --- a/.gitignore +++ b/.gitignore @@ -185,6 +185,9 @@ timing_logs.csv config.toml .aider* +# Kubernetes secrets — filled-in copies of k8s/secrets.yaml.example +k8s/secrets.yaml + CRUSH.md .crush/ diff --git a/k8s/README.md b/k8s/README.md new file mode 100644 index 000000000..91bdb9607 --- /dev/null +++ b/k8s/README.md @@ -0,0 +1,351 @@ +# Running Honcho on Kubernetes + +This directory contains Kubernetes manifests for deploying all Honcho services. The setup is managed with [Kustomize](https://kustomize.io/), which is built into `kubectl` v1.14+. + +## What gets deployed + +| Resource | Kind | Notes | +|----------|------|-------| +| `postgres` | StatefulSet + Headless Service | pgvector/pgvector:pg15, 10 Gi PVC | +| `redis` | StatefulSet + Headless Service | redis:8.2, 2 Gi PVC | +| `honcho-api` | Deployment + ClusterIP Service | FastAPI server; runs migrations on start | +| `honcho-deriver` | Deployment (no Service) | Background queue worker | +| `honcho-api` | HorizontalPodAutoscaler | 1–5 replicas at 70% CPU | +| `honcho-api` | PodDisruptionBudget | maxUnavailable: 1 (allows drains at minReplicas=1) | +| NetworkPolicies | default-deny + allow rules | Postgres/Redis reachable only from API/Deriver | + +## Testing the manifests + +A pytest test suite ships with the manifests at `tests/test_k8s_manifests.py`. It renders the Kustomize output and asserts structural correctness, security posture, and architectural invariants without requiring a running cluster. + +```bash +# Requires: kubectl on PATH, pytest, pyyaml +pytest tests/test_k8s_manifests.py --noconftest -p no:xdist --override-ini="addopts=" -v +``` + +To also validate schemas against the official Kubernetes API spec and run security scoring: + +```bash +# Schema validation (install: brew install kubeconform) +kubectl kustomize k8s/ | kubeconform -strict -summary + +# Security and best-practices scoring (install: brew install kube-score) +kubectl kustomize k8s/ | kube-score score - + +# Misconfiguration scanning (install: brew install trivy) +trivy config k8s/ +``` + +--- + +## Prerequisites + +- `kubectl` v1.14+ (Kustomize is built in) +- Docker (to build the Honcho image) +- One of the following local Kubernetes distributions: + - [k3s](https://k3s.io/) + - [kind](https://kind.sigs.k8s.io/) (Kubernetes IN Docker) + - [Docker Desktop](https://docs.docker.com/desktop/kubernetes/) with Kubernetes enabled + +--- + +## Quick start + +### Step 1 — Build the image + +From the repository root: + +```bash +docker build -t honcho:latest . +``` + +### Step 2 — Load the image into your cluster + +The image must be available inside the cluster. How to do this depends on your distribution: + +**kind** + +```bash +kind load docker-image honcho:latest --name +``` + +If you haven't created a cluster yet: + +```bash +kind create cluster --name honcho +kind load docker-image honcho:latest --name honcho +``` + +**k3s** + +```bash +docker save honcho:latest | sudo k3s ctr images import - +``` + +**Docker Desktop** + +No extra step needed — Docker Desktop shares its daemon with Kubernetes, so any locally-built image is already available. + +--- + +### Step 3 — Configure secrets + +Honcho requires a small set of secrets before it can start. Copy the template and fill in your values: + +```bash +cp k8s/secrets.yaml.example k8s/secrets.yaml +``` + +Open `k8s/secrets.yaml` and replace every placeholder: + +| Key | What to put here | +|-----|-----------------| +| `POSTGRES_PASSWORD` | A strong password for PostgreSQL | +| `DB_CONNECTION_URI` | Full URI — use the same password as above | +| `AUTH_JWT_SECRET` | Random 32-byte hex string (see below) | +| `LLM_ANTHROPIC_API_KEY` | Your Anthropic API key (if using Anthropic models) | +| `LLM_GEMINI_API_KEY` | Your Google Gemini API key (if using Gemini models) | +| `LLM_OPENAI_API_KEY` | Your OpenAI API key (if using OpenAI models) | + +Generate `AUTH_JWT_SECRET`: + +```bash +openssl rand -hex 32 +# or +uv run python scripts/generate_jwt_secret.py +``` + +`k8s/secrets.yaml` is listed in `.gitignore` — **never commit it**. + +The `honcho` namespace must exist before the Secret can be applied. Create it first: + +```bash +kubectl apply -f k8s/namespace.yaml +``` + +Then apply the secrets: + +```bash +kubectl apply -f k8s/secrets.yaml +``` + +--- + +### Step 4 — Deploy + +```bash +kubectl apply -k k8s/ +``` + +Kustomize assembles and applies all resources in a single API call — it does **not** guarantee startup ordering. Startup ordering is handled by the init containers in the API and Deriver Deployments (they wait for Postgres and Redis to be ready before the main container starts). Watch pods come up: + +```bash +kubectl get pods -n honcho --watch +``` + +Expected output once everything is running: + +``` +NAME READY STATUS RESTARTS AGE +honcho-api- 1/1 Running 0 2m +honcho-deriver- 1/1 Running 0 2m +postgres-0 1/1 Running 0 3m +redis-0 1/1 Running 0 3m +``` + +--- + +## Verify the deployment + +**Check the API** + +```bash +kubectl port-forward svc/honcho-api 8000:80 -n honcho +``` + +Then in another terminal: + +```bash +curl http://localhost:8000/openapi.json | head -5 +``` + +You should see the OpenAPI spec JSON. + +**Check the deriver** + +```bash +kubectl logs deploy/honcho-deriver -n honcho +``` + +You should see output like: + +``` +Starting deriver queue processor +Running main loop +ReconcilerScheduler started ... +``` + +The deriver has no HTTP server and no HTTP healthcheck — its health is managed by the Kubernetes restart policy. If the process exits, Kubernetes restarts it automatically. + +**Check the API logs** + +```bash +kubectl logs deploy/honcho-api -n honcho +``` + +Look for the alembic migration output followed by the FastAPI startup message. + +--- + +## Useful commands + +```bash +# Show all Honcho resources +kubectl get all -n honcho + +# Stream API logs +kubectl logs -f deploy/honcho-api -n honcho + +# Stream deriver logs +kubectl logs -f deploy/honcho-deriver -n honcho + +# Open a shell in the API pod +kubectl exec -it deploy/honcho-api -n honcho -- bash + +# Port-forward to postgres (for local DB inspection) +kubectl port-forward svc/postgres 5432:5432 -n honcho + +# Port-forward to redis +kubectl port-forward svc/redis 6379:6379 -n honcho + +# Check HPA status +kubectl get hpa -n honcho + +# Check NetworkPolicies +kubectl get networkpolicy -n honcho + +# Tear down everything (preserves PVCs — your data survives) +kubectl delete -k k8s/ + +# Tear down including persistent data +kubectl delete -k k8s/ +kubectl delete pvc --all -n honcho +``` + +--- + +## Configuration + +Non-secret configuration lives in `k8s/configmap.yaml`. Edit it and re-apply with `kubectl apply -k k8s/` to update the ConfigMap — but environment variables injected via `configMapKeyRef` are only read at pod start, so running pods will **not** pick up the change automatically. Trigger a rolling restart explicitly after re-applying: + +```bash +kubectl rollout restart deployment/honcho-api deployment/honcho-deriver -n honcho +``` + +Notable settings: + +| Key | Default | Notes | +|-----|---------|-------| +| `AUTH_USE_AUTH` | `"true"` | Set to `"false"` only for local development | +| `CACHE_ENABLED` | `"false"` | Set to `"true"` to activate Redis-backed caching | +| `DERIVER_WORKERS` | `"1"` | Increase if the deriver queue is a bottleneck | +| `METRICS_ENABLED` | `"false"` | Set to `"true"` to expose a Prometheus `/metrics` endpoint | + +For the full list of configuration options, see `config.toml.example` in the repository root. + +--- + +## Exposing the API externally + +The default Service type is `ClusterIP`, which is only reachable inside the cluster. For external access: + +**NodePort** (works on all local distributions without extra tooling): + +Edit `k8s/api/service.yaml`, change `type: ClusterIP` to `type: NodePort`, and re-apply. Kubernetes will assign a random port in the 30000–32767 range. Find it with: + +```bash +kubectl get svc honcho-api -n honcho +``` + +**LoadBalancer** (cloud providers; or bare-metal/kind with [metallb](https://metallb.universe.tf/)): + +Change `type: ClusterIP` to `type: LoadBalancer`. On cloud providers (EKS, GKE, AKS) this provisions a cloud load balancer automatically. + +**Ingress** (recommended for production): + +Keep the Service as `ClusterIP` and create an Ingress resource pointing to `honcho-api:80`. This works with any Ingress controller (nginx, traefik, etc.) and gives you TLS termination, path routing, and more. + +--- + +## Autoscaling + +The HPA (`k8s/api/hpa.yaml`) scales the API between 1 and 5 replicas when average CPU exceeds 70%. + +**Requirements**: `metrics-server` must be running in your cluster. + +- **Docker Desktop**: included by default +- **k3s**: included by default +- **kind**: install manually: + + ```bash + kubectl apply -f https://github.com/kubernetes-sigs/metrics-server/releases/latest/download/components.yaml + ``` + + On kind you may also need to patch metrics-server to disable TLS verification: + + ```bash + kubectl patch deployment metrics-server -n kube-system \ + --type='json' \ + -p='[{"op":"add","path":"/spec/template/spec/containers/0/args/-","value":"--kubelet-insecure-tls"}]' + ``` + +Check scaling activity: + +```bash +kubectl describe hpa honcho-api -n honcho +``` + +--- + +## Networking and security + +All pod-to-pod traffic within the `honcho` namespace is governed by NetworkPolicies (`k8s/network-policies.yaml`): + +- **Default deny** — all ingress is blocked unless explicitly allowed +- **API** — accepts HTTP on port 8000 from any source (external traffic, port-forward) +- **Postgres** — accepts connections on port 5432 only from API and Deriver pods +- **Redis** — accepts connections on port 6379 only from API and Deriver pods +- **Egress** — unrestricted; API and Deriver need outbound access to LLM provider APIs + +NetworkPolicies are enforced by the CNI plugin. Most CNI plugins (Calico, Cilium, Flannel with the Network Policy add-on) support them. On k3s, the default CNI (Flannel) does not enforce NetworkPolicies; install Calico or Cilium as a replacement if you need policy enforcement. + +--- + +## Production notes + +These manifests are a solid starting point, but production deployments should also consider: + +**Managed database and cache** +- Replace the in-cluster `postgres` StatefulSet with a managed service (Amazon RDS, Google Cloud SQL, Azure Database for PostgreSQL) with automated backups, point-in-time recovery, and multi-AZ failover. +- Replace in-cluster `redis` with Amazon ElastiCache or similar. +- Update `DB_CONNECTION_URI` and `CACHE_URL` in your secrets/configmap accordingly. + +**Secrets management** +- Instead of `k8s/secrets.yaml`, integrate with a secrets manager: + - [External Secrets Operator](https://external-secrets.io/) with AWS Secrets Manager, GCP Secret Manager, or HashiCorp Vault + - [Vault Agent Injector](https://developer.hashicorp.com/vault/docs/platform/k8s/injector) + - [Sealed Secrets](https://github.com/bitnami-labs/sealed-secrets) for GitOps workflows + +**Image tags** +- Replace `honcho:latest` in both Deployments with a specific, immutable tag (e.g., `honcho:v2.1.1` or a full digest). Using `latest` in production makes rollbacks harder and can cause inconsistent behavior. + +**TLS** +- Use cert-manager to issue TLS certificates and terminate TLS at your Ingress controller. + +**Ingress controller** +- Deploy nginx-ingress, traefik, or your preferred controller and create an Ingress resource pointing to `honcho-api:80` instead of using NodePort or LoadBalancer. + +**Observability** +- Set `METRICS_ENABLED: "true"` in the ConfigMap. +- Add a `Service` for the deriver on port 9090 and a `ServiceMonitor` (if using the Prometheus Operator) to scrape both the API and Deriver. +- The `docker/prometheus.yml` in the repository root shows the expected scrape targets. diff --git a/k8s/api/deployment.yaml b/k8s/api/deployment.yaml new file mode 100644 index 000000000..e84b2c3f9 --- /dev/null +++ b/k8s/api/deployment.yaml @@ -0,0 +1,233 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: honcho-api + namespace: honcho +spec: + # No static replicas — the HPA (api/hpa.yaml) manages the replica count. + # Setting replicas here would override HPA scaling on every kubectl apply. + selector: + matchLabels: + app: honcho-api + # maxSurge: 1, maxUnavailable: 0 — ensures only one new pod starts at a + # time so migrations (run inside the entrypoint) never execute concurrently. + strategy: + type: RollingUpdate + rollingUpdate: + maxSurge: 1 + maxUnavailable: 0 + template: + metadata: + labels: + app: honcho-api + spec: + # These pods don't call the Kubernetes API. + automountServiceAccountToken: false + # Pod-level security context: applies to all containers including init containers. + # Container-level securityContext blocks below add further restrictions. + securityContext: + runAsNonRoot: true + seccompProfile: + type: RuntimeDefault + initContainers: + # Block startup until PostgreSQL is accepting connections. + - name: wait-for-postgres + image: busybox:1.36 + imagePullPolicy: IfNotPresent + command: + - sh + - -c + - | + MAX_RETRIES=60; RETRY=0 + until nc -z postgres 5432; do + RETRY=$((RETRY + 1)) + if [ "$RETRY" -ge "$MAX_RETRIES" ]; then + echo "ERROR: postgres not reachable after $MAX_RETRIES attempts (120 s)" + exit 1 + fi + echo "Waiting for postgres... ($RETRY/$MAX_RETRIES)" + sleep 2 + done + echo "PostgreSQL is ready." + resources: + requests: + memory: "16Mi" + cpu: "50m" + ephemeral-storage: "16Mi" + limits: + memory: "32Mi" + cpu: "100m" + ephemeral-storage: "64Mi" + securityContext: + runAsNonRoot: true + runAsUser: 65534 # nobody + runAsGroup: 65534 # nogroup + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: ["ALL"] + # Block startup until Redis is accepting connections. + - name: wait-for-redis + image: busybox:1.36 + imagePullPolicy: IfNotPresent + command: + - sh + - -c + - | + MAX_RETRIES=60; RETRY=0 + until nc -z redis 6379; do + RETRY=$((RETRY + 1)) + if [ "$RETRY" -ge "$MAX_RETRIES" ]; then + echo "ERROR: redis not reachable after $MAX_RETRIES attempts (120 s)" + exit 1 + fi + echo "Waiting for redis... ($RETRY/$MAX_RETRIES)" + sleep 2 + done + echo "Redis is ready." + resources: + requests: + memory: "16Mi" + cpu: "50m" + ephemeral-storage: "16Mi" + limits: + memory: "32Mi" + cpu: "100m" + ephemeral-storage: "64Mi" + securityContext: + runAsNonRoot: true + runAsUser: 65534 # nobody + runAsGroup: 65534 # nogroup + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: ["ALL"] + containers: + - name: api + # Use a specific image tag in production (e.g. honcho:v2.1.1) to + # ensure reproducible deployments and enable clean rollbacks. + image: honcho:latest + # IfNotPresent: works for locally-loaded images (kind/k3s) and + # registry images alike. With specific tags the image is cached + # after the first pull. + imagePullPolicy: IfNotPresent + # Matches the docker-compose entrypoint: runs idempotent alembic + # migrations, then starts the FastAPI server. + command: ["sh", "docker/entrypoint.sh"] + ports: + - containerPort: 8000 + env: + # ── Non-secret config ────────────────────────────────────────── + - name: AUTH_USE_AUTH + valueFrom: + configMapKeyRef: + name: honcho-config + key: AUTH_USE_AUTH + - name: LOG_LEVEL + valueFrom: + configMapKeyRef: + name: honcho-config + key: LOG_LEVEL + - name: CACHE_URL + valueFrom: + configMapKeyRef: + name: honcho-config + key: CACHE_URL + - name: CACHE_ENABLED + valueFrom: + configMapKeyRef: + name: honcho-config + key: CACHE_ENABLED + - name: METRICS_ENABLED + valueFrom: + configMapKeyRef: + name: honcho-config + key: METRICS_ENABLED + # ── Secrets ──────────────────────────────────────────────────── + - name: DB_CONNECTION_URI + valueFrom: + secretKeyRef: + name: honcho-secrets + key: DB_CONNECTION_URI + - name: AUTH_JWT_SECRET + valueFrom: + secretKeyRef: + name: honcho-secrets + key: AUTH_JWT_SECRET + # LLM keys are optional — unset keys simply won't appear in the + # container environment; the app will error if it calls a provider + # whose key is absent. + - name: LLM_ANTHROPIC_API_KEY + valueFrom: + secretKeyRef: + name: honcho-secrets + key: LLM_ANTHROPIC_API_KEY + optional: true + - name: LLM_GEMINI_API_KEY + valueFrom: + secretKeyRef: + name: honcho-secrets + key: LLM_GEMINI_API_KEY + optional: true + - name: LLM_OPENAI_API_KEY + valueFrom: + secretKeyRef: + name: honcho-secrets + key: LLM_OPENAI_API_KEY + optional: true + volumeMounts: + # Writable /tmp for uv cache (UV_CACHE_DIR=/tmp/uv-cache set in + # the Dockerfile) and any other runtime temp files. + - name: tmp + mountPath: /tmp + # startupProbe — absorbs slow migration runs before readiness/liveness + # take over. 60 × 5 s = up to 5 min for migrations to complete. + startupProbe: + httpGet: + path: /openapi.json + port: 8000 + periodSeconds: 5 + timeoutSeconds: 5 + failureThreshold: 60 + # readinessProbe — checks that the HTTP server is serving responses. + # Probe type: httpGet (full request/response cycle). + readinessProbe: + httpGet: + path: /openapi.json + port: 8000 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 3 + # livenessProbe — checks that the TCP port is open (lighter than a + # full HTTP request). Different probe type from readiness avoids the + # identical-probe anti-pattern: port open != app serving responses. + livenessProbe: + tcpSocket: + port: 8000 + initialDelaySeconds: 30 + periodSeconds: 15 + timeoutSeconds: 5 + failureThreshold: 3 + resources: + requests: + memory: "256Mi" + cpu: "250m" + ephemeral-storage: "256Mi" + limits: + memory: "1Gi" + cpu: "1000m" + ephemeral-storage: "1Gi" + securityContext: + # The Dockerfile creates a non-root 'app' user via adduser --system + # (UID in the 100–999 range). runAsNonRoot enforces this without + # hardcoding the UID, which varies by image build. + # Note: kube-score recommends UID > 10000; achieving that requires + # a Dockerfile change (adduser --uid 10001 app). + runAsNonRoot: true + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: ["ALL"] + volumes: + - name: tmp + emptyDir: {} diff --git a/k8s/api/hpa.yaml b/k8s/api/hpa.yaml new file mode 100644 index 000000000..4a7ba1ed8 --- /dev/null +++ b/k8s/api/hpa.yaml @@ -0,0 +1,28 @@ +# HorizontalPodAutoscaler for the Honcho API. +# +# Prerequisites: metrics-server must be running in the cluster. +# Docker Desktop: included by default +# k3s: included by default +# kind: install manually — kubectl apply -f https://github.com/kubernetes-sigs/metrics-server/releases/latest/download/components.yaml +# +# Scales the API between 1 and 5 replicas targeting 70% average CPU utilization. +# Adjust maxReplicas and targetAverageUtilization to match your workload. +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: honcho-api + namespace: honcho +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: honcho-api + minReplicas: 1 + maxReplicas: 5 + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 70 diff --git a/k8s/api/pdb.yaml b/k8s/api/pdb.yaml new file mode 100644 index 000000000..a4a0d4434 --- /dev/null +++ b/k8s/api/pdb.yaml @@ -0,0 +1,15 @@ +# PodDisruptionBudget — limits voluntary disruption during node drains and +# cluster upgrades. maxUnavailable: 1 allows at most one pod to be evicted at a +# time regardless of current replica count, which avoids a deadlock with +# HPA minReplicas: 1 (minAvailable: 1 would block drains when only one replica +# is running). +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: honcho-api + namespace: honcho +spec: + maxUnavailable: 1 + selector: + matchLabels: + app: honcho-api diff --git a/k8s/api/service.yaml b/k8s/api/service.yaml new file mode 100644 index 000000000..91d642ef7 --- /dev/null +++ b/k8s/api/service.yaml @@ -0,0 +1,22 @@ +# ClusterIP Service — accessible within the cluster only. +# For local development use: kubectl port-forward svc/honcho-api 8000:80 -n honcho +# +# To expose externally, change type to: +# NodePort — works on all local k8s distributions without extra tooling +# LoadBalancer — requires a cloud provider LB or metallb on bare-metal/kind +# +# For production, prefer an Ingress controller (nginx, traefik, etc.) over +# changing the Service type. +apiVersion: v1 +kind: Service +metadata: + name: honcho-api + namespace: honcho +spec: + selector: + app: honcho-api + ports: + - name: http + port: 80 + targetPort: 8000 + type: ClusterIP diff --git a/k8s/configmap.yaml b/k8s/configmap.yaml new file mode 100644 index 000000000..8b7b37f6a --- /dev/null +++ b/k8s/configmap.yaml @@ -0,0 +1,26 @@ +# Non-secret application configuration for the Honcho API and Deriver pods. +# All sensitive values (passwords, API keys, JWT secrets) live in secrets.yaml. +apiVersion: v1 +kind: ConfigMap +metadata: + name: honcho-config + namespace: honcho +data: + # Authentication — set to "true" for all non-development deployments + AUTH_USE_AUTH: "true" + + # Logging + LOG_LEVEL: "INFO" + + # Redis cache — hostname matches the redis ClusterIP Service defined in redis/service.yaml + # Set CACHE_ENABLED to "true" to activate the cache (disabled by default). + CACHE_URL: "redis://redis:6379/0" + CACHE_ENABLED: "false" + + # Deriver background worker + DERIVER_ENABLED: "true" + DERIVER_WORKERS: "1" + + # Prometheus metrics — set to "true" and add a deriver Service on port 9090 + # if you are deploying Prometheus in the same cluster. + METRICS_ENABLED: "false" diff --git a/k8s/deriver/deployment.yaml b/k8s/deriver/deployment.yaml new file mode 100644 index 000000000..5558fa6f6 --- /dev/null +++ b/k8s/deriver/deployment.yaml @@ -0,0 +1,179 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: honcho-deriver + namespace: honcho +spec: + replicas: 1 + selector: + matchLabels: + app: honcho-deriver + template: + metadata: + labels: + app: honcho-deriver + spec: + automountServiceAccountToken: false + securityContext: + runAsNonRoot: true + seccompProfile: + type: RuntimeDefault + initContainers: + - name: wait-for-postgres + image: busybox:1.36 + imagePullPolicy: IfNotPresent + command: + - sh + - -c + - | + MAX_RETRIES=60; RETRY=0 + until nc -z postgres 5432; do + RETRY=$((RETRY + 1)) + if [ "$RETRY" -ge "$MAX_RETRIES" ]; then + echo "ERROR: postgres not reachable after $MAX_RETRIES attempts (120 s)" + exit 1 + fi + echo "Waiting for postgres... ($RETRY/$MAX_RETRIES)" + sleep 2 + done + echo "PostgreSQL is ready." + resources: + requests: + memory: "16Mi" + cpu: "50m" + ephemeral-storage: "16Mi" + limits: + memory: "32Mi" + cpu: "100m" + ephemeral-storage: "64Mi" + securityContext: + runAsNonRoot: true + runAsUser: 65534 # nobody + runAsGroup: 65534 # nogroup + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: ["ALL"] + - name: wait-for-redis + image: busybox:1.36 + imagePullPolicy: IfNotPresent + command: + - sh + - -c + - | + MAX_RETRIES=60; RETRY=0 + until nc -z redis 6379; do + RETRY=$((RETRY + 1)) + if [ "$RETRY" -ge "$MAX_RETRIES" ]; then + echo "ERROR: redis not reachable after $MAX_RETRIES attempts (120 s)" + exit 1 + fi + echo "Waiting for redis... ($RETRY/$MAX_RETRIES)" + sleep 2 + done + echo "Redis is ready." + resources: + requests: + memory: "16Mi" + cpu: "50m" + ephemeral-storage: "16Mi" + limits: + memory: "32Mi" + cpu: "100m" + ephemeral-storage: "64Mi" + securityContext: + runAsNonRoot: true + runAsUser: 65534 # nobody + runAsGroup: 65534 # nogroup + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: ["ALL"] + containers: + - name: deriver + image: honcho:latest + imagePullPolicy: IfNotPresent + # The deriver is a background queue worker — it is NOT an HTTP server. + # No HTTP liveness probe is defined; Kubernetes restarts the container + # automatically if the process exits (restartPolicy: Always, the + # Deployment default). This resolves the healthcheck issue present + # when the image-level HEALTHCHECK is applied to a non-API service. + command: ["/app/.venv/bin/python", "-m", "src.deriver"] + env: + # ── Non-secret config ────────────────────────────────────────── + - name: LOG_LEVEL + valueFrom: + configMapKeyRef: + name: honcho-config + key: LOG_LEVEL + - name: CACHE_URL + valueFrom: + configMapKeyRef: + name: honcho-config + key: CACHE_URL + - name: CACHE_ENABLED + valueFrom: + configMapKeyRef: + name: honcho-config + key: CACHE_ENABLED + - name: DERIVER_ENABLED + valueFrom: + configMapKeyRef: + name: honcho-config + key: DERIVER_ENABLED + - name: DERIVER_WORKERS + valueFrom: + configMapKeyRef: + name: honcho-config + key: DERIVER_WORKERS + - name: METRICS_ENABLED + valueFrom: + configMapKeyRef: + name: honcho-config + key: METRICS_ENABLED + # ── Secrets ──────────────────────────────────────────────────── + - name: DB_CONNECTION_URI + valueFrom: + secretKeyRef: + name: honcho-secrets + key: DB_CONNECTION_URI + - name: LLM_ANTHROPIC_API_KEY + valueFrom: + secretKeyRef: + name: honcho-secrets + key: LLM_ANTHROPIC_API_KEY + optional: true + - name: LLM_GEMINI_API_KEY + valueFrom: + secretKeyRef: + name: honcho-secrets + key: LLM_GEMINI_API_KEY + optional: true + - name: LLM_OPENAI_API_KEY + valueFrom: + secretKeyRef: + name: honcho-secrets + key: LLM_OPENAI_API_KEY + optional: true + volumeMounts: + - name: tmp + mountPath: /tmp + resources: + # Generous limits — the deriver runs multi-step LLM tool-calling loops. + requests: + memory: "256Mi" + cpu: "250m" + ephemeral-storage: "256Mi" + limits: + memory: "2Gi" + cpu: "2000m" + ephemeral-storage: "1Gi" + securityContext: + runAsNonRoot: true + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: ["ALL"] + volumes: + - name: tmp + emptyDir: {} diff --git a/k8s/kustomization.yaml b/k8s/kustomization.yaml new file mode 100644 index 000000000..1896d1260 --- /dev/null +++ b/k8s/kustomization.yaml @@ -0,0 +1,31 @@ +# Honcho Kustomize entrypoint +# +# BEFORE running kubectl apply -k k8s/, apply secrets separately: +# cp k8s/secrets.yaml.example k8s/secrets.yaml +# # fill in all placeholder values in k8s/secrets.yaml +# kubectl apply -f k8s/secrets.yaml +# +# Then deploy everything else: +# kubectl apply -k k8s/ +# +# secrets.yaml is excluded here because it is gitignored and must be created +# locally by each operator — it should never be committed to version control. +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: honcho + +resources: + - namespace.yaml + - configmap.yaml + - network-policies.yaml + - postgres/configmap.yaml + - postgres/statefulset.yaml + - postgres/service.yaml + - redis/statefulset.yaml + - redis/service.yaml + - api/deployment.yaml + - api/service.yaml + - api/hpa.yaml + - api/pdb.yaml + - deriver/deployment.yaml diff --git a/k8s/namespace.yaml b/k8s/namespace.yaml new file mode 100644 index 000000000..c075fb30c --- /dev/null +++ b/k8s/namespace.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: honcho diff --git a/k8s/network-policies.yaml b/k8s/network-policies.yaml new file mode 100644 index 000000000..a7362b939 --- /dev/null +++ b/k8s/network-policies.yaml @@ -0,0 +1,103 @@ +# Honcho NetworkPolicies +# +# Overall posture: default-deny all ingress within the honcho namespace, then +# explicitly allow only the connections that need to exist. Egress is unrestricted +# so that the API and Deriver can reach external LLM provider APIs. +# +# Egress is intentionally unrestricted: the API and Deriver pods must reach +# external LLM provider APIs (Anthropic, Google, OpenAI) over HTTPS. Adding +# egress policies would require listing all provider IP ranges, which are +# not stable. In production, use a network proxy or egress gateway to audit +# and control outbound traffic rather than restricting it at the pod level. +# +# Summary of allowed paths: +# [external / port-forward] ──► honcho-api:8000 +# honcho-api ──► postgres:5432 +# honcho-api ──► redis:6379 +# honcho-deriver ──► postgres:5432 +# honcho-deriver ──► redis:6379 + +--- +# 1. Default deny — drop all ingress to every pod in the namespace. +# Subsequent policies punch specific holes in this baseline. +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: default-deny-ingress + namespace: honcho +spec: + podSelector: {} + policyTypes: + - Ingress + +--- +# 2. Allow HTTP traffic into the API pods on port 8000. +# The empty `from` list means "from any source" — required for external +# traffic arriving via port-forward, NodePort, LoadBalancer, or Ingress. +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: allow-api-ingress + namespace: honcho +spec: + podSelector: + matchLabels: + app: honcho-api + policyTypes: + - Ingress + ingress: + - ports: + - protocol: TCP + port: 8000 + +--- +# 3. Allow only the API and Deriver pods to reach PostgreSQL on port 5432. +# All other ingress to postgres pods is blocked by the default-deny rule. +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: allow-postgres-from-honcho + namespace: honcho +spec: + podSelector: + matchLabels: + app: postgres + policyTypes: + - Ingress + ingress: + - from: + - podSelector: + matchLabels: + app: honcho-api + - podSelector: + matchLabels: + app: honcho-deriver + ports: + - protocol: TCP + port: 5432 + +--- +# 4. Allow only the API and Deriver pods to reach Redis on port 6379. +# All other ingress to redis pods is blocked by the default-deny rule. +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: allow-redis-from-honcho + namespace: honcho +spec: + podSelector: + matchLabels: + app: redis + policyTypes: + - Ingress + ingress: + - from: + - podSelector: + matchLabels: + app: honcho-api + - podSelector: + matchLabels: + app: honcho-deriver + ports: + - protocol: TCP + port: 6379 diff --git a/k8s/postgres/configmap.yaml b/k8s/postgres/configmap.yaml new file mode 100644 index 000000000..96ae0ffcf --- /dev/null +++ b/k8s/postgres/configmap.yaml @@ -0,0 +1,11 @@ +# Postgres init script — creates the pgvector extension when the data directory +# is first initialized. The honcho API (provision_db.py / alembic) also runs +# CREATE EXTENSION IF NOT EXISTS vector on startup, so this is belt-and-suspenders. +apiVersion: v1 +kind: ConfigMap +metadata: + name: postgres-init + namespace: honcho +data: + init.sql: | + CREATE EXTENSION IF NOT EXISTS vector; diff --git a/k8s/postgres/service.yaml b/k8s/postgres/service.yaml new file mode 100644 index 000000000..7e40d1fcf --- /dev/null +++ b/k8s/postgres/service.yaml @@ -0,0 +1,15 @@ +# Headless Service — required by the postgres StatefulSet for stable pod DNS. +# With clusterIP: None, DNS for 'postgres' resolves directly to the pod IP. +# For a single-replica StatefulSet this is transparent to clients. +apiVersion: v1 +kind: Service +metadata: + name: postgres + namespace: honcho +spec: + clusterIP: None + selector: + app: postgres + ports: + - port: 5432 + targetPort: 5432 diff --git a/k8s/postgres/statefulset.yaml b/k8s/postgres/statefulset.yaml new file mode 100644 index 000000000..eea7d3440 --- /dev/null +++ b/k8s/postgres/statefulset.yaml @@ -0,0 +1,105 @@ +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: postgres + namespace: honcho +spec: + serviceName: postgres + # Single replica is intentional — HA PostgreSQL requires an operator + # (e.g., CloudNativePG) that is beyond the scope of these sample manifests. + # For production, use a managed database service instead. + replicas: 1 + selector: + matchLabels: + app: postgres + template: + metadata: + labels: + app: postgres + spec: + # postgres pods don't call the Kubernetes API. + automountServiceAccountToken: false + containers: + - name: postgres + image: pgvector/pgvector:pg15 + # Explicit pull policy — specific tags are cached after first pull. + imagePullPolicy: IfNotPresent + ports: + - containerPort: 5432 + env: + - name: POSTGRES_USER + value: "postgres" + - name: POSTGRES_DB + value: "postgres" + - name: POSTGRES_PASSWORD + valueFrom: + secretKeyRef: + name: honcho-secrets + key: POSTGRES_PASSWORD + # Store data in a subdirectory so the mount point itself can be + # owned by root while pgdata is owned by the postgres user. + - name: PGDATA + value: /var/lib/postgresql/data/pgdata + volumeMounts: + - name: postgres-data + mountPath: /var/lib/postgresql/data + - name: init-scripts + mountPath: /docker-entrypoint-initdb.d + readOnly: true + # startupProbe — absorbs the slow initial DB init before + # readiness/liveness probes take over (up to 5 min grace). + startupProbe: + exec: + command: ["pg_isready", "-U", "postgres"] + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 5 + failureThreshold: 60 + # readinessProbe — checks that postgres is accepting connections. + # Probe type: exec (full connection check). + readinessProbe: + exec: + command: ["pg_isready", "-U", "postgres"] + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 3 + # livenessProbe — checks that the TCP port is open (lighter weight + # than pg_isready; a different probe type avoids the identical-probe + # anti-pattern and avoids triggering restarts on query-level issues). + livenessProbe: + tcpSocket: + port: 5432 + initialDelaySeconds: 30 + periodSeconds: 15 + timeoutSeconds: 5 + failureThreshold: 3 + resources: + requests: + memory: "256Mi" + cpu: "250m" + ephemeral-storage: "100Mi" + limits: + memory: "1Gi" + cpu: "1000m" + ephemeral-storage: "500Mi" + # securityContext is intentionally omitted for the postgres container. + # The official postgres image starts as root and uses gosu/su-exec to + # drop to the 'postgres' user after initialising the data directory — + # a pattern that requires CAP_SETUID/CAP_SETGID. Dropping ALL + # capabilities breaks the init sequence, and setting runAsNonRoot or + # readOnlyRootFilesystem without corresponding volume mounts would also + # fail. For production, use a managed PostgreSQL service or a Kubernetes + # operator (e.g. CloudNativePG) that handles hardening at the operator + # level. Security at the network level is enforced by NetworkPolicies. + volumes: + - name: init-scripts + configMap: + name: postgres-init + volumeClaimTemplates: + - metadata: + name: postgres-data + spec: + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: 10Gi diff --git a/k8s/redis/service.yaml b/k8s/redis/service.yaml new file mode 100644 index 000000000..4c3d9efbc --- /dev/null +++ b/k8s/redis/service.yaml @@ -0,0 +1,13 @@ +# Headless Service — required by the redis StatefulSet for stable pod DNS. +apiVersion: v1 +kind: Service +metadata: + name: redis + namespace: honcho +spec: + clusterIP: None + selector: + app: redis + ports: + - port: 6379 + targetPort: 6379 diff --git a/k8s/redis/statefulset.yaml b/k8s/redis/statefulset.yaml new file mode 100644 index 000000000..52bd3cb52 --- /dev/null +++ b/k8s/redis/statefulset.yaml @@ -0,0 +1,79 @@ +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: redis + namespace: honcho +spec: + serviceName: redis + # Single replica is intentional — Redis clustering/sentinel requires + # additional configuration beyond the scope of these sample manifests. + # For production, use a managed cache service instead. + replicas: 1 + selector: + matchLabels: + app: redis + template: + metadata: + labels: + app: redis + spec: + # redis pods don't call the Kubernetes API. + automountServiceAccountToken: false + containers: + - name: redis + image: redis:8.2 + imagePullPolicy: IfNotPresent + ports: + - containerPort: 6379 + # Enable persistence: snapshot every 60 s if at least 1 key changed. + command: ["redis-server", "--save", "60", "1", "--loglevel", "warning"] + volumeMounts: + - name: redis-data + mountPath: /data + # startupProbe — handles the brief period before redis is ready. + startupProbe: + exec: + command: ["redis-cli", "ping"] + initialDelaySeconds: 5 + periodSeconds: 3 + timeoutSeconds: 3 + failureThreshold: 20 + # readinessProbe — verifies redis is accepting commands. + # Probe type: exec (full PING/PONG roundtrip). + readinessProbe: + exec: + command: ["redis-cli", "ping"] + periodSeconds: 10 + timeoutSeconds: 3 + failureThreshold: 3 + # livenessProbe — checks that the TCP port is open. + # Different probe type from readiness to avoid the identical-probe + # anti-pattern: TCP open != redis accepting commands. + livenessProbe: + tcpSocket: + port: 6379 + initialDelaySeconds: 15 + periodSeconds: 15 + timeoutSeconds: 3 + failureThreshold: 3 + resources: + requests: + memory: "128Mi" + cpu: "100m" + ephemeral-storage: "100Mi" + limits: + memory: "512Mi" + cpu: "500m" + ephemeral-storage: "500Mi" + # securityContext is intentionally omitted for the redis container. + # Some redis image versions start as root. For production, use a + # managed cache service. Network-level security is enforced by + # NetworkPolicies. + volumeClaimTemplates: + - metadata: + name: redis-data + spec: + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: 2Gi diff --git a/k8s/secrets.yaml.example b/k8s/secrets.yaml.example new file mode 100644 index 000000000..721aed1cb --- /dev/null +++ b/k8s/secrets.yaml.example @@ -0,0 +1,42 @@ +# Honcho Kubernetes Secrets Template +# +# INSTRUCTIONS: +# 1. Copy this file to secrets.yaml (k8s/secrets.yaml is gitignored) +# 2. Replace every placeholder value below with real values +# 3. Apply once before deploying: kubectl apply -f k8s/secrets.yaml +# 4. NEVER commit k8s/secrets.yaml to version control +# +# POSTGRES_PASSWORD and the password embedded in DB_CONNECTION_URI MUST match. +# Generate AUTH_JWT_SECRET with: openssl rand -hex 32 +# or: uv run python scripts/generate_jwt_secret.py + +apiVersion: v1 +kind: Secret +metadata: + name: honcho-secrets + namespace: honcho +type: Opaque +stringData: + # ── PostgreSQL ────────────────────────────────────────────────────────────── + # Used by the postgres StatefulSet. Must match the password in DB_CONNECTION_URI. + POSTGRES_PASSWORD: "change-me-strong-password" + + # ── Application database connection ──────────────────────────────────────── + # The hostname 'postgres' resolves via the headless Service 'postgres' + # (clusterIP: None) directly to the pod's IP. + # Replace 'change-me-strong-password' with the same value as POSTGRES_PASSWORD. + DB_CONNECTION_URI: "postgresql+psycopg://postgres:change-me-strong-password@postgres:5432/postgres" + + # ── API authentication ────────────────────────────────────────────────────── + # Must be set when AUTH_USE_AUTH=true (the default in k8s/configmap.yaml). + # Generate: openssl rand -hex 32 + AUTH_JWT_SECRET: "change-me-generate-with-openssl-rand-hex-32" + + # ── LLM provider API keys ─────────────────────────────────────────────────── + # Uncomment and fill in the key(s) for the provider(s) you use. + # Keys that are omitted entirely will not be injected as environment variables. + # Do not leave these as empty strings — empty strings ARE injected and may + # confuse provider SDKs that treat them as valid (but invalid) credentials. + #LLM_ANTHROPIC_API_KEY: "sk-ant-..." + #LLM_GEMINI_API_KEY: "AIza..." + #LLM_OPENAI_API_KEY: "sk-..." diff --git a/tests/test_k8s_manifests.py b/tests/test_k8s_manifests.py new file mode 100644 index 000000000..42a73b5d4 --- /dev/null +++ b/tests/test_k8s_manifests.py @@ -0,0 +1,539 @@ +"""Unit tests for k8s manifest correctness and security posture. + +These tests render the Kustomize output and assert structural and security +properties without requiring a running Kubernetes cluster. They run in the +standard pytest suite alongside the application tests. + +Requirements: kubectl must be on PATH (kubectl kustomize is built-in since v1.14). +""" + +import subprocess +from pathlib import Path +from typing import Any + +import pytest +import yaml + +# --------------------------------------------------------------------------- +# Fixture: render kustomize output once per session +# --------------------------------------------------------------------------- + +K8S_DIR = Path(__file__).parent.parent / "k8s" + + +@pytest.fixture(scope="session") +def manifests() -> list[dict[str, Any]]: + """Render `kubectl kustomize k8s/` and return parsed YAML documents.""" + try: + result = subprocess.run( + ["kubectl", "kustomize", str(K8S_DIR)], + capture_output=True, + text=True, + timeout=30, + ) + except FileNotFoundError: + pytest.skip("kubectl not found on PATH — skipping k8s manifest tests") + except subprocess.TimeoutExpired: + pytest.fail("kubectl kustomize timed out after 30 s") + assert result.returncode == 0, ( + f"kubectl kustomize failed:\n{result.stderr}" + ) + docs = list(yaml.safe_load_all(result.stdout)) + return [d for d in docs if d is not None] + + +def _by_kind_name( + manifests: list[dict[str, Any]], kind: str, name: str +) -> dict[str, Any]: + """Return a single manifest by kind and metadata.name.""" + matches = [ + m + for m in manifests + if m.get("kind") == kind and m.get("metadata", {}).get("name") == name + ] + assert len(matches) == 1, f"Expected exactly 1 {kind}/{name}, got {len(matches)}" + return matches[0] + + +# --------------------------------------------------------------------------- +# Rendering smoke test +# --------------------------------------------------------------------------- + + +def test_kustomize_renders_without_error(manifests: list[dict[str, Any]]): + """Kustomize must produce at least the core resource types.""" + kinds = {m["kind"] for m in manifests} + expected = { + "Namespace", + "ConfigMap", + "NetworkPolicy", + "StatefulSet", + "Service", + "Deployment", + "HorizontalPodAutoscaler", + "PodDisruptionBudget", + } + assert expected.issubset(kinds), f"Missing kinds: {expected - kinds}" + + +def test_all_resources_in_honcho_namespace(manifests: list[dict[str, Any]]): + """Every namespaced resource must be in the 'honcho' namespace.""" + # These kinds don't carry a namespace. + cluster_scoped = {"Namespace"} + for m in manifests: + if m["kind"] in cluster_scoped: + continue + ns = m.get("metadata", {}).get("namespace") + assert ns == "honcho", ( + f"{m['kind']}/{m['metadata']['name']} has namespace '{ns}', expected 'honcho'" + ) + + +# --------------------------------------------------------------------------- +# Network policy tests +# --------------------------------------------------------------------------- + + +def test_default_deny_ingress_policy_exists(manifests: list[dict[str, Any]]): + """A default-deny NetworkPolicy must select all pods and allow no ingress.""" + policy = _by_kind_name(manifests, "NetworkPolicy", "default-deny-ingress") + assert policy["spec"]["podSelector"] == {}, ( + "default-deny must select all pods (empty podSelector)" + ) + assert "Ingress" in policy["spec"]["policyTypes"] + assert policy["spec"].get("ingress", []) == [], ( + "default-deny policy must not contain any ingress allow rules" + ) + + +def test_postgres_network_policy_restricts_access(manifests: list[dict[str, Any]]): + """Only honcho-api and honcho-deriver pods may reach postgres — no other sources.""" + policy = _by_kind_name(manifests, "NetworkPolicy", "allow-postgres-from-honcho") + assert policy["spec"].get("podSelector", {}).get("matchLabels") == {"app": "postgres"}, ( + "allow-postgres-from-honcho must select postgres pods" + ) + ingress_rules = policy["spec"].get("ingress", []) + assert len(ingress_rules) == 1, ( + f"allow-postgres-from-honcho must have exactly 1 ingress rule, got {len(ingress_rules)}" + ) + allowed_labels = { + frozenset(src.get("podSelector", {}).get("matchLabels", {}).items()) + for src in ingress_rules[0]["from"] + } + assert allowed_labels == { + frozenset({"app": "honcho-api"}.items()), + frozenset({"app": "honcho-deriver"}.items()), + }, f"postgres ingress sources must be exactly api+deriver, got: {allowed_labels}" + ports = {p["port"] for p in ingress_rules[0]["ports"]} + assert ports == {5432}, f"postgres ingress must allow only port 5432, got: {ports}" + + +def test_redis_network_policy_restricts_access(manifests: list[dict[str, Any]]): + """Only honcho-api and honcho-deriver pods may reach redis — no other sources.""" + policy = _by_kind_name(manifests, "NetworkPolicy", "allow-redis-from-honcho") + assert policy["spec"].get("podSelector", {}).get("matchLabels") == {"app": "redis"}, ( + "allow-redis-from-honcho must select redis pods" + ) + ingress_rules = policy["spec"].get("ingress", []) + assert len(ingress_rules) == 1, ( + f"allow-redis-from-honcho must have exactly 1 ingress rule, got {len(ingress_rules)}" + ) + allowed_labels = { + frozenset(src.get("podSelector", {}).get("matchLabels", {}).items()) + for src in ingress_rules[0]["from"] + } + assert allowed_labels == { + frozenset({"app": "honcho-api"}.items()), + frozenset({"app": "honcho-deriver"}.items()), + }, f"redis ingress sources must be exactly api+deriver, got: {allowed_labels}" + ports = {p["port"] for p in ingress_rules[0]["ports"]} + assert ports == {6379}, f"redis ingress must allow only port 6379, got: {ports}" + + +def test_four_network_policies_present(manifests: list[dict[str, Any]]): + """Exactly 4 NetworkPolicy resources must be present.""" + policies = [m for m in manifests if m["kind"] == "NetworkPolicy"] + names = {m["metadata"]["name"] for m in policies} + assert names == { + "default-deny-ingress", + "allow-api-ingress", + "allow-postgres-from-honcho", + "allow-redis-from-honcho", + } + + +def test_allow_api_ingress_policy_semantics(manifests: list[dict[str, Any]]): + """allow-api-ingress must target API pods and allow exactly port 8000/TCP.""" + policy = _by_kind_name(manifests, "NetworkPolicy", "allow-api-ingress") + deployment = _by_kind_name(manifests, "Deployment", "honcho-api") + + # Policy podSelector must match the Deployment's pod template labels. + pod_labels = deployment["spec"]["template"]["metadata"]["labels"] + assert policy["spec"]["podSelector"].get("matchLabels") == pod_labels, ( + f"allow-api-ingress podSelector {policy['spec']['podSelector']} " + f"must match API pod template labels {pod_labels}" + ) + + # Ingress must have exactly one rule allowing exactly port 8000/TCP. + ingress_rules = policy["spec"]["ingress"] + assert len(ingress_rules) == 1, ( + f"allow-api-ingress must have exactly 1 ingress rule, got {len(ingress_rules)}" + ) + ingress_ports = ingress_rules[0]["ports"] + assert len(ingress_ports) == 1, ( + f"allow-api-ingress ingress rule must specify exactly 1 port, got {ingress_ports}" + ) + entry = ingress_ports[0] + assert entry.get("port") == 8000, ( + f"allow-api-ingress must allow port 8000, got {entry.get('port')}" + ) + assert entry.get("protocol", "TCP") == "TCP", ( + f"allow-api-ingress must use TCP protocol, got {entry.get('protocol')}" + ) + + +# --------------------------------------------------------------------------- +# StatefulSet headless Service tests +# --------------------------------------------------------------------------- + + +def test_postgres_service_is_headless(manifests: list[dict[str, Any]]): + """The postgres Service must be headless (clusterIP: None) for the StatefulSet.""" + svc = _by_kind_name(manifests, "Service", "postgres") + assert svc["spec"].get("clusterIP") == "None", ( + "postgres Service must be headless (clusterIP: None)" + ) + + +def test_redis_service_is_headless(manifests: list[dict[str, Any]]): + """The redis Service must be headless (clusterIP: None) for the StatefulSet.""" + svc = _by_kind_name(manifests, "Service", "redis") + assert svc["spec"].get("clusterIP") == "None", ( + "redis Service must be headless (clusterIP: None)" + ) + + +def test_postgres_statefulset_servicename_matches_service( + manifests: list[dict[str, Any]], +): + """StatefulSet.spec.serviceName must match the headless Service name.""" + sts = _by_kind_name(manifests, "StatefulSet", "postgres") + assert sts["spec"]["serviceName"] == "postgres" + + +def test_redis_statefulset_servicename_matches_service( + manifests: list[dict[str, Any]], +): + sts = _by_kind_name(manifests, "StatefulSet", "redis") + assert sts["spec"]["serviceName"] == "redis" + + +# --------------------------------------------------------------------------- +# HPA / Deployment replica interaction +# --------------------------------------------------------------------------- + + +def test_api_deployment_has_no_static_replicas(manifests: list[dict[str, Any]]): + """The HPA-managed Deployment must not declare a static replica count. + + A static replicas field overrides the HPA on every kubectl apply. + """ + deployment = _by_kind_name(manifests, "Deployment", "honcho-api") + assert "replicas" not in deployment["spec"], ( + "honcho-api Deployment must not set replicas — the HPA manages this" + ) + + +def test_hpa_targets_api_deployment(manifests: list[dict[str, Any]]): + hpa = _by_kind_name(manifests, "HorizontalPodAutoscaler", "honcho-api") + ref = hpa["spec"]["scaleTargetRef"] + assert ref["kind"] == "Deployment" + assert ref["name"] == "honcho-api" + assert hpa["spec"]["minReplicas"] == 1, ( + f"HPA minReplicas must be 1, got {hpa['spec']['minReplicas']}" + ) + assert hpa["spec"]["maxReplicas"] == 5, ( + f"HPA maxReplicas must be 5, got {hpa['spec']['maxReplicas']}" + ) + cpu_metrics = [ + m for m in hpa["spec"]["metrics"] + if m.get("type") == "Resource" + and m.get("resource", {}).get("name") == "cpu" + ] + assert cpu_metrics, "HPA must define a CPU Resource metric" + utilization = cpu_metrics[0]["resource"]["target"].get("averageUtilization") + assert utilization == 70, ( + f"HPA CPU target averageUtilization must be 70, got {utilization}" + ) + + +def test_pdb_selects_api_pods(manifests: list[dict[str, Any]]): + pdb = _by_kind_name(manifests, "PodDisruptionBudget", "honcho-api") + assert pdb["spec"]["selector"]["matchLabels"] == {"app": "honcho-api"} + # maxUnavailable: 1 allows node drains even at minReplicas=1; minAvailable: 1 + # would deadlock when only one replica is running. + assert "maxUnavailable" in pdb["spec"] and "minAvailable" not in pdb["spec"], ( + "PDB must not define both maxUnavailable and minAvailable; " + "use maxUnavailable: 1 only" + ) + assert pdb["spec"].get("maxUnavailable") == 1, ( + "PDB must use maxUnavailable: 1 to avoid deadlock with HPA minReplicas: 1" + ) + + +# --------------------------------------------------------------------------- +# Deriver — no HTTP probe, correct command +# --------------------------------------------------------------------------- + + +def test_deriver_has_no_http_probes(manifests: list[dict[str, Any]]): + """The deriver is a queue worker, not an HTTP server. + + Neither a liveness nor a readiness HTTP probe must be present — the + deriver has no HTTP server to probe. Health is managed by the restart + policy (restartPolicy: Always, the Deployment default). + """ + deployment = _by_kind_name(manifests, "Deployment", "honcho-deriver") + container = next( + c + for c in deployment["spec"]["template"]["spec"]["containers"] + if c["name"] == "deriver" + ) + assert "livenessProbe" not in container, ( + "deriver must not have a livenessProbe — it is not an HTTP server" + ) + readiness = container.get("readinessProbe", {}) + assert "httpGet" not in readiness, ( + "deriver must not have an HTTP readinessProbe — it is not an HTTP server" + ) + + +def test_deriver_runs_correct_command(manifests: list[dict[str, Any]]): + deployment = _by_kind_name(manifests, "Deployment", "honcho-deriver") + container = next( + c + for c in deployment["spec"]["template"]["spec"]["containers"] + if c["name"] == "deriver" + ) + assert container["command"] == ["/app/.venv/bin/python", "-m", "src.deriver"] + + +# --------------------------------------------------------------------------- +# API — differentiated probe types +# --------------------------------------------------------------------------- + + +def test_api_readiness_probe_is_http(manifests: list[dict[str, Any]]): + """Readiness probe must use httpGet to verify the app is serving requests.""" + deployment = _by_kind_name(manifests, "Deployment", "honcho-api") + container = next( + c + for c in deployment["spec"]["template"]["spec"]["containers"] + if c["name"] == "api" + ) + readiness = container["readinessProbe"] + assert "httpGet" in readiness, "readinessProbe must use httpGet" + assert readiness["httpGet"]["path"] == "/openapi.json" + + +def test_api_liveness_probe_is_tcp(manifests: list[dict[str, Any]]): + """Liveness probe must use tcpSocket (different type from readiness). + + Using different probe types prevents the identical-probe anti-pattern: + port open (liveness) != app serving responses (readiness). + """ + deployment = _by_kind_name(manifests, "Deployment", "honcho-api") + container = next( + c + for c in deployment["spec"]["template"]["spec"]["containers"] + if c["name"] == "api" + ) + liveness = container["livenessProbe"] + assert "tcpSocket" in liveness, "livenessProbe must use tcpSocket" + assert "httpGet" not in liveness, ( + "livenessProbe must not use httpGet (would be identical to readinessProbe)" + ) + + +# --------------------------------------------------------------------------- +# Security context tests +# --------------------------------------------------------------------------- + + +def _get_container(deployment: dict[str, Any], name: str) -> dict[str, Any]: + containers = ( + deployment["spec"]["template"]["spec"].get("containers", []) + + deployment["spec"]["template"]["spec"].get("initContainers", []) + ) + return next(c for c in containers if c["name"] == name) + + +def test_api_container_runs_as_non_root(manifests: list[dict[str, Any]]): + deployment = _by_kind_name(manifests, "Deployment", "honcho-api") + # Pod-level + pod_sc = deployment["spec"]["template"]["spec"].get("securityContext", {}) + assert pod_sc.get("runAsNonRoot") is True, "Pod-level runAsNonRoot must be true" + # Container-level + container = _get_container(deployment, "api") + assert container["securityContext"].get("runAsNonRoot") is True + + +def test_api_container_no_privilege_escalation(manifests: list[dict[str, Any]]): + deployment = _by_kind_name(manifests, "Deployment", "honcho-api") + container = _get_container(deployment, "api") + assert container["securityContext"].get("allowPrivilegeEscalation") is False + + +def test_api_container_drops_all_capabilities(manifests: list[dict[str, Any]]): + deployment = _by_kind_name(manifests, "Deployment", "honcho-api") + container = _get_container(deployment, "api") + drop = container["securityContext"].get("capabilities", {}).get("drop", []) + assert "ALL" in drop + + +def test_api_container_readonly_root_filesystem(manifests: list[dict[str, Any]]): + deployment = _by_kind_name(manifests, "Deployment", "honcho-api") + container = _get_container(deployment, "api") + assert container["securityContext"].get("readOnlyRootFilesystem") is True + + +def test_api_pod_has_seccomp_profile(manifests: list[dict[str, Any]]): + deployment = _by_kind_name(manifests, "Deployment", "honcho-api") + pod_sc = deployment["spec"]["template"]["spec"].get("securityContext", {}) + assert pod_sc.get("seccompProfile", {}).get("type") == "RuntimeDefault" + + +def test_deriver_container_security_mirrors_api(manifests: list[dict[str, Any]]): + """Deriver must have the same security posture as the API.""" + deployment = _by_kind_name(manifests, "Deployment", "honcho-deriver") + pod_sc = deployment["spec"]["template"]["spec"].get("securityContext", {}) + assert pod_sc.get("runAsNonRoot") is True + assert pod_sc.get("seccompProfile", {}).get("type") == "RuntimeDefault" + container = _get_container(deployment, "deriver") + sc = container["securityContext"] + assert sc.get("runAsNonRoot") is True + assert sc.get("allowPrivilegeEscalation") is False + assert sc.get("readOnlyRootFilesystem") is True + assert "ALL" in sc.get("capabilities", {}).get("drop", []) + + +def test_init_containers_run_as_nobody(manifests: list[dict[str, Any]]): + """busybox init containers must run as UID/GID 65534 (nobody/nogroup).""" + for deployment_name in ("honcho-api", "honcho-deriver"): + deployment = _by_kind_name(manifests, "Deployment", deployment_name) + init_containers = deployment["spec"]["template"]["spec"].get( + "initContainers", [] + ) + assert len(init_containers) >= 2, ( + f"{deployment_name} must have at least 2 init containers" + ) + for ic in init_containers: + sc = ic.get("securityContext", {}) + assert sc.get("runAsUser") == 65534, ( + f"Init container '{ic['name']}' in {deployment_name} must run as UID 65534" + ) + assert sc.get("runAsGroup") == 65534, ( + f"Init container '{ic['name']}' in {deployment_name} must run as GID 65534" + ) + + +def test_all_workloads_no_service_account_token(manifests: list[dict[str, Any]]): + """No workload pod should mount a service account token.""" + workloads = [ + ("Deployment", "honcho-api"), + ("Deployment", "honcho-deriver"), + ("StatefulSet", "postgres"), + ("StatefulSet", "redis"), + ] + for kind, name in workloads: + workload = _by_kind_name(manifests, kind, name) + assert ( + workload["spec"]["template"]["spec"].get("automountServiceAccountToken") + is False + ), f"{kind}/{name} must set automountServiceAccountToken: false" + + +# --------------------------------------------------------------------------- +# Secrets vs ConfigMap split +# --------------------------------------------------------------------------- + + +def test_configmap_contains_no_secret_keys(manifests: list[dict[str, Any]]): + """Sensitive values must not appear in the ConfigMap.""" + cm = _by_kind_name(manifests, "ConfigMap", "honcho-config") + data = cm.get("data", {}) + forbidden = { + "DB_CONNECTION_URI", + "AUTH_JWT_SECRET", + "POSTGRES_PASSWORD", + "LLM_ANTHROPIC_API_KEY", + "LLM_GEMINI_API_KEY", + "LLM_OPENAI_API_KEY", + } + leaked = forbidden & set(data.keys()) + assert not leaked, f"Secret key(s) found in ConfigMap: {leaked}" + + +def test_db_uri_comes_from_secret(manifests: list[dict[str, Any]]): + """DB_CONNECTION_URI env var must be sourced from a Secret, not a ConfigMap.""" + for deployment_name in ("honcho-api", "honcho-deriver"): + deployment = _by_kind_name(manifests, "Deployment", deployment_name) + container = next( + c + for c in deployment["spec"]["template"]["spec"]["containers"] + if c["name"] in ("api", "deriver") + ) + db_env = next( + e for e in container["env"] if e["name"] == "DB_CONNECTION_URI" + ) + assert "secretKeyRef" in db_env.get("valueFrom", {}), ( + f"DB_CONNECTION_URI in {deployment_name} must come from a secretKeyRef" + ) + + +# --------------------------------------------------------------------------- +# Readiness/liveness probe differentiation for StatefulSets +# --------------------------------------------------------------------------- + + +def test_postgres_probes_use_different_types(manifests: list[dict[str, Any]]): + """postgres readiness (exec) and liveness (tcpSocket) must differ.""" + sts = _by_kind_name(manifests, "StatefulSet", "postgres") + container = sts["spec"]["template"]["spec"]["containers"][0] + assert "exec" in container["readinessProbe"] + assert "tcpSocket" in container["livenessProbe"] + + +def test_redis_probes_use_different_types(manifests: list[dict[str, Any]]): + """redis readiness (exec) and liveness (tcpSocket) must differ.""" + sts = _by_kind_name(manifests, "StatefulSet", "redis") + container = sts["spec"]["template"]["spec"]["containers"][0] + assert "exec" in container["readinessProbe"] + assert "tcpSocket" in container["livenessProbe"] + + +# --------------------------------------------------------------------------- +# Resource limits +# --------------------------------------------------------------------------- + + +def _assert_has_resource_limits(container: dict[str, Any], label: str): + resources = container.get("resources", {}) + assert "limits" in resources, f"{label}: missing resource limits" + assert "requests" in resources, f"{label}: missing resource requests" + for kind in ("limits", "requests"): + for field in ("memory", "cpu", "ephemeral-storage"): + assert field in resources[kind], ( + f"{label}: missing resources.{kind}.{field}" + ) + + +def test_all_containers_have_resource_limits(manifests: list[dict[str, Any]]): + """Every container (including init containers) must declare resource limits.""" + for m in manifests: + if m["kind"] not in ("Deployment", "StatefulSet"): + continue + name = m["metadata"]["name"] + spec = m["spec"]["template"]["spec"] + for c in spec.get("containers", []) + spec.get("initContainers", []): + _assert_has_resource_limits(c, f"{name}/{c['name']}")