From e72beb546b94a98ddd695c17ac69321dfcbe01d3 Mon Sep 17 00:00:00 2001 From: Mrugesh Mohapatra Date: Tue, 31 Mar 2026 12:57:18 +0530 Subject: [PATCH 01/40] feat(k3s): add ops-mgmt cluster configs and tooling Salvage useful infrastructure from feat/k3s-rancher onto a clean branch aligned with the Universe platform direction. Adds: - SOPS+age config for application secrets encryption - ops-mgmt cluster configs (security hardening, Rancher backup schedule) - DO inventory fix (drop unused id attribute) - Ansible config vault password comment --- .sops.yaml | 3 ++ ansible/ansible.cfg | 2 + ansible/inventory/digitalocean.yml | 1 + k3s/justfile | 27 ++++++++++ k3s/ops-mgmt/.gitignore | 5 ++ k3s/ops-mgmt/README.md | 52 +++++++++++++++++++ .../apps/rancher/backup-schedule.yaml | 18 +++++++ .../cluster/security/audit-policy.yaml | 38 ++++++++++++++ .../cluster/security/pss-admission.yaml | 31 +++++++++++ 9 files changed, 177 insertions(+) create mode 100644 .sops.yaml create mode 100644 k3s/justfile create mode 100644 k3s/ops-mgmt/.gitignore create mode 100644 k3s/ops-mgmt/README.md create mode 100644 k3s/ops-mgmt/apps/rancher/backup-schedule.yaml create mode 100644 k3s/ops-mgmt/cluster/security/audit-policy.yaml create mode 100644 k3s/ops-mgmt/cluster/security/pss-admission.yaml diff --git a/.sops.yaml b/.sops.yaml new file mode 100644 index 000000000..589ca98e9 --- /dev/null +++ b/.sops.yaml @@ -0,0 +1,3 @@ +creation_rules: + - path_regex: \.secrets\.env$ + age: "age1dj2tkgtplys5whp0rnw8kd4ell9m6jgfac5d8m8nprmgap70047sgfjtfr" diff --git a/ansible/ansible.cfg b/ansible/ansible.cfg index 682945304..e16c12d56 100644 --- a/ansible/ansible.cfg +++ b/ansible/ansible.cfg @@ -8,6 +8,8 @@ inventory = ./inventory home = ./.ansible collections_path = ./.ansible/collections:./roles roles_path = ./.ansible/roles:./roles +# Vault password: pass via CLI flag when needed +# ansible-playbook ... --vault-password-file <(op read "op://Service-Automation/Ansible-Vault-Password/Ansible-Vault-Password") [inventory] enable_plugins = yaml, ini, toml, community.general.linode, community.digitalocean.digitalocean diff --git a/ansible/inventory/digitalocean.yml b/ansible/inventory/digitalocean.yml index c9f0eeb36..464576de6 100644 --- a/ansible/inventory/digitalocean.yml +++ b/ansible/inventory/digitalocean.yml @@ -2,6 +2,7 @@ plugin: community.digitalocean.digitalocean api_token: "{{ lookup('ansible.builtin.env', 'DO_API_TOKEN') }}" attributes: + - id - name - tags - networks diff --git a/k3s/justfile b/k3s/justfile new file mode 100644 index 000000000..d89aef3ac --- /dev/null +++ b/k3s/justfile @@ -0,0 +1,27 @@ +set shell := ["bash", "-cu"] + +# Show available recipes +default: + @just --list + +# Encrypt app secrets for a given app path (e.g., just encrypt ops-backoffice/apps/appsmith) +encrypt path: + sops --encrypt {{path}}/manifests/base/secrets/.secrets.env \ + > {{path}}/manifests/base/secrets/.secrets.enc.env + @echo "Encrypted: {{path}}/manifests/base/secrets/.secrets.enc.env" + +# Decrypt app secrets for a given app path +decrypt path: + sops --decrypt {{path}}/manifests/base/secrets/.secrets.enc.env \ + > {{path}}/manifests/base/secrets/.secrets.env + @echo "Decrypted: {{path}}/manifests/base/secrets/.secrets.env" + +# Deploy an app (decrypt + apply) +deploy cluster app: + #!/usr/bin/env bash + set -eu + cd {{cluster}} + export $(cat .env | xargs) + just decrypt apps/{{app}} + kubectl apply -k apps/{{app}}/manifests/base/ + echo "Deployed {{app}} to {{cluster}}" diff --git a/k3s/ops-mgmt/.gitignore b/k3s/ops-mgmt/.gitignore new file mode 100644 index 000000000..e5b010291 --- /dev/null +++ b/k3s/ops-mgmt/.gitignore @@ -0,0 +1,5 @@ +.kubeconfig.yaml +**/secrets/.secrets.env +**/secrets/tls.crt +**/secrets/tls.key +!**/secrets/.secrets.enc.env diff --git a/k3s/ops-mgmt/README.md b/k3s/ops-mgmt/README.md new file mode 100644 index 000000000..b6835e24e --- /dev/null +++ b/k3s/ops-mgmt/README.md @@ -0,0 +1,52 @@ +# ops-mgmt k3s Cluster + +Rancher management cluster for provisioning and managing downstream Kubernetes clusters. + +## Specifications + +- **Node**: 1x s-4vcpu-8gb (DigitalOcean, nyc3) +- **Pod CIDR**: 10.40.0.0/16 +- **Service CIDR**: 10.41.0.0/16 +- **Tailscale hostname**: ops-k3s-mgmt-subnet + +## Quick Access + +```bash +cd k3s/ops-mgmt && export $(cat .env | xargs) +kubectl get nodes +``` + +## Deployment + +Everything is managed by a single Ansible playbook (8 plays): + +```bash +cd ansible/ +ansible-playbook -i inventory/digitalocean.yml play-k3s--ops-mgmt.yml \ + -e variable_host=mgmt_k3s \ + --vault-password-file <(op read "op://Service-Automation/Ansible-Vault-Password/Ansible-Vault-Password") +``` + +The playbook handles: k3s install, security hardening (secrets-encryption, PSS, audit logging), +cert-manager, Rancher, rancher-backup + schedule, Tailscale operator + Connector, +kubeconfig fetch, and DO firewall lockdown. + +Prerequisites: VM provisioned with Tailscale installed, Ansible Vault populated. + +## Re-runs + +After first run, the DO firewall restricts SSH to Tailscale only. Re-run via Tailscale IP: + +```bash +ansible-playbook -i inventory/digitalocean.yml play-k3s--ops-mgmt.yml \ + -e variable_host=mgmt_k3s \ + -e ansible_host= \ + --vault-password-file <(op read "op://Service-Automation/Ansible-Vault-Password/Ansible-Vault-Password") +``` + +## Disaster Recovery + +- **rancher-backup operator** takes snapshots every 6 hours to DO Spaces (`net.freecodecamp.ops-k3s-backups/rancher-backup`) +- **etcd snapshots** every 6 hours to DO Spaces (`net.freecodecamp.ops-k3s-backups/etcd/ops-mgmt`) +- Downstream clusters continue operating independently if ops-mgmt is lost +- Restore: deploy fresh k3s + Rancher, then `kubectl apply -f` a Restore CR pointing to the backup diff --git a/k3s/ops-mgmt/apps/rancher/backup-schedule.yaml b/k3s/ops-mgmt/apps/rancher/backup-schedule.yaml new file mode 100644 index 000000000..1e9955d6e --- /dev/null +++ b/k3s/ops-mgmt/apps/rancher/backup-schedule.yaml @@ -0,0 +1,18 @@ +# Recurring Rancher backup to DO Spaces +# Apply: kubectl apply -f backup-schedule.yaml +apiVersion: resources.cattle.io/v1 +kind: Backup +metadata: + name: rancher-scheduled-backup +spec: + resourceSetName: rancher-resource-set + retentionCount: 20 + schedule: "0 */6 * * *" + storageLocation: + s3: + bucketName: net.freecodecamp.ops-k3s-backups + folder: rancher-backup + region: nyc3 + endpoint: nyc3.digitaloceanspaces.com + credentialSecretName: rancher-backup-s3-creds + credentialSecretNamespace: cattle-resources-system diff --git a/k3s/ops-mgmt/cluster/security/audit-policy.yaml b/k3s/ops-mgmt/cluster/security/audit-policy.yaml new file mode 100644 index 000000000..4318218bd --- /dev/null +++ b/k3s/ops-mgmt/cluster/security/audit-policy.yaml @@ -0,0 +1,38 @@ +# Kubernetes API audit policy +# Copied to /var/lib/rancher/k3s/server/audit-policy.yaml by Ansible +# +# Phase 1: minimal — log secret access and anonymous requests only +# Phase 2: expand to full request/response logging for sensitive resources +apiVersion: audit.k8s.io/v1 +kind: Policy +rules: + # Log secret read/write at Metadata level + - level: Metadata + resources: + - group: "" + resources: ["secrets"] + + # Log anonymous/unauthenticated requests + - level: Metadata + users: ["system:anonymous"] + + # Log RBAC changes + - level: Metadata + resources: + - group: "rbac.authorization.k8s.io" + resources: + ["clusterroles", "clusterrolebindings", "roles", "rolebindings"] + + # Skip noisy read-only system requests + - level: None + users: ["system:kube-proxy"] + - level: None + resources: + - group: "" + resources: ["endpoints", "services", "services/status"] + verbs: ["get", "watch", "list"] + + # Default: log everything else at Metadata + - level: Metadata + omitStages: + - "RequestReceived" diff --git a/k3s/ops-mgmt/cluster/security/pss-admission.yaml b/k3s/ops-mgmt/cluster/security/pss-admission.yaml new file mode 100644 index 000000000..0cc18a71f --- /dev/null +++ b/k3s/ops-mgmt/cluster/security/pss-admission.yaml @@ -0,0 +1,31 @@ +# Pod Security Standards admission configuration +# Copied to /var/lib/rancher/k3s/server/pss.yaml by Ansible +# +# - baseline: enforced (blocks privileged containers, host networking, hostPath) +# - restricted: audit + warn only (logs violations, does not block) +# - System namespaces exempted (Rancher, cert-manager, Longhorn, Tailscale need elevated privileges) +apiVersion: apiserver.config.k8s.io/v1 +kind: AdmissionConfiguration +plugins: + - name: PodSecurity + configuration: + apiVersion: pod-security.admission.config.k8s.io/v1 + kind: PodSecurityConfiguration + defaults: + enforce: "baseline" + enforce-version: "latest" + audit: "restricted" + audit-version: "latest" + warn: "restricted" + warn-version: "latest" + exemptions: + namespaces: + - kube-system + - cattle-system + - cattle-fleet-system + - cattle-fleet-local-system + - cattle-resources-system + - cattle-provisioning-capi-system + - cert-manager + - longhorn-system + - tailscale From b0fae18983772b9a4985d1a988db7c1aa7c80a98 Mon Sep 17 00:00:00 2001 From: Mrugesh Mohapatra Date: Wed, 1 Apr 2026 12:22:39 +0530 Subject: [PATCH 02/40] feat(k3s): add gxy-management galaxy configs and Day 0 spike infrastructure Universe Day 0 spike: provision the first galaxy (gxy-management) with Cilium CNI. Adds: - Ansible role: cilium (Helm-based CNI install on any k3s cluster) - play-k3s--galaxy.yml: semi-generic 6-play playbook for any Universe galaxy, composing roles with security hardening and etcd S3 backups - k3s/gxy-management/: cluster configs (Cilium values, PSS, audit policy), app manifests (Windmill, ArgoCD, Zot) - Pod CIDR 10.1.0.0/16, Service CIDR 10.11.0.0/16 (ADR-009) - Traefik for Day 0 ingress, Cilium Gateway API evaluation later - Internal services via NodePort on Tailscale IPs (ADR-009) - local-path storage (ADR-008: no Longhorn, Ceph on bare metal later) Naming convention: ops-gxy-* hybrid (ops- prefix for infra resources, gxy- for logical galaxy naming). --- ansible/play-k3s--galaxy.yml | 281 ++++++++++++++++++ ansible/roles/cilium/defaults/main.yml | 7 + ansible/roles/cilium/tasks/main.yml | 66 ++++ k3s/README.md | 70 ++++- k3s/gxy-management/.gitignore | 7 + k3s/gxy-management/README.md | 37 +++ .../apps/argocd/charts/argo-cd/values.yaml | 60 ++++ .../argocd/manifests/base/kustomization.yaml | 14 + .../apps/argocd/manifests/base/namespace.yaml | 4 + .../argocd/manifests/base/secrets/.gitignore | 1 + .../apps/windmill/charts/windmill/values.yaml | 74 +++++ .../manifests/base/kustomization.yaml | 14 + .../windmill/manifests/base/namespace.yaml | 4 + .../manifests/base/secrets/.gitignore | 3 + .../apps/zot/charts/zot/values.yaml | 90 ++++++ .../zot/manifests/base/kustomization.yaml | 14 + .../apps/zot/manifests/base/namespace.yaml | 4 + .../zot/manifests/base/secrets/.gitignore | 1 + k3s/gxy-management/cluster/cilium/values.yaml | 34 +++ .../cluster/security/audit-policy.yaml | 38 +++ .../cluster/security/pss-admission.yaml | 31 ++ 21 files changed, 838 insertions(+), 16 deletions(-) create mode 100644 ansible/play-k3s--galaxy.yml create mode 100644 ansible/roles/cilium/defaults/main.yml create mode 100644 ansible/roles/cilium/tasks/main.yml create mode 100644 k3s/gxy-management/.gitignore create mode 100644 k3s/gxy-management/README.md create mode 100644 k3s/gxy-management/apps/argocd/charts/argo-cd/values.yaml create mode 100644 k3s/gxy-management/apps/argocd/manifests/base/kustomization.yaml create mode 100644 k3s/gxy-management/apps/argocd/manifests/base/namespace.yaml create mode 100644 k3s/gxy-management/apps/argocd/manifests/base/secrets/.gitignore create mode 100644 k3s/gxy-management/apps/windmill/charts/windmill/values.yaml create mode 100644 k3s/gxy-management/apps/windmill/manifests/base/kustomization.yaml create mode 100644 k3s/gxy-management/apps/windmill/manifests/base/namespace.yaml create mode 100644 k3s/gxy-management/apps/windmill/manifests/base/secrets/.gitignore create mode 100644 k3s/gxy-management/apps/zot/charts/zot/values.yaml create mode 100644 k3s/gxy-management/apps/zot/manifests/base/kustomization.yaml create mode 100644 k3s/gxy-management/apps/zot/manifests/base/namespace.yaml create mode 100644 k3s/gxy-management/apps/zot/manifests/base/secrets/.gitignore create mode 100644 k3s/gxy-management/cluster/cilium/values.yaml create mode 100644 k3s/gxy-management/cluster/security/audit-policy.yaml create mode 100644 k3s/gxy-management/cluster/security/pss-admission.yaml diff --git a/ansible/play-k3s--galaxy.yml b/ansible/play-k3s--galaxy.yml new file mode 100644 index 000000000..090b0babb --- /dev/null +++ b/ansible/play-k3s--galaxy.yml @@ -0,0 +1,281 @@ +--- +# Deploy k3s HA galaxy cluster with Cilium CNI +# +# Provisions any Universe galaxy cluster. All nodes are control-plane (HA). +# Uses Cilium as CNI (flannel/kube-proxy disabled). Tailscale on nodes for +# SSH/kubectl access (NOT the K8s operator — ADR-009). +# +# Prerequisites (manual, one-time per galaxy): +# - 3x Ubuntu VMs on DigitalOcean with VPC attached (eth1) +# - Tailscale installed and connected on all nodes (play-tailscale--*.yml) +# - Ansible Vault password in 1Password +# - Vault secrets populated in vars/vault-k3s.yml (encrypted) +# - Cluster config directory: k3s//cluster/ +# +# Usage: +# cd ansible/ # direnv loads .env + activates venv +# ansible-playbook -i inventory/digitalocean.yml play-k3s--galaxy.yml \ +# -e variable_host=gxy_mgmt_k3s \ +# -e galaxy_name=gxy-management \ +# --vault-password-file <(op read "op://Service-Automation/Ansible-Vault-Password/Ansible-Vault-Password") +# +# What this playbook does (6 plays): +# 1. Validate prerequisites (VPC, Tailscale, vault secrets) +# 2. Prepare system (k3s prerequisites + security configs) +# 3. Deploy k3s server (Cilium flags, etcd S3 backups, security hardening) +# 4. Configure ingress (Traefik + Gateway API CRDs) +# 5. Install Cilium CNI +# 6. Fetch kubeconfig (copy to local machine, replace server IP with Tailscale IP) + +# Play 1: Validate prerequisites +- name: "K3s {{ galaxy_name | default('galaxy') }} - Validate prerequisites" + hosts: "{{ variable_host }}" + gather_facts: true + become: true + vars_files: + - vars/vault-k3s.yml + vars: + galaxy_name: "gxy-management" + + tasks: + - name: Validate vault secrets loaded + assert: + that: + - vault_do_spaces_access_key is defined + - vault_do_spaces_access_key | length > 0 + - vault_do_spaces_secret_key is defined + - vault_do_spaces_secret_key | length > 0 + fail_msg: "Vault secrets missing. Run: ansible-vault edit vars/vault-k3s.yml" + + - name: Validate VPC interface exists (eth1) + assert: + that: + - ansible_eth1 is defined + - ansible_eth1.ipv4 is defined + - ansible_eth1.ipv4.address is defined + fail_msg: "VPC interface eth1 not found. Ensure VM is attached to DO VPC." + + - name: Validate Tailscale is connected + assert: + that: + - ansible_tailscale0 is defined + - ansible_tailscale0.ipv4 is defined + - ansible_tailscale0.ipv4.address is defined + fail_msg: "Tailscale interface not found. Ensure Tailscale is installed and connected." + + - name: Validate VPC IP is in expected range + assert: + that: + - ansible_eth1.ipv4.address | regex_search('^10\.') + fail_msg: "VPC IP {{ ansible_eth1.ipv4.address }} not in 10.x.x.x range." + + - name: Set network facts + set_fact: + vpc_ip: "{{ ansible_eth1.ipv4.address }}" + tailscale_ip: "{{ ansible_tailscale0.ipv4.address }}" + + - name: Display network configuration + debug: + msg: "{{ inventory_hostname }}: VPC={{ vpc_ip }}, Tailscale={{ tailscale_ip }}" + + - name: Build k3s_cluster group + group_by: + key: k3s_cluster + + - name: Build server group + group_by: + key: server + +# Play 2: System prerequisites +- name: "K3s {{ galaxy_name | default('galaxy') }} - System prerequisites" + hosts: k3s_cluster + gather_facts: true + become: true + vars: + galaxy_name: "gxy-management" + cluster_config_dir: "{{ playbook_dir }}/../k3s/{{ galaxy_name }}" + + pre_tasks: + - name: Ensure k3s config directory exists + file: + path: /etc/rancher/k3s + state: directory + mode: "0755" + owner: root + group: root + + - name: Copy PSS admission config + copy: + src: "{{ cluster_config_dir }}/cluster/security/pss-admission.yaml" + dest: /etc/rancher/k3s/pss-admission.yaml + mode: "0600" + + - name: Copy audit policy + copy: + src: "{{ cluster_config_dir }}/cluster/security/audit-policy.yaml" + dest: /etc/rancher/k3s/audit-policy.yaml + mode: "0600" + + roles: + - role: k3s.orchestration.prereq + +# Play 3: Deploy k3s server +- name: "K3s {{ galaxy_name | default('galaxy') }} - Deploy k3s server" + hosts: server + gather_facts: true + become: true + vars_files: + - vars/vault-k3s.yml + vars: + galaxy_name: "gxy-management" + k3s_version: "v1.34.5+k3s1" + cluster_cidr: "10.1.0.0/16" + service_cidr: "10.11.0.0/16" + etcd_s3_endpoint: "nyc3.digitaloceanspaces.com" + etcd_s3_bucket: "net.freecodecamp.ops-k3s-backups" + etcd_s3_folder: "etcd/{{ galaxy_name }}" + etcd_s3_region: "nyc3" + etcd_snapshot_schedule: "0 */6 * * *" + etcd_snapshot_retention: 20 + api_endpoint: "{{ hostvars[groups['server'][0]]['vpc_ip'] }}" + extra_server_args: >- + --node-ip={{ hostvars[inventory_hostname]['vpc_ip'] }} + --advertise-address={{ hostvars[inventory_hostname]['vpc_ip'] }} + --tls-san={{ hostvars[inventory_hostname]['vpc_ip'] }} + --tls-san={{ hostvars[inventory_hostname]['tailscale_ip'] }} + --flannel-backend=none + --disable-network-policy + --disable-kube-proxy + --disable=servicelb + --secrets-encryption + --protect-kernel-defaults + --cluster-cidr={{ cluster_cidr }} + --service-cidr={{ service_cidr }} + --kube-apiserver-arg=admission-control-config-file=/etc/rancher/k3s/pss-admission.yaml + --kube-apiserver-arg=audit-log-path=/var/log/k3s/audit.log + --kube-apiserver-arg=audit-policy-file=/etc/rancher/k3s/audit-policy.yaml + --etcd-s3 + --etcd-s3-endpoint={{ etcd_s3_endpoint }} + --etcd-s3-bucket={{ etcd_s3_bucket }} + --etcd-s3-folder={{ etcd_s3_folder }} + --etcd-s3-region={{ etcd_s3_region }} + --etcd-snapshot-schedule-cron={{ etcd_snapshot_schedule }} + --etcd-snapshot-retention={{ etcd_snapshot_retention }} + server_group: server + pre_tasks: + - name: Create k3s audit log directory + file: + path: /var/log/k3s + state: directory + mode: "0750" + owner: root + group: root + + - name: Write k3s service environment (S3 credentials) + copy: + content: | + AWS_ACCESS_KEY_ID={{ vault_do_spaces_access_key }} + AWS_SECRET_ACCESS_KEY={{ vault_do_spaces_secret_key }} + dest: /etc/systemd/system/k3s.service.env + mode: "0600" + owner: root + group: root + no_log: true + + roles: + - role: k3s.orchestration.k3s_server + +# Play 4: Configure ingress +- name: "K3s {{ galaxy_name | default('galaxy') }} - Configure ingress" + hosts: server[0] + gather_facts: false + become: true + vars: + gateway_api_version: "v1.5.1" + + tasks: + - name: Apply Traefik HelmChartConfig + copy: + src: "{{ playbook_dir }}/../k3s/shared/traefik-config.yaml" + dest: /var/lib/rancher/k3s/server/manifests/traefik-config.yaml + mode: "0600" + + - name: Install Gateway API CRDs + command: > + k3s kubectl apply -f + https://github.com/kubernetes-sigs/gateway-api/releases/download/{{ gateway_api_version }}/standard-install.yaml + register: gateway_result + changed_when: "'created' in gateway_result.stdout or 'configured' in gateway_result.stdout" + + - name: Wait for all nodes ready + command: k3s kubectl wait --for=condition=Ready nodes --all --timeout=300s + changed_when: false + + - name: Display cluster status + command: k3s kubectl get nodes -o wide + register: cluster_status + changed_when: false + + - name: Cluster ready + debug: + msg: "{{ cluster_status.stdout_lines }}" + +# Play 5: Install Cilium +- name: "K3s {{ galaxy_name | default('galaxy') }} - Install Cilium" + hosts: server[0] + gather_facts: false + become: true + vars: + galaxy_name: "gxy-management" + cilium_cluster_id: 1 + cluster_config_dir: "{{ playbook_dir }}/../k3s/{{ galaxy_name }}" + roles: + - role: cilium + vars: + cilium_cluster_name: "{{ galaxy_name }}" + cilium_values_file: "{{ cluster_config_dir }}/cluster/cilium/values.yaml" + cilium_k8s_service_host: "{{ hostvars[groups['server'][0]]['vpc_ip'] }}" + +# Play 6: Fetch kubeconfig +- name: "K3s {{ galaxy_name | default('galaxy') }} - Fetch kubeconfig" + hosts: server[0] + gather_facts: false + become: true + vars: + galaxy_name: "gxy-management" + cluster_config_dir: "{{ playbook_dir }}/../k3s/{{ galaxy_name }}" + + tasks: + - name: Read kubeconfig from server + slurp: + src: /etc/rancher/k3s/k3s.yaml + register: kubeconfig_raw + + - name: Write kubeconfig locally (replace server IP with Tailscale IP) + copy: + content: "{{ kubeconfig_raw.content | b64decode | regex_replace('127\\.0\\.0\\.1', hostvars[inventory_hostname]['tailscale_ip']) }}" + dest: "{{ cluster_config_dir }}/.kubeconfig.yaml" + mode: "0600" + delegate_to: localhost + become: false + + - name: Verify kubectl connectivity + command: kubectl get nodes + environment: + KUBECONFIG: "{{ cluster_config_dir }}/.kubeconfig.yaml" + register: kubectl_result + changed_when: false + delegate_to: localhost + become: false + + - name: Display final status + debug: + msg: + - "=== {{ galaxy_name }} cluster ready ===" + - "Kubeconfig: k3s/{{ galaxy_name }}/.kubeconfig.yaml" + - "Nodes: {{ kubectl_result.stdout }}" + - "" + - "Next steps:" + - " 1. Verify Cilium: kubectl exec -n kube-system ds/cilium -c cilium-agent -- cilium status" + - " 2. Verify etcd snapshots: ssh root@ k3s etcd-snapshot list" + - " 3. Deploy apps: kubectl apply -k k3s/{{ galaxy_name }}/apps//manifests/base/" diff --git a/ansible/roles/cilium/defaults/main.yml b/ansible/roles/cilium/defaults/main.yml new file mode 100644 index 000000000..6f07551d9 --- /dev/null +++ b/ansible/roles/cilium/defaults/main.yml @@ -0,0 +1,7 @@ +--- +cilium_version: "1.19.2" +cilium_cluster_name: "" +cilium_cluster_id: "" +cilium_values_file: "" +cilium_k8s_service_host: "" +cilium_k8s_service_port: "6443" diff --git a/ansible/roles/cilium/tasks/main.yml b/ansible/roles/cilium/tasks/main.yml new file mode 100644 index 000000000..492465997 --- /dev/null +++ b/ansible/roles/cilium/tasks/main.yml @@ -0,0 +1,66 @@ +--- +- name: Validate required Cilium variables + ansible.builtin.assert: + that: + - cilium_cluster_name | length > 0 + - cilium_cluster_id | length > 0 + - cilium_values_file | length > 0 + - cilium_k8s_service_host | length > 0 + fail_msg: >- + Required variables missing. Set cilium_cluster_name, cilium_cluster_id, + cilium_values_file, and cilium_k8s_service_host. + +- name: Copy Cilium values file to server + ansible.builtin.copy: + src: "{{ cilium_values_file }}" + dest: /tmp/cilium-values.yaml + mode: "0600" + +- name: Add Cilium Helm repo + kubernetes.core.helm_repository: + name: cilium + repo_url: https://helm.cilium.io/ + +- name: Install Cilium + kubernetes.core.helm: + name: cilium + chart_ref: cilium/cilium + chart_version: "{{ cilium_version }}" + release_namespace: kube-system + update_repo_cache: true + values_files: + - /tmp/cilium-values.yaml + set_values: + - value: "k8sServiceHost={{ cilium_k8s_service_host }}" + - value: "k8sServicePort={{ cilium_k8s_service_port }}" + wait: true + timeout: "5m0s" + +- name: Clean up Cilium values file + ansible.builtin.file: + path: /tmp/cilium-values.yaml + state: absent + +- name: Wait for Cilium agent DaemonSet rollout + ansible.builtin.command: + cmd: kubectl -n kube-system rollout status daemonset/cilium --timeout=180s + changed_when: false + +- name: Wait for Cilium operator Deployment rollout + ansible.builtin.command: + cmd: >- + kubectl -n kube-system rollout status + deployment/cilium-operator --timeout=180s + changed_when: false + +- name: Verify Cilium status + ansible.builtin.command: + cmd: >- + kubectl -n kube-system exec ds/cilium + -c cilium-agent -- cilium status --brief + register: cilium_status + changed_when: false + +- name: Display Cilium status + ansible.builtin.debug: + msg: "{{ cilium_status.stdout_lines }}" diff --git a/k3s/README.md b/k3s/README.md index 1eefa6a18..9d083d947 100644 --- a/k3s/README.md +++ b/k3s/README.md @@ -4,15 +4,19 @@ Self-hosted k3s clusters on DigitalOcean. ## Clusters -| Cluster | Purpose | Apps | -| -------------------- | -------------- | ----------------- | -| ops-backoffice-tools | Internal tools | Appsmith, Outline | +| Cluster | Purpose | Apps | +| -------------------- | ----------------- | --------------------- | +| ops-backoffice-tools | Internal tools | Appsmith, Outline | +| gxy-management | Universe platform | Windmill, ArgoCD, Zot | ## Quick Access ```bash # Tools cluster cd k3s/ops-backoffice-tools && export KUBECONFIG=$(pwd)/.kubeconfig.yaml + +# Galaxy management cluster +cd k3s/gxy-management && export KUBECONFIG=$(pwd)/.kubeconfig.yaml ``` ## Structure @@ -20,6 +24,16 @@ cd k3s/ops-backoffice-tools && export KUBECONFIG=$(pwd)/.kubeconfig.yaml ``` k3s/ ├── archive/ # Archived configs (historical reference) +├── gxy-management/ +│ ├── apps/ +│ │ ├── argocd/ +│ │ ├── windmill/ +│ │ └── zot/ +│ └── cluster/ +│ ├── cilium/ +│ ├── longhorn/ +│ ├── security/ +│ └── tailscale/ ├── ops-backoffice-tools/ │ ├── apps/ │ │ ├── appsmith/ @@ -45,9 +59,10 @@ k3s/ ### Droplets -| Cluster | Name Pattern | Count | Specs | Tags | -| ------- | ------------------------ | ----- | ------------------ | -------------- | -| tools | ops-vm-tools-k3s-nyc3-0X | 3 | 4 vCPU, 8GB, 160GB | k3s, tools_k3s | +| Cluster | Name Pattern | Count | Specs | Tags | +| -------------- | --------------------------- | ----- | ------------------ | ------------------- | +| tools | ops-vm-tools-k3s-nyc3-0X | 3 | 4 vCPU, 8GB, 160GB | k3s, tools_k3s | +| gxy-management | ops-vm-gxy-mgmt-k3s-nyc3-0X | 3 | 4 vCPU, 8GB, 160GB | k3s, \_gxy-mgmt-k3s | ### Load Balancer @@ -62,13 +77,17 @@ k3s/ ```bash cd ansible -# Deploy cluster +# Deploy tools cluster uv run ansible-playbook -i inventory/digitalocean.yml play-k3s--cluster.yml \ -e variable_host=tools_k3s -# Longhorn storage +# Longhorn storage (tools) uv run ansible-playbook -i inventory/digitalocean.yml play-k3s--longhorn.yml \ -e variable_host=tools_k3s + +# Deploy gxy-management galaxy +uv run ansible-playbook -i inventory/digitalocean.yml play-k3s--galaxy.yml \ + -e variable_host=gxy_mgmt_k3s ``` --- @@ -97,10 +116,11 @@ See `tailscale/README.md` (repo root) for device inventory. ## DNS (Cloudflare) -| Record | Type | Value | -| ------------------------- | ---- | -------- | -| appsmith.freecodecamp.net | A | tools LB | -| outline.freecodecamp.net | A | tools LB | +| Record | Type | Value | +| ------------------------- | ---- | ----------------- | +| appsmith.freecodecamp.net | A | tools LB | +| outline.freecodecamp.net | A | tools LB | +| windmill.freecodecamp.net | A | gxy-management LB | --- @@ -142,11 +162,29 @@ Internet → Cloudflare → DO LB → Traefik (NodePort) → Gateway API → App | Appsmith | 1 | 10Gi | Embedded | | Outline | 1 | 10Gi + 10Gi | PostgreSQL sidecar | +### gxy-management + +``` +Internet → Cloudflare → DO LB → Traefik (NodePort) → Gateway API → Windmill + │ +Tailscale ──────────────────────────────────────────────────→├── ArgoCD + └── Zot + +CNI: Cilium Storage: Longhorn (2 replicas) SSH/kubectl: Tailscale +``` + +| App | Replicas | Access | Notes | +| -------- | ------------------- | -------------- | ---------- | +| Windmill | 1 server, 2 workers | Public (HTTPS) | | +| ArgoCD | 1 (single replica) | Tailscale-only | | +| Zot | 1 (single replica) | Tailscale-only | S3 backend | + --- ## Playbooks Reference -| Playbook | Purpose | -| ---------------------- | ------------------------ | -| play-k3s--cluster.yml | Deploy k3s HA cluster | -| play-k3s--longhorn.yml | Install Longhorn storage | +| Playbook | Purpose | +| ---------------------- | ----------------------------------------------------- | +| play-k3s--cluster.yml | Deploy k3s HA cluster | +| play-k3s--longhorn.yml | Install Longhorn storage | +| play-k3s--galaxy.yml | Deploy any Universe galaxy (K3s + Cilium + Tailscale) | diff --git a/k3s/gxy-management/.gitignore b/k3s/gxy-management/.gitignore new file mode 100644 index 000000000..c1dde3ec9 --- /dev/null +++ b/k3s/gxy-management/.gitignore @@ -0,0 +1,7 @@ +# Kubeconfig +.kubeconfig.yaml + +# Secrets (managed locally, not in git) +apps/*/manifests/base/secrets/.secrets.env +apps/*/manifests/base/secrets/tls.crt +apps/*/manifests/base/secrets/tls.key diff --git a/k3s/gxy-management/README.md b/k3s/gxy-management/README.md new file mode 100644 index 000000000..eb404c4ea --- /dev/null +++ b/k3s/gxy-management/README.md @@ -0,0 +1,37 @@ +# gxy-management + +First Universe galaxy. Control plane brain — manages all galaxies. + +## Specifications + +- **Nodes**: 3× DigitalOcean s-8vcpu-16gb (nyc3) +- **CNI**: Cilium (eBPF, Hubble observability) +- **Pod CIDR**: 10.1.0.0/16 +- **Service CIDR**: 10.11.0.0/16 +- **Storage**: local-path (K3s default) +- **Ingress**: Traefik (Day 0), Cilium Gateway API (target) + +## Applications + +| App | Purpose | Access | +| -------- | --------------------- | ------------------------------- | +| Windmill | Workflow engine | NodePort 30080 via Tailscale IP | +| ArgoCD | GitOps (all galaxies) | NodePort 30443 via Tailscale IP | +| Zot | Container registry | NodePort 30500 via Tailscale IP | + +## Quick Access + +```bash +cd k3s/gxy-management && export KUBECONFIG=$(pwd)/.kubeconfig.yaml +kubectl get nodes +``` + +## Deploy + +```bash +cd ansible +uv run ansible-playbook -i inventory/digitalocean.yml play-k3s--galaxy.yml \ + -e variable_host=gxy_mgmt_k3s \ + -e galaxy_name=gxy-management \ + --vault-password-file <(op read "op://Service-Automation/Ansible-Vault-Password/Ansible-Vault-Password") +``` diff --git a/k3s/gxy-management/apps/argocd/charts/argo-cd/values.yaml b/k3s/gxy-management/apps/argocd/charts/argo-cd/values.yaml new file mode 100644 index 000000000..bf5f15655 --- /dev/null +++ b/k3s/gxy-management/apps/argocd/charts/argo-cd/values.yaml @@ -0,0 +1,60 @@ +# Argo CD Helm values for gxy-management cluster +# Chart: argo-cd (https://argoproj.github.io/argo-helm) +# Non-HA, Tailscale-only access (no public ingress) + +# -- Single-replica (non-HA) deployment for 8GB nodes +controller: + replicas: 1 + resources: + requests: + cpu: 250m + memory: 256Mi + limits: + cpu: 500m + memory: 512Mi + +server: + replicas: 1 + resources: + requests: + cpu: 50m + memory: 64Mi + limits: + cpu: 200m + memory: 256Mi + +repoServer: + replicas: 1 + resources: + requests: + cpu: 50m + memory: 64Mi + limits: + cpu: 200m + memory: 256Mi + +redis: + resources: + requests: + cpu: 50m + memory: 64Mi + limits: + cpu: 200m + memory: 128Mi + +# -- Disable DEX (external SSO not needed) +dex: + enabled: false + +# -- Disable notifications controller (not needed) +notifications: + enabled: false + +# -- Disable ApplicationSet controller (not needed initially) +applicationSet: + enabled: false + +configs: + params: + # TLS terminated at Tailscale level + "server.insecure": true diff --git a/k3s/gxy-management/apps/argocd/manifests/base/kustomization.yaml b/k3s/gxy-management/apps/argocd/manifests/base/kustomization.yaml new file mode 100644 index 000000000..b77a5d497 --- /dev/null +++ b/k3s/gxy-management/apps/argocd/manifests/base/kustomization.yaml @@ -0,0 +1,14 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +namespace: argocd + +resources: + - namespace.yaml + +secretGenerator: + - name: argocd-secrets + type: Opaque + envs: + - secrets/.secrets.env + options: + disableNameSuffixHash: true diff --git a/k3s/gxy-management/apps/argocd/manifests/base/namespace.yaml b/k3s/gxy-management/apps/argocd/manifests/base/namespace.yaml new file mode 100644 index 000000000..a040f2ba5 --- /dev/null +++ b/k3s/gxy-management/apps/argocd/manifests/base/namespace.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: argocd diff --git a/k3s/gxy-management/apps/argocd/manifests/base/secrets/.gitignore b/k3s/gxy-management/apps/argocd/manifests/base/secrets/.gitignore new file mode 100644 index 000000000..9551e2ae8 --- /dev/null +++ b/k3s/gxy-management/apps/argocd/manifests/base/secrets/.gitignore @@ -0,0 +1 @@ +.secrets.env diff --git a/k3s/gxy-management/apps/windmill/charts/windmill/values.yaml b/k3s/gxy-management/apps/windmill/charts/windmill/values.yaml new file mode 100644 index 000000000..2147b69c1 --- /dev/null +++ b/k3s/gxy-management/apps/windmill/charts/windmill/values.yaml @@ -0,0 +1,74 @@ +# Windmill Helm chart values +# Chart: windmill/windmill +# Repo: https://windmill-labs.github.io/windmill-helm-charts/ +# Source: https://github.com/windmill-labs/windmill-helm-charts + +windmill: + baseDomain: windmill.freecodecamp.net + baseProtocol: https + appReplicas: 1 + extraReplicas: 1 + databaseUrl: postgres://postgres:windmill@windmill-postgresql/windmill?sslmode=disable + + app: + resources: + requests: + memory: "512Mi" + limits: + memory: "2Gi" + + workerGroups: + - name: "default" + controller: "Deployment" + replicas: 2 + privileged: true + podSecurityContext: + runAsUser: 0 + runAsNonRoot: false + resources: + requests: + memory: "512Mi" + limits: + memory: "2Gi" + + - name: "native" + controller: "Deployment" + replicas: 1 + privileged: false + podSecurityContext: + runAsUser: 0 + runAsNonRoot: false + resources: + requests: + memory: "256Mi" + limits: + memory: "1Gi" + extraEnv: + - name: "NATIVE_MODE" + value: "true" + - name: "SLEEP_QUEUE" + value: "200" + + indexer: + enabled: true + resources: + requests: + memory: "256Mi" + limits: + memory: "2Gi" + +postgresql: + enabled: true + auth: + postgresUser: postgres + postgresPassword: windmill + database: windmill + persistence: + enabled: true + size: 10Gi + +ingress: + enabled: false + +enterprise: + enabled: false diff --git a/k3s/gxy-management/apps/windmill/manifests/base/kustomization.yaml b/k3s/gxy-management/apps/windmill/manifests/base/kustomization.yaml new file mode 100644 index 000000000..b5d91fce4 --- /dev/null +++ b/k3s/gxy-management/apps/windmill/manifests/base/kustomization.yaml @@ -0,0 +1,14 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +namespace: windmill + +resources: + - namespace.yaml + +secretGenerator: + - name: windmill-secrets + type: Opaque + envs: + - secrets/.secrets.env + options: + disableNameSuffixHash: true diff --git a/k3s/gxy-management/apps/windmill/manifests/base/namespace.yaml b/k3s/gxy-management/apps/windmill/manifests/base/namespace.yaml new file mode 100644 index 000000000..5e1d9ca40 --- /dev/null +++ b/k3s/gxy-management/apps/windmill/manifests/base/namespace.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: windmill diff --git a/k3s/gxy-management/apps/windmill/manifests/base/secrets/.gitignore b/k3s/gxy-management/apps/windmill/manifests/base/secrets/.gitignore new file mode 100644 index 000000000..d05f4f5da --- /dev/null +++ b/k3s/gxy-management/apps/windmill/manifests/base/secrets/.gitignore @@ -0,0 +1,3 @@ +.secrets.env +tls.crt +tls.key diff --git a/k3s/gxy-management/apps/zot/charts/zot/values.yaml b/k3s/gxy-management/apps/zot/charts/zot/values.yaml new file mode 100644 index 000000000..8d3d4748e --- /dev/null +++ b/k3s/gxy-management/apps/zot/charts/zot/values.yaml @@ -0,0 +1,90 @@ +# Zot OCI Registry Helm values for gxy-management cluster +# Chart: zot (https://github.com/project-zot/helm-charts) +# Chart version: 0.1.104, Image: v2.1.15 +# Tailscale-only access (no public ingress) + +replicaCount: 1 + +image: + repository: ghcr.io/project-zot/zot + pullPolicy: IfNotPresent + tag: "v2.1.15" + +service: + type: ClusterIP + port: 5000 + +# No public ingress — Tailscale-only access via separate Service manifest +ingress: + enabled: false + +# Mount config.json from ConfigMap +mountConfig: true +configFiles: + config.json: |- + { + "storage": { + "rootDirectory": "/var/lib/registry", + "storageDriver": { + "name": "s3", + "regionendpoint": "https://nyc3.digitaloceanspaces.com", + "region": "nyc3", + "bucket": "CHANGEME", + "secure": true, + "skipverify": false, + "forcepathstyle": true + } + }, + "http": { + "address": "0.0.0.0", + "port": "5000", + "auth": { + "htpasswd": { + "path": "/secret/htpasswd" + } + } + }, + "log": { + "level": "info" + } + } + +# Mount htpasswd from Secret (managed via kustomize secretGenerator) +mountSecret: true +secretFiles: + # Placeholder — actual htpasswd managed via kustomize secrets/.secrets.env + # Generate entries with: htpasswd -nbBC 10 + htpasswd: "" + +# S3 credentials injected as environment variables +# The S3 driver picks up AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY +# automatically when accesskey/secretkey are omitted from config.json +env: + - name: AWS_ACCESS_KEY_ID + valueFrom: + secretKeyRef: + name: zot-secrets + key: S3_ACCESS_KEY + - name: AWS_SECRET_ACCESS_KEY + valueFrom: + secretKeyRef: + name: zot-secrets + key: S3_SECRET_KEY + +# Longhorn PVC for local cache +persistence: true +pvc: + create: true + accessModes: + - ReadWriteOnce + storage: 8Gi + storageClassName: local-path + +# Probes +httpGet: + scheme: HTTP + port: 5000 +startupProbe: + initialDelaySeconds: 5 + periodSeconds: 10 + failureThreshold: 3 diff --git a/k3s/gxy-management/apps/zot/manifests/base/kustomization.yaml b/k3s/gxy-management/apps/zot/manifests/base/kustomization.yaml new file mode 100644 index 000000000..e8b1710ed --- /dev/null +++ b/k3s/gxy-management/apps/zot/manifests/base/kustomization.yaml @@ -0,0 +1,14 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +namespace: zot + +resources: + - namespace.yaml + +secretGenerator: + - name: zot-secrets + type: Opaque + envs: + - secrets/.secrets.env + options: + disableNameSuffixHash: true diff --git a/k3s/gxy-management/apps/zot/manifests/base/namespace.yaml b/k3s/gxy-management/apps/zot/manifests/base/namespace.yaml new file mode 100644 index 000000000..017091887 --- /dev/null +++ b/k3s/gxy-management/apps/zot/manifests/base/namespace.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: zot diff --git a/k3s/gxy-management/apps/zot/manifests/base/secrets/.gitignore b/k3s/gxy-management/apps/zot/manifests/base/secrets/.gitignore new file mode 100644 index 000000000..9551e2ae8 --- /dev/null +++ b/k3s/gxy-management/apps/zot/manifests/base/secrets/.gitignore @@ -0,0 +1 @@ +.secrets.env diff --git a/k3s/gxy-management/cluster/cilium/values.yaml b/k3s/gxy-management/cluster/cilium/values.yaml new file mode 100644 index 000000000..0ca4cdf32 --- /dev/null +++ b/k3s/gxy-management/cluster/cilium/values.yaml @@ -0,0 +1,34 @@ +cluster: + name: "gxy-management" + id: 1 + +# Set at deploy time via Ansible --set flags +# k8sServiceHost: +k8sServicePort: "6443" + +kubeProxyReplacement: true + +ipam: + operator: + clusterPoolIPv4PodCIDRList: + - "10.1.0.0/16" + +operator: + replicas: 1 + +hubble: + enabled: true + relay: + enabled: true + ui: + enabled: false + +gatewayAPI: + enabled: false + +resources: + requests: + cpu: 100m + memory: 256Mi + limits: + memory: 512Mi diff --git a/k3s/gxy-management/cluster/security/audit-policy.yaml b/k3s/gxy-management/cluster/security/audit-policy.yaml new file mode 100644 index 000000000..4318218bd --- /dev/null +++ b/k3s/gxy-management/cluster/security/audit-policy.yaml @@ -0,0 +1,38 @@ +# Kubernetes API audit policy +# Copied to /var/lib/rancher/k3s/server/audit-policy.yaml by Ansible +# +# Phase 1: minimal — log secret access and anonymous requests only +# Phase 2: expand to full request/response logging for sensitive resources +apiVersion: audit.k8s.io/v1 +kind: Policy +rules: + # Log secret read/write at Metadata level + - level: Metadata + resources: + - group: "" + resources: ["secrets"] + + # Log anonymous/unauthenticated requests + - level: Metadata + users: ["system:anonymous"] + + # Log RBAC changes + - level: Metadata + resources: + - group: "rbac.authorization.k8s.io" + resources: + ["clusterroles", "clusterrolebindings", "roles", "rolebindings"] + + # Skip noisy read-only system requests + - level: None + users: ["system:kube-proxy"] + - level: None + resources: + - group: "" + resources: ["endpoints", "services", "services/status"] + verbs: ["get", "watch", "list"] + + # Default: log everything else at Metadata + - level: Metadata + omitStages: + - "RequestReceived" diff --git a/k3s/gxy-management/cluster/security/pss-admission.yaml b/k3s/gxy-management/cluster/security/pss-admission.yaml new file mode 100644 index 000000000..0cc18a71f --- /dev/null +++ b/k3s/gxy-management/cluster/security/pss-admission.yaml @@ -0,0 +1,31 @@ +# Pod Security Standards admission configuration +# Copied to /var/lib/rancher/k3s/server/pss.yaml by Ansible +# +# - baseline: enforced (blocks privileged containers, host networking, hostPath) +# - restricted: audit + warn only (logs violations, does not block) +# - System namespaces exempted (Rancher, cert-manager, Longhorn, Tailscale need elevated privileges) +apiVersion: apiserver.config.k8s.io/v1 +kind: AdmissionConfiguration +plugins: + - name: PodSecurity + configuration: + apiVersion: pod-security.admission.config.k8s.io/v1 + kind: PodSecurityConfiguration + defaults: + enforce: "baseline" + enforce-version: "latest" + audit: "restricted" + audit-version: "latest" + warn: "restricted" + warn-version: "latest" + exemptions: + namespaces: + - kube-system + - cattle-system + - cattle-fleet-system + - cattle-fleet-local-system + - cattle-resources-system + - cattle-provisioning-capi-system + - cert-manager + - longhorn-system + - tailscale From a564bd640d7e27fe6fc029c181cab6f8e11f2326 Mon Sep 17 00:00:00 2001 From: Mrugesh Mohapatra Date: Wed, 1 Apr 2026 23:09:55 +0530 Subject: [PATCH 03/40] refactor: consolidate justfiles into root justfile Merge ansible, k3s, and terraform justfiles into a single root justfile with grouped recipes. All recipes run from repo root. Groups: secrets, k3s, ansible, terraform. Uses uv run for ansible commands since direnv does not activate inside just subprocesses. --- ansible/justfile | 63 ----------------- justfile | 169 +++++++++++++++++++++++++++++++++++++++++++++ k3s/justfile | 27 -------- terraform/justfile | 68 ------------------ 4 files changed, 169 insertions(+), 158 deletions(-) delete mode 100644 ansible/justfile create mode 100644 justfile delete mode 100644 k3s/justfile delete mode 100644 terraform/justfile diff --git a/ansible/justfile b/ansible/justfile deleted file mode 100644 index dc61cc942..000000000 --- a/ansible/justfile +++ /dev/null @@ -1,63 +0,0 @@ -set shell := ["bash", "-cu"] - -venv := ".venv" -INVENTORY := env("INVENTORY", "linode.yml") - -# Show available recipes -default: - @just --list - @echo "" - @echo "UV-BASED WORKFLOW:" - @echo " 1. just install # Install ansible + deps with uv" - @echo " 2. direnv allow # Auto-activate venv (one-time)" - @echo " 3. just test # Test connection" - @echo " 4. ansible-playbook ... # Run playbooks" - -# Install ansible and dependencies using uv -install: - #!/usr/bin/env bash - set -eu - if ! command -v uv >/dev/null 2>&1; then - echo "ERROR: uv not found. Please install uv first:" - echo " curl -LsSf https://astral.sh/uv/install.sh | sh" - exit 1 - fi - uv sync - source {{venv}}/bin/activate && ansible-galaxy install -r requirements.yml - -# Remove virtual environment and ansible directories -[confirm("This will delete the virtual environment and .ansible directory. Continue?")] -clean: - rm -rf {{venv}} .ansible - -# Test connection to random VM (set INVENTORY env var, default: linode.yml) -test: - #!/usr/bin/env bash - set -eu - echo "Counting VMs in inventory..." - if ! command -v ansible >/dev/null 2>&1; then - echo "ERROR: ansible not found - did you source the venv?" - echo "Run: source {{venv}}/bin/activate" - exit 1 - fi - if ! command -v jq >/dev/null 2>&1; then - echo "ERROR: jq not found - please install jq" - exit 1 - fi - VM_COUNT=$(ansible-inventory -i inventory/{{INVENTORY}} --list 2>/dev/null | jq -r '._meta.hostvars | keys | length') - if [ $? -ne 0 ]; then - echo "ERROR: Failed to parse inventory" - exit 1 - fi - echo "Found $VM_COUNT VMs in inventory" - if [ "$VM_COUNT" -eq 0 ]; then - echo "ERROR: No VMs found in inventory" - exit 1 - fi - RANDOM_INDEX=$(( RANDOM % VM_COUNT )) - echo "Testing connection to VM at index $RANDOM_INDEX..." - if ! ansible -i inventory/{{INVENTORY}} "all[$RANDOM_INDEX]" -m ping --one-line -v; then - echo "ERROR: Connection test failed" - exit 1 - fi - echo "SUCCESS: Connection test passed" diff --git a/justfile b/justfile new file mode 100644 index 000000000..75fceb81b --- /dev/null +++ b/justfile @@ -0,0 +1,169 @@ +set shell := ["bash", "-cu"] + +ansible_vault := "uv run --project ansible ansible-vault" +vault_password := "--vault-password-file <(op read \"op://Service-Automation/Ansible-Vault-Password/Ansible-Vault-Password\")" + +# Show available recipes +default: + @just --list + +# --------------------------------------------------------------------------- +# Secrets +# --------------------------------------------------------------------------- + +# Encrypt a secret +[group('secrets')] +secret-encrypt name: + {{ansible_vault}} encrypt {{vault_password}} secrets/{{name}}/.env + +# Decrypt a secret to stdout +[group('secrets')] +secret-decrypt name: + {{ansible_vault}} decrypt --output - {{vault_password}} secrets/{{name}}/.env + +# Decrypt a secret to a file +[group('secrets')] +secret-decrypt-to name dest: + {{ansible_vault}} decrypt --output {{dest}} {{vault_password}} secrets/{{name}}/.env + +# View a secret +[group('secrets')] +secret-view name: + {{ansible_vault}} view {{vault_password}} secrets/{{name}}/.env + +# Edit a secret +[group('secrets')] +secret-edit name: + {{ansible_vault}} edit {{vault_password}} secrets/{{name}}/.env + +# Encrypt all unencrypted .env files in secrets/ +[group('secrets')] +secret-encrypt-all: + #!/usr/bin/env bash + set -eu + for f in secrets/*/.env; do + [ -f "$f" ] || continue + if ! head -1 "$f" | grep -q '^\$ANSIBLE_VAULT'; then + echo "Encrypting $f" + {{ansible_vault}} encrypt {{vault_password}} "$f" + else + echo "Already encrypted: $f" + fi + done + +# Verify all encrypted secrets are readable +[group('secrets')] +secret-verify-all: + #!/usr/bin/env bash + set -eu + for f in secrets/*/.env; do + [ -f "$f" ] || continue + echo -n "$f: " + {{ansible_vault}} view {{vault_password}} "$f" > /dev/null 2>&1 && echo "OK" || echo "FAILED" + done + +# --------------------------------------------------------------------------- +# K8s / K3s +# --------------------------------------------------------------------------- + +# Deploy a K8s app (decrypt secrets → apply → clean up) +[group('k3s')] +deploy cluster app: + #!/usr/bin/env bash + set -eu + SECRETS_SRC="secrets/{{app}}/.env" + SECRETS_DST="k3s/{{cluster}}/apps/{{app}}/manifests/base/secrets/.secrets.env" + + if [ ! -f "$SECRETS_SRC" ]; then + echo "Error: $SECRETS_SRC not found" + echo "Create it: cp secrets/{{app}}/.env.sample secrets/{{app}}/.env && just secret-encrypt {{app}}" + exit 1 + fi + + {{ansible_vault}} decrypt --output "$SECRETS_DST" {{vault_password}} "$SECRETS_SRC" + trap 'rm -f "$SECRETS_DST"' EXIT + + cd k3s/{{cluster}} + export KUBECONFIG="$(pwd)/.kubeconfig.yaml" + kubectl apply -k apps/{{app}}/manifests/base/ + echo "Deployed {{app}} to {{cluster}}" + +# --------------------------------------------------------------------------- +# Ansible +# --------------------------------------------------------------------------- + +# Install ansible and dependencies +[group('ansible')] +ansible-install: + cd ansible && uv sync && uv run ansible-galaxy install -r requirements.yml + +# Test connection to a random VM +[group('ansible')] +ansible-test inventory="linode.yml": + #!/usr/bin/env bash + set -eu + cd ansible + VM_COUNT=$(uv run ansible-inventory -i inventory/{{inventory}} --list 2>/dev/null | jq -r '._meta.hostvars | keys | length') + echo "Found $VM_COUNT VMs" + [ "$VM_COUNT" -eq 0 ] && echo "No VMs found" && exit 1 + RANDOM_INDEX=$(( RANDOM % VM_COUNT )) + uv run ansible -i inventory/{{inventory}} "all[$RANDOM_INDEX]" -m ping --one-line -v + +# --------------------------------------------------------------------------- +# Terraform +# --------------------------------------------------------------------------- + +# List all Terraform workspaces +[group('terraform')] +tf-list: + @find terraform -name ".terraform.lock.hcl" -exec dirname {} \; | sort + +# Format Terraform files +[group('terraform')] +tf-format: + #!/usr/bin/env bash + set -eu + for ws in $(find terraform -name ".terraform.lock.hcl" -exec dirname {} \;); do + echo "Formatting $ws" + terraform -chdir=$ws fmt + done + +# Validate Terraform configurations +[group('terraform')] +tf-validate: + #!/usr/bin/env bash + set -eu + for ws in $(find terraform -name ".terraform.lock.hcl" -exec dirname {} \;); do + echo "Validating $ws" + terraform -chdir=$ws validate + done + +# Initialize Terraform workspaces +[group('terraform')] +tf-init: + #!/usr/bin/env bash + set -eu + for ws in $(find terraform -name ".terraform.lock.hcl" -exec dirname {} \;); do + echo "Initializing $ws" + terraform -chdir=$ws init + done + +# Initialize and upgrade Terraform workspaces +[group('terraform')] +tf-init-upgrade: + #!/usr/bin/env bash + set -eu + for ws in $(find terraform -name ".terraform.lock.hcl" -exec dirname {} \;); do + echo "Upgrading $ws" + terraform -chdir=$ws init -upgrade + done + +# Plan all Terraform workspaces +[group('terraform')] +tf-plan: + #!/usr/bin/env bash + set -eu + for ws in $(find terraform -name ".terraform.lock.hcl" -exec dirname {} \;); do + echo "Planning $ws" + terraform -chdir=$ws plan + done diff --git a/k3s/justfile b/k3s/justfile deleted file mode 100644 index d89aef3ac..000000000 --- a/k3s/justfile +++ /dev/null @@ -1,27 +0,0 @@ -set shell := ["bash", "-cu"] - -# Show available recipes -default: - @just --list - -# Encrypt app secrets for a given app path (e.g., just encrypt ops-backoffice/apps/appsmith) -encrypt path: - sops --encrypt {{path}}/manifests/base/secrets/.secrets.env \ - > {{path}}/manifests/base/secrets/.secrets.enc.env - @echo "Encrypted: {{path}}/manifests/base/secrets/.secrets.enc.env" - -# Decrypt app secrets for a given app path -decrypt path: - sops --decrypt {{path}}/manifests/base/secrets/.secrets.enc.env \ - > {{path}}/manifests/base/secrets/.secrets.env - @echo "Decrypted: {{path}}/manifests/base/secrets/.secrets.env" - -# Deploy an app (decrypt + apply) -deploy cluster app: - #!/usr/bin/env bash - set -eu - cd {{cluster}} - export $(cat .env | xargs) - just decrypt apps/{{app}} - kubectl apply -k apps/{{app}}/manifests/base/ - echo "Deployed {{app}} to {{cluster}}" diff --git a/terraform/justfile b/terraform/justfile deleted file mode 100644 index 0c0a90e80..000000000 --- a/terraform/justfile +++ /dev/null @@ -1,68 +0,0 @@ -# Find all directories containing .terraform.lock.hcl files -workspaces := `find . -name ".terraform.lock.hcl" -exec dirname {} \; | tr '\n' ' '` - -# Show available recipes -default: - @just --list - -# Format Terraform files in all workspaces -format: - @echo "Formatting Terraform files in all workspaces..." - @for workspace in {{workspaces}}; do \ - echo "Formatting $workspace"; \ - terraform -chdir=$workspace fmt; \ - done - @echo "Formatting complete." - -# List all detected Terraform workspaces -list-workspaces: - @echo "Detected Terraform workspaces:" - @for workspace in {{workspaces}}; do \ - echo " $workspace"; \ - done - -# Validate Terraform configurations in all workspaces -validate: - @echo "Validating Terraform configurations in all workspaces..." - @for workspace in {{workspaces}}; do \ - echo "Validating $workspace"; \ - terraform -chdir=$workspace validate; \ - done - @echo "Validation complete." - -# Initialize Terraform in all workspaces -init: - @echo "Initializing Terraform in all workspaces..." - @for workspace in {{workspaces}}; do \ - echo "Initializing $workspace"; \ - terraform -chdir=$workspace init; \ - done - @echo "Initialization complete." - -# Initialize and upgrade Terraform in all workspaces -init-upgrade: - @echo "Initializing and upgrading Terraform in all workspaces..." - @for workspace in {{workspaces}}; do \ - echo "Initializing and upgrading $workspace"; \ - terraform -chdir=$workspace init -upgrade; \ - done - @echo "Initialization and upgrade complete." - -# Run Terraform plan in all workspaces -plan: - @echo "Running Terraform plan in all workspaces..." - @for workspace in {{workspaces}}; do \ - echo "Planning $workspace"; \ - terraform -chdir=$workspace plan; \ - done - @echo "Planning complete." - -# Remove Terraform cache files from all workspaces -[confirm("This will delete .terraform, tfstate, and lock files. Continue?")] -clean: - @echo "Cleaning Terraform cache files from all workspaces..." - @for workspace in {{workspaces}}; do \ - echo "Cleaning $workspace"; \ - rm -rf $workspace/.terraform $workspace/*.tfstate* $workspace/.terraform.lock.hcl; \ - done - @echo "Cleaning complete." From 4ebcc24837188f7a14aa7664e905cf78d3ebff7e Mon Sep 17 00:00:00 2001 From: Mrugesh Mohapatra Date: Wed, 1 Apr 2026 23:10:17 +0530 Subject: [PATCH 04/40] feat: consolidate secrets management with ansible-vault Single secrets/ directory with ansible-vault encryption replaces scattered .secrets.env files and SOPS+age. - secrets//.env.sample: plaintext templates (git tracked) - secrets//.env: ansible-vault encrypted (gitignored, shared via 1Password) - Remove .sops.yaml (SOPS replaced by ansible-vault) - Update .gitignore to whitelist secrets/ samples only - Add secrets/.gitignore to exclude encrypted .env files - Add secrets/README.md documenting the approach Encrypted .env files exist for: global, do-legacy, do-universe, ansible, appsmith, outline. New apps (windmill, argocd, zot) have samples only until deployed. --- .gitignore | 6 +++ .sops.yaml | 3 -- k3s/gxy-management/.gitignore | 5 ++- secrets/.gitignore | 6 +++ secrets/README.md | 65 +++++++++++++++++++++++++++++++++ secrets/ansible/.env.sample | 4 ++ secrets/appsmith/.env.sample | 25 +++++++++++++ secrets/argocd/.env.sample | 9 +++++ secrets/do-legacy/.env.sample | 1 + secrets/do-universe/.env.sample | 3 ++ secrets/global/.env.sample | 2 + secrets/outline/.env.sample | 38 +++++++++++++++++++ secrets/windmill/.env.sample | 17 +++++++++ secrets/zot/.env.sample | 17 +++++++++ 14 files changed, 197 insertions(+), 4 deletions(-) delete mode 100644 .sops.yaml create mode 100644 secrets/.gitignore create mode 100644 secrets/README.md create mode 100644 secrets/ansible/.env.sample create mode 100644 secrets/appsmith/.env.sample create mode 100644 secrets/argocd/.env.sample create mode 100644 secrets/do-legacy/.env.sample create mode 100644 secrets/do-universe/.env.sample create mode 100644 secrets/global/.env.sample create mode 100644 secrets/outline/.env.sample create mode 100644 secrets/windmill/.env.sample create mode 100644 secrets/zot/.env.sample diff --git a/.gitignore b/.gitignore index 73b3f2a3a..f225add8e 100644 --- a/.gitignore +++ b/.gitignore @@ -50,6 +50,12 @@ ansible/inventory/hosts *.env.* .envrc + +# Allow secrets/ directory (ansible-vault encrypted, safe in public repo) +!secrets/**/.env +!secrets/**/.env.sample +!secrets/**/README.md + .kubeconfig.yaml *.crt *.key diff --git a/.sops.yaml b/.sops.yaml deleted file mode 100644 index 589ca98e9..000000000 --- a/.sops.yaml +++ /dev/null @@ -1,3 +0,0 @@ -creation_rules: - - path_regex: \.secrets\.env$ - age: "age1dj2tkgtplys5whp0rnw8kd4ell9m6jgfac5d8m8nprmgap70047sgfjtfr" diff --git a/k3s/gxy-management/.gitignore b/k3s/gxy-management/.gitignore index c1dde3ec9..4757c806f 100644 --- a/k3s/gxy-management/.gitignore +++ b/k3s/gxy-management/.gitignore @@ -1,7 +1,10 @@ +# Environment (direnv) +.env + # Kubeconfig .kubeconfig.yaml -# Secrets (managed locally, not in git) +# Decrypted secrets (temporary, generated by just deploy) apps/*/manifests/base/secrets/.secrets.env apps/*/manifests/base/secrets/tls.crt apps/*/manifests/base/secrets/tls.key diff --git a/secrets/.gitignore b/secrets/.gitignore new file mode 100644 index 000000000..7ba0a59d5 --- /dev/null +++ b/secrets/.gitignore @@ -0,0 +1,6 @@ +# Ignore encrypted .env files (not tracked, shared via 1Password) +*/.env + +# Track samples and docs +!*/.env.sample +!README.md diff --git a/secrets/README.md b/secrets/README.md new file mode 100644 index 000000000..ac4285fe1 --- /dev/null +++ b/secrets/README.md @@ -0,0 +1,65 @@ +# Secrets + +All secrets in one place. Encrypted with `ansible-vault`. One password unlocks everything. + +## Layout + +``` +secrets// + .env # ansible-vault encrypted (git tracked) + .env.sample # plaintext template (git tracked) +``` + +## Setup + +Get the vault password from a team member, then: + +```bash +echo 'your-vault-password' > ~/.ansible-vault-password +chmod 600 ~/.ansible-vault-password +``` + +Or use 1Password: + +```bash +--vault-password-file <(op read "op://Service-Automation/Ansible-Vault-Password/Ansible-Vault-Password") +``` + +## Commands + +```bash +# View +ansible-vault view secrets//.env + +# Edit +ansible-vault edit secrets//.env + +# Create new +cp secrets//.env.sample secrets//.env +# fill in values +ansible-vault encrypt secrets//.env + +# Decrypt to stdout +ansible-vault decrypt --output - secrets//.env +``` + +## Deploy a K8s app + +```bash +cd k3s +just deploy +``` + +## Directories + +| Directory | Purpose | +| -------------- | ----------------------------------------------- | +| `global/` | Shared tokens (Cloudflare, Linode) | +| `do-legacy/` | Legacy DO team API token | +| `do-universe/` | Universe DO team API token + Spaces credentials | +| `ansible/` | Playbook runtime secrets (S3, Tailscale OAuth) | +| `appsmith/` | Appsmith app secrets | +| `outline/` | Outline app secrets | +| `windmill/` | Windmill app secrets | +| `argocd/` | ArgoCD app secrets | +| `zot/` | Zot registry secrets (S3, htpasswd) | diff --git a/secrets/ansible/.env.sample b/secrets/ansible/.env.sample new file mode 100644 index 000000000..5111e620e --- /dev/null +++ b/secrets/ansible/.env.sample @@ -0,0 +1,4 @@ +vault_do_spaces_access_key= +vault_do_spaces_secret_key= +vault_tailscale_oauth_client_id= +vault_tailscale_oauth_client_secret= diff --git a/secrets/appsmith/.env.sample b/secrets/appsmith/.env.sample new file mode 100644 index 000000000..ba985af17 --- /dev/null +++ b/secrets/appsmith/.env.sample @@ -0,0 +1,25 @@ +# Appsmith secrets for appsmith.freecodecamp.net +# All values must be unquoted + +# ============================================================================= +# REQUIRED - Core +# ============================================================================= +# Generate with: openssl rand -hex 32 +APPSMITH_ENCRYPTION_PASSWORD= +APPSMITH_ENCRYPTION_SALT= +APPSMITH_SUPERVISOR_PASSWORD= + +# MongoDB Atlas connection string (unquoted) +APPSMITH_DB_URL=mongodb+srv://USER:PASS@xxx.yyy.mongodb.net/appsmith?retryWrites=true&w=majority&authSource=admin + +# ============================================================================= +# Optional +# ============================================================================= +APPSMITH_DISABLE_TELEMETRY=true +APPSMITH_DISABLE_INTERCOM=true + +# ============================================================================= +# REQUIRED - Google OAuth SSO +# ============================================================================= +# Appsmith doesn't support Google OAuth SSO configs from this file. Please configure +# them in the Appsmith UI. diff --git a/secrets/argocd/.env.sample b/secrets/argocd/.env.sample new file mode 100644 index 000000000..349b0a343 --- /dev/null +++ b/secrets/argocd/.env.sample @@ -0,0 +1,9 @@ +# ArgoCD secrets for ops-k3s-gxy-mgmt-argocd.batfish-ray.ts.net +# All values must be unquoted + +# ============================================================================= +# REQUIRED - Admin Password +# ============================================================================= +# Generate bcrypt hash with: argocd account bcrypt --password +# Or use: htpasswd -nbBC 10 "" | tr -d ':\n' | sed 's/$2y/$2a/' +ARGOCD_ADMIN_PASSWORD= diff --git a/secrets/do-legacy/.env.sample b/secrets/do-legacy/.env.sample new file mode 100644 index 000000000..92c79eb93 --- /dev/null +++ b/secrets/do-legacy/.env.sample @@ -0,0 +1 @@ +DO_API_TOKEN= diff --git a/secrets/do-universe/.env.sample b/secrets/do-universe/.env.sample new file mode 100644 index 000000000..2e44f405b --- /dev/null +++ b/secrets/do-universe/.env.sample @@ -0,0 +1,3 @@ +DO_API_TOKEN= +DO_SPACES_ACCESS_KEY= +DO_SPACES_SECRET_KEY= diff --git a/secrets/global/.env.sample b/secrets/global/.env.sample new file mode 100644 index 000000000..7ef0883b8 --- /dev/null +++ b/secrets/global/.env.sample @@ -0,0 +1,2 @@ +CLOUDFLARE_API_TOKEN= +LINODE_API_TOKEN= diff --git a/secrets/outline/.env.sample b/secrets/outline/.env.sample new file mode 100644 index 000000000..352e77d79 --- /dev/null +++ b/secrets/outline/.env.sample @@ -0,0 +1,38 @@ +# Outline secrets for outline.freecodecamp.net +# All values must be unquoted +# Generate secrets with: openssl rand -hex 32 + +# ============================================================================= +# REQUIRED - Application Secrets +# ============================================================================= +SECRET_KEY= +UTILS_SECRET= + +# ============================================================================= +# REQUIRED - Database (internal, don't change unless you know what you're doing) +# ============================================================================= +POSTGRES_USER=outline +POSTGRES_PASSWORD= +POSTGRES_DB=outline + +# ============================================================================= +# REQUIRED - Google OAuth +# ============================================================================= +# Create at: https://console.cloud.google.com/apis/credentials +# Authorized redirect URI: https://outline.freecodecamp.net/auth/google.callback +GOOGLE_CLIENT_ID= +GOOGLE_CLIENT_SECRET= + +# ============================================================================= +# OPTIONAL - GitHub Integration (PR/Issue previews) +# ============================================================================= +# Create GitHub App at: https://github.com/settings/apps +# Callback URL: https://outline.freecodecamp.net/api/github.callback +# Enable "Request user authorization (OAuth) during installation" +# Disable webhooks (not needed for previews) +# Base64 encode private key: openssl base64 -in private-key.pem -out key-b64.txt +GITHUB_CLIENT_ID= +GITHUB_CLIENT_SECRET= +GITHUB_APP_NAME= +GITHUB_APP_ID= +GITHUB_APP_PRIVATE_KEY= diff --git a/secrets/windmill/.env.sample b/secrets/windmill/.env.sample new file mode 100644 index 000000000..8479435c8 --- /dev/null +++ b/secrets/windmill/.env.sample @@ -0,0 +1,17 @@ +# Windmill secrets for windmill.freecodecamp.net +# All values must be unquoted + +# ============================================================================= +# REQUIRED - Database +# ============================================================================= +# PostgreSQL connection string for Windmill +# For embedded PostgreSQL, match the credentials in the Helm values +# For external PostgreSQL, use the managed database connection string +DATABASE_URL=postgres://postgres:windmill@windmill-postgresql/windmill?sslmode=disable + +# ============================================================================= +# REQUIRED - Admin +# ============================================================================= +# Initial admin email and password (used on first setup) +WINDMILL_ADMIN_EMAIL=admin@freecodecamp.org +WINDMILL_ADMIN_PASSWORD= diff --git a/secrets/zot/.env.sample b/secrets/zot/.env.sample new file mode 100644 index 000000000..934b6c728 --- /dev/null +++ b/secrets/zot/.env.sample @@ -0,0 +1,17 @@ +# Zot OCI Registry secrets for ops-k3s-gxy-mgmt-registry.batfish-ray.ts.net +# All values must be unquoted + +# ============================================================================= +# REQUIRED - S3 Storage (DigitalOcean Spaces) +# ============================================================================= +# DO Spaces access credentials +# Create at: https://cloud.digitalocean.com/account/api/spaces +S3_ACCESS_KEY= +S3_SECRET_KEY= + +# ============================================================================= +# REQUIRED - Registry Authentication +# ============================================================================= +# htpasswd entries for registry auth (one per line, newline-separated) +# Generate with: htpasswd -nbBC 10 +HTPASSWD= From 5810c792ffdbd1b6dcc22aeb6940ffcdaa7af6af Mon Sep 17 00:00:00 2001 From: Mrugesh Mohapatra Date: Wed, 1 Apr 2026 23:28:17 +0530 Subject: [PATCH 05/40] feat: add direnv hierarchy and secrets bootstrap workflow MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move from per-directory .envrc (ansible/) to root .envrc with per-cluster overrides via direnv source_env hierarchy. - Root .envrc: loads .env (global tokens) + adds ansible venv to PATH - k3s//.envrc: inherits root, loads cluster-specific .env (DO_API_TOKEN + KUBECONFIG) - just secret-bootstrap: decrypts global tokens to root .env - just secret-bootstrap-cluster: decrypts team tokens to cluster .env - Rename do-legacy → do-primary - Update .gitignore: track .envrc files, whitelist secrets/ samples --- .envrc | 5 ++++ .gitignore | 11 ++++++--- justfile | 24 +++++++++++++++++++ k3s/gxy-management/.envrc | 2 ++ k3s/ops-backoffice-tools/.envrc | 2 ++ secrets/README.md | 2 +- secrets/{do-legacy => do-primary}/.env.sample | 0 7 files changed, 42 insertions(+), 4 deletions(-) create mode 100644 .envrc create mode 100644 k3s/gxy-management/.envrc create mode 100644 k3s/ops-backoffice-tools/.envrc rename secrets/{do-legacy => do-primary}/.env.sample (100%) diff --git a/.envrc b/.envrc new file mode 100644 index 000000000..9efcf7736 --- /dev/null +++ b/.envrc @@ -0,0 +1,5 @@ +dotenv_if_exists .env + +if [ -d ansible/.venv ]; then + PATH_add ansible/.venv/bin +fi diff --git a/.gitignore b/.gitignore index f225add8e..0b54bc880 100644 --- a/.gitignore +++ b/.gitignore @@ -50,11 +50,16 @@ ansible/inventory/hosts *.env.* .envrc +# Allow .envrc files (direnv config, not secrets) +!.envrc +!k3s/**/.envrc -# Allow secrets/ directory (ansible-vault encrypted, safe in public repo) -!secrets/**/.env +# Allow secrets/ samples and docs (NOT encrypted .env files) !secrets/**/.env.sample -!secrets/**/README.md +!secrets/.gitignore +!secrets/README.md + + .kubeconfig.yaml *.crt diff --git a/justfile b/justfile index 75fceb81b..09e738c37 100644 --- a/justfile +++ b/justfile @@ -11,6 +11,30 @@ default: # Secrets # --------------------------------------------------------------------------- +# Bootstrap root .env (global tokens only — Cloudflare, Linode) +[group('secrets')] +secret-bootstrap: + #!/usr/bin/env bash + set -eu + SRC="secrets/global/.env" + [ -f "$SRC" ] || { echo "Error: $SRC not found. Get it from 1Password."; exit 1; } + {{ansible_vault}} decrypt --output .env {{vault_password}} "$SRC" + echo "Bootstrapped .env (global tokens)" + echo "Run: direnv allow" + +# Bootstrap a cluster .env (DO_API_TOKEN + KUBECONFIG) +[group('secrets')] +secret-bootstrap-cluster cluster team: + #!/usr/bin/env bash + set -eu + SRC="secrets/do-{{team}}/.env" + DEST="k3s/{{cluster}}/.env" + [ -f "$SRC" ] || { echo "Error: $SRC not found. Get it from 1Password."; exit 1; } + {{ansible_vault}} decrypt --output - {{vault_password}} "$SRC" > "$DEST" + echo "KUBECONFIG=.kubeconfig.yaml" >> "$DEST" + echo "Bootstrapped $DEST (team: {{team}})" + echo "Run: cd k3s/{{cluster}} && direnv allow" + # Encrypt a secret [group('secrets')] secret-encrypt name: diff --git a/k3s/gxy-management/.envrc b/k3s/gxy-management/.envrc new file mode 100644 index 000000000..f3b9e1705 --- /dev/null +++ b/k3s/gxy-management/.envrc @@ -0,0 +1,2 @@ +source_env ../../.envrc +dotenv_if_exists .env diff --git a/k3s/ops-backoffice-tools/.envrc b/k3s/ops-backoffice-tools/.envrc new file mode 100644 index 000000000..f3b9e1705 --- /dev/null +++ b/k3s/ops-backoffice-tools/.envrc @@ -0,0 +1,2 @@ +source_env ../../.envrc +dotenv_if_exists .env diff --git a/secrets/README.md b/secrets/README.md index ac4285fe1..8d30f00ea 100644 --- a/secrets/README.md +++ b/secrets/README.md @@ -55,7 +55,7 @@ just deploy | Directory | Purpose | | -------------- | ----------------------------------------------- | | `global/` | Shared tokens (Cloudflare, Linode) | -| `do-legacy/` | Legacy DO team API token | +| `do-primary/` | Primary DO team API token | | `do-universe/` | Universe DO team API token + Spaces credentials | | `ansible/` | Playbook runtime secrets (S3, Tailscale OAuth) | | `appsmith/` | Appsmith app secrets | diff --git a/secrets/do-legacy/.env.sample b/secrets/do-primary/.env.sample similarity index 100% rename from secrets/do-legacy/.env.sample rename to secrets/do-primary/.env.sample From 61370732c297bc370cac5dd0e843ad41ae397df4 Mon Sep 17 00:00:00 2001 From: Mrugesh Mohapatra Date: Thu, 2 Apr 2026 21:58:48 +0530 Subject: [PATCH 06/40] fix: move scratchpad --- .gitignore | 6 +++++- {__scratchpad__ => .scratchpad}/.gitkeep | 0 {__scratchpad__ => .scratchpad}/README.md | 0 3 files changed, 5 insertions(+), 1 deletion(-) rename {__scratchpad__ => .scratchpad}/.gitkeep (100%) rename {__scratchpad__ => .scratchpad}/README.md (100%) diff --git a/.gitignore b/.gitignore index 0b54bc880..3ee927145 100644 --- a/.gitignore +++ b/.gitignore @@ -36,7 +36,7 @@ terraform.rc .vscode/ # Ignore User-specific temporary files -__scratchpad__/ +.scratchpad # Ignore generated files manifest.json @@ -72,3 +72,7 @@ secrets.overrides.yaml o11y/defaults/ .beads-credential-key + +# Beads / Dolt files (added by bd init) +.dolt/ +*.db diff --git a/__scratchpad__/.gitkeep b/.scratchpad/.gitkeep similarity index 100% rename from __scratchpad__/.gitkeep rename to .scratchpad/.gitkeep diff --git a/__scratchpad__/README.md b/.scratchpad/README.md similarity index 100% rename from __scratchpad__/README.md rename to .scratchpad/README.md From c9c1b4e33f9197d49c7346f6aba800fbcf39fe53 Mon Sep 17 00:00:00 2001 From: Mrugesh Mohapatra Date: Thu, 2 Apr 2026 22:04:48 +0530 Subject: [PATCH 07/40] fix: move archive --- .../ansible/play-k3s--clickhouse.yml | 0 .../2026-03-observability-teardown/ansible/play-o11y--vector.yml | 0 .../ansible/vector-template/vector.yaml.j2 | 0 .../dashboards/clickhouse-monitoring.json | 0 .../dashboards/nginx-access-logs.json | 0 .../2026-03-observability-teardown/grafana-nginx-dashboard.md | 0 .../2026-03-observability-teardown/nginx-logs-schema.md | 0 .../ops-backoffice-tools/grafana/charts/grafana/values.yaml | 0 .../ops-backoffice-tools/grafana/manifests/base/gateway.yaml | 0 .../ops-backoffice-tools/grafana/manifests/base/httproutes.yaml | 0 .../grafana/manifests/base/kustomization.yaml | 0 .../ops-backoffice-tools/grafana/manifests/base/namespace.yaml | 0 .../grafana/manifests/base/secrets/.gitignore | 0 .../grafana/manifests/base/secrets/.secrets.env.sample | 0 .../ops-backoffice-tools/n8n/manifests/base/gateway.yaml | 0 .../ops-backoffice-tools/n8n/manifests/base/httproutes.yaml | 0 .../ops-backoffice-tools/n8n/manifests/base/kustomization.yaml | 0 .../n8n/manifests/base/n8n-main-deployment.yaml | 0 .../ops-backoffice-tools/n8n/manifests/base/n8n-main-service.yaml | 0 .../n8n/manifests/base/n8n-worker-deployment.yaml | 0 .../ops-backoffice-tools/n8n/manifests/base/namespace.yaml | 0 .../n8n/manifests/base/postgres-deployment.yaml | 0 .../ops-backoffice-tools/n8n/manifests/base/postgres-service.yaml | 0 .../ops-backoffice-tools/n8n/manifests/base/pvc.yaml | 0 .../ops-backoffice-tools/n8n/manifests/base/redis-deployment.yaml | 0 .../ops-backoffice-tools/n8n/manifests/base/redis-service.yaml | 0 .../ops-backoffice-tools/n8n/manifests/base/secrets/.gitignore | 0 .../prometheus/charts/kube-prometheus-stack/values.yaml | 0 .../prometheus/manifests/base/kustomization.yaml | 0 .../prometheus/manifests/base/longhorn-servicemonitor.yaml | 0 .../ops-backoffice-tools/prometheus/manifests/base/namespace.yaml | 0 .../prometheus/manifests/base/secrets/.gitignore | 0 .../prometheus/manifests/base/tailscale-ingress.yaml | 0 .../2026-03-observability-teardown/ops-logs-clickhouse/.gitignore | 0 .../apps/clickhouse/manifests/base/clickhouse-installation.yaml | 0 .../apps/clickhouse/manifests/base/clickhouse-keeper.yaml | 0 .../apps/clickhouse/manifests/base/kustomization.yaml | 0 .../apps/clickhouse/manifests/base/namespace.yaml | 0 .../apps/clickhouse/manifests/base/secrets/.gitignore | 0 .../clickhouse/manifests/base/secrets/users-secret.yaml.sample | 0 .../apps/clickhouse/manifests/base/service-tailscale.yaml | 0 .../apps/clickhouse/schemas/002-logs-nginx-stg.sql | 0 .../apps/clickhouse/schemas/003-logs-nginx-prd.sql | 0 .../cluster/charts/tailscale-operator/values.yaml | 0 .../ops-logs-clickhouse/cluster/charts/traefik/values.yaml | 0 .../ops-logs-clickhouse/cluster/tailscale/README.md | 0 .../ops-logs-clickhouse/cluster/tailscale/operator-values.yaml | 0 .../2026-03-observability-teardown/teardown-runbook.md | 0 48 files changed, 0 insertions(+), 0 deletions(-) rename {archive => .archive}/2026-03-observability-teardown/ansible/play-k3s--clickhouse.yml (100%) rename {archive => .archive}/2026-03-observability-teardown/ansible/play-o11y--vector.yml (100%) rename {archive => .archive}/2026-03-observability-teardown/ansible/vector-template/vector.yaml.j2 (100%) rename {archive => .archive}/2026-03-observability-teardown/dashboards/clickhouse-monitoring.json (100%) rename {archive => .archive}/2026-03-observability-teardown/dashboards/nginx-access-logs.json (100%) rename {archive => .archive}/2026-03-observability-teardown/grafana-nginx-dashboard.md (100%) rename {archive => .archive}/2026-03-observability-teardown/nginx-logs-schema.md (100%) rename {archive => .archive}/2026-03-observability-teardown/ops-backoffice-tools/grafana/charts/grafana/values.yaml (100%) rename {archive => .archive}/2026-03-observability-teardown/ops-backoffice-tools/grafana/manifests/base/gateway.yaml (100%) rename {archive => .archive}/2026-03-observability-teardown/ops-backoffice-tools/grafana/manifests/base/httproutes.yaml (100%) rename {archive => .archive}/2026-03-observability-teardown/ops-backoffice-tools/grafana/manifests/base/kustomization.yaml (100%) rename {archive => .archive}/2026-03-observability-teardown/ops-backoffice-tools/grafana/manifests/base/namespace.yaml (100%) rename {archive => .archive}/2026-03-observability-teardown/ops-backoffice-tools/grafana/manifests/base/secrets/.gitignore (100%) rename {archive => .archive}/2026-03-observability-teardown/ops-backoffice-tools/grafana/manifests/base/secrets/.secrets.env.sample (100%) rename {archive => .archive}/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/gateway.yaml (100%) rename {archive => .archive}/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/httproutes.yaml (100%) rename {archive => .archive}/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/kustomization.yaml (100%) rename {archive => .archive}/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/n8n-main-deployment.yaml (100%) rename {archive => .archive}/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/n8n-main-service.yaml (100%) rename {archive => .archive}/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/n8n-worker-deployment.yaml (100%) rename {archive => .archive}/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/namespace.yaml (100%) rename {archive => .archive}/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/postgres-deployment.yaml (100%) rename {archive => .archive}/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/postgres-service.yaml (100%) rename {archive => .archive}/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/pvc.yaml (100%) rename {archive => .archive}/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/redis-deployment.yaml (100%) rename {archive => .archive}/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/redis-service.yaml (100%) rename {archive => .archive}/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/secrets/.gitignore (100%) rename {archive => .archive}/2026-03-observability-teardown/ops-backoffice-tools/prometheus/charts/kube-prometheus-stack/values.yaml (100%) rename {archive => .archive}/2026-03-observability-teardown/ops-backoffice-tools/prometheus/manifests/base/kustomization.yaml (100%) rename {archive => .archive}/2026-03-observability-teardown/ops-backoffice-tools/prometheus/manifests/base/longhorn-servicemonitor.yaml (100%) rename {archive => .archive}/2026-03-observability-teardown/ops-backoffice-tools/prometheus/manifests/base/namespace.yaml (100%) rename {archive => .archive}/2026-03-observability-teardown/ops-backoffice-tools/prometheus/manifests/base/secrets/.gitignore (100%) rename {archive => .archive}/2026-03-observability-teardown/ops-backoffice-tools/prometheus/manifests/base/tailscale-ingress.yaml (100%) rename {archive => .archive}/2026-03-observability-teardown/ops-logs-clickhouse/.gitignore (100%) rename {archive => .archive}/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/manifests/base/clickhouse-installation.yaml (100%) rename {archive => .archive}/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/manifests/base/clickhouse-keeper.yaml (100%) rename {archive => .archive}/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/manifests/base/kustomization.yaml (100%) rename {archive => .archive}/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/manifests/base/namespace.yaml (100%) rename {archive => .archive}/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/manifests/base/secrets/.gitignore (100%) rename {archive => .archive}/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/manifests/base/secrets/users-secret.yaml.sample (100%) rename {archive => .archive}/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/manifests/base/service-tailscale.yaml (100%) rename {archive => .archive}/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/schemas/002-logs-nginx-stg.sql (100%) rename {archive => .archive}/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/schemas/003-logs-nginx-prd.sql (100%) rename {archive => .archive}/2026-03-observability-teardown/ops-logs-clickhouse/cluster/charts/tailscale-operator/values.yaml (100%) rename {archive => .archive}/2026-03-observability-teardown/ops-logs-clickhouse/cluster/charts/traefik/values.yaml (100%) rename {archive => .archive}/2026-03-observability-teardown/ops-logs-clickhouse/cluster/tailscale/README.md (100%) rename {archive => .archive}/2026-03-observability-teardown/ops-logs-clickhouse/cluster/tailscale/operator-values.yaml (100%) rename {archive => .archive}/2026-03-observability-teardown/teardown-runbook.md (100%) diff --git a/archive/2026-03-observability-teardown/ansible/play-k3s--clickhouse.yml b/.archive/2026-03-observability-teardown/ansible/play-k3s--clickhouse.yml similarity index 100% rename from archive/2026-03-observability-teardown/ansible/play-k3s--clickhouse.yml rename to .archive/2026-03-observability-teardown/ansible/play-k3s--clickhouse.yml diff --git a/archive/2026-03-observability-teardown/ansible/play-o11y--vector.yml b/.archive/2026-03-observability-teardown/ansible/play-o11y--vector.yml similarity index 100% rename from archive/2026-03-observability-teardown/ansible/play-o11y--vector.yml rename to .archive/2026-03-observability-teardown/ansible/play-o11y--vector.yml diff --git a/archive/2026-03-observability-teardown/ansible/vector-template/vector.yaml.j2 b/.archive/2026-03-observability-teardown/ansible/vector-template/vector.yaml.j2 similarity index 100% rename from archive/2026-03-observability-teardown/ansible/vector-template/vector.yaml.j2 rename to .archive/2026-03-observability-teardown/ansible/vector-template/vector.yaml.j2 diff --git a/archive/2026-03-observability-teardown/dashboards/clickhouse-monitoring.json b/.archive/2026-03-observability-teardown/dashboards/clickhouse-monitoring.json similarity index 100% rename from archive/2026-03-observability-teardown/dashboards/clickhouse-monitoring.json rename to .archive/2026-03-observability-teardown/dashboards/clickhouse-monitoring.json diff --git a/archive/2026-03-observability-teardown/dashboards/nginx-access-logs.json b/.archive/2026-03-observability-teardown/dashboards/nginx-access-logs.json similarity index 100% rename from archive/2026-03-observability-teardown/dashboards/nginx-access-logs.json rename to .archive/2026-03-observability-teardown/dashboards/nginx-access-logs.json diff --git a/archive/2026-03-observability-teardown/grafana-nginx-dashboard.md b/.archive/2026-03-observability-teardown/grafana-nginx-dashboard.md similarity index 100% rename from archive/2026-03-observability-teardown/grafana-nginx-dashboard.md rename to .archive/2026-03-observability-teardown/grafana-nginx-dashboard.md diff --git a/archive/2026-03-observability-teardown/nginx-logs-schema.md b/.archive/2026-03-observability-teardown/nginx-logs-schema.md similarity index 100% rename from archive/2026-03-observability-teardown/nginx-logs-schema.md rename to .archive/2026-03-observability-teardown/nginx-logs-schema.md diff --git a/archive/2026-03-observability-teardown/ops-backoffice-tools/grafana/charts/grafana/values.yaml b/.archive/2026-03-observability-teardown/ops-backoffice-tools/grafana/charts/grafana/values.yaml similarity index 100% rename from archive/2026-03-observability-teardown/ops-backoffice-tools/grafana/charts/grafana/values.yaml rename to .archive/2026-03-observability-teardown/ops-backoffice-tools/grafana/charts/grafana/values.yaml diff --git a/archive/2026-03-observability-teardown/ops-backoffice-tools/grafana/manifests/base/gateway.yaml b/.archive/2026-03-observability-teardown/ops-backoffice-tools/grafana/manifests/base/gateway.yaml similarity index 100% rename from archive/2026-03-observability-teardown/ops-backoffice-tools/grafana/manifests/base/gateway.yaml rename to .archive/2026-03-observability-teardown/ops-backoffice-tools/grafana/manifests/base/gateway.yaml diff --git a/archive/2026-03-observability-teardown/ops-backoffice-tools/grafana/manifests/base/httproutes.yaml b/.archive/2026-03-observability-teardown/ops-backoffice-tools/grafana/manifests/base/httproutes.yaml similarity index 100% rename from archive/2026-03-observability-teardown/ops-backoffice-tools/grafana/manifests/base/httproutes.yaml rename to .archive/2026-03-observability-teardown/ops-backoffice-tools/grafana/manifests/base/httproutes.yaml diff --git a/archive/2026-03-observability-teardown/ops-backoffice-tools/grafana/manifests/base/kustomization.yaml b/.archive/2026-03-observability-teardown/ops-backoffice-tools/grafana/manifests/base/kustomization.yaml similarity index 100% rename from archive/2026-03-observability-teardown/ops-backoffice-tools/grafana/manifests/base/kustomization.yaml rename to .archive/2026-03-observability-teardown/ops-backoffice-tools/grafana/manifests/base/kustomization.yaml diff --git a/archive/2026-03-observability-teardown/ops-backoffice-tools/grafana/manifests/base/namespace.yaml b/.archive/2026-03-observability-teardown/ops-backoffice-tools/grafana/manifests/base/namespace.yaml similarity index 100% rename from archive/2026-03-observability-teardown/ops-backoffice-tools/grafana/manifests/base/namespace.yaml rename to .archive/2026-03-observability-teardown/ops-backoffice-tools/grafana/manifests/base/namespace.yaml diff --git a/archive/2026-03-observability-teardown/ops-backoffice-tools/grafana/manifests/base/secrets/.gitignore b/.archive/2026-03-observability-teardown/ops-backoffice-tools/grafana/manifests/base/secrets/.gitignore similarity index 100% rename from archive/2026-03-observability-teardown/ops-backoffice-tools/grafana/manifests/base/secrets/.gitignore rename to .archive/2026-03-observability-teardown/ops-backoffice-tools/grafana/manifests/base/secrets/.gitignore diff --git a/archive/2026-03-observability-teardown/ops-backoffice-tools/grafana/manifests/base/secrets/.secrets.env.sample b/.archive/2026-03-observability-teardown/ops-backoffice-tools/grafana/manifests/base/secrets/.secrets.env.sample similarity index 100% rename from archive/2026-03-observability-teardown/ops-backoffice-tools/grafana/manifests/base/secrets/.secrets.env.sample rename to .archive/2026-03-observability-teardown/ops-backoffice-tools/grafana/manifests/base/secrets/.secrets.env.sample diff --git a/archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/gateway.yaml b/.archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/gateway.yaml similarity index 100% rename from archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/gateway.yaml rename to .archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/gateway.yaml diff --git a/archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/httproutes.yaml b/.archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/httproutes.yaml similarity index 100% rename from archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/httproutes.yaml rename to .archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/httproutes.yaml diff --git a/archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/kustomization.yaml b/.archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/kustomization.yaml similarity index 100% rename from archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/kustomization.yaml rename to .archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/kustomization.yaml diff --git a/archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/n8n-main-deployment.yaml b/.archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/n8n-main-deployment.yaml similarity index 100% rename from archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/n8n-main-deployment.yaml rename to .archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/n8n-main-deployment.yaml diff --git a/archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/n8n-main-service.yaml b/.archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/n8n-main-service.yaml similarity index 100% rename from archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/n8n-main-service.yaml rename to .archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/n8n-main-service.yaml diff --git a/archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/n8n-worker-deployment.yaml b/.archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/n8n-worker-deployment.yaml similarity index 100% rename from archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/n8n-worker-deployment.yaml rename to .archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/n8n-worker-deployment.yaml diff --git a/archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/namespace.yaml b/.archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/namespace.yaml similarity index 100% rename from archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/namespace.yaml rename to .archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/namespace.yaml diff --git a/archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/postgres-deployment.yaml b/.archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/postgres-deployment.yaml similarity index 100% rename from archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/postgres-deployment.yaml rename to .archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/postgres-deployment.yaml diff --git a/archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/postgres-service.yaml b/.archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/postgres-service.yaml similarity index 100% rename from archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/postgres-service.yaml rename to .archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/postgres-service.yaml diff --git a/archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/pvc.yaml b/.archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/pvc.yaml similarity index 100% rename from archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/pvc.yaml rename to .archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/pvc.yaml diff --git a/archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/redis-deployment.yaml b/.archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/redis-deployment.yaml similarity index 100% rename from archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/redis-deployment.yaml rename to .archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/redis-deployment.yaml diff --git a/archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/redis-service.yaml b/.archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/redis-service.yaml similarity index 100% rename from archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/redis-service.yaml rename to .archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/redis-service.yaml diff --git a/archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/secrets/.gitignore b/.archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/secrets/.gitignore similarity index 100% rename from archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/secrets/.gitignore rename to .archive/2026-03-observability-teardown/ops-backoffice-tools/n8n/manifests/base/secrets/.gitignore diff --git a/archive/2026-03-observability-teardown/ops-backoffice-tools/prometheus/charts/kube-prometheus-stack/values.yaml b/.archive/2026-03-observability-teardown/ops-backoffice-tools/prometheus/charts/kube-prometheus-stack/values.yaml similarity index 100% rename from archive/2026-03-observability-teardown/ops-backoffice-tools/prometheus/charts/kube-prometheus-stack/values.yaml rename to .archive/2026-03-observability-teardown/ops-backoffice-tools/prometheus/charts/kube-prometheus-stack/values.yaml diff --git a/archive/2026-03-observability-teardown/ops-backoffice-tools/prometheus/manifests/base/kustomization.yaml b/.archive/2026-03-observability-teardown/ops-backoffice-tools/prometheus/manifests/base/kustomization.yaml similarity index 100% rename from archive/2026-03-observability-teardown/ops-backoffice-tools/prometheus/manifests/base/kustomization.yaml rename to .archive/2026-03-observability-teardown/ops-backoffice-tools/prometheus/manifests/base/kustomization.yaml diff --git a/archive/2026-03-observability-teardown/ops-backoffice-tools/prometheus/manifests/base/longhorn-servicemonitor.yaml b/.archive/2026-03-observability-teardown/ops-backoffice-tools/prometheus/manifests/base/longhorn-servicemonitor.yaml similarity index 100% rename from archive/2026-03-observability-teardown/ops-backoffice-tools/prometheus/manifests/base/longhorn-servicemonitor.yaml rename to .archive/2026-03-observability-teardown/ops-backoffice-tools/prometheus/manifests/base/longhorn-servicemonitor.yaml diff --git a/archive/2026-03-observability-teardown/ops-backoffice-tools/prometheus/manifests/base/namespace.yaml b/.archive/2026-03-observability-teardown/ops-backoffice-tools/prometheus/manifests/base/namespace.yaml similarity index 100% rename from archive/2026-03-observability-teardown/ops-backoffice-tools/prometheus/manifests/base/namespace.yaml rename to .archive/2026-03-observability-teardown/ops-backoffice-tools/prometheus/manifests/base/namespace.yaml diff --git a/archive/2026-03-observability-teardown/ops-backoffice-tools/prometheus/manifests/base/secrets/.gitignore b/.archive/2026-03-observability-teardown/ops-backoffice-tools/prometheus/manifests/base/secrets/.gitignore similarity index 100% rename from archive/2026-03-observability-teardown/ops-backoffice-tools/prometheus/manifests/base/secrets/.gitignore rename to .archive/2026-03-observability-teardown/ops-backoffice-tools/prometheus/manifests/base/secrets/.gitignore diff --git a/archive/2026-03-observability-teardown/ops-backoffice-tools/prometheus/manifests/base/tailscale-ingress.yaml b/.archive/2026-03-observability-teardown/ops-backoffice-tools/prometheus/manifests/base/tailscale-ingress.yaml similarity index 100% rename from archive/2026-03-observability-teardown/ops-backoffice-tools/prometheus/manifests/base/tailscale-ingress.yaml rename to .archive/2026-03-observability-teardown/ops-backoffice-tools/prometheus/manifests/base/tailscale-ingress.yaml diff --git a/archive/2026-03-observability-teardown/ops-logs-clickhouse/.gitignore b/.archive/2026-03-observability-teardown/ops-logs-clickhouse/.gitignore similarity index 100% rename from archive/2026-03-observability-teardown/ops-logs-clickhouse/.gitignore rename to .archive/2026-03-observability-teardown/ops-logs-clickhouse/.gitignore diff --git a/archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/manifests/base/clickhouse-installation.yaml b/.archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/manifests/base/clickhouse-installation.yaml similarity index 100% rename from archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/manifests/base/clickhouse-installation.yaml rename to .archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/manifests/base/clickhouse-installation.yaml diff --git a/archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/manifests/base/clickhouse-keeper.yaml b/.archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/manifests/base/clickhouse-keeper.yaml similarity index 100% rename from archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/manifests/base/clickhouse-keeper.yaml rename to .archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/manifests/base/clickhouse-keeper.yaml diff --git a/archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/manifests/base/kustomization.yaml b/.archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/manifests/base/kustomization.yaml similarity index 100% rename from archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/manifests/base/kustomization.yaml rename to .archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/manifests/base/kustomization.yaml diff --git a/archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/manifests/base/namespace.yaml b/.archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/manifests/base/namespace.yaml similarity index 100% rename from archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/manifests/base/namespace.yaml rename to .archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/manifests/base/namespace.yaml diff --git a/archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/manifests/base/secrets/.gitignore b/.archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/manifests/base/secrets/.gitignore similarity index 100% rename from archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/manifests/base/secrets/.gitignore rename to .archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/manifests/base/secrets/.gitignore diff --git a/archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/manifests/base/secrets/users-secret.yaml.sample b/.archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/manifests/base/secrets/users-secret.yaml.sample similarity index 100% rename from archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/manifests/base/secrets/users-secret.yaml.sample rename to .archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/manifests/base/secrets/users-secret.yaml.sample diff --git a/archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/manifests/base/service-tailscale.yaml b/.archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/manifests/base/service-tailscale.yaml similarity index 100% rename from archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/manifests/base/service-tailscale.yaml rename to .archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/manifests/base/service-tailscale.yaml diff --git a/archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/schemas/002-logs-nginx-stg.sql b/.archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/schemas/002-logs-nginx-stg.sql similarity index 100% rename from archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/schemas/002-logs-nginx-stg.sql rename to .archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/schemas/002-logs-nginx-stg.sql diff --git a/archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/schemas/003-logs-nginx-prd.sql b/.archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/schemas/003-logs-nginx-prd.sql similarity index 100% rename from archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/schemas/003-logs-nginx-prd.sql rename to .archive/2026-03-observability-teardown/ops-logs-clickhouse/apps/clickhouse/schemas/003-logs-nginx-prd.sql diff --git a/archive/2026-03-observability-teardown/ops-logs-clickhouse/cluster/charts/tailscale-operator/values.yaml b/.archive/2026-03-observability-teardown/ops-logs-clickhouse/cluster/charts/tailscale-operator/values.yaml similarity index 100% rename from archive/2026-03-observability-teardown/ops-logs-clickhouse/cluster/charts/tailscale-operator/values.yaml rename to .archive/2026-03-observability-teardown/ops-logs-clickhouse/cluster/charts/tailscale-operator/values.yaml diff --git a/archive/2026-03-observability-teardown/ops-logs-clickhouse/cluster/charts/traefik/values.yaml b/.archive/2026-03-observability-teardown/ops-logs-clickhouse/cluster/charts/traefik/values.yaml similarity index 100% rename from archive/2026-03-observability-teardown/ops-logs-clickhouse/cluster/charts/traefik/values.yaml rename to .archive/2026-03-observability-teardown/ops-logs-clickhouse/cluster/charts/traefik/values.yaml diff --git a/archive/2026-03-observability-teardown/ops-logs-clickhouse/cluster/tailscale/README.md b/.archive/2026-03-observability-teardown/ops-logs-clickhouse/cluster/tailscale/README.md similarity index 100% rename from archive/2026-03-observability-teardown/ops-logs-clickhouse/cluster/tailscale/README.md rename to .archive/2026-03-observability-teardown/ops-logs-clickhouse/cluster/tailscale/README.md diff --git a/archive/2026-03-observability-teardown/ops-logs-clickhouse/cluster/tailscale/operator-values.yaml b/.archive/2026-03-observability-teardown/ops-logs-clickhouse/cluster/tailscale/operator-values.yaml similarity index 100% rename from archive/2026-03-observability-teardown/ops-logs-clickhouse/cluster/tailscale/operator-values.yaml rename to .archive/2026-03-observability-teardown/ops-logs-clickhouse/cluster/tailscale/operator-values.yaml diff --git a/archive/2026-03-observability-teardown/teardown-runbook.md b/.archive/2026-03-observability-teardown/teardown-runbook.md similarity index 100% rename from archive/2026-03-observability-teardown/teardown-runbook.md rename to .archive/2026-03-observability-teardown/teardown-runbook.md From b5fc35b126290c6ed5ce76d1552734fced18fcfc Mon Sep 17 00:00:00 2001 From: Mrugesh Mohapatra Date: Thu, 2 Apr 2026 22:14:06 +0530 Subject: [PATCH 08/40] feat(gxy-management): align Day 0 config with spike-plan and ADRs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Resolve all config discrepancies between feat/k3s-universe branch and the Universe spike-plan/ADR decisions for the gxy-management cluster. Changes: - Create galaxy-specific traefik-config.yaml (LoadBalancer via ServiceLB) instead of modifying the shared config used by ops-backoffice-tools - Re-enable ServiceLB in playbook, update traefik source to galaxy path - Fix region alignment: nyc3 → fra1 for etcd S3 backups and Zot registry - New FRA1 buckets: universe-backups (etcd) and universe-registry (Zot) - Clean up PSS admission: remove stale cattle-*/longhorn/cert-manager, add cilium and windmill namespace exemptions - Add Gateway API resources (Gateway + HTTPRoute) for Windmill, ArgoCD, and Zot matching the ops-backoffice-tools pattern - Add TLS secret samples for Cloudflare origin certificates - Update ArgoCD/Zot comments to Cloudflare Access model - Add deployment runbook with pre/post ClickOps checklists - Update k3s/README.md with correct specs, region, and architecture --- ansible/play-k3s--galaxy.yml | 10 +-- k3s/README.md | 41 +++++----- k3s/gxy-management/.gitignore | 1 + k3s/gxy-management/README.md | 50 +++++++++++-- .../apps/argocd/charts/argo-cd/values.yaml | 4 +- .../apps/argocd/manifests/base/gateway.yaml | 26 +++++++ .../argocd/manifests/base/httproutes.yaml | 75 +++++++++++++++++++ .../argocd/manifests/base/kustomization.yaml | 2 + .../argocd/manifests/base/secrets/.gitignore | 2 + .../manifests/base/secrets/tls.yaml.sample | 11 +++ .../apps/windmill/manifests/base/gateway.yaml | 26 +++++++ .../windmill/manifests/base/httproutes.yaml | 75 +++++++++++++++++++ .../manifests/base/kustomization.yaml | 2 + .../manifests/base/secrets/tls.yaml.sample | 11 +++ .../apps/zot/charts/zot/values.yaml | 12 +-- .../apps/zot/manifests/base/gateway.yaml | 26 +++++++ .../apps/zot/manifests/base/httproutes.yaml | 75 +++++++++++++++++++ .../zot/manifests/base/kustomization.yaml | 2 + .../zot/manifests/base/secrets/.gitignore | 2 + .../manifests/base/secrets/tls.yaml.sample | 11 +++ .../cluster/security/pss-admission.yaml | 11 +-- .../cluster/traefik-config.yaml | 55 ++++++++++++++ 22 files changed, 484 insertions(+), 46 deletions(-) create mode 100644 k3s/gxy-management/apps/argocd/manifests/base/gateway.yaml create mode 100644 k3s/gxy-management/apps/argocd/manifests/base/httproutes.yaml create mode 100644 k3s/gxy-management/apps/argocd/manifests/base/secrets/tls.yaml.sample create mode 100644 k3s/gxy-management/apps/windmill/manifests/base/gateway.yaml create mode 100644 k3s/gxy-management/apps/windmill/manifests/base/httproutes.yaml create mode 100644 k3s/gxy-management/apps/windmill/manifests/base/secrets/tls.yaml.sample create mode 100644 k3s/gxy-management/apps/zot/manifests/base/gateway.yaml create mode 100644 k3s/gxy-management/apps/zot/manifests/base/httproutes.yaml create mode 100644 k3s/gxy-management/apps/zot/manifests/base/secrets/tls.yaml.sample create mode 100644 k3s/gxy-management/cluster/traefik-config.yaml diff --git a/ansible/play-k3s--galaxy.yml b/ansible/play-k3s--galaxy.yml index 090b0babb..7ada06914 100644 --- a/ansible/play-k3s--galaxy.yml +++ b/ansible/play-k3s--galaxy.yml @@ -131,10 +131,10 @@ k3s_version: "v1.34.5+k3s1" cluster_cidr: "10.1.0.0/16" service_cidr: "10.11.0.0/16" - etcd_s3_endpoint: "nyc3.digitaloceanspaces.com" - etcd_s3_bucket: "net.freecodecamp.ops-k3s-backups" + etcd_s3_endpoint: "fra1.digitaloceanspaces.com" + etcd_s3_bucket: "net.freecodecamp.universe-backups" etcd_s3_folder: "etcd/{{ galaxy_name }}" - etcd_s3_region: "nyc3" + etcd_s3_region: "fra1" etcd_snapshot_schedule: "0 */6 * * *" etcd_snapshot_retention: 20 api_endpoint: "{{ hostvars[groups['server'][0]]['vpc_ip'] }}" @@ -146,7 +146,6 @@ --flannel-backend=none --disable-network-policy --disable-kube-proxy - --disable=servicelb --secrets-encryption --protect-kernel-defaults --cluster-cidr={{ cluster_cidr }} @@ -191,12 +190,13 @@ gather_facts: false become: true vars: + galaxy_name: "gxy-management" gateway_api_version: "v1.5.1" tasks: - name: Apply Traefik HelmChartConfig copy: - src: "{{ playbook_dir }}/../k3s/shared/traefik-config.yaml" + src: "{{ playbook_dir }}/../k3s/{{ galaxy_name }}/cluster/traefik-config.yaml" dest: /var/lib/rancher/k3s/server/manifests/traefik-config.yaml mode: "0600" diff --git a/k3s/README.md b/k3s/README.md index 9d083d947..88835c2d7 100644 --- a/k3s/README.md +++ b/k3s/README.md @@ -59,10 +59,10 @@ k3s/ ### Droplets -| Cluster | Name Pattern | Count | Specs | Tags | -| -------------- | --------------------------- | ----- | ------------------ | ------------------- | -| tools | ops-vm-tools-k3s-nyc3-0X | 3 | 4 vCPU, 8GB, 160GB | k3s, tools_k3s | -| gxy-management | ops-vm-gxy-mgmt-k3s-nyc3-0X | 3 | 4 vCPU, 8GB, 160GB | k3s, \_gxy-mgmt-k3s | +| Cluster | Name Pattern | Count | Specs | Tags | +| -------------- | --------------------------- | ----- | ------------------- | ------------------- | +| tools | ops-vm-tools-k3s-nyc3-0X | 3 | 4 vCPU, 8GB, 160GB | k3s, tools_k3s | +| gxy-management | ops-vm-gxy-mgmt-k3s-fra1-0X | 3 | 8 vCPU, 16GB, 320GB | k3s, \_gxy-mgmt-k3s | ### Load Balancer @@ -116,11 +116,13 @@ See `tailscale/README.md` (repo root) for device inventory. ## DNS (Cloudflare) -| Record | Type | Value | -| ------------------------- | ---- | ----------------- | -| appsmith.freecodecamp.net | A | tools LB | -| outline.freecodecamp.net | A | tools LB | -| windmill.freecodecamp.net | A | gxy-management LB | +| Record | Type | Value | +| ------------------------- | ---- | ----------------------- | +| appsmith.freecodecamp.net | A | tools LB | +| outline.freecodecamp.net | A | tools LB | +| windmill.freecodecamp.net | A | gxy-management node IPs | +| argocd.freecodecamp.net | A | gxy-management node IPs | +| registry.freecodecamp.net | A | gxy-management node IPs | --- @@ -165,19 +167,20 @@ Internet → Cloudflare → DO LB → Traefik (NodePort) → Gateway API → App ### gxy-management ``` -Internet → Cloudflare → DO LB → Traefik (NodePort) → Gateway API → Windmill - │ -Tailscale ──────────────────────────────────────────────────→├── ArgoCD - └── Zot +Internet → Cloudflare → Node Public IPs → Traefik (ServiceLB) → Gateway API → Apps + (Access) │ + ┌─────────────┼─────────────┐ + ↓ ↓ ↓ + Windmill ArgoCD Zot -CNI: Cilium Storage: Longhorn (2 replicas) SSH/kubectl: Tailscale +CNI: Cilium Storage: local-path SSH/kubectl: Tailscale ``` -| App | Replicas | Access | Notes | -| -------- | ------------------- | -------------- | ---------- | -| Windmill | 1 server, 2 workers | Public (HTTPS) | | -| ArgoCD | 1 (single replica) | Tailscale-only | | -| Zot | 1 (single replica) | Tailscale-only | S3 backend | +| App | Replicas | Access | Notes | +| -------- | ------------------- | ----------------- | ---------- | +| Windmill | 1 server, 2 workers | Cloudflare Access | | +| ArgoCD | 1 (single replica) | Cloudflare Access | | +| Zot | 1 (single replica) | Cloudflare Access | S3 backend | --- diff --git a/k3s/gxy-management/.gitignore b/k3s/gxy-management/.gitignore index 4757c806f..ae8010926 100644 --- a/k3s/gxy-management/.gitignore +++ b/k3s/gxy-management/.gitignore @@ -8,3 +8,4 @@ apps/*/manifests/base/secrets/.secrets.env apps/*/manifests/base/secrets/tls.crt apps/*/manifests/base/secrets/tls.key +apps/*/manifests/base/secrets/tls.yaml diff --git a/k3s/gxy-management/README.md b/k3s/gxy-management/README.md index eb404c4ea..899a5070b 100644 --- a/k3s/gxy-management/README.md +++ b/k3s/gxy-management/README.md @@ -4,7 +4,7 @@ First Universe galaxy. Control plane brain — manages all galaxies. ## Specifications -- **Nodes**: 3× DigitalOcean s-8vcpu-16gb (nyc3) +- **Nodes**: 3× DigitalOcean s-8vcpu-16gb (FRA1) - **CNI**: Cilium (eBPF, Hubble observability) - **Pod CIDR**: 10.1.0.0/16 - **Service CIDR**: 10.11.0.0/16 @@ -13,11 +13,11 @@ First Universe galaxy. Control plane brain — manages all galaxies. ## Applications -| App | Purpose | Access | -| -------- | --------------------- | ------------------------------- | -| Windmill | Workflow engine | NodePort 30080 via Tailscale IP | -| ArgoCD | GitOps (all galaxies) | NodePort 30443 via Tailscale IP | -| Zot | Container registry | NodePort 30500 via Tailscale IP | +| App | Purpose | Access | +| -------- | --------------------- | ----------------------------------------- | +| Windmill | Workflow engine | windmill.freecodecamp.net (all staff) | +| ArgoCD | GitOps (all galaxies) | argocd.freecodecamp.net (platform team) | +| Zot | Container registry | registry.freecodecamp.net (platform team) | ## Quick Access @@ -35,3 +35,41 @@ uv run ansible-playbook -i inventory/digitalocean.yml play-k3s--galaxy.yml \ -e galaxy_name=gxy-management \ --vault-password-file <(op read "op://Service-Automation/Ansible-Vault-Password/Ansible-Vault-Password") ``` + +## Deployment Runbook + +### Pre-deployment (ClickOps) + +1. Create 3x DO droplets (s-8vcpu-16gb) in FRA1 -- attach to VPC, configure firewall (80, 443, 6443 from VPC, 22 from Tailscale) +2. Create DO Spaces bucket `net.freecodecamp.universe-backups` in FRA1 (etcd snapshots) +3. Create DO Spaces bucket `net.freecodecamp.universe-registry` in FRA1 (Zot images) +4. Install Tailscale on all 3 nodes +5. Create Cloudflare origin certificate for `*.freecodecamp.net` (15-year, RSA) +6. Populate ansible-vault secrets (`vars/vault-k3s.yml`) +7. Populate app secrets (decrypt samples, fill values, encrypt) + +### Helm Installations + +After playbook completes, before app deploy: + +```bash +helm install argocd argo-cd --repo https://argoproj.github.io/argo-helm -n argocd -f charts/argo-cd/values.yaml +helm install windmill windmill --repo https://windmill-labs.github.io/windmill-helm-charts/ -n windmill -f charts/windmill/values.yaml +helm install zot zot --repo https://zotregistry.dev/helm-charts/ -n zot -f charts/zot/values.yaml +``` + +**IMPORTANT: Helm release names must be exactly `argocd`, `windmill`, `zot`** -- the Gateway API HTTPRoute resources reference service names derived from these release names. + +### Post-deployment (ClickOps) + +1. Create DNS A records (proxied) for windmill/argocd/registry.freecodecamp.net pointing to all 3 node public IPs +2. Create Cloudflare Access policies for each service +3. Apply TLS secrets: `kubectl create secret tls -tls-cloudflare --cert=tls.crt --key=tls.key -n ` +4. Apply kustomize manifests: `kubectl apply -k apps//manifests/base/ -n ` + +### Smoke Tests + +1. `kubectl get nodes` -- all 3 Ready +2. `cilium status` -- all green +3. `curl -H "Host: windmill.freecodecamp.net" https:// -k` +4. Verify Cloudflare Access gate diff --git a/k3s/gxy-management/apps/argocd/charts/argo-cd/values.yaml b/k3s/gxy-management/apps/argocd/charts/argo-cd/values.yaml index bf5f15655..d6b16ee27 100644 --- a/k3s/gxy-management/apps/argocd/charts/argo-cd/values.yaml +++ b/k3s/gxy-management/apps/argocd/charts/argo-cd/values.yaml @@ -1,6 +1,6 @@ # Argo CD Helm values for gxy-management cluster # Chart: argo-cd (https://argoproj.github.io/argo-helm) -# Non-HA, Tailscale-only access (no public ingress) +# Non-HA, Cloudflare Access-gated (platform team only) # -- Single-replica (non-HA) deployment for 8GB nodes controller: @@ -56,5 +56,5 @@ applicationSet: configs: params: - # TLS terminated at Tailscale level + # TLS terminated at Traefik (Cloudflare origin cert) "server.insecure": true diff --git a/k3s/gxy-management/apps/argocd/manifests/base/gateway.yaml b/k3s/gxy-management/apps/argocd/manifests/base/gateway.yaml new file mode 100644 index 000000000..5c6257384 --- /dev/null +++ b/k3s/gxy-management/apps/argocd/manifests/base/gateway.yaml @@ -0,0 +1,26 @@ +apiVersion: gateway.networking.k8s.io/v1 +kind: Gateway +metadata: + name: argocd-gateway + namespace: argocd +spec: + gatewayClassName: traefik + listeners: + - name: websecure + protocol: HTTPS + port: 8443 + hostname: argocd.freecodecamp.net + tls: + mode: Terminate + certificateRefs: + - name: argocd-tls-cloudflare + allowedRoutes: + namespaces: + from: Same + - name: web + protocol: HTTP + port: 8000 + hostname: argocd.freecodecamp.net + allowedRoutes: + namespaces: + from: Same diff --git a/k3s/gxy-management/apps/argocd/manifests/base/httproutes.yaml b/k3s/gxy-management/apps/argocd/manifests/base/httproutes.yaml new file mode 100644 index 000000000..6759e2500 --- /dev/null +++ b/k3s/gxy-management/apps/argocd/manifests/base/httproutes.yaml @@ -0,0 +1,75 @@ +--- +apiVersion: traefik.io/v1alpha1 +kind: Middleware +metadata: + name: secure-headers + namespace: argocd +spec: + headers: + customRequestHeaders: + X-Forwarded-Proto: "https" + +--- +apiVersion: traefik.io/v1alpha1 +kind: Middleware +metadata: + name: redirect-https + namespace: argocd +spec: + redirectScheme: + scheme: https + permanent: true + +--- +# HTTP to HTTPS redirect +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: http-redirect + namespace: argocd +spec: + parentRefs: + - name: argocd-gateway + namespace: argocd + sectionName: web + hostnames: + - argocd.freecodecamp.net + rules: + - filters: + - type: ExtensionRef + extensionRef: + group: traefik.io + kind: Middleware + name: redirect-https + backendRefs: + - name: argocd-server + port: 80 + +--- +# Main ArgoCD route +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: argocd-route + namespace: argocd +spec: + parentRefs: + - name: argocd-gateway + namespace: argocd + sectionName: websecure + hostnames: + - argocd.freecodecamp.net + rules: + - matches: + - path: + type: PathPrefix + value: / + filters: + - type: ExtensionRef + extensionRef: + group: traefik.io + kind: Middleware + name: secure-headers + backendRefs: + - name: argocd-server + port: 80 diff --git a/k3s/gxy-management/apps/argocd/manifests/base/kustomization.yaml b/k3s/gxy-management/apps/argocd/manifests/base/kustomization.yaml index b77a5d497..5331e33f4 100644 --- a/k3s/gxy-management/apps/argocd/manifests/base/kustomization.yaml +++ b/k3s/gxy-management/apps/argocd/manifests/base/kustomization.yaml @@ -4,6 +4,8 @@ namespace: argocd resources: - namespace.yaml + - gateway.yaml + - httproutes.yaml secretGenerator: - name: argocd-secrets diff --git a/k3s/gxy-management/apps/argocd/manifests/base/secrets/.gitignore b/k3s/gxy-management/apps/argocd/manifests/base/secrets/.gitignore index 9551e2ae8..d05f4f5da 100644 --- a/k3s/gxy-management/apps/argocd/manifests/base/secrets/.gitignore +++ b/k3s/gxy-management/apps/argocd/manifests/base/secrets/.gitignore @@ -1 +1,3 @@ .secrets.env +tls.crt +tls.key diff --git a/k3s/gxy-management/apps/argocd/manifests/base/secrets/tls.yaml.sample b/k3s/gxy-management/apps/argocd/manifests/base/secrets/tls.yaml.sample new file mode 100644 index 000000000..9674a94b9 --- /dev/null +++ b/k3s/gxy-management/apps/argocd/manifests/base/secrets/tls.yaml.sample @@ -0,0 +1,11 @@ +apiVersion: v1 +kind: Secret +metadata: + name: argocd-tls-cloudflare + namespace: argocd +type: kubernetes.io/tls +data: + # Base64-encoded Cloudflare origin certificate (*.freecodecamp.net) + tls.crt: + # Base64-encoded private key + tls.key: diff --git a/k3s/gxy-management/apps/windmill/manifests/base/gateway.yaml b/k3s/gxy-management/apps/windmill/manifests/base/gateway.yaml new file mode 100644 index 000000000..cb80f00f0 --- /dev/null +++ b/k3s/gxy-management/apps/windmill/manifests/base/gateway.yaml @@ -0,0 +1,26 @@ +apiVersion: gateway.networking.k8s.io/v1 +kind: Gateway +metadata: + name: windmill-gateway + namespace: windmill +spec: + gatewayClassName: traefik + listeners: + - name: websecure + protocol: HTTPS + port: 8443 + hostname: windmill.freecodecamp.net + tls: + mode: Terminate + certificateRefs: + - name: windmill-tls-cloudflare + allowedRoutes: + namespaces: + from: Same + - name: web + protocol: HTTP + port: 8000 + hostname: windmill.freecodecamp.net + allowedRoutes: + namespaces: + from: Same diff --git a/k3s/gxy-management/apps/windmill/manifests/base/httproutes.yaml b/k3s/gxy-management/apps/windmill/manifests/base/httproutes.yaml new file mode 100644 index 000000000..30891ff4e --- /dev/null +++ b/k3s/gxy-management/apps/windmill/manifests/base/httproutes.yaml @@ -0,0 +1,75 @@ +--- +apiVersion: traefik.io/v1alpha1 +kind: Middleware +metadata: + name: secure-headers + namespace: windmill +spec: + headers: + customRequestHeaders: + X-Forwarded-Proto: "https" + +--- +apiVersion: traefik.io/v1alpha1 +kind: Middleware +metadata: + name: redirect-https + namespace: windmill +spec: + redirectScheme: + scheme: https + permanent: true + +--- +# HTTP to HTTPS redirect +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: http-redirect + namespace: windmill +spec: + parentRefs: + - name: windmill-gateway + namespace: windmill + sectionName: web + hostnames: + - windmill.freecodecamp.net + rules: + - filters: + - type: ExtensionRef + extensionRef: + group: traefik.io + kind: Middleware + name: redirect-https + backendRefs: + - name: windmill-app + port: 8000 + +--- +# Main Windmill route +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: windmill-route + namespace: windmill +spec: + parentRefs: + - name: windmill-gateway + namespace: windmill + sectionName: websecure + hostnames: + - windmill.freecodecamp.net + rules: + - matches: + - path: + type: PathPrefix + value: / + filters: + - type: ExtensionRef + extensionRef: + group: traefik.io + kind: Middleware + name: secure-headers + backendRefs: + - name: windmill-app + port: 8000 diff --git a/k3s/gxy-management/apps/windmill/manifests/base/kustomization.yaml b/k3s/gxy-management/apps/windmill/manifests/base/kustomization.yaml index b5d91fce4..e4ac5cfe4 100644 --- a/k3s/gxy-management/apps/windmill/manifests/base/kustomization.yaml +++ b/k3s/gxy-management/apps/windmill/manifests/base/kustomization.yaml @@ -4,6 +4,8 @@ namespace: windmill resources: - namespace.yaml + - gateway.yaml + - httproutes.yaml secretGenerator: - name: windmill-secrets diff --git a/k3s/gxy-management/apps/windmill/manifests/base/secrets/tls.yaml.sample b/k3s/gxy-management/apps/windmill/manifests/base/secrets/tls.yaml.sample new file mode 100644 index 000000000..bfa5bc863 --- /dev/null +++ b/k3s/gxy-management/apps/windmill/manifests/base/secrets/tls.yaml.sample @@ -0,0 +1,11 @@ +apiVersion: v1 +kind: Secret +metadata: + name: windmill-tls-cloudflare + namespace: windmill +type: kubernetes.io/tls +data: + # Base64-encoded Cloudflare origin certificate (*.freecodecamp.net) + tls.crt: + # Base64-encoded private key + tls.key: diff --git a/k3s/gxy-management/apps/zot/charts/zot/values.yaml b/k3s/gxy-management/apps/zot/charts/zot/values.yaml index 8d3d4748e..3deda55b8 100644 --- a/k3s/gxy-management/apps/zot/charts/zot/values.yaml +++ b/k3s/gxy-management/apps/zot/charts/zot/values.yaml @@ -1,7 +1,7 @@ # Zot OCI Registry Helm values for gxy-management cluster # Chart: zot (https://github.com/project-zot/helm-charts) # Chart version: 0.1.104, Image: v2.1.15 -# Tailscale-only access (no public ingress) +# Cloudflare Access-gated (platform team + Woodpecker CI) replicaCount: 1 @@ -14,7 +14,7 @@ service: type: ClusterIP port: 5000 -# No public ingress — Tailscale-only access via separate Service manifest +# Routing managed via Gateway API (see manifests/base/gateway.yaml) ingress: enabled: false @@ -27,9 +27,9 @@ configFiles: "rootDirectory": "/var/lib/registry", "storageDriver": { "name": "s3", - "regionendpoint": "https://nyc3.digitaloceanspaces.com", - "region": "nyc3", - "bucket": "CHANGEME", + "regionendpoint": "https://fra1.digitaloceanspaces.com", + "region": "fra1", + "bucket": "net.freecodecamp.universe-registry", "secure": true, "skipverify": false, "forcepathstyle": true @@ -71,7 +71,7 @@ env: name: zot-secrets key: S3_SECRET_KEY -# Longhorn PVC for local cache +# Local PVC for registry cache persistence: true pvc: create: true diff --git a/k3s/gxy-management/apps/zot/manifests/base/gateway.yaml b/k3s/gxy-management/apps/zot/manifests/base/gateway.yaml new file mode 100644 index 000000000..1ddce28ac --- /dev/null +++ b/k3s/gxy-management/apps/zot/manifests/base/gateway.yaml @@ -0,0 +1,26 @@ +apiVersion: gateway.networking.k8s.io/v1 +kind: Gateway +metadata: + name: zot-gateway + namespace: zot +spec: + gatewayClassName: traefik + listeners: + - name: websecure + protocol: HTTPS + port: 8443 + hostname: registry.freecodecamp.net + tls: + mode: Terminate + certificateRefs: + - name: zot-tls-cloudflare + allowedRoutes: + namespaces: + from: Same + - name: web + protocol: HTTP + port: 8000 + hostname: registry.freecodecamp.net + allowedRoutes: + namespaces: + from: Same diff --git a/k3s/gxy-management/apps/zot/manifests/base/httproutes.yaml b/k3s/gxy-management/apps/zot/manifests/base/httproutes.yaml new file mode 100644 index 000000000..e8f0b32e2 --- /dev/null +++ b/k3s/gxy-management/apps/zot/manifests/base/httproutes.yaml @@ -0,0 +1,75 @@ +--- +apiVersion: traefik.io/v1alpha1 +kind: Middleware +metadata: + name: secure-headers + namespace: zot +spec: + headers: + customRequestHeaders: + X-Forwarded-Proto: "https" + +--- +apiVersion: traefik.io/v1alpha1 +kind: Middleware +metadata: + name: redirect-https + namespace: zot +spec: + redirectScheme: + scheme: https + permanent: true + +--- +# HTTP to HTTPS redirect +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: http-redirect + namespace: zot +spec: + parentRefs: + - name: zot-gateway + namespace: zot + sectionName: web + hostnames: + - registry.freecodecamp.net + rules: + - filters: + - type: ExtensionRef + extensionRef: + group: traefik.io + kind: Middleware + name: redirect-https + backendRefs: + - name: zot + port: 5000 + +--- +# Main Zot route +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: zot-route + namespace: zot +spec: + parentRefs: + - name: zot-gateway + namespace: zot + sectionName: websecure + hostnames: + - registry.freecodecamp.net + rules: + - matches: + - path: + type: PathPrefix + value: / + filters: + - type: ExtensionRef + extensionRef: + group: traefik.io + kind: Middleware + name: secure-headers + backendRefs: + - name: zot + port: 5000 diff --git a/k3s/gxy-management/apps/zot/manifests/base/kustomization.yaml b/k3s/gxy-management/apps/zot/manifests/base/kustomization.yaml index e8b1710ed..f111666cd 100644 --- a/k3s/gxy-management/apps/zot/manifests/base/kustomization.yaml +++ b/k3s/gxy-management/apps/zot/manifests/base/kustomization.yaml @@ -4,6 +4,8 @@ namespace: zot resources: - namespace.yaml + - gateway.yaml + - httproutes.yaml secretGenerator: - name: zot-secrets diff --git a/k3s/gxy-management/apps/zot/manifests/base/secrets/.gitignore b/k3s/gxy-management/apps/zot/manifests/base/secrets/.gitignore index 9551e2ae8..d05f4f5da 100644 --- a/k3s/gxy-management/apps/zot/manifests/base/secrets/.gitignore +++ b/k3s/gxy-management/apps/zot/manifests/base/secrets/.gitignore @@ -1 +1,3 @@ .secrets.env +tls.crt +tls.key diff --git a/k3s/gxy-management/apps/zot/manifests/base/secrets/tls.yaml.sample b/k3s/gxy-management/apps/zot/manifests/base/secrets/tls.yaml.sample new file mode 100644 index 000000000..e017111c4 --- /dev/null +++ b/k3s/gxy-management/apps/zot/manifests/base/secrets/tls.yaml.sample @@ -0,0 +1,11 @@ +apiVersion: v1 +kind: Secret +metadata: + name: zot-tls-cloudflare + namespace: zot +type: kubernetes.io/tls +data: + # Base64-encoded Cloudflare origin certificate (*.freecodecamp.net) + tls.crt: + # Base64-encoded private key + tls.key: diff --git a/k3s/gxy-management/cluster/security/pss-admission.yaml b/k3s/gxy-management/cluster/security/pss-admission.yaml index 0cc18a71f..9d6baaa3d 100644 --- a/k3s/gxy-management/cluster/security/pss-admission.yaml +++ b/k3s/gxy-management/cluster/security/pss-admission.yaml @@ -3,7 +3,7 @@ # # - baseline: enforced (blocks privileged containers, host networking, hostPath) # - restricted: audit + warn only (logs violations, does not block) -# - System namespaces exempted (Rancher, cert-manager, Longhorn, Tailscale need elevated privileges) +# - System namespaces exempted (Cilium, Windmill workers, and Tailscale need elevated privileges) apiVersion: apiserver.config.k8s.io/v1 kind: AdmissionConfiguration plugins: @@ -21,11 +21,6 @@ plugins: exemptions: namespaces: - kube-system - - cattle-system - - cattle-fleet-system - - cattle-fleet-local-system - - cattle-resources-system - - cattle-provisioning-capi-system - - cert-manager - - longhorn-system + - cilium + - windmill - tailscale diff --git a/k3s/gxy-management/cluster/traefik-config.yaml b/k3s/gxy-management/cluster/traefik-config.yaml new file mode 100644 index 000000000..aceab7f77 --- /dev/null +++ b/k3s/gxy-management/cluster/traefik-config.yaml @@ -0,0 +1,55 @@ +# Traefik HelmChartConfig for gxy-management galaxy +# Reference: https://docs.k3s.io/networking/networking-services#customizing-the-traefik-helm-chart +# +# This file is copied to /var/lib/rancher/k3s/server/manifests/traefik-config.yaml +# by Ansible play-k3s--galaxy.yml + +apiVersion: helm.cattle.io/v1 +kind: HelmChartConfig +metadata: + name: traefik + namespace: kube-system +spec: + valuesContent: |- + # Enable Gateway API provider + providers: + kubernetesGateway: + enabled: true + + # Enable Gateway provisioning + gateway: + enabled: true + + # Use LoadBalancer with ServiceLB (Klipper) to bind host ports 80/443 + service: + type: LoadBalancer + + ports: + web: {} + websecure: + tls: + enabled: true + transport: + respondingTimeouts: + readTimeout: "0s" + writeTimeout: "0s" + idleTimeout: "180s" + + # Logging + logs: + general: + level: INFO + access: + enabled: true + + # Security context + securityContext: + capabilities: + drop: [ALL] + add: [NET_BIND_SERVICE] + readOnlyRootFilesystem: true + runAsNonRoot: true + runAsUser: 65532 + + podSecurityContext: + fsGroup: 65532 From 2332a1c4ae3965d816b340ce4dbfb42e2da67cff Mon Sep 17 00:00:00 2001 From: Mrugesh Mohapatra Date: Thu, 2 Apr 2026 23:00:30 +0530 Subject: [PATCH 09/40] =?UTF-8?q?feat(k8s):=20add=20kubeconform=20manifest?= =?UTF-8?q?=20validation=20=E2=80=94=20local=20+=20CI?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add kubeconform as a K8s manifest schema validator: - justfile: k8s-validate recipe validates all manifests under k3s/ and k8s/ against K8s 1.30.0 schemas + datreeio/CRDs-catalog for CRDs (Gateway API, Traefik, Longhorn, HelmChartConfig) - CI: k8s--validate.yml workflow runs just k8s-validate on push/PR to main, installs kubeconform v0.7.0 + just via curl Non-manifest YAML (values.yaml, kustomization.yaml, kubeconfig, PSS admission, audit policy, samples) excluded via filename patterns. --- .github/workflows/k8s--validate.yml | 35 +++++++++++++++++++++++++++++ justfile | 23 +++++++++++++++++++ 2 files changed, 58 insertions(+) create mode 100644 .github/workflows/k8s--validate.yml diff --git a/.github/workflows/k8s--validate.yml b/.github/workflows/k8s--validate.yml new file mode 100644 index 000000000..afeb168a9 --- /dev/null +++ b/.github/workflows/k8s--validate.yml @@ -0,0 +1,35 @@ +name: K8s -- Manifest Validation + +on: + push: + branches: + - main + pull_request: + branches: + - main + workflow_dispatch: + +permissions: + contents: read + +jobs: + validate: + name: K8s -- Manifest Validation + runs-on: ubuntu-latest + + steps: + - name: Checkout + uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 + + - name: Install just + run: | + curl --proto '=https' --tlsv1.2 -sSf https://just.systems/install.sh | bash -s -- --to /usr/local/bin + just --version + + - name: Install kubeconform + run: | + curl -sL https://github.com/yannh/kubeconform/releases/download/v0.7.0/kubeconform-linux-amd64.tar.gz | tar xz -C /usr/local/bin + kubeconform -v + + - name: Validate K8s manifests + run: just k8s-validate diff --git a/justfile b/justfile index 09e738c37..94d959af8 100644 --- a/justfile +++ b/justfile @@ -2,6 +2,7 @@ set shell := ["bash", "-cu"] ansible_vault := "uv run --project ansible ansible-vault" vault_password := "--vault-password-file <(op read \"op://Service-Automation/Ansible-Vault-Password/Ansible-Vault-Password\")" +crds_schema := 'https://raw.githubusercontent.com/datreeio/CRDs-catalog/main/{{.Group}}/{{.ResourceKind}}_{{.ResourceAPIVersion}}.json' # Show available recipes default: @@ -112,6 +113,28 @@ deploy cluster app: kubectl apply -k apps/{{app}}/manifests/base/ echo "Deployed {{app}} to {{cluster}}" +# Validate K8s manifests with kubeconform +[group('k3s')] +k8s-validate: + kubeconform \ + -summary \ + -output text \ + -strict \ + -ignore-missing-schemas \ + -kubernetes-version 1.30.0 \ + -schema-location default \ + -schema-location '{{crds_schema}}' \ + -ignore-filename-pattern 'kustomization\.yaml' \ + -ignore-filename-pattern '\.kubeconfig\.yaml' \ + -ignore-filename-pattern 'values\.yaml' \ + -ignore-filename-pattern 'operator-values\.yaml' \ + -ignore-filename-pattern 'pnpm-lock\.yaml' \ + -ignore-filename-pattern 'pss-admission\.yaml' \ + -ignore-filename-pattern 'audit-policy\.yaml' \ + -ignore-filename-pattern '\.sample' \ + -ignore-filename-pattern 'node_modules' \ + k3s/ k8s/ + # --------------------------------------------------------------------------- # Ansible # --------------------------------------------------------------------------- From 0619242b17cecd8baf8d13e0eb2477bbd472c015 Mon Sep 17 00:00:00 2001 From: Mrugesh Mohapatra Date: Thu, 2 Apr 2026 23:04:40 +0530 Subject: [PATCH 10/40] fix(k8s): exclude JSON and dashboards from kubeconform validation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add ignore patterns for .json files and dashboards/ directory — Grafana dashboard outputs, package.json, and tsconfig.json are not K8s manifests. --- justfile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/justfile b/justfile index 94d959af8..74aa1c020 100644 --- a/justfile +++ b/justfile @@ -133,6 +133,8 @@ k8s-validate: -ignore-filename-pattern 'audit-policy\.yaml' \ -ignore-filename-pattern '\.sample' \ -ignore-filename-pattern 'node_modules' \ + -ignore-filename-pattern '\.json' \ + -ignore-filename-pattern 'dashboards/' \ k3s/ k8s/ # --------------------------------------------------------------------------- From 9c902c18b73e3465b8a6e9e9ef6737f87545d199 Mon Sep 17 00:00:00 2001 From: Mrugesh Mohapatra Date: Fri, 3 Apr 2026 00:59:49 +0530 Subject: [PATCH 11/40] feat(cloud-init): update config for Ubuntu 24.04 --- cloud-init/basic.yml | 41 +++++++++++++++++++++++++++++++++-------- cloud-init/docker.yml | 32 ++++++++++++++++++++++++++++++-- 2 files changed, 63 insertions(+), 10 deletions(-) diff --git a/cloud-init/basic.yml b/cloud-init/basic.yml index e4d6dc37f..863148259 100644 --- a/cloud-init/basic.yml +++ b/cloud-init/basic.yml @@ -1,26 +1,51 @@ #cloud-config +package_update: true +package_upgrade: true +package_reboot_if_required: true +packages: + - apt-transport-https + - ca-certificates + - curl + - gnupg-agent + - software-properties-common + - fail2ban users: - name: freecodecamp groups: - sudo - - docker shell: /bin/bash sudo: "ALL=(ALL) NOPASSWD:ALL" ssh_import_id: - gh:camperbot - raisedadead +write_files: + - path: /etc/fail2ban/jail.local + content: | + [sshd] + enabled = true + port = ssh + filter = sshd + maxretry = 5 + bantime = 3600 + findtime = 600 + owner: root:root + permissions: "0644" + - path: /etc/ssh/sshd_config.d/99-hardening.conf + content: | + PermitRootLogin no + PasswordAuthentication no + PubkeyAuthentication yes + AllowUsers freecodecamp + owner: root:root + permissions: "0644" runcmd: - # Configure sshd - - | - sed -i -e '/^PermitRootLogin/s/^.*$/PermitRootLogin no/' /etc/ssh/sshd_config - sed -i -e '/^PasswordAuthentication/s/^.*$/PasswordAuthentication no/' /etc/ssh/sshd_config - sed -i -e '/^PubkeyAuthentication/s/^.*$/PubkeyAuthentication yes/' /etc/ssh/sshd_config - sed -i -e '$aAllowUsers freecodecamp' /etc/ssh/sshd_config + - systemctl enable fail2ban + - systemctl start fail2ban # :-----------------------: WARNING :-----------------------: # # This next line should be the last command in the list, # because it involves restarting the ssh service. # # :-----------------------: WARNING :-----------------------: - - systemctl restart sshd + - systemctl restart ssh || systemctl restart sshd || true final_message: "Setup complete" diff --git a/cloud-init/docker.yml b/cloud-init/docker.yml index ec3eeec04..6d32ded8a 100644 --- a/cloud-init/docker.yml +++ b/cloud-init/docker.yml @@ -3,13 +3,13 @@ package_update: true package_upgrade: true package_reboot_if_required: true packages: - - unattended-upgrades - apt-transport-https - ca-certificates - curl - gnupg-agent - software-properties-common - lsb-release + - fail2ban users: - name: freecodecamp groups: @@ -20,8 +20,27 @@ users: ssh_import_id: - gh:camperbot - raisedadead +write_files: + - path: /etc/fail2ban/jail.local + content: | + [sshd] + enabled = true + port = ssh + filter = sshd + maxretry = 5 + bantime = 3600 + findtime = 600 + owner: root:root + permissions: "0644" + - path: /etc/ssh/sshd_config.d/99-hardening.conf + content: | + PermitRootLogin no + PasswordAuthentication no + PubkeyAuthentication yes + AllowUsers freecodecamp + owner: root:root + permissions: "0644" runcmd: - # This will install docker on the virtual machine and add the freeCodeCamp user to docker usergroup - sudo mkdir -p /etc/apt/keyrings - curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /etc/apt/keyrings/docker.gpg @@ -35,4 +54,13 @@ runcmd: - sudo systemctl start docker - sudo systemctl enable docker - sudo usermod -aG docker freecodecamp + - systemctl enable fail2ban + - systemctl start fail2ban + # :-----------------------: WARNING :-----------------------: + # + # This next line should be the last command in the list, + # because it involves restarting the ssh service. + # + # :-----------------------: WARNING :-----------------------: + - systemctl restart ssh || systemctl restart sshd || true final_message: "Setup complete" From 6ac1504f98f5007c0eeb8bddefe661ba9832c1d4 Mon Sep 17 00:00:00 2001 From: Mrugesh Mohapatra Date: Sat, 4 Apr 2026 13:03:13 +0530 Subject: [PATCH 12/40] refactor: migrate secrets from ansible-vault to sops+age Move all encrypted secrets and samples to a private infra-secrets sibling repo. Replace ansible-vault with sops+age encryption. - Add use_sops function to root .envrc for transparent decryption - Update cluster .envrc files to load team-specific DO tokens - Replace justfile ansible-vault recipes with sops equivalents - Update deploy recipe to use sops -d - Add galaxy-play recipe for k3s playbook with sops decrypt - Remove secrets/ directory (migrated to private repo) - Update ansible.cfg and galaxy playbook references --- .envrc | 19 +++++++ .gitignore | 5 +- ansible/ansible.cfg | 4 +- ansible/play-k3s--galaxy.yml | 10 +--- justfile | 98 ++++++++++----------------------- k3s/gxy-management/.envrc | 5 ++ k3s/ops-backoffice-tools/.envrc | 5 ++ secrets/.gitignore | 6 -- secrets/README.md | 65 ---------------------- secrets/ansible/.env.sample | 4 -- secrets/appsmith/.env.sample | 25 --------- secrets/argocd/.env.sample | 9 --- secrets/do-primary/.env.sample | 1 - secrets/do-universe/.env.sample | 3 - secrets/global/.env.sample | 2 - secrets/outline/.env.sample | 38 ------------- secrets/windmill/.env.sample | 17 ------ secrets/zot/.env.sample | 17 ------ 18 files changed, 64 insertions(+), 269 deletions(-) delete mode 100644 secrets/.gitignore delete mode 100644 secrets/README.md delete mode 100644 secrets/ansible/.env.sample delete mode 100644 secrets/appsmith/.env.sample delete mode 100644 secrets/argocd/.env.sample delete mode 100644 secrets/do-primary/.env.sample delete mode 100644 secrets/do-universe/.env.sample delete mode 100644 secrets/global/.env.sample delete mode 100644 secrets/outline/.env.sample delete mode 100644 secrets/windmill/.env.sample delete mode 100644 secrets/zot/.env.sample diff --git a/.envrc b/.envrc index 9efcf7736..9db352fed 100644 --- a/.envrc +++ b/.envrc @@ -1,3 +1,22 @@ +SECRETS_DIR="${SECRETS_DIR:-$(expand_path ../infra-secrets)}" + +use_sops() { + local path="$1" + local type="${2:-dotenv}" + if [ -f "$path" ]; then + eval "$(sops -d --input-type "$type" --output-type "$type" "$path" 2>/dev/null \ + | direnv dotenv bash /dev/stdin)" + watch_file "$path" + fi +} + +if [ -d "$SECRETS_DIR" ]; then + use_sops "$SECRETS_DIR/global/.env.enc" +else + log_error "infra-secrets repo not found at $SECRETS_DIR" + log_error "Clone it: git clone git@github.com:freeCodeCamp/infra-secrets.git ../infra-secrets" +fi + dotenv_if_exists .env if [ -d ansible/.venv ]; then diff --git a/.gitignore b/.gitignore index 3ee927145..3aac1057a 100644 --- a/.gitignore +++ b/.gitignore @@ -54,10 +54,7 @@ ansible/inventory/hosts !.envrc !k3s/**/.envrc -# Allow secrets/ samples and docs (NOT encrypted .env files) -!secrets/**/.env.sample -!secrets/.gitignore -!secrets/README.md +# Secrets live in the private infra-secrets repo (sibling clone) diff --git a/ansible/ansible.cfg b/ansible/ansible.cfg index e16c12d56..d28e0756a 100644 --- a/ansible/ansible.cfg +++ b/ansible/ansible.cfg @@ -8,8 +8,8 @@ inventory = ./inventory home = ./.ansible collections_path = ./.ansible/collections:./roles roles_path = ./.ansible/roles:./roles -# Vault password: pass via CLI flag when needed -# ansible-playbook ... --vault-password-file <(op read "op://Service-Automation/Ansible-Vault-Password/Ansible-Vault-Password") +# Secrets managed via sops+age in the infra-secrets private repo +# Env vars loaded via direnv; vault vars via community.sops collection when needed [inventory] enable_plugins = yaml, ini, toml, community.general.linode, community.digitalocean.digitalocean diff --git a/ansible/play-k3s--galaxy.yml b/ansible/play-k3s--galaxy.yml index 7ada06914..f6a36f8c7 100644 --- a/ansible/play-k3s--galaxy.yml +++ b/ansible/play-k3s--galaxy.yml @@ -8,16 +8,12 @@ # Prerequisites (manual, one-time per galaxy): # - 3x Ubuntu VMs on DigitalOcean with VPC attached (eth1) # - Tailscale installed and connected on all nodes (play-tailscale--*.yml) -# - Ansible Vault password in 1Password -# - Vault secrets populated in vars/vault-k3s.yml (encrypted) +# - Vault secrets decrypted: sops -d $SECRETS_DIR/ansible/vault-k3s.yaml.enc > vars/vault-k3s.yml # - Cluster config directory: k3s//cluster/ # # Usage: # cd ansible/ # direnv loads .env + activates venv -# ansible-playbook -i inventory/digitalocean.yml play-k3s--galaxy.yml \ -# -e variable_host=gxy_mgmt_k3s \ -# -e galaxy_name=gxy-management \ -# --vault-password-file <(op read "op://Service-Automation/Ansible-Vault-Password/Ansible-Vault-Password") +# just galaxy-play gxy-management gxy_mgmt_k3s # # What this playbook does (6 plays): # 1. Validate prerequisites (VPC, Tailscale, vault secrets) @@ -45,7 +41,7 @@ - vault_do_spaces_access_key | length > 0 - vault_do_spaces_secret_key is defined - vault_do_spaces_secret_key | length > 0 - fail_msg: "Vault secrets missing. Run: ansible-vault edit vars/vault-k3s.yml" + fail_msg: "Vault secrets missing. Run: sops -d $SECRETS_DIR/ansible/vault-k3s.yaml.enc > vars/vault-k3s.yml" - name: Validate VPC interface exists (eth1) assert: diff --git a/justfile b/justfile index 74aa1c020..470daa964 100644 --- a/justfile +++ b/justfile @@ -1,7 +1,7 @@ set shell := ["bash", "-cu"] -ansible_vault := "uv run --project ansible ansible-vault" -vault_password := "--vault-password-file <(op read \"op://Service-Automation/Ansible-Vault-Password/Ansible-Vault-Password\")" +secrets_dir := env("SECRETS_DIR", justfile_directory() + "/../infra-secrets") +sops_config := secrets_dir + "/.sops.yaml" crds_schema := 'https://raw.githubusercontent.com/datreeio/CRDs-catalog/main/{{.Group}}/{{.ResourceKind}}_{{.ResourceAPIVersion}}.json' # Show available recipes @@ -9,82 +9,31 @@ default: @just --list # --------------------------------------------------------------------------- -# Secrets +# Secrets (sops + age — stored in infra-secrets private repo) # --------------------------------------------------------------------------- -# Bootstrap root .env (global tokens only — Cloudflare, Linode) -[group('secrets')] -secret-bootstrap: - #!/usr/bin/env bash - set -eu - SRC="secrets/global/.env" - [ -f "$SRC" ] || { echo "Error: $SRC not found. Get it from 1Password."; exit 1; } - {{ansible_vault}} decrypt --output .env {{vault_password}} "$SRC" - echo "Bootstrapped .env (global tokens)" - echo "Run: direnv allow" - -# Bootstrap a cluster .env (DO_API_TOKEN + KUBECONFIG) -[group('secrets')] -secret-bootstrap-cluster cluster team: - #!/usr/bin/env bash - set -eu - SRC="secrets/do-{{team}}/.env" - DEST="k3s/{{cluster}}/.env" - [ -f "$SRC" ] || { echo "Error: $SRC not found. Get it from 1Password."; exit 1; } - {{ansible_vault}} decrypt --output - {{vault_password}} "$SRC" > "$DEST" - echo "KUBECONFIG=.kubeconfig.yaml" >> "$DEST" - echo "Bootstrapped $DEST (team: {{team}})" - echo "Run: cd k3s/{{cluster}} && direnv allow" - -# Encrypt a secret -[group('secrets')] -secret-encrypt name: - {{ansible_vault}} encrypt {{vault_password}} secrets/{{name}}/.env - -# Decrypt a secret to stdout -[group('secrets')] -secret-decrypt name: - {{ansible_vault}} decrypt --output - {{vault_password}} secrets/{{name}}/.env - -# Decrypt a secret to a file -[group('secrets')] -secret-decrypt-to name dest: - {{ansible_vault}} decrypt --output {{dest}} {{vault_password}} secrets/{{name}}/.env - # View a secret [group('secrets')] secret-view name: - {{ansible_vault}} view {{vault_password}} secrets/{{name}}/.env + sops -d --input-type dotenv --output-type dotenv "{{secrets_dir}}/{{name}}/.env.enc" # Edit a secret [group('secrets')] secret-edit name: - {{ansible_vault}} edit {{vault_password}} secrets/{{name}}/.env - -# Encrypt all unencrypted .env files in secrets/ -[group('secrets')] -secret-encrypt-all: - #!/usr/bin/env bash - set -eu - for f in secrets/*/.env; do - [ -f "$f" ] || continue - if ! head -1 "$f" | grep -q '^\$ANSIBLE_VAULT'; then - echo "Encrypting $f" - {{ansible_vault}} encrypt {{vault_password}} "$f" - else - echo "Already encrypted: $f" - fi - done + sops "{{secrets_dir}}/{{name}}/.env.enc" # Verify all encrypted secrets are readable [group('secrets')] secret-verify-all: #!/usr/bin/env bash set -eu - for f in secrets/*/.env; do - [ -f "$f" ] || continue + for f in $(find "{{secrets_dir}}" -name '*.enc' -type f | sort); do echo -n "$f: " - {{ansible_vault}} view {{vault_password}} "$f" > /dev/null 2>&1 && echo "OK" || echo "FAILED" + case "$f" in + *.env.enc) sops -d --input-type dotenv --output-type dotenv "$f" > /dev/null 2>&1 ;; + *.yaml.enc|*.yml.enc) sops -d --input-type yaml --output-type yaml "$f" > /dev/null 2>&1 ;; + *) sops -d "$f" > /dev/null 2>&1 ;; + esac && echo "OK" || echo "FAILED" done # --------------------------------------------------------------------------- @@ -96,16 +45,12 @@ secret-verify-all: deploy cluster app: #!/usr/bin/env bash set -eu - SECRETS_SRC="secrets/{{app}}/.env" + SECRETS_SRC="{{secrets_dir}}/k3s/{{cluster}}/{{app}}.secrets.env.enc" SECRETS_DST="k3s/{{cluster}}/apps/{{app}}/manifests/base/secrets/.secrets.env" - if [ ! -f "$SECRETS_SRC" ]; then - echo "Error: $SECRETS_SRC not found" - echo "Create it: cp secrets/{{app}}/.env.sample secrets/{{app}}/.env && just secret-encrypt {{app}}" - exit 1 - fi + [ -f "$SECRETS_SRC" ] || { echo "Error: $SECRETS_SRC not found"; exit 1; } - {{ansible_vault}} decrypt --output "$SECRETS_DST" {{vault_password}} "$SECRETS_SRC" + sops -d --input-type dotenv --output-type dotenv "$SECRETS_SRC" > "$SECRETS_DST" trap 'rm -f "$SECRETS_DST"' EXIT cd k3s/{{cluster}} @@ -141,6 +86,21 @@ k8s-validate: # Ansible # --------------------------------------------------------------------------- +# Run galaxy playbook (decrypt vault → run → clean up) +[group('ansible')] +galaxy-play galaxy_name host: + #!/usr/bin/env bash + set -eu + VAULT_SRC="{{secrets_dir}}/ansible/vault-k3s.yaml.enc" + VAULT_DST="ansible/vars/vault-k3s.yml" + [ -f "$VAULT_SRC" ] || { echo "Error: $VAULT_SRC not found"; exit 1; } + sops -d --input-type yaml --output-type yaml "$VAULT_SRC" > "$VAULT_DST" + trap 'rm -f "$VAULT_DST"' EXIT + cd ansible + uv run ansible-playbook -i inventory/digitalocean.yml play-k3s--galaxy.yml \ + -e variable_host={{host}} \ + -e galaxy_name={{galaxy_name}} + # Install ansible and dependencies [group('ansible')] ansible-install: diff --git a/k3s/gxy-management/.envrc b/k3s/gxy-management/.envrc index f3b9e1705..08bb22dfc 100644 --- a/k3s/gxy-management/.envrc +++ b/k3s/gxy-management/.envrc @@ -1,2 +1,7 @@ source_env ../../.envrc + +if [ -d "$SECRETS_DIR" ]; then + use_sops "$SECRETS_DIR/do-universe/.env.enc" +fi + dotenv_if_exists .env diff --git a/k3s/ops-backoffice-tools/.envrc b/k3s/ops-backoffice-tools/.envrc index f3b9e1705..bafa883b9 100644 --- a/k3s/ops-backoffice-tools/.envrc +++ b/k3s/ops-backoffice-tools/.envrc @@ -1,2 +1,7 @@ source_env ../../.envrc + +if [ -d "$SECRETS_DIR" ]; then + use_sops "$SECRETS_DIR/do-primary/.env.enc" +fi + dotenv_if_exists .env diff --git a/secrets/.gitignore b/secrets/.gitignore deleted file mode 100644 index 7ba0a59d5..000000000 --- a/secrets/.gitignore +++ /dev/null @@ -1,6 +0,0 @@ -# Ignore encrypted .env files (not tracked, shared via 1Password) -*/.env - -# Track samples and docs -!*/.env.sample -!README.md diff --git a/secrets/README.md b/secrets/README.md deleted file mode 100644 index 8d30f00ea..000000000 --- a/secrets/README.md +++ /dev/null @@ -1,65 +0,0 @@ -# Secrets - -All secrets in one place. Encrypted with `ansible-vault`. One password unlocks everything. - -## Layout - -``` -secrets// - .env # ansible-vault encrypted (git tracked) - .env.sample # plaintext template (git tracked) -``` - -## Setup - -Get the vault password from a team member, then: - -```bash -echo 'your-vault-password' > ~/.ansible-vault-password -chmod 600 ~/.ansible-vault-password -``` - -Or use 1Password: - -```bash ---vault-password-file <(op read "op://Service-Automation/Ansible-Vault-Password/Ansible-Vault-Password") -``` - -## Commands - -```bash -# View -ansible-vault view secrets//.env - -# Edit -ansible-vault edit secrets//.env - -# Create new -cp secrets//.env.sample secrets//.env -# fill in values -ansible-vault encrypt secrets//.env - -# Decrypt to stdout -ansible-vault decrypt --output - secrets//.env -``` - -## Deploy a K8s app - -```bash -cd k3s -just deploy -``` - -## Directories - -| Directory | Purpose | -| -------------- | ----------------------------------------------- | -| `global/` | Shared tokens (Cloudflare, Linode) | -| `do-primary/` | Primary DO team API token | -| `do-universe/` | Universe DO team API token + Spaces credentials | -| `ansible/` | Playbook runtime secrets (S3, Tailscale OAuth) | -| `appsmith/` | Appsmith app secrets | -| `outline/` | Outline app secrets | -| `windmill/` | Windmill app secrets | -| `argocd/` | ArgoCD app secrets | -| `zot/` | Zot registry secrets (S3, htpasswd) | diff --git a/secrets/ansible/.env.sample b/secrets/ansible/.env.sample deleted file mode 100644 index 5111e620e..000000000 --- a/secrets/ansible/.env.sample +++ /dev/null @@ -1,4 +0,0 @@ -vault_do_spaces_access_key= -vault_do_spaces_secret_key= -vault_tailscale_oauth_client_id= -vault_tailscale_oauth_client_secret= diff --git a/secrets/appsmith/.env.sample b/secrets/appsmith/.env.sample deleted file mode 100644 index ba985af17..000000000 --- a/secrets/appsmith/.env.sample +++ /dev/null @@ -1,25 +0,0 @@ -# Appsmith secrets for appsmith.freecodecamp.net -# All values must be unquoted - -# ============================================================================= -# REQUIRED - Core -# ============================================================================= -# Generate with: openssl rand -hex 32 -APPSMITH_ENCRYPTION_PASSWORD= -APPSMITH_ENCRYPTION_SALT= -APPSMITH_SUPERVISOR_PASSWORD= - -# MongoDB Atlas connection string (unquoted) -APPSMITH_DB_URL=mongodb+srv://USER:PASS@xxx.yyy.mongodb.net/appsmith?retryWrites=true&w=majority&authSource=admin - -# ============================================================================= -# Optional -# ============================================================================= -APPSMITH_DISABLE_TELEMETRY=true -APPSMITH_DISABLE_INTERCOM=true - -# ============================================================================= -# REQUIRED - Google OAuth SSO -# ============================================================================= -# Appsmith doesn't support Google OAuth SSO configs from this file. Please configure -# them in the Appsmith UI. diff --git a/secrets/argocd/.env.sample b/secrets/argocd/.env.sample deleted file mode 100644 index 349b0a343..000000000 --- a/secrets/argocd/.env.sample +++ /dev/null @@ -1,9 +0,0 @@ -# ArgoCD secrets for ops-k3s-gxy-mgmt-argocd.batfish-ray.ts.net -# All values must be unquoted - -# ============================================================================= -# REQUIRED - Admin Password -# ============================================================================= -# Generate bcrypt hash with: argocd account bcrypt --password -# Or use: htpasswd -nbBC 10 "" | tr -d ':\n' | sed 's/$2y/$2a/' -ARGOCD_ADMIN_PASSWORD= diff --git a/secrets/do-primary/.env.sample b/secrets/do-primary/.env.sample deleted file mode 100644 index 92c79eb93..000000000 --- a/secrets/do-primary/.env.sample +++ /dev/null @@ -1 +0,0 @@ -DO_API_TOKEN= diff --git a/secrets/do-universe/.env.sample b/secrets/do-universe/.env.sample deleted file mode 100644 index 2e44f405b..000000000 --- a/secrets/do-universe/.env.sample +++ /dev/null @@ -1,3 +0,0 @@ -DO_API_TOKEN= -DO_SPACES_ACCESS_KEY= -DO_SPACES_SECRET_KEY= diff --git a/secrets/global/.env.sample b/secrets/global/.env.sample deleted file mode 100644 index 7ef0883b8..000000000 --- a/secrets/global/.env.sample +++ /dev/null @@ -1,2 +0,0 @@ -CLOUDFLARE_API_TOKEN= -LINODE_API_TOKEN= diff --git a/secrets/outline/.env.sample b/secrets/outline/.env.sample deleted file mode 100644 index 352e77d79..000000000 --- a/secrets/outline/.env.sample +++ /dev/null @@ -1,38 +0,0 @@ -# Outline secrets for outline.freecodecamp.net -# All values must be unquoted -# Generate secrets with: openssl rand -hex 32 - -# ============================================================================= -# REQUIRED - Application Secrets -# ============================================================================= -SECRET_KEY= -UTILS_SECRET= - -# ============================================================================= -# REQUIRED - Database (internal, don't change unless you know what you're doing) -# ============================================================================= -POSTGRES_USER=outline -POSTGRES_PASSWORD= -POSTGRES_DB=outline - -# ============================================================================= -# REQUIRED - Google OAuth -# ============================================================================= -# Create at: https://console.cloud.google.com/apis/credentials -# Authorized redirect URI: https://outline.freecodecamp.net/auth/google.callback -GOOGLE_CLIENT_ID= -GOOGLE_CLIENT_SECRET= - -# ============================================================================= -# OPTIONAL - GitHub Integration (PR/Issue previews) -# ============================================================================= -# Create GitHub App at: https://github.com/settings/apps -# Callback URL: https://outline.freecodecamp.net/api/github.callback -# Enable "Request user authorization (OAuth) during installation" -# Disable webhooks (not needed for previews) -# Base64 encode private key: openssl base64 -in private-key.pem -out key-b64.txt -GITHUB_CLIENT_ID= -GITHUB_CLIENT_SECRET= -GITHUB_APP_NAME= -GITHUB_APP_ID= -GITHUB_APP_PRIVATE_KEY= diff --git a/secrets/windmill/.env.sample b/secrets/windmill/.env.sample deleted file mode 100644 index 8479435c8..000000000 --- a/secrets/windmill/.env.sample +++ /dev/null @@ -1,17 +0,0 @@ -# Windmill secrets for windmill.freecodecamp.net -# All values must be unquoted - -# ============================================================================= -# REQUIRED - Database -# ============================================================================= -# PostgreSQL connection string for Windmill -# For embedded PostgreSQL, match the credentials in the Helm values -# For external PostgreSQL, use the managed database connection string -DATABASE_URL=postgres://postgres:windmill@windmill-postgresql/windmill?sslmode=disable - -# ============================================================================= -# REQUIRED - Admin -# ============================================================================= -# Initial admin email and password (used on first setup) -WINDMILL_ADMIN_EMAIL=admin@freecodecamp.org -WINDMILL_ADMIN_PASSWORD= diff --git a/secrets/zot/.env.sample b/secrets/zot/.env.sample deleted file mode 100644 index 934b6c728..000000000 --- a/secrets/zot/.env.sample +++ /dev/null @@ -1,17 +0,0 @@ -# Zot OCI Registry secrets for ops-k3s-gxy-mgmt-registry.batfish-ray.ts.net -# All values must be unquoted - -# ============================================================================= -# REQUIRED - S3 Storage (DigitalOcean Spaces) -# ============================================================================= -# DO Spaces access credentials -# Create at: https://cloud.digitalocean.com/account/api/spaces -S3_ACCESS_KEY= -S3_SECRET_KEY= - -# ============================================================================= -# REQUIRED - Registry Authentication -# ============================================================================= -# htpasswd entries for registry auth (one per line, newline-separated) -# Generate with: htpasswd -nbBC 10 -HTPASSWD= From ab0f8002462f0a350462301a917abc78827b8ae5 Mon Sep 17 00:00:00 2001 From: Mrugesh Mohapatra Date: Sat, 4 Apr 2026 13:55:54 +0530 Subject: [PATCH 13/40] chore: add tailscale justfile recipes and update gxy-management README - Add tailscale-install and tailscale-up convenience recipes - Update README to reference just recipes instead of raw ansible commands - Remove stale ansible-vault references from deployment runbook - Add --create-namespace to Helm install commands --- justfile | 12 +++++++++++ k3s/gxy-management/README.md | 42 ++++++++++++++++++++++-------------- 2 files changed, 38 insertions(+), 16 deletions(-) diff --git a/justfile b/justfile index 470daa964..4b46d50b2 100644 --- a/justfile +++ b/justfile @@ -101,6 +101,18 @@ galaxy-play galaxy_name host: -e variable_host={{host}} \ -e galaxy_name={{galaxy_name}} +# Install Tailscale on hosts +[group('ansible')] +tailscale-install host inventory="digitalocean.yml": + cd ansible && uv run ansible-playbook -i inventory/{{inventory}} play-tailscale--0-install.yml \ + -e variable_host={{host}} + +# Connect hosts to Tailscale network (with SSH) +[group('ansible')] +tailscale-up host inventory="digitalocean.yml": + cd ansible && uv run ansible-playbook -i inventory/{{inventory}} play-tailscale--1b-up-with-ssh.yml \ + -e variable_host={{host}} + # Install ansible and dependencies [group('ansible')] ansible-install: diff --git a/k3s/gxy-management/README.md b/k3s/gxy-management/README.md index 899a5070b..67eb4bd62 100644 --- a/k3s/gxy-management/README.md +++ b/k3s/gxy-management/README.md @@ -22,18 +22,14 @@ First Universe galaxy. Control plane brain — manages all galaxies. ## Quick Access ```bash -cd k3s/gxy-management && export KUBECONFIG=$(pwd)/.kubeconfig.yaml +cd k3s/gxy-management # direnv loads KUBECONFIG + DO_API_TOKEN kubectl get nodes ``` ## Deploy ```bash -cd ansible -uv run ansible-playbook -i inventory/digitalocean.yml play-k3s--galaxy.yml \ - -e variable_host=gxy_mgmt_k3s \ - -e galaxy_name=gxy-management \ - --vault-password-file <(op read "op://Service-Automation/Ansible-Vault-Password/Ansible-Vault-Password") +just galaxy-play gxy-management gxy_mgmt_k3s ``` ## Deployment Runbook @@ -43,29 +39,43 @@ uv run ansible-playbook -i inventory/digitalocean.yml play-k3s--galaxy.yml \ 1. Create 3x DO droplets (s-8vcpu-16gb) in FRA1 -- attach to VPC, configure firewall (80, 443, 6443 from VPC, 22 from Tailscale) 2. Create DO Spaces bucket `net.freecodecamp.universe-backups` in FRA1 (etcd snapshots) 3. Create DO Spaces bucket `net.freecodecamp.universe-registry` in FRA1 (Zot images) -4. Install Tailscale on all 3 nodes +4. Install Tailscale: `just tailscale-install gxy_mgmt_k3s` then `just tailscale-up gxy_mgmt_k3s` 5. Create Cloudflare origin certificate for `*.freecodecamp.net` (15-year, RSA) -6. Populate ansible-vault secrets (`vars/vault-k3s.yml`) -7. Populate app secrets (decrypt samples, fill values, encrypt) +6. Populate app secrets in infra-secrets repo (see samples in each app directory) + +### K3s Bootstrap + +```bash +just galaxy-play gxy-management gxy_mgmt_k3s +``` + +Deploys k3s HA cluster with Cilium CNI, Traefik ingress, etcd S3 backups, and fetches kubeconfig. ### Helm Installations -After playbook completes, before app deploy: +After playbook completes: ```bash -helm install argocd argo-cd --repo https://argoproj.github.io/argo-helm -n argocd -f charts/argo-cd/values.yaml -helm install windmill windmill --repo https://windmill-labs.github.io/windmill-helm-charts/ -n windmill -f charts/windmill/values.yaml -helm install zot zot --repo https://zotregistry.dev/helm-charts/ -n zot -f charts/zot/values.yaml +cd k3s/gxy-management +helm install argocd argo-cd --repo https://argoproj.github.io/argo-helm -n argocd --create-namespace -f apps/argocd/charts/argo-cd/values.yaml +helm install windmill windmill --repo https://windmill-labs.github.io/windmill-helm-charts/ -n windmill --create-namespace -f apps/windmill/charts/windmill/values.yaml +helm install zot zot --repo https://zotregistry.dev/helm-charts/ -n zot --create-namespace -f apps/zot/charts/zot/values.yaml ``` -**IMPORTANT: Helm release names must be exactly `argocd`, `windmill`, `zot`** -- the Gateway API HTTPRoute resources reference service names derived from these release names. +Release names MUST be exactly `argocd`, `windmill`, `zot` -- HTTPRoute manifests reference service names derived from these. + +### App Secrets and Manifests + +```bash +just deploy gxy-management argocd +just deploy gxy-management windmill +just deploy gxy-management zot +``` ### Post-deployment (ClickOps) 1. Create DNS A records (proxied) for windmill/argocd/registry.freecodecamp.net pointing to all 3 node public IPs 2. Create Cloudflare Access policies for each service -3. Apply TLS secrets: `kubectl create secret tls -tls-cloudflare --cert=tls.crt --key=tls.key -n ` -4. Apply kustomize manifests: `kubectl apply -k apps//manifests/base/ -n ` ### Smoke Tests From 820068148d6f77ab080edae97b624bb152407493 Mon Sep 17 00:00:00 2001 From: Mrugesh Mohapatra Date: Sat, 4 Apr 2026 14:25:30 +0530 Subject: [PATCH 14/40] chore: add spike status doc, kubeconfig-sync recipe, fix deploy for TLS - Add SPIKE-STATUS.md with full research, decisions, progress, and next steps - Add kubeconfig-sync recipe to decrypt kubeconfig from infra-secrets - Update deploy recipe to handle TLS certs alongside app secrets - Add DO_API_TOKEN guard to galaxy-play recipe --- SPIKE-STATUS.md | 244 ++++++++++++++++++++++++++++++++++++++++++++++++ justfile | 49 ++++++++-- 2 files changed, 283 insertions(+), 10 deletions(-) create mode 100644 SPIKE-STATUS.md diff --git a/SPIKE-STATUS.md b/SPIKE-STATUS.md new file mode 100644 index 000000000..b44b5f0e4 --- /dev/null +++ b/SPIKE-STATUS.md @@ -0,0 +1,244 @@ +# Universe gxy-management Spike Status + +Status as of 2026-04-04. This document captures all research, decisions, and progress for the first Universe galaxy cluster deployment. + +## Spike Goal + +Deploy gxy-management — the first Universe galaxy cluster — on DigitalOcean FRA1 with Cilium CNI, Traefik ingress, and three core services (Windmill, ArgoCD, Zot). + +Day 0 deliverable: Windmill accessible to all staff at windmill.freecodecamp.net. + +## Architecture Decisions (from Universe ADRs) + +| ADR | Decision | Impact on This Spike | +| ----------------------------- | ---------------------------------------------- | -------------------------------------------------- | +| 001 - Infrastructure Topology | K3s, 4 galaxies planned | gxy-management is first, 3-node HA | +| 002 - IaC Tooling | OpenTofu + Ansible | Using Ansible for bootstrap, TF migration separate | +| 005 - GitOps | ArgoCD multi-cluster | Installed on gxy-management, manages all galaxies | +| 008 - Data Storage | Rook-Ceph (later), local-path Day 0 | No Longhorn, K3s default storage | +| 009 - Networking | Cilium CNI, Cloudflare TLS, Tailscale SSH only | No cert-manager, origin certs from CF | +| 010 - Secrets | SOPS + age Phase 1, OpenBao Phase 2 | SOPS + age implemented in private repo | +| 011 - Security | Pin by SHA, PSS, audit logging | PSS + audit policy in cluster config | +| 015 - Observability | VictoriaMetrics + ClickHouse + HyperDX | Not in this spike, future galaxy | + +## Infrastructure Provisioned + +### DigitalOcean (Universe Account, FRA1) + +| Resource | Details | Status | +| ------------- | --------------------------------------------------------------- | ------ | +| VPC | `gxy-management-vpc`, 10.110.0.0/20, FRA1 | Done | +| Firewall | `universe-firewall` (80, 443, 6443 from VPC, 22 from Tailscale) | Done | +| Droplet 1 | `gxy-vm-mgmt-k3s-1`, s-8vcpu-16gb, 104.248.36.250 | Done | +| Droplet 2 | `gxy-vm-mgmt-k3s-2`, s-8vcpu-16gb, 134.122.69.214 | Done | +| Droplet 3 | `gxy-vm-mgmt-k3s-3`, s-8vcpu-16gb, 104.248.40.237 | Done | +| Spaces bucket | `net.freecodecamp.universe-backups` (etcd snapshots) | Done | +| Spaces bucket | `net.freecodecamp.universe-registry` (Zot images) | Done | + +Tag: `gxy-mgmt-k3s` → Ansible inventory group: `gxy_mgmt_k3s` + +### Cloud-init + +All droplets use `cloud-init/basic.yml` which provides: + +- Package updates/upgrades +- fail2ban (5 retries, 3600s ban) +- SSH hardening via `/etc/ssh/sshd_config.d/99-hardening.conf` (no root login, no passwords, pubkey only) +- `freecodecamp` user with sudo NOPASSWD + GitHub SSH key import +- Uses `ssh.service` (Ubuntu 24.04 naming, with `sshd` fallback) + +### Cluster Specs + +| Setting | Value | +| ------------ | -------------------------------------------------------- | +| K3s version | v1.34.5+k3s1 | +| CNI | Cilium (eBPF, Hubble enabled, kube-proxy replacement) | +| Pod CIDR | 10.1.0.0/16 | +| Service CIDR | 10.11.0.0/16 | +| Ingress | Traefik via ServiceLB (Klipper), ports 80/443 | +| Storage | local-path (K3s default) | +| etcd backups | Every 6h → DO Spaces (net.freecodecamp.universe-backups) | +| Security | Secrets encryption, PSS admission, audit logging | + +## Secrets Architecture + +### What Changed + +Migrated from ansible-vault (single shared password, whole-file encryption in public repo) to sops+age (per-person keys, value-level encryption in private repo). + +Commit: `6ac1504 refactor: migrate secrets from ansible-vault to sops+age` + +### How It Works + +``` +infra-secrets (private repo) infra (public repo) +───────────────────────── ────────────────────── + +global/.env.enc ──── direnv ───────────→ env: LINODE_API_TOKEN, TAILSCALE_AUTH_KEY, + HCP_CLIENT_ID, CLOUDFLARE_*, GRAFANA_* + +do-primary/.env.enc ── direnv ─────────→ env: DO_API_TOKEN (ops-backoffice-tools) +do-universe/.env.enc ── direnv ────────→ env: DO_API_TOKEN (gxy-management) + +ansible/vault-k3s.yaml.enc + └── just galaxy-play ── sops -d ────→ ansible/vars/vault-k3s.yml (temp, deleted after) + +k3s//kubeconfig.yaml.enc + └── just kubeconfig-sync ── sops -d → k3s//.kubeconfig.yaml (persists) + +k3s//.secrets.env.enc +k3s//.tls.crt.enc k3s//apps//.../secrets/ +k3s//.tls.key.enc ├── .secrets.env (temp) + └── just deploy ── sops -d ─────────→ ├── tls.crt (temp) + └── tls.key (temp) + (all deleted after kubectl apply) +``` + +### direnv Hierarchy + +| Directory | What Loads | +| --------------------------- | ----------------------------------------------------------- | +| `infra/` (root) | Global tokens (Linode, Tailscale, HCP, Cloudflare, Grafana) | +| `k3s/gxy-management/` | Above + DO_API_TOKEN (universe account) + KUBECONFIG | +| `k3s/ops-backoffice-tools/` | Above + DO_API_TOKEN (primary account) + KUBECONFIG | + +### Key Files + +- `infra/.envrc` — defines `use_sops()` function, loads global tokens, adds ansible venv to PATH +- `infra/k3s//.envrc` — sources parent, loads cluster-specific DO token, sets KUBECONFIG +- `infra-secrets/.sops.yaml` — creation rules with age public keys +- `~/.config/sops/age/keys.txt` — your age private key (from your password manager) + +### infra-secrets File Inventory + +``` +22 encrypted files (.enc) +16 sample files (.sample) + +global/.env.enc — Linode, Tailscale, HCP, Cloudflare, Grafana Cloud tokens +do-primary/.env.enc — Primary DO team API token +do-universe/.env.enc — Universe DO team API token +ansible/vault-k3s.yaml.enc — DO Spaces creds, Tailscale OAuth (YAML format) +appsmith/.env.enc — Appsmith app secrets +outline/.env.enc — Outline app secrets + +k3s/ops-backoffice-tools/ + kubeconfig.yaml.enc — Cluster kubeconfig + appsmith.secrets.env.enc — Appsmith deployed secrets + appsmith.tls.crt.enc — Appsmith Cloudflare origin cert + appsmith.tls.key.enc — Appsmith origin private key + outline.secrets.env.enc — Outline deployed secrets + outline.tls.crt.enc — Outline Cloudflare origin cert + outline.tls.key.enc — Outline origin private key + +k8s/o11y/ + kubeconfig.yaml.enc — o11y cluster kubeconfig + o11y.secrets.env.enc — o11y deployed secrets + o11y.tls.crt.enc — o11y Cloudflare origin cert + o11y.tls.key.enc — o11y origin private key + +docker/oldeworld/oncall.env.enc — Oncall stack secrets +scratchpad/ — dev.env.enc, org.env.enc, sample.env.enc +``` + +## justfile Recipes + +| Recipe | Purpose | Requires | +| -------------------------------------- | ----------------------------------- | --------------------------------- | +| `just secret-verify-all` | Verify all secrets decrypt | age key | +| `just secret-view ` | View a secret | age key | +| `just secret-edit ` | Edit a secret in $EDITOR | age key | +| `just kubeconfig-sync ` | Decrypt kubeconfig (run once) | age key | +| `just tailscale-install [inv]` | Install Tailscale on hosts | DO_API_TOKEN via direnv | +| `just tailscale-up [inv]` | Connect hosts to Tailscale | DO_API_TOKEN + TAILSCALE_AUTH_KEY | +| `just galaxy-play [inv]` | Bootstrap k3s cluster | DO_API_TOKEN + vault vars | +| `just deploy ` | Deploy app to cluster | KUBECONFIG via direnv | +| `just k8s-validate` | Validate manifests with kubeconform | — | +| `just ansible-install` | Install ansible + dependencies | — | +| `just ansible-test [inv]` | Ping a random VM | API token for inventory | + +## What's Done + +- [x] DigitalOcean infrastructure (VPC, firewall, 3 droplets, 2 Spaces buckets) +- [x] Cloud-init hardening (fail2ban, SSH, user creation) tested on OrbStack + deployed +- [x] Secrets migration: ansible-vault → sops+age in private infra-secrets repo +- [x] direnv wiring: root + cluster .envrc files with use_sops +- [x] justfile recipes: secrets, deploy, galaxy-play, tailscale, kubeconfig-sync +- [x] gxy-management cluster configs (Cilium values, security policies, Traefik config) +- [x] App manifests (ArgoCD, Windmill, Zot — kustomization, gateway, httproutes) +- [x] Helm chart values (ArgoCD, Windmill, Zot) +- [x] Documentation (infra-secrets README wiring doc, gxy-management README runbook) + +## What's Next + +| # | Task | Blocked By | Notes | +| --- | ----------------------------- | ---------- | --------------------------------------------------------------------------- | +| 1 | Install Tailscale on 3 nodes | — | `just tailscale-install gxy_mgmt_k3s` then `just tailscale-up gxy_mgmt_k3s` | +| 2 | Cloudflare origin certificate | #1 | \*.freecodecamp.net, 15-year RSA, encrypt to infra-secrets | +| 3 | Populate app secrets | — | argocd, windmill, zot .secrets.env from samples | +| 4 | Run K3s galaxy playbook | #1, #3 | `just galaxy-play gxy-management gxy_mgmt_k3s` | +| 5 | Install ArgoCD via Helm | #4 | `helm install argocd ...` from gxy-management dir | +| 6 | Install Windmill via Helm | #4 | `helm install windmill ...` | +| 7 | Install Zot via Helm | #4 | `helm install zot ...` | +| 8 | DNS + Cloudflare Access | #5, #6, #7 | A records + Access policies (ClickOps) | +| 9 | Smoke tests | #8 | nodes Ready, Cilium green, curl endpoints, Access gate | +| 10 | Commit infra-secrets repo | — | Push to GitHub | + +Unblocked right now: #1 (Tailscale), #3 (app secrets), #10 (infra-secrets commit). + +## Existing Infrastructure (Unchanged) + +### ops-backoffice-tools (live, 101 days uptime) + +- 3 nodes: ops-vm-tools-k3s-nyc3-{01,02,03}, k3s v1.32.11 +- Apps: Appsmith (1 pod), Outline (3 containers) +- Storage: Longhorn v1.10.1 (31 pods) +- Ingress: Traefik v3.5.1 +- Network: Tailscale operator +- Helm: longhorn, tailscale-operator, traefik, traefik-crd + +### What Was Archived (this branch) + +Observability stack torn down and moved to `.archive/2026-03-observability-teardown/`: + +- ops-logs-clickhouse cluster (3 droplets) +- Grafana, Prometheus, Vector from ops-backoffice-tools +- Savings: ~$231/month + +### Branch History + +``` +feat/k3s-universe (13 commits ahead of main) + +ab0f800 chore: add tailscale justfile recipes and update gxy-management README +6ac1504 refactor: migrate secrets from ansible-vault to sops+age +9c902c1 feat(cloud-init): update config for Ubuntu 24.04 +0619242 fix(k8s): exclude JSON and dashboards from kubeconform validation +2332a1c feat(k8s): add kubeconform manifest validation — local + CI +b5fc35b feat(gxy-management): align Day 0 config with spike-plan and ADRs +c9c1b4e fix: move archive +6137073 fix: move scratchpad +5810c79 feat: add direnv hierarchy and secrets bootstrap workflow +4ebcc24 feat: consolidate secrets management with ansible-vault +a564bd6 refactor: consolidate justfiles into root justfile +b0fae18 feat(k3s): add gxy-management galaxy configs and Day 0 spike infrastructure +e72beb5 feat(k3s): add ops-mgmt cluster configs and tooling +``` + +## Errors and Fixes (for Future Reference) + +| Issue | Root Cause | Fix | +| ---------------------------------------------------- | ----------------------------------------- | --------------------------------------------------------- | +| cloud-init heredoc syntax error | runcmd `\|` strings don't support heredoc | Moved to write_files section | +| `systemctl restart sshd` fails on Ubuntu 24.04 | Service renamed to `ssh.service` | `ssh \|\| sshd \|\| true` fallback | +| SSH hardening sed had no effect | Ubuntu 24.04 ships commented defaults | Drop-in file at sshd_config.d/99-hardening.conf | +| sops `path_regex: .*\.enc$` didn't match input files | Regex matches input path, not output | Changed to `.*` (match all) | +| sops `dotenv` format failed on YAML file | ansible vars are YAML, not dotenv | Renamed to `.yaml.enc`, format detection in verify recipe | +| direnv `$(dirname "$0")` empty | Not available in direnv context | Use `expand_path ../infra-secrets` | + +## Open Questions + +- **Helm chart versions**: Need to verify latest stable for ArgoCD, Windmill, Zot before install +- **Cloudflare Access policies**: Exact group/email configuration TBD +- **Windmill DB**: Using embedded SQLite or external PostgreSQL? (ADR-008 says CNPG later) +- **TLS for gxy-management apps**: Need to create Cloudflare origin cert for \*.freecodecamp.net diff --git a/justfile b/justfile index 4b46d50b2..a226914de 100644 --- a/justfile +++ b/justfile @@ -40,18 +40,45 @@ secret-verify-all: # K8s / K3s # --------------------------------------------------------------------------- -# Deploy a K8s app (decrypt secrets → apply → clean up) +# Decrypt kubeconfig from infra-secrets to cluster dir (run once after clone) +[group('k3s')] +kubeconfig-sync cluster: + #!/usr/bin/env bash + set -eu + SRC="{{secrets_dir}}/k3s/{{cluster}}/kubeconfig.yaml.enc" + DST="k3s/{{cluster}}/.kubeconfig.yaml" + [ -f "$SRC" ] || { echo "Error: $SRC not found (cluster not yet bootstrapped?)"; exit 1; } + sops -d --input-type yaml --output-type yaml "$SRC" > "$DST" + chmod 600 "$DST" + echo "Synced kubeconfig → $DST" + +# Deploy a K8s app (decrypt secrets + TLS → apply → clean up) [group('k3s')] deploy cluster app: #!/usr/bin/env bash set -eu - SECRETS_SRC="{{secrets_dir}}/k3s/{{cluster}}/{{app}}.secrets.env.enc" - SECRETS_DST="k3s/{{cluster}}/apps/{{app}}/manifests/base/secrets/.secrets.env" - - [ -f "$SECRETS_SRC" ] || { echo "Error: $SECRETS_SRC not found"; exit 1; } - - sops -d --input-type dotenv --output-type dotenv "$SECRETS_SRC" > "$SECRETS_DST" - trap 'rm -f "$SECRETS_DST"' EXIT + ENC_DIR="{{secrets_dir}}/k3s/{{cluster}}" + APP_SECRETS="k3s/{{cluster}}/apps/{{app}}/manifests/base/secrets" + CLEANUP="" + + # Decrypt app secrets (.secrets.env) + if [ -f "$ENC_DIR/{{app}}.secrets.env.enc" ]; then + sops -d --input-type dotenv --output-type dotenv "$ENC_DIR/{{app}}.secrets.env.enc" > "$APP_SECRETS/.secrets.env" + CLEANUP="$APP_SECRETS/.secrets.env" + fi + + # Decrypt TLS cert + key + if [ -f "$ENC_DIR/{{app}}.tls.crt.enc" ]; then + sops -d "$ENC_DIR/{{app}}.tls.crt.enc" > "$APP_SECRETS/tls.crt" + CLEANUP="$CLEANUP $APP_SECRETS/tls.crt" + fi + if [ -f "$ENC_DIR/{{app}}.tls.key.enc" ]; then + sops -d "$ENC_DIR/{{app}}.tls.key.enc" > "$APP_SECRETS/tls.key" + CLEANUP="$CLEANUP $APP_SECRETS/tls.key" + fi + + [ -n "$CLEANUP" ] || { echo "Error: no secrets found for {{app}} in $ENC_DIR"; exit 1; } + trap "rm -f $CLEANUP" EXIT cd k3s/{{cluster}} export KUBECONFIG="$(pwd)/.kubeconfig.yaml" @@ -87,17 +114,19 @@ k8s-validate: # --------------------------------------------------------------------------- # Run galaxy playbook (decrypt vault → run → clean up) +# Must be run from a cluster dir (e.g., cd k3s/gxy-management) so DO_API_TOKEN is loaded via direnv [group('ansible')] -galaxy-play galaxy_name host: +galaxy-play galaxy_name host inventory="digitalocean.yml": #!/usr/bin/env bash set -eu + [ -n "${DO_API_TOKEN:-}" ] || { echo "Error: DO_API_TOKEN not set. Run from cluster dir (cd k3s/{{galaxy_name}})"; exit 1; } VAULT_SRC="{{secrets_dir}}/ansible/vault-k3s.yaml.enc" VAULT_DST="ansible/vars/vault-k3s.yml" [ -f "$VAULT_SRC" ] || { echo "Error: $VAULT_SRC not found"; exit 1; } sops -d --input-type yaml --output-type yaml "$VAULT_SRC" > "$VAULT_DST" trap 'rm -f "$VAULT_DST"' EXIT cd ansible - uv run ansible-playbook -i inventory/digitalocean.yml play-k3s--galaxy.yml \ + uv run ansible-playbook -i inventory/{{inventory}} play-k3s--galaxy.yml \ -e variable_host={{host}} \ -e galaxy_name={{galaxy_name}} From f9ce6096aa2a505f149ecded85762872a981a779 Mon Sep 17 00:00:00 2001 From: Mrugesh Mohapatra Date: Sat, 4 Apr 2026 20:39:51 +0530 Subject: [PATCH 15/40] refactor: overhaul justfile and align playbook with sops+direnv MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Justfile: 18 → 12 recipes (parametric, no special-case orchestration) - Add generic `play` recipe replacing tailscale-install/tailscale-up - Add `helm-upgrade` recipe with convention-based chart discovery - Add parametric `tf` recipe replacing 5 separate terraform recipes - Fix `secret-view` format auto-detection (was hardcoded dotenv) - Parameterize `k8s-validate` K8s version (was hardcoded 1.30.0) - Remove `galaxy-play` — playbook reads env vars via direnv now Playbook: galaxy reads DO_SPACES_* from env instead of vault file - Replace vars_files with lookup('env', ...) in play-k3s--galaxy.yml - Add DO_SPACES_ACCESS_KEY/SECRET_KEY to do-universe/.env.enc - Delete ansible/vault-k3s.yaml.enc from infra-secrets Docs: replace raw commands with just recipes in all READMEs - Add repo files for helm-upgrade chart discovery convention --- SPIKE-STATUS.md | 59 ++++--- ansible/play-k3s--galaxy.yml | 28 ++- justfile | 159 +++++++----------- k3s/README.md | 15 +- k3s/gxy-management/README.md | 15 +- .../apps/argocd/charts/argo-cd/repo | 1 + .../apps/windmill/charts/windmill/repo | 1 + k3s/gxy-management/apps/zot/charts/zot/repo | 1 + k3s/ops-backoffice-tools/README.md | 4 +- 9 files changed, 121 insertions(+), 162 deletions(-) create mode 100644 k3s/gxy-management/apps/argocd/charts/argo-cd/repo create mode 100644 k3s/gxy-management/apps/windmill/charts/windmill/repo create mode 100644 k3s/gxy-management/apps/zot/charts/zot/repo diff --git a/SPIKE-STATUS.md b/SPIKE-STATUS.md index b44b5f0e4..78de36eab 100644 --- a/SPIKE-STATUS.md +++ b/SPIKE-STATUS.md @@ -78,10 +78,8 @@ global/.env.enc ──── direnv ───────────→ env: L HCP_CLIENT_ID, CLOUDFLARE_*, GRAFANA_* do-primary/.env.enc ── direnv ─────────→ env: DO_API_TOKEN (ops-backoffice-tools) -do-universe/.env.enc ── direnv ────────→ env: DO_API_TOKEN (gxy-management) - -ansible/vault-k3s.yaml.enc - └── just galaxy-play ── sops -d ────→ ansible/vars/vault-k3s.yml (temp, deleted after) +do-universe/.env.enc ── direnv ────────→ env: DO_API_TOKEN, DO_SPACES_ACCESS_KEY, + DO_SPACES_SECRET_KEY (gxy-management) k3s//kubeconfig.yaml.enc └── just kubeconfig-sync ── sops -d → k3s//.kubeconfig.yaml (persists) @@ -143,19 +141,20 @@ scratchpad/ — dev.env.enc, org.env.enc, sample.env.enc ## justfile Recipes -| Recipe | Purpose | Requires | -| -------------------------------------- | ----------------------------------- | --------------------------------- | -| `just secret-verify-all` | Verify all secrets decrypt | age key | -| `just secret-view ` | View a secret | age key | -| `just secret-edit ` | Edit a secret in $EDITOR | age key | -| `just kubeconfig-sync ` | Decrypt kubeconfig (run once) | age key | -| `just tailscale-install [inv]` | Install Tailscale on hosts | DO_API_TOKEN via direnv | -| `just tailscale-up [inv]` | Connect hosts to Tailscale | DO_API_TOKEN + TAILSCALE_AUTH_KEY | -| `just galaxy-play [inv]` | Bootstrap k3s cluster | DO_API_TOKEN + vault vars | -| `just deploy ` | Deploy app to cluster | KUBECONFIG via direnv | -| `just k8s-validate` | Validate manifests with kubeconform | — | -| `just ansible-install` | Install ansible + dependencies | — | -| `just ansible-test [inv]` | Ping a random VM | API token for inventory | +| Recipe | Purpose | Requires | +| ----------------------------------- | ----------------------------------- | --------------------- | +| `just secret-verify-all` | Verify all secrets decrypt | age key | +| `just secret-view ` | View a secret (auto-detects format) | age key | +| `just secret-edit ` | Edit a secret in $EDITOR | age key | +| `just kubeconfig-sync ` | Decrypt kubeconfig (run once) | age key | +| `just play [inv]` | Run any ansible playbook | API token via direnv | +| `just deploy ` | Deploy app (secrets + TLS → apply) | KUBECONFIG via direnv | +| `just helm-upgrade ` | Install/upgrade Helm chart | KUBECONFIG via direnv | +| `just k8s-validate [version]` | Validate manifests with kubeconform | — | +| `just ansible-install` | Install ansible + dependencies | — | +| `just tf [workspace]` | Run terraform (selective or all) | API tokens via direnv | +| `just tf-fmt` | Format all terraform files | — | +| `just tf-list` | List terraform workspaces | — | ## What's Done @@ -163,7 +162,7 @@ scratchpad/ — dev.env.enc, org.env.enc, sample.env.enc - [x] Cloud-init hardening (fail2ban, SSH, user creation) tested on OrbStack + deployed - [x] Secrets migration: ansible-vault → sops+age in private infra-secrets repo - [x] direnv wiring: root + cluster .envrc files with use_sops -- [x] justfile recipes: secrets, deploy, galaxy-play, tailscale, kubeconfig-sync +- [x] justfile recipes: secrets, deploy, play, helm-upgrade, kubeconfig-sync, tf - [x] gxy-management cluster configs (Cilium values, security policies, Traefik config) - [x] App manifests (ArgoCD, Windmill, Zot — kustomization, gateway, httproutes) - [x] Helm chart values (ArgoCD, Windmill, Zot) @@ -171,18 +170,18 @@ scratchpad/ — dev.env.enc, org.env.enc, sample.env.enc ## What's Next -| # | Task | Blocked By | Notes | -| --- | ----------------------------- | ---------- | --------------------------------------------------------------------------- | -| 1 | Install Tailscale on 3 nodes | — | `just tailscale-install gxy_mgmt_k3s` then `just tailscale-up gxy_mgmt_k3s` | -| 2 | Cloudflare origin certificate | #1 | \*.freecodecamp.net, 15-year RSA, encrypt to infra-secrets | -| 3 | Populate app secrets | — | argocd, windmill, zot .secrets.env from samples | -| 4 | Run K3s galaxy playbook | #1, #3 | `just galaxy-play gxy-management gxy_mgmt_k3s` | -| 5 | Install ArgoCD via Helm | #4 | `helm install argocd ...` from gxy-management dir | -| 6 | Install Windmill via Helm | #4 | `helm install windmill ...` | -| 7 | Install Zot via Helm | #4 | `helm install zot ...` | -| 8 | DNS + Cloudflare Access | #5, #6, #7 | A records + Access policies (ClickOps) | -| 9 | Smoke tests | #8 | nodes Ready, Cilium green, curl endpoints, Access gate | -| 10 | Commit infra-secrets repo | — | Push to GitHub | +| # | Task | Blocked By | Notes | +| --- | ----------------------------- | ---------- | ----------------------------------------------------------------------------------------------------- | +| 1 | Install Tailscale on 3 nodes | — | `just play tailscale--0-install gxy_mgmt_k3s` then `just play tailscale--1b-up-with-ssh gxy_mgmt_k3s` | +| 2 | Cloudflare origin certificate | #1 | \*.freecodecamp.net, 15-year RSA, encrypt to infra-secrets | +| 3 | Populate app secrets | — | argocd, windmill, zot .secrets.env from samples | +| 4 | Run K3s galaxy playbook | #1, #3 | `just play k3s--galaxy gxy_mgmt_k3s` | +| 5 | Install ArgoCD via Helm | #4 | `just helm-upgrade gxy-management argocd` | +| 6 | Install Windmill via Helm | #4 | `just helm-upgrade gxy-management windmill` | +| 7 | Install Zot via Helm | #4 | `just helm-upgrade gxy-management zot` | +| 8 | DNS + Cloudflare Access | #5, #6, #7 | A records + Access policies (ClickOps) | +| 9 | Smoke tests | #8 | nodes Ready, Cilium green, curl endpoints, Access gate | +| 10 | Commit infra-secrets repo | — | Push to GitHub | Unblocked right now: #1 (Tailscale), #3 (app secrets), #10 (infra-secrets commit). diff --git a/ansible/play-k3s--galaxy.yml b/ansible/play-k3s--galaxy.yml index f6a36f8c7..ad250d834 100644 --- a/ansible/play-k3s--galaxy.yml +++ b/ansible/play-k3s--galaxy.yml @@ -8,12 +8,12 @@ # Prerequisites (manual, one-time per galaxy): # - 3x Ubuntu VMs on DigitalOcean with VPC attached (eth1) # - Tailscale installed and connected on all nodes (play-tailscale--*.yml) -# - Vault secrets decrypted: sops -d $SECRETS_DIR/ansible/vault-k3s.yaml.enc > vars/vault-k3s.yml +# - Env vars loaded via direnv (cd into cluster dir first) # - Cluster config directory: k3s//cluster/ # # Usage: -# cd ansible/ # direnv loads .env + activates venv -# just galaxy-play gxy-management gxy_mgmt_k3s +# cd k3s/gxy-management # direnv loads DO_API_TOKEN, DO_SPACES_*, etc. +# just play k3s--galaxy gxy_mgmt_k3s # # What this playbook does (6 plays): # 1. Validate prerequisites (VPC, Tailscale, vault secrets) @@ -28,20 +28,18 @@ hosts: "{{ variable_host }}" gather_facts: true become: true - vars_files: - - vars/vault-k3s.yml vars: galaxy_name: "gxy-management" + do_spaces_access_key: "{{ lookup('env', 'DO_SPACES_ACCESS_KEY') }}" + do_spaces_secret_key: "{{ lookup('env', 'DO_SPACES_SECRET_KEY') }}" tasks: - - name: Validate vault secrets loaded + - name: Validate env vars loaded (cd into cluster dir for direnv) assert: that: - - vault_do_spaces_access_key is defined - - vault_do_spaces_access_key | length > 0 - - vault_do_spaces_secret_key is defined - - vault_do_spaces_secret_key | length > 0 - fail_msg: "Vault secrets missing. Run: sops -d $SECRETS_DIR/ansible/vault-k3s.yaml.enc > vars/vault-k3s.yml" + - do_spaces_access_key | length > 0 + - do_spaces_secret_key | length > 0 + fail_msg: "DO_SPACES_ACCESS_KEY/SECRET_KEY not set. cd into cluster dir first (direnv loads them)." - name: Validate VPC interface exists (eth1) assert: @@ -120,10 +118,10 @@ hosts: server gather_facts: true become: true - vars_files: - - vars/vault-k3s.yml vars: galaxy_name: "gxy-management" + do_spaces_access_key: "{{ lookup('env', 'DO_SPACES_ACCESS_KEY') }}" + do_spaces_secret_key: "{{ lookup('env', 'DO_SPACES_SECRET_KEY') }}" k3s_version: "v1.34.5+k3s1" cluster_cidr: "10.1.0.0/16" service_cidr: "10.11.0.0/16" @@ -169,8 +167,8 @@ - name: Write k3s service environment (S3 credentials) copy: content: | - AWS_ACCESS_KEY_ID={{ vault_do_spaces_access_key }} - AWS_SECRET_ACCESS_KEY={{ vault_do_spaces_secret_key }} + AWS_ACCESS_KEY_ID={{ do_spaces_access_key }} + AWS_SECRET_ACCESS_KEY={{ do_spaces_secret_key }} dest: /etc/systemd/system/k3s.service.env mode: "0600" owner: root diff --git a/justfile b/justfile index a226914de..04d8fe715 100644 --- a/justfile +++ b/justfile @@ -1,7 +1,6 @@ set shell := ["bash", "-cu"] secrets_dir := env("SECRETS_DIR", justfile_directory() + "/../infra-secrets") -sops_config := secrets_dir + "/.sops.yaml" crds_schema := 'https://raw.githubusercontent.com/datreeio/CRDs-catalog/main/{{.Group}}/{{.ResourceKind}}_{{.ResourceAPIVersion}}.json' # Show available recipes @@ -12,12 +11,20 @@ default: # Secrets (sops + age — stored in infra-secrets private repo) # --------------------------------------------------------------------------- -# View a secret +# View a decrypted secret (auto-detects format from extension) [group('secrets')] secret-view name: - sops -d --input-type dotenv --output-type dotenv "{{secrets_dir}}/{{name}}/.env.enc" - -# Edit a secret + #!/usr/bin/env bash + set -eu + FILE=$(find "{{secrets_dir}}/{{name}}" -name '*.enc' -type f | head -1) + [ -f "$FILE" ] || { echo "Error: no .enc file in {{secrets_dir}}/{{name}}/"; exit 1; } + case "$FILE" in + *.env.enc) sops -d --input-type dotenv --output-type dotenv "$FILE" ;; + *.yaml.enc|*.yml.enc) sops -d --input-type yaml --output-type yaml "$FILE" ;; + *) sops -d "$FILE" ;; + esac + +# Edit a secret in $EDITOR [group('secrets')] secret-edit name: sops "{{secrets_dir}}/{{name}}/.env.enc" @@ -30,9 +37,9 @@ secret-verify-all: for f in $(find "{{secrets_dir}}" -name '*.enc' -type f | sort); do echo -n "$f: " case "$f" in - *.env.enc) sops -d --input-type dotenv --output-type dotenv "$f" > /dev/null 2>&1 ;; + *.env.enc) sops -d --input-type dotenv --output-type dotenv "$f" > /dev/null 2>&1 ;; *.yaml.enc|*.yml.enc) sops -d --input-type yaml --output-type yaml "$f" > /dev/null 2>&1 ;; - *) sops -d "$f" > /dev/null 2>&1 ;; + *) sops -d "$f" > /dev/null 2>&1 ;; esac && echo "OK" || echo "FAILED" done @@ -40,7 +47,7 @@ secret-verify-all: # K8s / K3s # --------------------------------------------------------------------------- -# Decrypt kubeconfig from infra-secrets to cluster dir (run once after clone) +# Decrypt kubeconfig from infra-secrets (run once after clone) [group('k3s')] kubeconfig-sync cluster: #!/usr/bin/env bash @@ -52,7 +59,7 @@ kubeconfig-sync cluster: chmod 600 "$DST" echo "Synced kubeconfig → $DST" -# Deploy a K8s app (decrypt secrets + TLS → apply → clean up) +# Deploy app (decrypt secrets + TLS → kustomize apply → cleanup) [group('k3s')] deploy cluster app: #!/usr/bin/env bash @@ -61,13 +68,10 @@ deploy cluster app: APP_SECRETS="k3s/{{cluster}}/apps/{{app}}/manifests/base/secrets" CLEANUP="" - # Decrypt app secrets (.secrets.env) if [ -f "$ENC_DIR/{{app}}.secrets.env.enc" ]; then sops -d --input-type dotenv --output-type dotenv "$ENC_DIR/{{app}}.secrets.env.enc" > "$APP_SECRETS/.secrets.env" CLEANUP="$APP_SECRETS/.secrets.env" fi - - # Decrypt TLS cert + key if [ -f "$ENC_DIR/{{app}}.tls.crt.enc" ]; then sops -d "$ENC_DIR/{{app}}.tls.crt.enc" > "$APP_SECRETS/tls.crt" CLEANUP="$CLEANUP $APP_SECRETS/tls.crt" @@ -85,15 +89,36 @@ deploy cluster app: kubectl apply -k apps/{{app}}/manifests/base/ echo "Deployed {{app}} to {{cluster}}" +# Install or upgrade a Helm chart for a cluster app +[group('k3s')] +helm-upgrade cluster app: + #!/usr/bin/env bash + set -eu + cd k3s/{{cluster}} + export KUBECONFIG="$(pwd)/.kubeconfig.yaml" + CHART_DIR=$(find "apps/{{app}}/charts" -maxdepth 1 -mindepth 1 -type d | head -1) + [ -d "$CHART_DIR" ] || { echo "Error: no chart dir in apps/{{app}}/charts/"; exit 1; } + CHART_NAME=$(basename "$CHART_DIR") + VALUES="$CHART_DIR/values.yaml" + [ -f "$VALUES" ] || { echo "Error: $VALUES not found"; exit 1; } + REPO_FILE="$CHART_DIR/repo" + [ -f "$REPO_FILE" ] || { echo "Error: $REPO_FILE not found (one line: chart repo URL)"; exit 1; } + REPO_URL=$(cat "$REPO_FILE") + echo "Installing {{app}} (chart: $CHART_NAME) from $REPO_URL" + helm upgrade --install {{app}} "$CHART_NAME" \ + --repo "$REPO_URL" \ + -n {{app}} --create-namespace \ + -f "$VALUES" + # Validate K8s manifests with kubeconform [group('k3s')] -k8s-validate: +k8s-validate version="1.32.0": kubeconform \ -summary \ -output text \ -strict \ -ignore-missing-schemas \ - -kubernetes-version 1.30.0 \ + -kubernetes-version {{version}} \ -schema-location default \ -schema-location '{{crds_schema}}' \ -ignore-filename-pattern 'kustomization\.yaml' \ @@ -113,107 +138,47 @@ k8s-validate: # Ansible # --------------------------------------------------------------------------- -# Run galaxy playbook (decrypt vault → run → clean up) -# Must be run from a cluster dir (e.g., cd k3s/gxy-management) so DO_API_TOKEN is loaded via direnv -[group('ansible')] -galaxy-play galaxy_name host inventory="digitalocean.yml": - #!/usr/bin/env bash - set -eu - [ -n "${DO_API_TOKEN:-}" ] || { echo "Error: DO_API_TOKEN not set. Run from cluster dir (cd k3s/{{galaxy_name}})"; exit 1; } - VAULT_SRC="{{secrets_dir}}/ansible/vault-k3s.yaml.enc" - VAULT_DST="ansible/vars/vault-k3s.yml" - [ -f "$VAULT_SRC" ] || { echo "Error: $VAULT_SRC not found"; exit 1; } - sops -d --input-type yaml --output-type yaml "$VAULT_SRC" > "$VAULT_DST" - trap 'rm -f "$VAULT_DST"' EXIT - cd ansible - uv run ansible-playbook -i inventory/{{inventory}} play-k3s--galaxy.yml \ - -e variable_host={{host}} \ - -e galaxy_name={{galaxy_name}} - -# Install Tailscale on hosts -[group('ansible')] -tailscale-install host inventory="digitalocean.yml": - cd ansible && uv run ansible-playbook -i inventory/{{inventory}} play-tailscale--0-install.yml \ - -e variable_host={{host}} - -# Connect hosts to Tailscale network (with SSH) +# Run any ansible playbook [group('ansible')] -tailscale-up host inventory="digitalocean.yml": - cd ansible && uv run ansible-playbook -i inventory/{{inventory}} play-tailscale--1b-up-with-ssh.yml \ +play playbook host inv="digitalocean.yml": + cd ansible && uv run ansible-playbook -i inventory/{{inv}} play-{{playbook}}.yml \ -e variable_host={{host}} -# Install ansible and dependencies +# Install ansible dependencies [group('ansible')] ansible-install: cd ansible && uv sync && uv run ansible-galaxy install -r requirements.yml -# Test connection to a random VM -[group('ansible')] -ansible-test inventory="linode.yml": - #!/usr/bin/env bash - set -eu - cd ansible - VM_COUNT=$(uv run ansible-inventory -i inventory/{{inventory}} --list 2>/dev/null | jq -r '._meta.hostvars | keys | length') - echo "Found $VM_COUNT VMs" - [ "$VM_COUNT" -eq 0 ] && echo "No VMs found" && exit 1 - RANDOM_INDEX=$(( RANDOM % VM_COUNT )) - uv run ansible -i inventory/{{inventory}} "all[$RANDOM_INDEX]" -m ping --one-line -v - # --------------------------------------------------------------------------- # Terraform # --------------------------------------------------------------------------- -# List all Terraform workspaces +# Run terraform on one or all workspaces [group('terraform')] -tf-list: - @find terraform -name ".terraform.lock.hcl" -exec dirname {} \; | sort - -# Format Terraform files -[group('terraform')] -tf-format: +tf cmd workspace="all": #!/usr/bin/env bash set -eu - for ws in $(find terraform -name ".terraform.lock.hcl" -exec dirname {} \;); do - echo "Formatting $ws" - terraform -chdir=$ws fmt - done - -# Validate Terraform configurations -[group('terraform')] -tf-validate: - #!/usr/bin/env bash - set -eu - for ws in $(find terraform -name ".terraform.lock.hcl" -exec dirname {} \;); do - echo "Validating $ws" - terraform -chdir=$ws validate - done - -# Initialize Terraform workspaces -[group('terraform')] -tf-init: - #!/usr/bin/env bash - set -eu - for ws in $(find terraform -name ".terraform.lock.hcl" -exec dirname {} \;); do - echo "Initializing $ws" - terraform -chdir=$ws init - done + if [ "{{workspace}}" = "all" ]; then + for ws in $(find terraform -name ".terraform.lock.hcl" -exec dirname {} \; | sort); do + echo "==> $ws: terraform {{cmd}}" + terraform -chdir=$ws {{cmd}} + done + else + ws="terraform/{{workspace}}" + [ -d "$ws" ] || { echo "Error: $ws not found"; exit 1; } + terraform -chdir=$ws {{cmd}} + fi -# Initialize and upgrade Terraform workspaces +# Format all terraform files [group('terraform')] -tf-init-upgrade: +tf-fmt: #!/usr/bin/env bash set -eu - for ws in $(find terraform -name ".terraform.lock.hcl" -exec dirname {} \;); do - echo "Upgrading $ws" - terraform -chdir=$ws init -upgrade + for ws in $(find terraform -name ".terraform.lock.hcl" -exec dirname {} \; | sort); do + terraform -chdir=$ws fmt done -# Plan all Terraform workspaces +# List terraform workspaces [group('terraform')] -tf-plan: - #!/usr/bin/env bash - set -eu - for ws in $(find terraform -name ".terraform.lock.hcl" -exec dirname {} \;); do - echo "Planning $ws" - terraform -chdir=$ws plan - done +tf-list: + @find terraform -name ".terraform.lock.hcl" -exec dirname {} \; | sort diff --git a/k3s/README.md b/k3s/README.md index 88835c2d7..c6a70123d 100644 --- a/k3s/README.md +++ b/k3s/README.md @@ -75,19 +75,14 @@ k3s/ ## Ansible Deployment ```bash -cd ansible - # Deploy tools cluster -uv run ansible-playbook -i inventory/digitalocean.yml play-k3s--cluster.yml \ - -e variable_host=tools_k3s +just play k3s--cluster tools_k3s # Longhorn storage (tools) -uv run ansible-playbook -i inventory/digitalocean.yml play-k3s--longhorn.yml \ - -e variable_host=tools_k3s +just play k3s--longhorn tools_k3s -# Deploy gxy-management galaxy -uv run ansible-playbook -i inventory/digitalocean.yml play-k3s--galaxy.yml \ - -e variable_host=gxy_mgmt_k3s +# Deploy gxy-management galaxy (decrypts vault vars automatically) +just play k3s--galaxy gxy_mgmt_k3s ``` --- @@ -137,7 +132,7 @@ kubectl port-forward -n longhorn-system svc/longhorn-frontend 8080:80 ### Update Apps ```bash -kubectl apply -k apps//manifests/base/ +just deploy ``` --- diff --git a/k3s/gxy-management/README.md b/k3s/gxy-management/README.md index 67eb4bd62..e77fefc82 100644 --- a/k3s/gxy-management/README.md +++ b/k3s/gxy-management/README.md @@ -29,7 +29,7 @@ kubectl get nodes ## Deploy ```bash -just galaxy-play gxy-management gxy_mgmt_k3s +just play k3s--galaxy gxy_mgmt_k3s ``` ## Deployment Runbook @@ -39,14 +39,14 @@ just galaxy-play gxy-management gxy_mgmt_k3s 1. Create 3x DO droplets (s-8vcpu-16gb) in FRA1 -- attach to VPC, configure firewall (80, 443, 6443 from VPC, 22 from Tailscale) 2. Create DO Spaces bucket `net.freecodecamp.universe-backups` in FRA1 (etcd snapshots) 3. Create DO Spaces bucket `net.freecodecamp.universe-registry` in FRA1 (Zot images) -4. Install Tailscale: `just tailscale-install gxy_mgmt_k3s` then `just tailscale-up gxy_mgmt_k3s` +4. Install Tailscale: `just play tailscale--0-install gxy_mgmt_k3s` then `just play tailscale--1b-up-with-ssh gxy_mgmt_k3s` 5. Create Cloudflare origin certificate for `*.freecodecamp.net` (15-year, RSA) 6. Populate app secrets in infra-secrets repo (see samples in each app directory) ### K3s Bootstrap ```bash -just galaxy-play gxy-management gxy_mgmt_k3s +just play k3s--galaxy gxy_mgmt_k3s ``` Deploys k3s HA cluster with Cilium CNI, Traefik ingress, etcd S3 backups, and fetches kubeconfig. @@ -56,13 +56,12 @@ Deploys k3s HA cluster with Cilium CNI, Traefik ingress, etcd S3 backups, and fe After playbook completes: ```bash -cd k3s/gxy-management -helm install argocd argo-cd --repo https://argoproj.github.io/argo-helm -n argocd --create-namespace -f apps/argocd/charts/argo-cd/values.yaml -helm install windmill windmill --repo https://windmill-labs.github.io/windmill-helm-charts/ -n windmill --create-namespace -f apps/windmill/charts/windmill/values.yaml -helm install zot zot --repo https://zotregistry.dev/helm-charts/ -n zot --create-namespace -f apps/zot/charts/zot/values.yaml +just helm-upgrade gxy-management argocd +just helm-upgrade gxy-management windmill +just helm-upgrade gxy-management zot ``` -Release names MUST be exactly `argocd`, `windmill`, `zot` -- HTTPRoute manifests reference service names derived from these. +Release names match the app directory names. The recipe reads the chart repo URL from `charts//repo.txt` and the values from `charts//values.yaml`. ### App Secrets and Manifests diff --git a/k3s/gxy-management/apps/argocd/charts/argo-cd/repo b/k3s/gxy-management/apps/argocd/charts/argo-cd/repo new file mode 100644 index 000000000..fba634c54 --- /dev/null +++ b/k3s/gxy-management/apps/argocd/charts/argo-cd/repo @@ -0,0 +1 @@ +https://argoproj.github.io/argo-helm diff --git a/k3s/gxy-management/apps/windmill/charts/windmill/repo b/k3s/gxy-management/apps/windmill/charts/windmill/repo new file mode 100644 index 000000000..9ff3dfef4 --- /dev/null +++ b/k3s/gxy-management/apps/windmill/charts/windmill/repo @@ -0,0 +1 @@ +https://windmill-labs.github.io/windmill-helm-charts/ diff --git a/k3s/gxy-management/apps/zot/charts/zot/repo b/k3s/gxy-management/apps/zot/charts/zot/repo new file mode 100644 index 000000000..6329dc84a --- /dev/null +++ b/k3s/gxy-management/apps/zot/charts/zot/repo @@ -0,0 +1 @@ +https://zotregistry.dev/helm-charts/ diff --git a/k3s/ops-backoffice-tools/README.md b/k3s/ops-backoffice-tools/README.md index 4b9c27ed5..e0fb10204 100644 --- a/k3s/ops-backoffice-tools/README.md +++ b/k3s/ops-backoffice-tools/README.md @@ -45,8 +45,8 @@ helm upgrade tailscale-operator tailscale/tailscale-operator \ All apps use Kustomize: ```bash -# Deploy -kubectl apply -k apps//manifests/base/ +# Deploy (decrypts secrets + TLS, applies, cleans up) +just deploy ops-backoffice-tools # Check kubectl get all -n From 9851893e48448d81e8b53f08b32f009a060d65b7 Mon Sep 17 00:00:00 2001 From: Mrugesh Mohapatra Date: Sat, 4 Apr 2026 23:30:09 +0530 Subject: [PATCH 16/40] =?UTF-8?q?fix:=20address=20review=20findings=20?= =?UTF-8?q?=E2=80=94=203=20critical,=2010=20warning,=206=20suggestion?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Critical: - Fix cilium_cluster_id assertion (int has no length, use string filter) - Quote etcd snapshot cron schedule to survive systemd word-splitting - Add TLS secretGenerator to argocd/windmill/zot kustomization.yaml Warning: - .envrc: replace silent 2>/dev/null with log_error on sops failure - justfile deploy: set trap incrementally after each sops decrypt - justfile kubeconfig-sync: umask 077 before writing kubeconfig - Add Helm install task to cilium role (was missing on remote hosts) - CI workflow: pass explicit k8s version to kubeconform - Parameterize galaxy_name in playbook (was hardcoded 6 times) - Remove redundant tf-fmt recipe (just tf fmt works) - Fix ops-mgmt README: remove raw commands and stale vault refs - Fix ssh_import_id: add gh: prefix to raisedadead in cloud-init Suggestion: - Remove redundant .envrc patterns from .gitignore - Remove orphaned comment and blank lines from .gitignore - Trim gxy-management .gitignore to non-redundant patterns only - Fix repo.txt → repo in gxy-management README - Remove dead cilium namespace from PSS exemptions - Fix stale destination path comments in security configs --- .envrc | 8 ++++++-- .github/workflows/k8s--validate.yml | 2 +- .gitignore | 9 --------- ansible/play-k3s--galaxy.yml | 14 +++++++------- ansible/roles/cilium/tasks/main.yml | 8 +++++++- cloud-init/basic.yml | 2 +- cloud-init/docker.yml | 2 +- justfile | 14 ++++---------- k3s/README.md | 4 +--- k3s/gxy-management/.gitignore | 9 --------- k3s/gxy-management/README.md | 2 +- .../apps/argocd/manifests/base/kustomization.yaml | 7 +++++++ .../windmill/manifests/base/kustomization.yaml | 7 +++++++ .../apps/zot/manifests/base/kustomization.yaml | 7 +++++++ .../cluster/security/audit-policy.yaml | 2 +- .../cluster/security/pss-admission.yaml | 6 +++--- k3s/ops-mgmt/README.md | 12 +++--------- k3s/ops-mgmt/cluster/security/audit-policy.yaml | 2 +- k3s/ops-mgmt/cluster/security/pss-admission.yaml | 2 +- 19 files changed, 59 insertions(+), 60 deletions(-) diff --git a/.envrc b/.envrc index 9db352fed..072f6caea 100644 --- a/.envrc +++ b/.envrc @@ -4,8 +4,12 @@ use_sops() { local path="$1" local type="${2:-dotenv}" if [ -f "$path" ]; then - eval "$(sops -d --input-type "$type" --output-type "$type" "$path" 2>/dev/null \ - | direnv dotenv bash /dev/stdin)" + local decrypted + decrypted=$(sops -d --input-type "$type" --output-type "$type" "$path" 2>&1) || { + log_error "sops decrypt failed for $path" + return + } + eval "$(echo "$decrypted" | direnv dotenv bash /dev/stdin)" watch_file "$path" fi } diff --git a/.github/workflows/k8s--validate.yml b/.github/workflows/k8s--validate.yml index afeb168a9..d18849cdb 100644 --- a/.github/workflows/k8s--validate.yml +++ b/.github/workflows/k8s--validate.yml @@ -32,4 +32,4 @@ jobs: kubeconform -v - name: Validate K8s manifests - run: just k8s-validate + run: just k8s-validate 1.32.0 diff --git a/.gitignore b/.gitignore index 3aac1057a..35f623d08 100644 --- a/.gitignore +++ b/.gitignore @@ -48,15 +48,6 @@ ansible/inventory/hosts # Secrets *.env *.env.* -.envrc - -# Allow .envrc files (direnv config, not secrets) -!.envrc -!k3s/**/.envrc - -# Secrets live in the private infra-secrets repo (sibling clone) - - .kubeconfig.yaml *.crt diff --git a/ansible/play-k3s--galaxy.yml b/ansible/play-k3s--galaxy.yml index ad250d834..d75e47218 100644 --- a/ansible/play-k3s--galaxy.yml +++ b/ansible/play-k3s--galaxy.yml @@ -29,7 +29,7 @@ gather_facts: true become: true vars: - galaxy_name: "gxy-management" + galaxy_name: "{{ galaxy_name | default('gxy-management') }}" do_spaces_access_key: "{{ lookup('env', 'DO_SPACES_ACCESS_KEY') }}" do_spaces_secret_key: "{{ lookup('env', 'DO_SPACES_SECRET_KEY') }}" @@ -86,7 +86,7 @@ gather_facts: true become: true vars: - galaxy_name: "gxy-management" + galaxy_name: "{{ galaxy_name | default('gxy-management') }}" cluster_config_dir: "{{ playbook_dir }}/../k3s/{{ galaxy_name }}" pre_tasks: @@ -119,7 +119,7 @@ gather_facts: true become: true vars: - galaxy_name: "gxy-management" + galaxy_name: "{{ galaxy_name | default('gxy-management') }}" do_spaces_access_key: "{{ lookup('env', 'DO_SPACES_ACCESS_KEY') }}" do_spaces_secret_key: "{{ lookup('env', 'DO_SPACES_SECRET_KEY') }}" k3s_version: "v1.34.5+k3s1" @@ -152,7 +152,7 @@ --etcd-s3-bucket={{ etcd_s3_bucket }} --etcd-s3-folder={{ etcd_s3_folder }} --etcd-s3-region={{ etcd_s3_region }} - --etcd-snapshot-schedule-cron={{ etcd_snapshot_schedule }} + --etcd-snapshot-schedule-cron="{{ etcd_snapshot_schedule }}" --etcd-snapshot-retention={{ etcd_snapshot_retention }} server_group: server pre_tasks: @@ -184,7 +184,7 @@ gather_facts: false become: true vars: - galaxy_name: "gxy-management" + galaxy_name: "{{ galaxy_name | default('gxy-management') }}" gateway_api_version: "v1.5.1" tasks: @@ -220,7 +220,7 @@ gather_facts: false become: true vars: - galaxy_name: "gxy-management" + galaxy_name: "{{ galaxy_name | default('gxy-management') }}" cilium_cluster_id: 1 cluster_config_dir: "{{ playbook_dir }}/../k3s/{{ galaxy_name }}" roles: @@ -236,7 +236,7 @@ gather_facts: false become: true vars: - galaxy_name: "gxy-management" + galaxy_name: "{{ galaxy_name | default('gxy-management') }}" cluster_config_dir: "{{ playbook_dir }}/../k3s/{{ galaxy_name }}" tasks: diff --git a/ansible/roles/cilium/tasks/main.yml b/ansible/roles/cilium/tasks/main.yml index 492465997..38775a0f7 100644 --- a/ansible/roles/cilium/tasks/main.yml +++ b/ansible/roles/cilium/tasks/main.yml @@ -3,7 +3,7 @@ ansible.builtin.assert: that: - cilium_cluster_name | length > 0 - - cilium_cluster_id | length > 0 + - cilium_cluster_id | string | length > 0 - cilium_values_file | length > 0 - cilium_k8s_service_host | length > 0 fail_msg: >- @@ -16,6 +16,12 @@ dest: /tmp/cilium-values.yaml mode: "0600" +- name: Install Helm + ansible.builtin.shell: + cmd: set -o pipefail && curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash + executable: /bin/bash + creates: /usr/local/bin/helm + - name: Add Cilium Helm repo kubernetes.core.helm_repository: name: cilium diff --git a/cloud-init/basic.yml b/cloud-init/basic.yml index 863148259..5d37707f6 100644 --- a/cloud-init/basic.yml +++ b/cloud-init/basic.yml @@ -17,7 +17,7 @@ users: sudo: "ALL=(ALL) NOPASSWD:ALL" ssh_import_id: - gh:camperbot - - raisedadead + - gh:raisedadead write_files: - path: /etc/fail2ban/jail.local content: | diff --git a/cloud-init/docker.yml b/cloud-init/docker.yml index 6d32ded8a..0653cbedc 100644 --- a/cloud-init/docker.yml +++ b/cloud-init/docker.yml @@ -19,7 +19,7 @@ users: sudo: "ALL=(ALL) NOPASSWD:ALL" ssh_import_id: - gh:camperbot - - raisedadead + - gh:raisedadead write_files: - path: /etc/fail2ban/jail.local content: | diff --git a/justfile b/justfile index 04d8fe715..5f74985c9 100644 --- a/justfile +++ b/justfile @@ -55,6 +55,7 @@ kubeconfig-sync cluster: SRC="{{secrets_dir}}/k3s/{{cluster}}/kubeconfig.yaml.enc" DST="k3s/{{cluster}}/.kubeconfig.yaml" [ -f "$SRC" ] || { echo "Error: $SRC not found (cluster not yet bootstrapped?)"; exit 1; } + umask 077 sops -d --input-type yaml --output-type yaml "$SRC" > "$DST" chmod 600 "$DST" echo "Synced kubeconfig → $DST" @@ -71,18 +72,20 @@ deploy cluster app: if [ -f "$ENC_DIR/{{app}}.secrets.env.enc" ]; then sops -d --input-type dotenv --output-type dotenv "$ENC_DIR/{{app}}.secrets.env.enc" > "$APP_SECRETS/.secrets.env" CLEANUP="$APP_SECRETS/.secrets.env" + trap "rm -f $CLEANUP" EXIT fi if [ -f "$ENC_DIR/{{app}}.tls.crt.enc" ]; then sops -d "$ENC_DIR/{{app}}.tls.crt.enc" > "$APP_SECRETS/tls.crt" CLEANUP="$CLEANUP $APP_SECRETS/tls.crt" + trap "rm -f $CLEANUP" EXIT fi if [ -f "$ENC_DIR/{{app}}.tls.key.enc" ]; then sops -d "$ENC_DIR/{{app}}.tls.key.enc" > "$APP_SECRETS/tls.key" CLEANUP="$CLEANUP $APP_SECRETS/tls.key" + trap "rm -f $CLEANUP" EXIT fi [ -n "$CLEANUP" ] || { echo "Error: no secrets found for {{app}} in $ENC_DIR"; exit 1; } - trap "rm -f $CLEANUP" EXIT cd k3s/{{cluster}} export KUBECONFIG="$(pwd)/.kubeconfig.yaml" @@ -169,15 +172,6 @@ tf cmd workspace="all": terraform -chdir=$ws {{cmd}} fi -# Format all terraform files -[group('terraform')] -tf-fmt: - #!/usr/bin/env bash - set -eu - for ws in $(find terraform -name ".terraform.lock.hcl" -exec dirname {} \; | sort); do - terraform -chdir=$ws fmt - done - # List terraform workspaces [group('terraform')] tf-list: diff --git a/k3s/README.md b/k3s/README.md index c6a70123d..577c27827 100644 --- a/k3s/README.md +++ b/k3s/README.md @@ -31,9 +31,7 @@ k3s/ │ │ └── zot/ │ └── cluster/ │ ├── cilium/ -│ ├── longhorn/ -│ ├── security/ -│ └── tailscale/ +│ └── security/ ├── ops-backoffice-tools/ │ ├── apps/ │ │ ├── appsmith/ diff --git a/k3s/gxy-management/.gitignore b/k3s/gxy-management/.gitignore index ae8010926..7220cce0a 100644 --- a/k3s/gxy-management/.gitignore +++ b/k3s/gxy-management/.gitignore @@ -1,11 +1,2 @@ -# Environment (direnv) -.env - -# Kubeconfig -.kubeconfig.yaml - # Decrypted secrets (temporary, generated by just deploy) -apps/*/manifests/base/secrets/.secrets.env -apps/*/manifests/base/secrets/tls.crt -apps/*/manifests/base/secrets/tls.key apps/*/manifests/base/secrets/tls.yaml diff --git a/k3s/gxy-management/README.md b/k3s/gxy-management/README.md index e77fefc82..44cd3cc05 100644 --- a/k3s/gxy-management/README.md +++ b/k3s/gxy-management/README.md @@ -61,7 +61,7 @@ just helm-upgrade gxy-management windmill just helm-upgrade gxy-management zot ``` -Release names match the app directory names. The recipe reads the chart repo URL from `charts//repo.txt` and the values from `charts//values.yaml`. +Release names match the app directory names. The recipe reads the chart repo URL from `charts//repo` and the values from `charts//values.yaml`. ### App Secrets and Manifests diff --git a/k3s/gxy-management/apps/argocd/manifests/base/kustomization.yaml b/k3s/gxy-management/apps/argocd/manifests/base/kustomization.yaml index 5331e33f4..e8cab94e3 100644 --- a/k3s/gxy-management/apps/argocd/manifests/base/kustomization.yaml +++ b/k3s/gxy-management/apps/argocd/manifests/base/kustomization.yaml @@ -14,3 +14,10 @@ secretGenerator: - secrets/.secrets.env options: disableNameSuffixHash: true + - name: argocd-tls-cloudflare + type: kubernetes.io/tls + files: + - tls.crt=secrets/tls.crt + - tls.key=secrets/tls.key + options: + disableNameSuffixHash: true diff --git a/k3s/gxy-management/apps/windmill/manifests/base/kustomization.yaml b/k3s/gxy-management/apps/windmill/manifests/base/kustomization.yaml index e4ac5cfe4..b6e81a5fa 100644 --- a/k3s/gxy-management/apps/windmill/manifests/base/kustomization.yaml +++ b/k3s/gxy-management/apps/windmill/manifests/base/kustomization.yaml @@ -14,3 +14,10 @@ secretGenerator: - secrets/.secrets.env options: disableNameSuffixHash: true + - name: windmill-tls-cloudflare + type: kubernetes.io/tls + files: + - tls.crt=secrets/tls.crt + - tls.key=secrets/tls.key + options: + disableNameSuffixHash: true diff --git a/k3s/gxy-management/apps/zot/manifests/base/kustomization.yaml b/k3s/gxy-management/apps/zot/manifests/base/kustomization.yaml index f111666cd..69a92e59a 100644 --- a/k3s/gxy-management/apps/zot/manifests/base/kustomization.yaml +++ b/k3s/gxy-management/apps/zot/manifests/base/kustomization.yaml @@ -14,3 +14,10 @@ secretGenerator: - secrets/.secrets.env options: disableNameSuffixHash: true + - name: zot-tls-cloudflare + type: kubernetes.io/tls + files: + - tls.crt=secrets/tls.crt + - tls.key=secrets/tls.key + options: + disableNameSuffixHash: true diff --git a/k3s/gxy-management/cluster/security/audit-policy.yaml b/k3s/gxy-management/cluster/security/audit-policy.yaml index 4318218bd..7243d0578 100644 --- a/k3s/gxy-management/cluster/security/audit-policy.yaml +++ b/k3s/gxy-management/cluster/security/audit-policy.yaml @@ -1,5 +1,5 @@ # Kubernetes API audit policy -# Copied to /var/lib/rancher/k3s/server/audit-policy.yaml by Ansible +# Copied to /etc/rancher/k3s/audit-policy.yaml by Ansible # # Phase 1: minimal — log secret access and anonymous requests only # Phase 2: expand to full request/response logging for sensitive resources diff --git a/k3s/gxy-management/cluster/security/pss-admission.yaml b/k3s/gxy-management/cluster/security/pss-admission.yaml index 9d6baaa3d..4f02cc90a 100644 --- a/k3s/gxy-management/cluster/security/pss-admission.yaml +++ b/k3s/gxy-management/cluster/security/pss-admission.yaml @@ -1,9 +1,10 @@ # Pod Security Standards admission configuration -# Copied to /var/lib/rancher/k3s/server/pss.yaml by Ansible +# Copied to /etc/rancher/k3s/pss-admission.yaml by Ansible # # - baseline: enforced (blocks privileged containers, host networking, hostPath) # - restricted: audit + warn only (logs violations, does not block) -# - System namespaces exempted (Cilium, Windmill workers, and Tailscale need elevated privileges) +# - System namespaces exempted (Windmill workers and Tailscale need elevated privileges; +# Cilium installs into kube-system which is already exempted) apiVersion: apiserver.config.k8s.io/v1 kind: AdmissionConfiguration plugins: @@ -21,6 +22,5 @@ plugins: exemptions: namespaces: - kube-system - - cilium - windmill - tailscale diff --git a/k3s/ops-mgmt/README.md b/k3s/ops-mgmt/README.md index b6835e24e..a85f90810 100644 --- a/k3s/ops-mgmt/README.md +++ b/k3s/ops-mgmt/README.md @@ -21,27 +21,21 @@ kubectl get nodes Everything is managed by a single Ansible playbook (8 plays): ```bash -cd ansible/ -ansible-playbook -i inventory/digitalocean.yml play-k3s--ops-mgmt.yml \ - -e variable_host=mgmt_k3s \ - --vault-password-file <(op read "op://Service-Automation/Ansible-Vault-Password/Ansible-Vault-Password") +just play k3s--ops-mgmt mgmt_k3s ``` The playbook handles: k3s install, security hardening (secrets-encryption, PSS, audit logging), cert-manager, Rancher, rancher-backup + schedule, Tailscale operator + Connector, kubeconfig fetch, and DO firewall lockdown. -Prerequisites: VM provisioned with Tailscale installed, Ansible Vault populated. +Prerequisites: VM provisioned with Tailscale installed, secrets populated in infra-secrets repo. ## Re-runs After first run, the DO firewall restricts SSH to Tailscale only. Re-run via Tailscale IP: ```bash -ansible-playbook -i inventory/digitalocean.yml play-k3s--ops-mgmt.yml \ - -e variable_host=mgmt_k3s \ - -e ansible_host= \ - --vault-password-file <(op read "op://Service-Automation/Ansible-Vault-Password/Ansible-Vault-Password") +just play k3s--ops-mgmt mgmt_k3s -e ansible_host= ``` ## Disaster Recovery diff --git a/k3s/ops-mgmt/cluster/security/audit-policy.yaml b/k3s/ops-mgmt/cluster/security/audit-policy.yaml index 4318218bd..7243d0578 100644 --- a/k3s/ops-mgmt/cluster/security/audit-policy.yaml +++ b/k3s/ops-mgmt/cluster/security/audit-policy.yaml @@ -1,5 +1,5 @@ # Kubernetes API audit policy -# Copied to /var/lib/rancher/k3s/server/audit-policy.yaml by Ansible +# Copied to /etc/rancher/k3s/audit-policy.yaml by Ansible # # Phase 1: minimal — log secret access and anonymous requests only # Phase 2: expand to full request/response logging for sensitive resources diff --git a/k3s/ops-mgmt/cluster/security/pss-admission.yaml b/k3s/ops-mgmt/cluster/security/pss-admission.yaml index 0cc18a71f..52e5f679b 100644 --- a/k3s/ops-mgmt/cluster/security/pss-admission.yaml +++ b/k3s/ops-mgmt/cluster/security/pss-admission.yaml @@ -1,5 +1,5 @@ # Pod Security Standards admission configuration -# Copied to /var/lib/rancher/k3s/server/pss.yaml by Ansible +# Copied to /etc/rancher/k3s/pss-admission.yaml by Ansible # # - baseline: enforced (blocks privileged containers, host networking, hostPath) # - restricted: audit + warn only (logs violations, does not block) From c73215dbb1e3f434c25feecff9e67ed3f72020e4 Mon Sep 17 00:00:00 2001 From: Mrugesh Mohapatra Date: Sun, 5 Apr 2026 00:12:28 +0530 Subject: [PATCH 17/40] fix: move Windmill credentials to secret values overlay - Remove hardcoded postgresPassword and databaseUrl from public values.yaml - Update helm-upgrade recipe to overlay secret values from infra-secrets - Secret values file pattern: .values.yaml.enc (sops-encrypted YAML) - Decrypted to temp file at install time, deleted after helm upgrade --- justfile | 16 ++++++++++++++-- .../apps/windmill/charts/windmill/values.yaml | 5 ++--- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/justfile b/justfile index 5f74985c9..fedbae0a7 100644 --- a/justfile +++ b/justfile @@ -92,7 +92,7 @@ deploy cluster app: kubectl apply -k apps/{{app}}/manifests/base/ echo "Deployed {{app}} to {{cluster}}" -# Install or upgrade a Helm chart for a cluster app +# Install or upgrade a Helm chart (overlays secret values from infra-secrets if present) [group('k3s')] helm-upgrade cluster app: #!/usr/bin/env bash @@ -107,11 +107,23 @@ helm-upgrade cluster app: REPO_FILE="$CHART_DIR/repo" [ -f "$REPO_FILE" ] || { echo "Error: $REPO_FILE not found (one line: chart repo URL)"; exit 1; } REPO_URL=$(cat "$REPO_FILE") + + HELM_ARGS="-f $VALUES" + CLEANUP="" + SECRET_VALUES="{{secrets_dir}}/k3s/{{cluster}}/{{app}}.values.yaml.enc" + if [ -f "$SECRET_VALUES" ]; then + TMPVALS=$(mktemp) + sops -d --input-type yaml --output-type yaml "$SECRET_VALUES" > "$TMPVALS" + HELM_ARGS="$HELM_ARGS -f $TMPVALS" + CLEANUP="$TMPVALS" + trap "rm -f $CLEANUP" EXIT + fi + echo "Installing {{app}} (chart: $CHART_NAME) from $REPO_URL" helm upgrade --install {{app}} "$CHART_NAME" \ --repo "$REPO_URL" \ -n {{app}} --create-namespace \ - -f "$VALUES" + $HELM_ARGS # Validate K8s manifests with kubeconform [group('k3s')] diff --git a/k3s/gxy-management/apps/windmill/charts/windmill/values.yaml b/k3s/gxy-management/apps/windmill/charts/windmill/values.yaml index 2147b69c1..5d9e5c034 100644 --- a/k3s/gxy-management/apps/windmill/charts/windmill/values.yaml +++ b/k3s/gxy-management/apps/windmill/charts/windmill/values.yaml @@ -8,7 +8,7 @@ windmill: baseProtocol: https appReplicas: 1 extraReplicas: 1 - databaseUrl: postgres://postgres:windmill@windmill-postgresql/windmill?sslmode=disable + # databaseUrl set via secret values overlay (infra-secrets) app: resources: @@ -60,8 +60,7 @@ windmill: postgresql: enabled: true auth: - postgresUser: postgres - postgresPassword: windmill + # Credentials set via secret values overlay (infra-secrets) database: windmill persistence: enabled: true From 3decdca96690c619a1ebb2164c8574ee05dd9ff5 Mon Sep 17 00:00:00 2001 From: Mrugesh Mohapatra Date: Sun, 5 Apr 2026 00:13:28 +0530 Subject: [PATCH 18/40] docs: update SPIKE-STATUS.md with reviewed deployment plan MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Mark completed items (Tailscale, TLS certs, code review, justfile overhaul) - Replace flat task list with phased plan (A→E) - Document secrets → Helm flow (public values + secret overlay) - Deploy sequentially: cluster → Windmill → ArgoCD → Zot - Each phase has verify step before proceeding --- SPIKE-STATUS.md | 85 ++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 70 insertions(+), 15 deletions(-) diff --git a/SPIKE-STATUS.md b/SPIKE-STATUS.md index 78de36eab..ddc253795 100644 --- a/SPIKE-STATUS.md +++ b/SPIKE-STATUS.md @@ -165,25 +165,80 @@ scratchpad/ — dev.env.enc, org.env.enc, sample.env.enc - [x] justfile recipes: secrets, deploy, play, helm-upgrade, kubeconfig-sync, tf - [x] gxy-management cluster configs (Cilium values, security policies, Traefik config) - [x] App manifests (ArgoCD, Windmill, Zot — kustomization, gateway, httproutes) -- [x] Helm chart values (ArgoCD, Windmill, Zot) +- [x] Helm chart values (ArgoCD, Windmill, Zot — credentials stripped to secret overlays) - [x] Documentation (infra-secrets README wiring doc, gxy-management README runbook) +- [x] Tailscale installed and connected on all 3 nodes (verified: online, SSH enabled) +- [x] Cloudflare origin certs encrypted for all 3 apps (reused existing wildcard) +- [x] Code review: 3 CRITICALs + 10 WARNINGs + 6 SUGGESTIONs found and fixed +- [x] Justfile overhaul: 18 → 11 parametric recipes, no special-case orchestration + +## Secrets → Helm Flow + +Public values.yaml (structure, resources, flags) are overlaid with secret values from infra-secrets: + +``` +Public: k3s//apps//charts//values.yaml +Secret: infra-secrets/k3s//.values.yaml.enc (optional, sops-encrypted) + +just helm-upgrade → helm upgrade --install -f values.yaml -f /tmp/secret-values.yaml → cleanup +``` + +Apps that only need K8s Secrets (ArgoCD, Zot) use `just deploy` which decrypts `.secrets.env` + TLS. ## What's Next -| # | Task | Blocked By | Notes | -| --- | ----------------------------- | ---------- | ----------------------------------------------------------------------------------------------------- | -| 1 | Install Tailscale on 3 nodes | — | `just play tailscale--0-install gxy_mgmt_k3s` then `just play tailscale--1b-up-with-ssh gxy_mgmt_k3s` | -| 2 | Cloudflare origin certificate | #1 | \*.freecodecamp.net, 15-year RSA, encrypt to infra-secrets | -| 3 | Populate app secrets | — | argocd, windmill, zot .secrets.env from samples | -| 4 | Run K3s galaxy playbook | #1, #3 | `just play k3s--galaxy gxy_mgmt_k3s` | -| 5 | Install ArgoCD via Helm | #4 | `just helm-upgrade gxy-management argocd` | -| 6 | Install Windmill via Helm | #4 | `just helm-upgrade gxy-management windmill` | -| 7 | Install Zot via Helm | #4 | `just helm-upgrade gxy-management zot` | -| 8 | DNS + Cloudflare Access | #5, #6, #7 | A records + Access policies (ClickOps) | -| 9 | Smoke tests | #8 | nodes Ready, Cilium green, curl endpoints, Access gate | -| 10 | Commit infra-secrets repo | — | Push to GitHub | - -Unblocked right now: #1 (Tailscale), #3 (app secrets), #10 (infra-secrets commit). +Deploy sequentially. Verify each before moving to the next. + +### Phase A: Bootstrap Cluster + +| # | Task | Status | Command / Notes | +| --- | ----------------------------- | ------ | ---------------------------------------------------------- | +| A1 | Populate Windmill secrets | TODO | Create `windmill.values.yaml.enc` (PG password, DB URL) | +| A2 | Populate Windmill app secrets | TODO | Create `windmill.secrets.env.enc` (admin email + password) | +| A3 | Run K3s galaxy playbook | TODO | `just play k3s--galaxy gxy_mgmt_k3s` (from cluster dir) | +| A4 | Verify cluster health | TODO | 3 nodes Ready, Cilium green, Traefik running, Gateway CRDs | +| A5 | Encrypt kubeconfig | TODO | sops encrypt to infra-secrets | + +### Phase B: Windmill (Day 0 Deliverable) + +| # | Task | Status | Command / Notes | +| --- | --------------------- | ------ | ----------------------------------------------------- | +| B1 | Install Windmill Helm | TODO | `just helm-upgrade gxy-management windmill` | +| B2 | Verify pods ready | TODO | `kubectl get pods -n windmill` | +| B3 | Deploy manifests | TODO | `just deploy gxy-management windmill` (Gateway + TLS) | +| B4 | Cloudflare DNS | TODO | ClickOps: A records (proxied) → 3 node public IPs | +| B5 | Cloudflare Access | TODO | ClickOps: email OTP gate, all staff | +| B6 | Smoke test | TODO | curl + browser, verify Access gate | + +### Phase C: ArgoCD (Platform Team) + +| # | Task | Status | Command / Notes | +| --- | ----------------------- | ------ | ------------------------------------------------------- | +| C1 | Populate ArgoCD secrets | TODO | Create `argocd.secrets.env.enc` (bcrypt admin password) | +| C2 | Install ArgoCD Helm | TODO | `just helm-upgrade gxy-management argocd` | +| C3 | Deploy manifests | TODO | `just deploy gxy-management argocd` | +| C4 | DNS + Access | TODO | ClickOps: argocd.freecodecamp.net, platform team only | +| C5 | Verify | TODO | Login, verify dashboard | + +### Phase D: Zot (Platform Team) + +| # | Task | Status | Command / Notes | +| --- | -------------------- | ------ | ------------------------------------------------------- | +| D1 | Populate Zot secrets | TODO | Create `zot.secrets.env.enc` (S3 creds, htpasswd) | +| D2 | Install Zot Helm | TODO | `just helm-upgrade gxy-management zot` | +| D3 | Deploy manifests | TODO | `just deploy gxy-management zot` | +| D4 | DNS + Access | TODO | ClickOps: registry.freecodecamp.net, platform team only | +| D5 | Verify | TODO | Push/pull test image | + +### Phase E: Cleanup + +| # | Task | Status | Notes | +| --- | ---------------------- | ------ | ---------------------------------------------- | +| E1 | Commit infra-secrets | TODO | Push to GitHub | +| E2 | Remove SPIKE-STATUS.md | TODO | Absorb permanent decisions into cluster README | +| E3 | Clean up stale files | TODO | Orphaned samples, archive cruft | + +Unblocked now: A1, A2 (populate secrets). ## Existing Infrastructure (Unchanged) From df3a0933b3b1211711a1d2121484a4e045e9eb17 Mon Sep 17 00:00:00 2001 From: Mrugesh Mohapatra Date: Sun, 5 Apr 2026 01:19:04 +0530 Subject: [PATCH 19/40] fix: add CIS sysctl prereqs and improve play recipe - Add kubelet kernel parameters to galaxy playbook Play 2 pre_tasks (vm.overcommit_memory, vm.panic_on_oom, kernel.panic, kernel.panic_on_oops) Required by --protect-kernel-defaults per k3s CIS hardening guide - Remove galaxy_name from play-level vars (pass via -e, fail-safe assert) - Add *args passthrough to play recipe for extra ansible-playbook flags - Add per-run logging via tee to ansible/.ansible/logs/ --- ansible/play-k3s--galaxy.yml | 25 +++++++++++++++++++------ justfile | 14 ++++++++++---- 2 files changed, 29 insertions(+), 10 deletions(-) diff --git a/ansible/play-k3s--galaxy.yml b/ansible/play-k3s--galaxy.yml index d75e47218..6fa05ff53 100644 --- a/ansible/play-k3s--galaxy.yml +++ b/ansible/play-k3s--galaxy.yml @@ -29,11 +29,17 @@ gather_facts: true become: true vars: - galaxy_name: "{{ galaxy_name | default('gxy-management') }}" do_spaces_access_key: "{{ lookup('env', 'DO_SPACES_ACCESS_KEY') }}" do_spaces_secret_key: "{{ lookup('env', 'DO_SPACES_SECRET_KEY') }}" tasks: + - name: Validate galaxy_name is provided + assert: + that: + - galaxy_name is defined + - galaxy_name | length > 0 + fail_msg: "galaxy_name not set. Pass via: -e galaxy_name=gxy-management" + - name: Validate env vars loaded (cd into cluster dir for direnv) assert: that: @@ -86,10 +92,21 @@ gather_facts: true become: true vars: - galaxy_name: "{{ galaxy_name | default('gxy-management') }}" cluster_config_dir: "{{ playbook_dir }}/../k3s/{{ galaxy_name }}" pre_tasks: + - name: Set kubelet kernel parameters (required for --protect-kernel-defaults) + ansible.posix.sysctl: + name: "{{ item.key }}" + value: "{{ item.value }}" + sysctl_file: /etc/sysctl.d/90-kubelet.conf + reload: true + loop: + - { key: vm.overcommit_memory, value: "1" } + - { key: vm.panic_on_oom, value: "0" } + - { key: kernel.panic, value: "10" } + - { key: kernel.panic_on_oops, value: "1" } + - name: Ensure k3s config directory exists file: path: /etc/rancher/k3s @@ -119,7 +136,6 @@ gather_facts: true become: true vars: - galaxy_name: "{{ galaxy_name | default('gxy-management') }}" do_spaces_access_key: "{{ lookup('env', 'DO_SPACES_ACCESS_KEY') }}" do_spaces_secret_key: "{{ lookup('env', 'DO_SPACES_SECRET_KEY') }}" k3s_version: "v1.34.5+k3s1" @@ -184,7 +200,6 @@ gather_facts: false become: true vars: - galaxy_name: "{{ galaxy_name | default('gxy-management') }}" gateway_api_version: "v1.5.1" tasks: @@ -220,7 +235,6 @@ gather_facts: false become: true vars: - galaxy_name: "{{ galaxy_name | default('gxy-management') }}" cilium_cluster_id: 1 cluster_config_dir: "{{ playbook_dir }}/../k3s/{{ galaxy_name }}" roles: @@ -236,7 +250,6 @@ gather_facts: false become: true vars: - galaxy_name: "{{ galaxy_name | default('gxy-management') }}" cluster_config_dir: "{{ playbook_dir }}/../k3s/{{ galaxy_name }}" tasks: diff --git a/justfile b/justfile index fedbae0a7..389b53fce 100644 --- a/justfile +++ b/justfile @@ -153,11 +153,17 @@ k8s-validate version="1.32.0": # Ansible # --------------------------------------------------------------------------- -# Run any ansible playbook +# Run any ansible playbook (logs to ansible/.ansible/logs/) [group('ansible')] -play playbook host inv="digitalocean.yml": - cd ansible && uv run ansible-playbook -i inventory/{{inv}} play-{{playbook}}.yml \ - -e variable_host={{host}} +[positional-arguments] +play playbook host *args: + #!/usr/bin/env bash + set -eu + mkdir -p ansible/.ansible/logs + LOGFILE="$(pwd)/ansible/.ansible/logs/$(date +%Y%m%d-%H%M%S)-{{playbook}}.log" + cd ansible && uv run ansible-playbook -i inventory/digitalocean.yml play-{{playbook}}.yml \ + -e variable_host={{host}} {{args}} 2>&1 | tee "$LOGFILE" + echo "Log: $LOGFILE" # Install ansible dependencies [group('ansible')] From 7711fe66bcef1c2a53058dd0b9d36efff717832f Mon Sep 17 00:00:00 2001 From: Mrugesh Mohapatra Date: Sun, 5 Apr 2026 01:48:07 +0530 Subject: [PATCH 20/40] refactor: extract galaxy playbook config to Ansible group_vars MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Create inventory/group_vars/gxy_mgmt_k3s.yml with all galaxy-specific config (CIDRs, k3s version, Cilium ID, etcd S3 bucket, Gateway API version) - Strip all hardcoded values from play-k3s--galaxy.yml — now a generic orchestrator - Add comprehensive assert block validating all required group_vars before execution - Fix service.env clobbering: replace copy with lineinfile to preserve K3S_TOKEN - Restore VPC IP range validation that was dropped during earlier refactors - Add cron quoting comment to prevent future regressions To add a new galaxy: create a group_vars file. No playbook editing needed. --- SPIKE-STATUS.md | 47 +++--- ansible/inventory/group_vars/gxy_mgmt_k3s.yml | 25 ++++ ansible/play-k3s--galaxy.yml | 139 ++++++++---------- 3 files changed, 115 insertions(+), 96 deletions(-) create mode 100644 ansible/inventory/group_vars/gxy_mgmt_k3s.yml diff --git a/SPIKE-STATUS.md b/SPIKE-STATUS.md index ddc253795..2515f8686 100644 --- a/SPIKE-STATUS.md +++ b/SPIKE-STATUS.md @@ -191,13 +191,13 @@ Deploy sequentially. Verify each before moving to the next. ### Phase A: Bootstrap Cluster -| # | Task | Status | Command / Notes | -| --- | ----------------------------- | ------ | ---------------------------------------------------------- | -| A1 | Populate Windmill secrets | TODO | Create `windmill.values.yaml.enc` (PG password, DB URL) | -| A2 | Populate Windmill app secrets | TODO | Create `windmill.secrets.env.enc` (admin email + password) | -| A3 | Run K3s galaxy playbook | TODO | `just play k3s--galaxy gxy_mgmt_k3s` (from cluster dir) | -| A4 | Verify cluster health | TODO | 3 nodes Ready, Cilium green, Traefik running, Gateway CRDs | -| A5 | Encrypt kubeconfig | TODO | sops encrypt to infra-secrets | +| # | Task | Status | Command / Notes | +| --- | ------------------------------- | ------ | -------------------------------------------------------------------- | +| A1 | Populate Windmill secrets | DONE | `windmill.values.yaml.enc` + `windmill.secrets.env.enc` | +| A2 | Refactor playbook to group_vars | BLOCK | Playbook has hardcoded values — needs proper Ansible design | +| A3 | Run K3s galaxy playbook | BLOCK | Failed: missing CIS sysctls (sysctl fix added, needs rerun after A2) | +| A4 | Verify cluster health | TODO | 3 nodes Ready, Cilium green, Traefik running, Gateway CRDs | +| A5 | Encrypt kubeconfig | TODO | sops encrypt to infra-secrets | ### Phase B: Windmill (Day 0 Deliverable) @@ -281,18 +281,29 @@ e72beb5 feat(k3s): add ops-mgmt cluster configs and tooling ## Errors and Fixes (for Future Reference) -| Issue | Root Cause | Fix | -| ---------------------------------------------------- | ----------------------------------------- | --------------------------------------------------------- | -| cloud-init heredoc syntax error | runcmd `\|` strings don't support heredoc | Moved to write_files section | -| `systemctl restart sshd` fails on Ubuntu 24.04 | Service renamed to `ssh.service` | `ssh \|\| sshd \|\| true` fallback | -| SSH hardening sed had no effect | Ubuntu 24.04 ships commented defaults | Drop-in file at sshd_config.d/99-hardening.conf | -| sops `path_regex: .*\.enc$` didn't match input files | Regex matches input path, not output | Changed to `.*` (match all) | -| sops `dotenv` format failed on YAML file | ansible vars are YAML, not dotenv | Renamed to `.yaml.enc`, format detection in verify recipe | -| direnv `$(dirname "$0")` empty | Not available in direnv context | Use `expand_path ../infra-secrets` | +| Issue | Root Cause | Fix | +| ----------------------------------------------------- | ------------------------------------------------------------------------------ | --------------------------------------------------------- | +| cloud-init heredoc syntax error | runcmd `\|` strings don't support heredoc | Moved to write_files section | +| `systemctl restart sshd` fails on Ubuntu 24.04 | Service renamed to `ssh.service` | `ssh \|\| sshd \|\| true` fallback | +| SSH hardening sed had no effect | Ubuntu 24.04 ships commented defaults | Drop-in file at sshd_config.d/99-hardening.conf | +| sops `path_regex: .*\.enc$` didn't match input files | Regex matches input path, not output | Changed to `.*` (match all) | +| sops `dotenv` format failed on YAML file | ansible vars are YAML, not dotenv | Renamed to `.yaml.enc`, format detection in verify recipe | +| direnv `$(dirname "$0")` empty | Not available in direnv context | Use `expand_path ../infra-secrets` | +| k3s fails: `invalid kernel flag vm/overcommit_memory` | `--protect-kernel-defaults` requires CIS sysctls, prereq role doesn't set them | Add sysctl pre_task (90-kubelet.conf) before k3s starts | +| Ansible `galaxy_name` recursive template loop | `vars: galaxy_name: "{{ galaxy_name \| default(...) }}"` self-references | Remove from vars, pass via `-e`, add assert validation | +| Galaxy playbook hardcodes all environment values | CIDRs, versions, bucket names, Cilium ID in playbook not group_vars | **BLOCKING** — refactor to group_vars before re-run | ## Open Questions -- **Helm chart versions**: Need to verify latest stable for ArgoCD, Windmill, Zot before install +- **Playbook refactor**: Move all galaxy-specific config to `inventory/group_vars/.yml`. Playbook must be generic. +- **Helm chart versions**: Pin versions — Windmill 4.0.124, ArgoCD 9.4.17, Zot 0.1.104 - **Cloudflare Access policies**: Exact group/email configuration TBD -- **Windmill DB**: Using embedded SQLite or external PostgreSQL? (ADR-008 says CNPG later) -- **TLS for gxy-management apps**: Need to create Cloudflare origin cert for \*.freecodecamp.net +- **Windmill DB**: Embedded PostgreSQL for Day 0 (ADR-008: CNPG later) + +## Lessons Learned + +1. **Use tool primitives.** Ansible has group_vars, roles, templates — use them. Don't write shell scripts disguised as YAML. +2. **Research before coding.** The `--protect-kernel-defaults` failure was predictable from the k3s hardening guide. Read the docs first. +3. **The spike IS production.** No shortcuts, no "fix later." Every line of code must be production-quality. +4. **Aggressive review.** Every change gets independent hostile review before presenting. The review that caught 3 CRITICALs was correct — but more should have been caught at design time. +5. **Validate before suggesting.** Never tell the operator to "try it" without dry-run, syntax check, or verified expansion. diff --git a/ansible/inventory/group_vars/gxy_mgmt_k3s.yml b/ansible/inventory/group_vars/gxy_mgmt_k3s.yml new file mode 100644 index 000000000..0d6bcfb14 --- /dev/null +++ b/ansible/inventory/group_vars/gxy_mgmt_k3s.yml @@ -0,0 +1,25 @@ +--- +# gxy-management galaxy configuration +# Applied automatically when targeting the gxy_mgmt_k3s inventory group + +galaxy_name: gxy-management + +# k3s +k3s_version: v1.34.5+k3s1 + +# Networking (ADR-009: non-overlapping per galaxy) +cluster_cidr: "10.1.0.0/16" +service_cidr: "10.11.0.0/16" + +# Cilium CNI +cilium_cluster_id: 1 + +# Gateway API +gateway_api_version: v1.5.1 + +# etcd S3 backups (DO Spaces) +etcd_s3_endpoint: fra1.digitaloceanspaces.com +etcd_s3_bucket: net.freecodecamp.universe-backups +etcd_s3_region: fra1 +etcd_snapshot_schedule: "0 */6 * * *" +etcd_snapshot_retention: 20 diff --git a/ansible/play-k3s--galaxy.yml b/ansible/play-k3s--galaxy.yml index 6fa05ff53..6cbe94d0c 100644 --- a/ansible/play-k3s--galaxy.yml +++ b/ansible/play-k3s--galaxy.yml @@ -1,30 +1,20 @@ --- # Deploy k3s HA galaxy cluster with Cilium CNI # -# Provisions any Universe galaxy cluster. All nodes are control-plane (HA). -# Uses Cilium as CNI (flannel/kube-proxy disabled). Tailscale on nodes for -# SSH/kubectl access (NOT the K8s operator — ADR-009). +# Generic playbook for any Universe galaxy. All galaxy-specific config +# lives in inventory/group_vars/.yml — not in this file. # -# Prerequisites (manual, one-time per galaxy): -# - 3x Ubuntu VMs on DigitalOcean with VPC attached (eth1) -# - Tailscale installed and connected on all nodes (play-tailscale--*.yml) +# Prerequisites: +# - VMs provisioned with VPC (eth1) and Tailscale connected # - Env vars loaded via direnv (cd into cluster dir first) -# - Cluster config directory: k3s//cluster/ +# - Group vars populated for the target inventory group # # Usage: -# cd k3s/gxy-management # direnv loads DO_API_TOKEN, DO_SPACES_*, etc. -# just play k3s--galaxy gxy_mgmt_k3s -# -# What this playbook does (6 plays): -# 1. Validate prerequisites (VPC, Tailscale, vault secrets) -# 2. Prepare system (k3s prerequisites + security configs) -# 3. Deploy k3s server (Cilium flags, etcd S3 backups, security hardening) -# 4. Configure ingress (Traefik + Gateway API CRDs) -# 5. Install Cilium CNI -# 6. Fetch kubeconfig (copy to local machine, replace server IP with Tailscale IP) +# cd k3s/ +# just play k3s--galaxy # Play 1: Validate prerequisites -- name: "K3s {{ galaxy_name | default('galaxy') }} - Validate prerequisites" +- name: "K3s {{ galaxy_name | default('UNKNOWN') }} - Validate prerequisites" hosts: "{{ variable_host }}" gather_facts: true become: true @@ -33,19 +23,28 @@ do_spaces_secret_key: "{{ lookup('env', 'DO_SPACES_SECRET_KEY') }}" tasks: - - name: Validate galaxy_name is provided + - name: Validate required group_vars are set assert: that: - - galaxy_name is defined - - galaxy_name | length > 0 - fail_msg: "galaxy_name not set. Pass via: -e galaxy_name=gxy-management" + - galaxy_name is defined and galaxy_name | length > 0 + - k3s_version is defined and k3s_version | length > 0 + - cluster_cidr is defined and cluster_cidr | length > 0 + - service_cidr is defined and service_cidr | length > 0 + - cilium_cluster_id is defined and cilium_cluster_id | string | length > 0 + - gateway_api_version is defined and gateway_api_version | length > 0 + - etcd_s3_endpoint is defined and etcd_s3_endpoint | length > 0 + - etcd_s3_bucket is defined and etcd_s3_bucket | length > 0 + - etcd_s3_region is defined and etcd_s3_region | length > 0 + - etcd_snapshot_schedule is defined and etcd_snapshot_schedule | length > 0 + - etcd_snapshot_retention is defined + fail_msg: "Missing group_vars. Populate inventory/group_vars/{{ variable_host }}.yml" - - name: Validate env vars loaded (cd into cluster dir for direnv) + - name: Validate env vars loaded assert: that: - do_spaces_access_key | length > 0 - do_spaces_secret_key | length > 0 - fail_msg: "DO_SPACES_ACCESS_KEY/SECRET_KEY not set. cd into cluster dir first (direnv loads them)." + fail_msg: "DO_SPACES_ACCESS_KEY/SECRET_KEY not set. cd into cluster dir first." - name: Validate VPC interface exists (eth1) assert: @@ -53,7 +52,13 @@ - ansible_eth1 is defined - ansible_eth1.ipv4 is defined - ansible_eth1.ipv4.address is defined - fail_msg: "VPC interface eth1 not found. Ensure VM is attached to DO VPC." + fail_msg: "VPC interface eth1 not found." + + - name: Validate VPC IP is in expected range + assert: + that: + - ansible_eth1.ipv4.address | regex_search('^10\.') + fail_msg: "VPC IP {{ ansible_eth1.ipv4.address }} not in 10.x.x.x range." - name: Validate Tailscale is connected assert: @@ -61,33 +66,29 @@ - ansible_tailscale0 is defined - ansible_tailscale0.ipv4 is defined - ansible_tailscale0.ipv4.address is defined - fail_msg: "Tailscale interface not found. Ensure Tailscale is installed and connected." - - - name: Validate VPC IP is in expected range - assert: - that: - - ansible_eth1.ipv4.address | regex_search('^10\.') - fail_msg: "VPC IP {{ ansible_eth1.ipv4.address }} not in 10.x.x.x range." + fail_msg: "Tailscale not connected." - name: Set network facts set_fact: vpc_ip: "{{ ansible_eth1.ipv4.address }}" tailscale_ip: "{{ ansible_tailscale0.ipv4.address }}" - - name: Display network configuration + - name: Display configuration debug: - msg: "{{ inventory_hostname }}: VPC={{ vpc_ip }}, Tailscale={{ tailscale_ip }}" - - - name: Build k3s_cluster group - group_by: - key: k3s_cluster + msg: >- + {{ inventory_hostname }}: VPC={{ vpc_ip }}, Tailscale={{ tailscale_ip }}, + galaxy={{ galaxy_name }}, k3s={{ k3s_version }}, + pods={{ cluster_cidr }}, svc={{ service_cidr }} - - name: Build server group + - name: Build dynamic groups group_by: - key: server + key: "{{ item }}" + loop: + - k3s_cluster + - server # Play 2: System prerequisites -- name: "K3s {{ galaxy_name | default('galaxy') }} - System prerequisites" +- name: "K3s {{ galaxy_name | default('UNKNOWN') }} - System prerequisites" hosts: k3s_cluster gather_facts: true become: true @@ -95,7 +96,7 @@ cluster_config_dir: "{{ playbook_dir }}/../k3s/{{ galaxy_name }}" pre_tasks: - - name: Set kubelet kernel parameters (required for --protect-kernel-defaults) + - name: Set kubelet kernel parameters (CIS hardening) ansible.posix.sysctl: name: "{{ item.key }}" value: "{{ item.value }}" @@ -112,8 +113,6 @@ path: /etc/rancher/k3s state: directory mode: "0755" - owner: root - group: root - name: Copy PSS admission config copy: @@ -131,22 +130,13 @@ - role: k3s.orchestration.prereq # Play 3: Deploy k3s server -- name: "K3s {{ galaxy_name | default('galaxy') }} - Deploy k3s server" +- name: "K3s {{ galaxy_name | default('UNKNOWN') }} - Deploy k3s server" hosts: server gather_facts: true become: true vars: do_spaces_access_key: "{{ lookup('env', 'DO_SPACES_ACCESS_KEY') }}" do_spaces_secret_key: "{{ lookup('env', 'DO_SPACES_SECRET_KEY') }}" - k3s_version: "v1.34.5+k3s1" - cluster_cidr: "10.1.0.0/16" - service_cidr: "10.11.0.0/16" - etcd_s3_endpoint: "fra1.digitaloceanspaces.com" - etcd_s3_bucket: "net.freecodecamp.universe-backups" - etcd_s3_folder: "etcd/{{ galaxy_name }}" - etcd_s3_region: "fra1" - etcd_snapshot_schedule: "0 */6 * * *" - etcd_snapshot_retention: 20 api_endpoint: "{{ hostvars[groups['server'][0]]['vpc_ip'] }}" extra_server_args: >- --node-ip={{ hostvars[inventory_hostname]['vpc_ip'] }} @@ -166,41 +156,40 @@ --etcd-s3 --etcd-s3-endpoint={{ etcd_s3_endpoint }} --etcd-s3-bucket={{ etcd_s3_bucket }} - --etcd-s3-folder={{ etcd_s3_folder }} + --etcd-s3-folder=etcd/{{ galaxy_name }} --etcd-s3-region={{ etcd_s3_region }} + # Cron string MUST be double-quoted — systemd word-splits without them --etcd-snapshot-schedule-cron="{{ etcd_snapshot_schedule }}" --etcd-snapshot-retention={{ etcd_snapshot_retention }} server_group: server + pre_tasks: - - name: Create k3s audit log directory + - name: Create audit log directory file: path: /var/log/k3s state: directory mode: "0750" - owner: root - group: root - - name: Write k3s service environment (S3 credentials) - copy: - content: | - AWS_ACCESS_KEY_ID={{ do_spaces_access_key }} - AWS_SECRET_ACCESS_KEY={{ do_spaces_secret_key }} - dest: /etc/systemd/system/k3s.service.env + - name: Write S3 credentials to service environment + ansible.builtin.lineinfile: + path: /etc/systemd/system/k3s.service.env + regexp: "^{{ item.key }}=" + line: "{{ item.key }}={{ item.value }}" + create: true mode: "0600" - owner: root - group: root + loop: + - { key: AWS_ACCESS_KEY_ID, value: "{{ do_spaces_access_key }}" } + - { key: AWS_SECRET_ACCESS_KEY, value: "{{ do_spaces_secret_key }}" } no_log: true roles: - role: k3s.orchestration.k3s_server # Play 4: Configure ingress -- name: "K3s {{ galaxy_name | default('galaxy') }} - Configure ingress" +- name: "K3s {{ galaxy_name | default('UNKNOWN') }} - Configure ingress" hosts: server[0] gather_facts: false become: true - vars: - gateway_api_version: "v1.5.1" tasks: - name: Apply Traefik HelmChartConfig @@ -230,12 +219,11 @@ msg: "{{ cluster_status.stdout_lines }}" # Play 5: Install Cilium -- name: "K3s {{ galaxy_name | default('galaxy') }} - Install Cilium" +- name: "K3s {{ galaxy_name | default('UNKNOWN') }} - Install Cilium" hosts: server[0] gather_facts: false become: true vars: - cilium_cluster_id: 1 cluster_config_dir: "{{ playbook_dir }}/../k3s/{{ galaxy_name }}" roles: - role: cilium @@ -245,7 +233,7 @@ cilium_k8s_service_host: "{{ hostvars[groups['server'][0]]['vpc_ip'] }}" # Play 6: Fetch kubeconfig -- name: "K3s {{ galaxy_name | default('galaxy') }} - Fetch kubeconfig" +- name: "K3s {{ galaxy_name | default('UNKNOWN') }} - Fetch kubeconfig" hosts: server[0] gather_facts: false become: true @@ -280,9 +268,4 @@ msg: - "=== {{ galaxy_name }} cluster ready ===" - "Kubeconfig: k3s/{{ galaxy_name }}/.kubeconfig.yaml" - - "Nodes: {{ kubectl_result.stdout }}" - - "" - - "Next steps:" - - " 1. Verify Cilium: kubectl exec -n kube-system ds/cilium -c cilium-agent -- cilium status" - - " 2. Verify etcd snapshots: ssh root@ k3s etcd-snapshot list" - - " 3. Deploy apps: kubectl apply -k k3s/{{ galaxy_name }}/apps//manifests/base/" + - "{{ kubectl_result.stdout }}" From 5bfa405775583b2d0058457e86681c8e7749574f Mon Sep 17 00:00:00 2001 From: Mrugesh Mohapatra Date: Sun, 5 Apr 2026 02:04:33 +0530 Subject: [PATCH 21/40] =?UTF-8?q?fix:=20reorder=20playbook=20=E2=80=94=20i?= =?UTF-8?q?nstall=20Cilium=20before=20waiting=20for=20nodes=20Ready?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Nodes stay NotReady without a CNI. With --flannel-backend=none, the wait must happen AFTER Cilium install, not before. Moved wait + status display from Play 4 to new Play 6 (after Cilium in Play 5). Play order: validate → prereqs → k3s server → traefik + CRDs → cilium → verify → kubeconfig --- ansible/play-k3s--galaxy.yml | 39 ++++++++++++++++++++++-------------- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/ansible/play-k3s--galaxy.yml b/ansible/play-k3s--galaxy.yml index 6cbe94d0c..2ee3ef5da 100644 --- a/ansible/play-k3s--galaxy.yml +++ b/ansible/play-k3s--galaxy.yml @@ -4,6 +4,8 @@ # Generic playbook for any Universe galaxy. All galaxy-specific config # lives in inventory/group_vars/.yml — not in this file. # +# Plays: validate → sysctl + prereqs → k3s server → traefik + gateway CRDs → cilium → verify → fetch kubeconfig +# # Prerequisites: # - VMs provisioned with VPC (eth1) and Tailscale connected # - Env vars loaded via direnv (cd into cluster dir first) @@ -205,20 +207,7 @@ register: gateway_result changed_when: "'created' in gateway_result.stdout or 'configured' in gateway_result.stdout" - - name: Wait for all nodes ready - command: k3s kubectl wait --for=condition=Ready nodes --all --timeout=300s - changed_when: false - - - name: Display cluster status - command: k3s kubectl get nodes -o wide - register: cluster_status - changed_when: false - - - name: Cluster ready - debug: - msg: "{{ cluster_status.stdout_lines }}" - -# Play 5: Install Cilium +# Play 5: Install Cilium (must run before nodes can be Ready) - name: "K3s {{ galaxy_name | default('UNKNOWN') }} - Install Cilium" hosts: server[0] gather_facts: false @@ -232,7 +221,27 @@ cilium_values_file: "{{ cluster_config_dir }}/cluster/cilium/values.yaml" cilium_k8s_service_host: "{{ hostvars[groups['server'][0]]['vpc_ip'] }}" -# Play 6: Fetch kubeconfig +# Play 6: Wait for cluster ready (after Cilium provides the CNI) +- name: "K3s {{ galaxy_name | default('UNKNOWN') }} - Verify cluster" + hosts: server[0] + gather_facts: false + become: true + + tasks: + - name: Wait for all nodes ready + command: k3s kubectl wait --for=condition=Ready nodes --all --timeout=300s + changed_when: false + + - name: Display cluster status + command: k3s kubectl get nodes -o wide + register: cluster_status + changed_when: false + + - name: Cluster ready + debug: + msg: "{{ cluster_status.stdout_lines }}" + +# Play 7: Fetch kubeconfig - name: "K3s {{ galaxy_name | default('UNKNOWN') }} - Fetch kubeconfig" hosts: server[0] gather_facts: false From edf6521efac6fe7acf7b5ad64a2f95a759768761 Mon Sep 17 00:00:00 2001 From: Mrugesh Mohapatra Date: Sun, 5 Apr 2026 02:08:30 +0530 Subject: [PATCH 22/40] fix: use variable_host in play names (available at parse time) galaxy_name from group_vars isn't resolved when Ansible parses play names, causing "UNKNOWN" in output. variable_host is passed via -e and available at parse time. --- ansible/play-k3s--galaxy.yml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/ansible/play-k3s--galaxy.yml b/ansible/play-k3s--galaxy.yml index 2ee3ef5da..4840c95c5 100644 --- a/ansible/play-k3s--galaxy.yml +++ b/ansible/play-k3s--galaxy.yml @@ -16,7 +16,7 @@ # just play k3s--galaxy # Play 1: Validate prerequisites -- name: "K3s {{ galaxy_name | default('UNKNOWN') }} - Validate prerequisites" +- name: "K3s {{ variable_host }} - Validate prerequisites" hosts: "{{ variable_host }}" gather_facts: true become: true @@ -90,7 +90,7 @@ - server # Play 2: System prerequisites -- name: "K3s {{ galaxy_name | default('UNKNOWN') }} - System prerequisites" +- name: "K3s {{ variable_host }} - System prerequisites" hosts: k3s_cluster gather_facts: true become: true @@ -132,7 +132,7 @@ - role: k3s.orchestration.prereq # Play 3: Deploy k3s server -- name: "K3s {{ galaxy_name | default('UNKNOWN') }} - Deploy k3s server" +- name: "K3s {{ variable_host }} - Deploy k3s server" hosts: server gather_facts: true become: true @@ -188,7 +188,7 @@ - role: k3s.orchestration.k3s_server # Play 4: Configure ingress -- name: "K3s {{ galaxy_name | default('UNKNOWN') }} - Configure ingress" +- name: "K3s {{ variable_host }} - Configure ingress" hosts: server[0] gather_facts: false become: true @@ -208,7 +208,7 @@ changed_when: "'created' in gateway_result.stdout or 'configured' in gateway_result.stdout" # Play 5: Install Cilium (must run before nodes can be Ready) -- name: "K3s {{ galaxy_name | default('UNKNOWN') }} - Install Cilium" +- name: "K3s {{ variable_host }} - Install Cilium" hosts: server[0] gather_facts: false become: true @@ -222,7 +222,7 @@ cilium_k8s_service_host: "{{ hostvars[groups['server'][0]]['vpc_ip'] }}" # Play 6: Wait for cluster ready (after Cilium provides the CNI) -- name: "K3s {{ galaxy_name | default('UNKNOWN') }} - Verify cluster" +- name: "K3s {{ variable_host }} - Verify cluster" hosts: server[0] gather_facts: false become: true @@ -242,7 +242,7 @@ msg: "{{ cluster_status.stdout_lines }}" # Play 7: Fetch kubeconfig -- name: "K3s {{ galaxy_name | default('UNKNOWN') }} - Fetch kubeconfig" +- name: "K3s {{ variable_host }} - Fetch kubeconfig" hosts: server[0] gather_facts: false become: true From e9be94dffb9bf85cda14a29fb72b03ee23005110 Mon Sep 17 00:00:00 2001 From: Mrugesh Mohapatra Date: Sun, 5 Apr 2026 02:10:35 +0530 Subject: [PATCH 23/40] fix: set KUBECONFIG for Helm and kubectl in cilium role kubernetes.core.helm defaults to localhost:8080 without KUBECONFIG. k3s places the kubeconfig at /etc/rancher/k3s/k3s.yaml. Added environment block to all helm and kubectl tasks in the cilium role. --- ansible/roles/cilium/tasks/main.yml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/ansible/roles/cilium/tasks/main.yml b/ansible/roles/cilium/tasks/main.yml index 38775a0f7..52ba91b8b 100644 --- a/ansible/roles/cilium/tasks/main.yml +++ b/ansible/roles/cilium/tasks/main.yml @@ -26,6 +26,8 @@ kubernetes.core.helm_repository: name: cilium repo_url: https://helm.cilium.io/ + environment: + KUBECONFIG: /etc/rancher/k3s/k3s.yaml - name: Install Cilium kubernetes.core.helm: @@ -41,6 +43,8 @@ - value: "k8sServicePort={{ cilium_k8s_service_port }}" wait: true timeout: "5m0s" + environment: + KUBECONFIG: /etc/rancher/k3s/k3s.yaml - name: Clean up Cilium values file ansible.builtin.file: @@ -50,6 +54,8 @@ - name: Wait for Cilium agent DaemonSet rollout ansible.builtin.command: cmd: kubectl -n kube-system rollout status daemonset/cilium --timeout=180s + environment: + KUBECONFIG: /etc/rancher/k3s/k3s.yaml changed_when: false - name: Wait for Cilium operator Deployment rollout @@ -57,6 +63,8 @@ cmd: >- kubectl -n kube-system rollout status deployment/cilium-operator --timeout=180s + environment: + KUBECONFIG: /etc/rancher/k3s/k3s.yaml changed_when: false - name: Verify Cilium status @@ -64,6 +72,8 @@ cmd: >- kubectl -n kube-system exec ds/cilium -c cilium-agent -- cilium status --brief + environment: + KUBECONFIG: /etc/rancher/k3s/k3s.yaml register: cilium_status changed_when: false From 270a61419246e2202a941bb25e7c3e6288e8eef2 Mon Sep 17 00:00:00 2001 From: Mrugesh Mohapatra Date: Sun, 5 Apr 2026 02:22:55 +0530 Subject: [PATCH 24/40] =?UTF-8?q?fix:=20address=20review=20findings=20?= =?UTF-8?q?=E2=80=94=20etcd=20backups,=20Cilium=20idempotency,=20security?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Critical: - Remove YAML comment inside >- folded scalar that leaked into k3s ExecStart (etcd snapshot schedule and retention were silently disabled) - Add kubernetes.core to requirements.yml (Cilium role dependency) Warning: - Set hubble.tls.auto.method=cronJob to prevent cert regen on every re-run - Pin Helm version (v3.17.3) in cilium role install task - Add no_log to kubeconfig slurp and copy tasks (admin creds in output) - Fix Gateway CRD changed_when to only match 'created' (not 'configured') - Move cilium values to /etc/rancher/k3s/ instead of /tmp Requires re-run to fix etcd snapshot configuration on live nodes. --- ansible/play-k3s--galaxy.yml | 5 +++-- ansible/requirements.yml | 2 ++ ansible/roles/cilium/defaults/main.yml | 1 + ansible/roles/cilium/tasks/main.yml | 8 ++++---- k3s/gxy-management/cluster/cilium/values.yaml | 3 +++ 5 files changed, 13 insertions(+), 6 deletions(-) diff --git a/ansible/play-k3s--galaxy.yml b/ansible/play-k3s--galaxy.yml index 4840c95c5..b092480d4 100644 --- a/ansible/play-k3s--galaxy.yml +++ b/ansible/play-k3s--galaxy.yml @@ -160,7 +160,6 @@ --etcd-s3-bucket={{ etcd_s3_bucket }} --etcd-s3-folder=etcd/{{ galaxy_name }} --etcd-s3-region={{ etcd_s3_region }} - # Cron string MUST be double-quoted — systemd word-splits without them --etcd-snapshot-schedule-cron="{{ etcd_snapshot_schedule }}" --etcd-snapshot-retention={{ etcd_snapshot_retention }} server_group: server @@ -205,7 +204,7 @@ k3s kubectl apply -f https://github.com/kubernetes-sigs/gateway-api/releases/download/{{ gateway_api_version }}/standard-install.yaml register: gateway_result - changed_when: "'created' in gateway_result.stdout or 'configured' in gateway_result.stdout" + changed_when: "'created' in gateway_result.stdout" # Play 5: Install Cilium (must run before nodes can be Ready) - name: "K3s {{ variable_host }} - Install Cilium" @@ -254,6 +253,7 @@ slurp: src: /etc/rancher/k3s/k3s.yaml register: kubeconfig_raw + no_log: true - name: Write kubeconfig locally (replace server IP with Tailscale IP) copy: @@ -261,6 +261,7 @@ dest: "{{ cluster_config_dir }}/.kubeconfig.yaml" mode: "0600" delegate_to: localhost + no_log: true become: false - name: Verify kubectl connectivity diff --git a/ansible/requirements.yml b/ansible/requirements.yml index 89788f790..c5970c72e 100644 --- a/ansible/requirements.yml +++ b/ansible/requirements.yml @@ -8,6 +8,8 @@ collections: version: ">=1.27.0,<2.0.0" - name: ansible.posix version: ">=2.0.0,<3.0.0" + - name: kubernetes.core + version: ">=5.0.0,<7.0.0" - name: grafana.grafana version: ">=5.7.0,<6.0.0" - name: https://github.com/k3s-io/k3s-ansible.git diff --git a/ansible/roles/cilium/defaults/main.yml b/ansible/roles/cilium/defaults/main.yml index 6f07551d9..f63c3b65c 100644 --- a/ansible/roles/cilium/defaults/main.yml +++ b/ansible/roles/cilium/defaults/main.yml @@ -1,5 +1,6 @@ --- cilium_version: "1.19.2" +helm_version: "v3.17.3" cilium_cluster_name: "" cilium_cluster_id: "" cilium_values_file: "" diff --git a/ansible/roles/cilium/tasks/main.yml b/ansible/roles/cilium/tasks/main.yml index 52ba91b8b..8a9fce309 100644 --- a/ansible/roles/cilium/tasks/main.yml +++ b/ansible/roles/cilium/tasks/main.yml @@ -13,12 +13,12 @@ - name: Copy Cilium values file to server ansible.builtin.copy: src: "{{ cilium_values_file }}" - dest: /tmp/cilium-values.yaml + dest: /etc/rancher/k3s/cilium-values.yaml mode: "0600" - name: Install Helm ansible.builtin.shell: - cmd: set -o pipefail && curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash + cmd: set -o pipefail && curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | DESIRED_VERSION={{ helm_version }} bash executable: /bin/bash creates: /usr/local/bin/helm @@ -37,7 +37,7 @@ release_namespace: kube-system update_repo_cache: true values_files: - - /tmp/cilium-values.yaml + - /etc/rancher/k3s/cilium-values.yaml set_values: - value: "k8sServiceHost={{ cilium_k8s_service_host }}" - value: "k8sServicePort={{ cilium_k8s_service_port }}" @@ -48,7 +48,7 @@ - name: Clean up Cilium values file ansible.builtin.file: - path: /tmp/cilium-values.yaml + path: /etc/rancher/k3s/cilium-values.yaml state: absent - name: Wait for Cilium agent DaemonSet rollout diff --git a/k3s/gxy-management/cluster/cilium/values.yaml b/k3s/gxy-management/cluster/cilium/values.yaml index 0ca4cdf32..9b66764fd 100644 --- a/k3s/gxy-management/cluster/cilium/values.yaml +++ b/k3s/gxy-management/cluster/cilium/values.yaml @@ -18,6 +18,9 @@ operator: hubble: enabled: true + tls: + auto: + method: cronJob relay: enabled: true ui: From e855771b7b0badcfef68e67230f515690c1ae90a Mon Sep 17 00:00:00 2001 From: Mrugesh Mohapatra Date: Sun, 5 Apr 2026 02:25:23 +0530 Subject: [PATCH 25/40] fix: add KUBECONFIG to gxy-management .envrc direnv now sets KUBECONFIG automatically when you cd into the cluster dir. Uses expand_path to resolve the absolute path to .kubeconfig.yaml. --- k3s/gxy-management/.envrc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/k3s/gxy-management/.envrc b/k3s/gxy-management/.envrc index 08bb22dfc..da3847029 100644 --- a/k3s/gxy-management/.envrc +++ b/k3s/gxy-management/.envrc @@ -4,4 +4,6 @@ if [ -d "$SECRETS_DIR" ]; then use_sops "$SECRETS_DIR/do-universe/.env.enc" fi +export KUBECONFIG="$(expand_path .kubeconfig.yaml)" + dotenv_if_exists .env From deed8d4ba52f21e5ef5e5da78c3123b9c22c9914 Mon Sep 17 00:00:00 2001 From: Mrugesh Mohapatra Date: Sun, 5 Apr 2026 02:27:18 +0530 Subject: [PATCH 26/40] =?UTF-8?q?fix:=20remove=20manual=20Gateway=20API=20?= =?UTF-8?q?CRD=20install=20=E2=80=94=20conflicts=20with=20Traefik=20CRD=20?= =?UTF-8?q?chart?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Traefik's bundled traefik-crd Helm chart includes Gateway API CRDs. Manual kubectl apply creates CRDs without Helm ownership labels, causing traefik-crd install to CrashLoopBackOff with "invalid ownership metadata". Removed the manual install task and gateway_api_version variable. Existing CRDs must be deleted manually for Traefik to adopt them: kubectl delete crds -l gateway.networking.k8s.io/bundle-version --- ansible/inventory/group_vars/gxy_mgmt_k3s.yml | 3 --- ansible/play-k3s--galaxy.yml | 10 ++-------- 2 files changed, 2 insertions(+), 11 deletions(-) diff --git a/ansible/inventory/group_vars/gxy_mgmt_k3s.yml b/ansible/inventory/group_vars/gxy_mgmt_k3s.yml index 0d6bcfb14..9f402721a 100644 --- a/ansible/inventory/group_vars/gxy_mgmt_k3s.yml +++ b/ansible/inventory/group_vars/gxy_mgmt_k3s.yml @@ -14,9 +14,6 @@ service_cidr: "10.11.0.0/16" # Cilium CNI cilium_cluster_id: 1 -# Gateway API -gateway_api_version: v1.5.1 - # etcd S3 backups (DO Spaces) etcd_s3_endpoint: fra1.digitaloceanspaces.com etcd_s3_bucket: net.freecodecamp.universe-backups diff --git a/ansible/play-k3s--galaxy.yml b/ansible/play-k3s--galaxy.yml index b092480d4..bc15ae166 100644 --- a/ansible/play-k3s--galaxy.yml +++ b/ansible/play-k3s--galaxy.yml @@ -33,7 +33,6 @@ - cluster_cidr is defined and cluster_cidr | length > 0 - service_cidr is defined and service_cidr | length > 0 - cilium_cluster_id is defined and cilium_cluster_id | string | length > 0 - - gateway_api_version is defined and gateway_api_version | length > 0 - etcd_s3_endpoint is defined and etcd_s3_endpoint | length > 0 - etcd_s3_bucket is defined and etcd_s3_bucket | length > 0 - etcd_s3_region is defined and etcd_s3_region | length > 0 @@ -198,13 +197,8 @@ src: "{{ playbook_dir }}/../k3s/{{ galaxy_name }}/cluster/traefik-config.yaml" dest: /var/lib/rancher/k3s/server/manifests/traefik-config.yaml mode: "0600" - - - name: Install Gateway API CRDs - command: > - k3s kubectl apply -f - https://github.com/kubernetes-sigs/gateway-api/releases/download/{{ gateway_api_version }}/standard-install.yaml - register: gateway_result - changed_when: "'created' in gateway_result.stdout" + # Gateway API CRDs are bundled in the Traefik CRD Helm chart (traefik-crd). + # Do NOT install them manually — conflicts with Helm ownership labels. # Play 5: Install Cilium (must run before nodes can be Ready) - name: "K3s {{ variable_host }} - Install Cilium" From c0f2264e92c7bb51205b34d514512e29ab1c446b Mon Sep 17 00:00:00 2001 From: Mrugesh Mohapatra Date: Sun, 5 Apr 2026 10:09:48 +0530 Subject: [PATCH 27/40] feat: add k3s cluster reset playbook Wraps the k3s-uninstall.sh script for all nodes in an inventory group. Removes: k3s, Cilium, etcd data, Helm, service env, audit logs, kubeconfig. Preserves: Tailscale, cloud-init hardening, DO infrastructure, CIS sysctls. Usage: just play k3s--reset gxy_mgmt_k3s --- ansible/play-k3s--reset.yml | 78 +++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 ansible/play-k3s--reset.yml diff --git a/ansible/play-k3s--reset.yml b/ansible/play-k3s--reset.yml new file mode 100644 index 000000000..6ebb9f7f8 --- /dev/null +++ b/ansible/play-k3s--reset.yml @@ -0,0 +1,78 @@ +--- +# Reset k3s cluster — removes k3s, Cilium, etcd data, all k8s state. +# Preserves: Tailscale, cloud-init hardening, DO infrastructure, CIS sysctls. +# +# Usage: +# cd k3s/ +# just play k3s--reset + +- name: "K3s {{ variable_host }} - Build groups for reset" + hosts: "{{ variable_host }}" + gather_facts: false + tasks: + - name: Build k3s_cluster group + group_by: + key: k3s_cluster + + - name: Build server group + group_by: + key: server + +- name: "K3s {{ variable_host }} - Reset cluster" + hosts: k3s_cluster + become: true + tasks: + - name: Run k3s uninstall script (server) + when: "'server' in group_names" + ansible.builtin.command: + cmd: k3s-uninstall.sh + removes: /var/lib/rancher/k3s/* + + - name: Run k3s uninstall script (agent) + when: "'agent' in group_names" + ansible.builtin.command: + cmd: k3s-agent-uninstall.sh + removes: /var/lib/rancher/k3s/* + + - name: Remove user kubeconfig + ansible.builtin.file: + path: "~{{ ansible_user }}/.kube/config" + state: absent + + - name: Remove k3s install script + ansible.builtin.file: + path: /usr/local/bin/k3s-install.sh + state: absent + + - name: Remove k3s config directory + ansible.builtin.file: + path: /etc/rancher/k3s + state: absent + + - name: Remove k3s service environment + ansible.builtin.file: + path: /etc/systemd/system/k3s.service.env + state: absent + + - name: Remove Helm binary + ansible.builtin.file: + path: /usr/local/bin/helm + state: absent + + - name: Remove k3s audit logs + ansible.builtin.file: + path: /var/log/k3s + state: absent + +- name: "K3s {{ variable_host }} - Clean up local kubeconfig" + hosts: "{{ variable_host }}" + gather_facts: false + connection: local + become: false + run_once: true + tasks: + - name: Remove local kubeconfig + ansible.builtin.file: + path: "{{ playbook_dir }}/../k3s/{{ galaxy_name }}/.kubeconfig.yaml" + state: absent + when: galaxy_name is defined From e1d2016d3ddb3432215302db4c4d5578b6e57058 Mon Sep 17 00:00:00 2001 From: Mrugesh Mohapatra Date: Sun, 5 Apr 2026 10:12:17 +0530 Subject: [PATCH 28/40] refactor: move k3s flags from extra_server_args to server_config_yaml MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Static cluster config (CNI, CIDRs, hardening, etcd S3) now lives in group_vars as server_config_yaml — written to /etc/rancher/k3s/config.yaml by the k3s-ansible role. Structured YAML, no folded scalar bugs. extra_server_args retains only per-node flags (node-ip, advertise-address, tls-san) that vary by host. Aligns with k3s hardening guide documented format. Added audit log rotation flags (maxage=30, maxbackup=10, maxsize=100) per CIS recommendations. --- ansible/inventory/group_vars/gxy_mgmt_k3s.yml | 41 ++++++++++++++----- ansible/play-k3s--galaxy.yml | 25 +---------- 2 files changed, 32 insertions(+), 34 deletions(-) diff --git a/ansible/inventory/group_vars/gxy_mgmt_k3s.yml b/ansible/inventory/group_vars/gxy_mgmt_k3s.yml index 9f402721a..7a3ea2c62 100644 --- a/ansible/inventory/group_vars/gxy_mgmt_k3s.yml +++ b/ansible/inventory/group_vars/gxy_mgmt_k3s.yml @@ -7,16 +7,37 @@ galaxy_name: gxy-management # k3s k3s_version: v1.34.5+k3s1 -# Networking (ADR-009: non-overlapping per galaxy) -cluster_cidr: "10.1.0.0/16" -service_cidr: "10.11.0.0/16" - # Cilium CNI cilium_cluster_id: 1 -# etcd S3 backups (DO Spaces) -etcd_s3_endpoint: fra1.digitaloceanspaces.com -etcd_s3_bucket: net.freecodecamp.universe-backups -etcd_s3_region: fra1 -etcd_snapshot_schedule: "0 */6 * * *" -etcd_snapshot_retention: 20 +# k3s config.yaml — written to /etc/rancher/k3s/config.yaml +# Keys are hyphenated (matching CLI flags). Per k3s docs: +# https://docs.k3s.io/installation/configuration +# https://docs.k3s.io/security/hardening-guide +server_config_yaml: | + # Networking (ADR-009) + flannel-backend: "none" + disable-network-policy: true + disable-kube-proxy: true + cluster-cidr: "10.1.0.0/16" + service-cidr: "10.11.0.0/16" + + # Security (CIS hardening) + protect-kernel-defaults: true + secrets-encryption: true + kube-apiserver-arg: + - "admission-control-config-file=/etc/rancher/k3s/pss-admission.yaml" + - "audit-log-path=/var/log/k3s/audit.log" + - "audit-policy-file=/etc/rancher/k3s/audit-policy.yaml" + - "audit-log-maxage=30" + - "audit-log-maxbackup=10" + - "audit-log-maxsize=100" + + # etcd S3 backups (DO Spaces FRA1) + etcd-s3: true + etcd-s3-endpoint: "fra1.digitaloceanspaces.com" + etcd-s3-bucket: "net.freecodecamp.universe-backups" + etcd-s3-folder: "etcd/gxy-management" + etcd-s3-region: "fra1" + etcd-snapshot-schedule-cron: "0 */6 * * *" + etcd-snapshot-retention: 20 diff --git a/ansible/play-k3s--galaxy.yml b/ansible/play-k3s--galaxy.yml index bc15ae166..488b4b272 100644 --- a/ansible/play-k3s--galaxy.yml +++ b/ansible/play-k3s--galaxy.yml @@ -30,14 +30,8 @@ that: - galaxy_name is defined and galaxy_name | length > 0 - k3s_version is defined and k3s_version | length > 0 - - cluster_cidr is defined and cluster_cidr | length > 0 - - service_cidr is defined and service_cidr | length > 0 + - server_config_yaml is defined and server_config_yaml | length > 0 - cilium_cluster_id is defined and cilium_cluster_id | string | length > 0 - - etcd_s3_endpoint is defined and etcd_s3_endpoint | length > 0 - - etcd_s3_bucket is defined and etcd_s3_bucket | length > 0 - - etcd_s3_region is defined and etcd_s3_region | length > 0 - - etcd_snapshot_schedule is defined and etcd_snapshot_schedule | length > 0 - - etcd_snapshot_retention is defined fail_msg: "Missing group_vars. Populate inventory/group_vars/{{ variable_host }}.yml" - name: Validate env vars loaded @@ -144,23 +138,6 @@ --advertise-address={{ hostvars[inventory_hostname]['vpc_ip'] }} --tls-san={{ hostvars[inventory_hostname]['vpc_ip'] }} --tls-san={{ hostvars[inventory_hostname]['tailscale_ip'] }} - --flannel-backend=none - --disable-network-policy - --disable-kube-proxy - --secrets-encryption - --protect-kernel-defaults - --cluster-cidr={{ cluster_cidr }} - --service-cidr={{ service_cidr }} - --kube-apiserver-arg=admission-control-config-file=/etc/rancher/k3s/pss-admission.yaml - --kube-apiserver-arg=audit-log-path=/var/log/k3s/audit.log - --kube-apiserver-arg=audit-policy-file=/etc/rancher/k3s/audit-policy.yaml - --etcd-s3 - --etcd-s3-endpoint={{ etcd_s3_endpoint }} - --etcd-s3-bucket={{ etcd_s3_bucket }} - --etcd-s3-folder=etcd/{{ galaxy_name }} - --etcd-s3-region={{ etcd_s3_region }} - --etcd-snapshot-schedule-cron="{{ etcd_snapshot_schedule }}" - --etcd-snapshot-retention={{ etcd_snapshot_retention }} server_group: server pre_tasks: From 7ac23bd3928015f5f07e3111bf7d8d179507bdaa Mon Sep 17 00:00:00 2001 From: Mrugesh Mohapatra Date: Sun, 5 Apr 2026 10:31:18 +0530 Subject: [PATCH 29/40] fix: address adversarial review findings (W1-W4) - Cilium role: override cluster.name/id via set_values from group_vars - Cilium role: pin Helm install script to release tag, not main branch - Reset playbook: clean up Cilium BPF state in /sys/fs/bpf/cilium - Galaxy playbook: move Traefik config to Play 2 (before k3s starts) - Galaxy playbook: collapse from 7 plays to 6 - Group vars: add tls-san exclusion comment - Galaxy playbook: remove undefined vars from debug output --- ansible/inventory/group_vars/gxy_mgmt_k3s.yml | 1 + ansible/play-k3s--galaxy.yml | 39 +++++++++---------- ansible/play-k3s--reset.yml | 5 +++ ansible/roles/cilium/tasks/main.yml | 4 +- 4 files changed, 27 insertions(+), 22 deletions(-) diff --git a/ansible/inventory/group_vars/gxy_mgmt_k3s.yml b/ansible/inventory/group_vars/gxy_mgmt_k3s.yml index 7a3ea2c62..3530428b5 100644 --- a/ansible/inventory/group_vars/gxy_mgmt_k3s.yml +++ b/ansible/inventory/group_vars/gxy_mgmt_k3s.yml @@ -15,6 +15,7 @@ cilium_cluster_id: 1 # https://docs.k3s.io/installation/configuration # https://docs.k3s.io/security/hardening-guide server_config_yaml: | + # Do NOT add tls-san here — managed by the playbook's extra_server_args (per-node) # Networking (ADR-009) flannel-backend: "none" disable-network-policy: true diff --git a/ansible/play-k3s--galaxy.yml b/ansible/play-k3s--galaxy.yml index 488b4b272..f5ae510f3 100644 --- a/ansible/play-k3s--galaxy.yml +++ b/ansible/play-k3s--galaxy.yml @@ -4,7 +4,7 @@ # Generic playbook for any Universe galaxy. All galaxy-specific config # lives in inventory/group_vars/.yml — not in this file. # -# Plays: validate → sysctl + prereqs → k3s server → traefik + gateway CRDs → cilium → verify → fetch kubeconfig +# Plays: validate → sysctl + prereqs + traefik config → k3s server → cilium → verify → fetch kubeconfig # # Prerequisites: # - VMs provisioned with VPC (eth1) and Tailscale connected @@ -72,8 +72,7 @@ debug: msg: >- {{ inventory_hostname }}: VPC={{ vpc_ip }}, Tailscale={{ tailscale_ip }}, - galaxy={{ galaxy_name }}, k3s={{ k3s_version }}, - pods={{ cluster_cidr }}, svc={{ service_cidr }} + galaxy={{ galaxy_name }}, k3s={{ k3s_version }} - name: Build dynamic groups group_by: @@ -121,6 +120,18 @@ dest: /etc/rancher/k3s/audit-policy.yaml mode: "0600" + - name: Ensure k3s manifests directory exists + file: + path: /var/lib/rancher/k3s/server/manifests + state: directory + mode: "0755" + + - name: Copy Traefik HelmChartConfig (before k3s starts) + copy: + src: "{{ cluster_config_dir }}/cluster/traefik-config.yaml" + dest: /var/lib/rancher/k3s/server/manifests/traefik-config.yaml + mode: "0600" + roles: - role: k3s.orchestration.prereq @@ -162,22 +173,8 @@ roles: - role: k3s.orchestration.k3s_server -# Play 4: Configure ingress -- name: "K3s {{ variable_host }} - Configure ingress" - hosts: server[0] - gather_facts: false - become: true - - tasks: - - name: Apply Traefik HelmChartConfig - copy: - src: "{{ playbook_dir }}/../k3s/{{ galaxy_name }}/cluster/traefik-config.yaml" - dest: /var/lib/rancher/k3s/server/manifests/traefik-config.yaml - mode: "0600" - # Gateway API CRDs are bundled in the Traefik CRD Helm chart (traefik-crd). - # Do NOT install them manually — conflicts with Helm ownership labels. - -# Play 5: Install Cilium (must run before nodes can be Ready) +# Play 4: Install Cilium (must run before nodes can be Ready) +# Gateway API CRDs are bundled in the Traefik CRD Helm chart — do NOT install manually. - name: "K3s {{ variable_host }} - Install Cilium" hosts: server[0] gather_facts: false @@ -191,7 +188,7 @@ cilium_values_file: "{{ cluster_config_dir }}/cluster/cilium/values.yaml" cilium_k8s_service_host: "{{ hostvars[groups['server'][0]]['vpc_ip'] }}" -# Play 6: Wait for cluster ready (after Cilium provides the CNI) +# Play 5: Wait for cluster ready (after Cilium provides the CNI) - name: "K3s {{ variable_host }} - Verify cluster" hosts: server[0] gather_facts: false @@ -211,7 +208,7 @@ debug: msg: "{{ cluster_status.stdout_lines }}" -# Play 7: Fetch kubeconfig +# Play 6: Fetch kubeconfig - name: "K3s {{ variable_host }} - Fetch kubeconfig" hosts: server[0] gather_facts: false diff --git a/ansible/play-k3s--reset.yml b/ansible/play-k3s--reset.yml index 6ebb9f7f8..192b7229f 100644 --- a/ansible/play-k3s--reset.yml +++ b/ansible/play-k3s--reset.yml @@ -54,6 +54,11 @@ path: /etc/systemd/system/k3s.service.env state: absent + - name: Remove Cilium BPF state + ansible.builtin.command: + cmd: rm -rf /sys/fs/bpf/cilium + removes: /sys/fs/bpf/cilium + - name: Remove Helm binary ansible.builtin.file: path: /usr/local/bin/helm diff --git a/ansible/roles/cilium/tasks/main.yml b/ansible/roles/cilium/tasks/main.yml index 8a9fce309..2cc56d7b9 100644 --- a/ansible/roles/cilium/tasks/main.yml +++ b/ansible/roles/cilium/tasks/main.yml @@ -18,7 +18,7 @@ - name: Install Helm ansible.builtin.shell: - cmd: set -o pipefail && curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | DESIRED_VERSION={{ helm_version }} bash + cmd: set -o pipefail && curl https://raw.githubusercontent.com/helm/helm/{{ helm_version }}/scripts/get-helm-3 | DESIRED_VERSION={{ helm_version }} bash executable: /bin/bash creates: /usr/local/bin/helm @@ -39,6 +39,8 @@ values_files: - /etc/rancher/k3s/cilium-values.yaml set_values: + - value: "cluster.name={{ cilium_cluster_name }}" + - value: "cluster.id={{ cilium_cluster_id }}" - value: "k8sServiceHost={{ cilium_k8s_service_host }}" - value: "k8sServicePort={{ cilium_k8s_service_port }}" wait: true From 3d3179f9626f333fcea8920429eeeb14eb16a159 Mon Sep 17 00:00:00 2001 From: Mrugesh Mohapatra Date: Sun, 5 Apr 2026 10:57:27 +0530 Subject: [PATCH 30/40] fix: reset local cleanup delegation + kubeconfig context naming - Reset playbook: use delegate_to localhost instead of connection: local (connection: local with remote hosts still uses remote Python interpreter) - Galaxy playbook: rename kubeconfig context/cluster/user from 'default' to galaxy_name so OMP and kubectl context show the actual cluster name --- ansible/play-k3s--galaxy.yml | 10 ++++++++-- ansible/play-k3s--reset.yml | 2 +- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/ansible/play-k3s--galaxy.yml b/ansible/play-k3s--galaxy.yml index f5ae510f3..313298415 100644 --- a/ansible/play-k3s--galaxy.yml +++ b/ansible/play-k3s--galaxy.yml @@ -223,9 +223,15 @@ register: kubeconfig_raw no_log: true - - name: Write kubeconfig locally (replace server IP with Tailscale IP) + - name: Write kubeconfig locally (replace server IP and context name) copy: - content: "{{ kubeconfig_raw.content | b64decode | regex_replace('127\\.0\\.0\\.1', hostvars[inventory_hostname]['tailscale_ip']) }}" + content: >- + {{ kubeconfig_raw.content | b64decode + | regex_replace('127\\.0\\.0\\.1', hostvars[inventory_hostname]['tailscale_ip']) + | regex_replace('name: default', 'name: ' + galaxy_name) + | regex_replace('cluster: default', 'cluster: ' + galaxy_name) + | regex_replace('user: default', 'user: ' + galaxy_name) + | regex_replace('current-context: default', 'current-context: ' + galaxy_name) }} dest: "{{ cluster_config_dir }}/.kubeconfig.yaml" mode: "0600" delegate_to: localhost diff --git a/ansible/play-k3s--reset.yml b/ansible/play-k3s--reset.yml index 192b7229f..a94365cfb 100644 --- a/ansible/play-k3s--reset.yml +++ b/ansible/play-k3s--reset.yml @@ -72,7 +72,6 @@ - name: "K3s {{ variable_host }} - Clean up local kubeconfig" hosts: "{{ variable_host }}" gather_facts: false - connection: local become: false run_once: true tasks: @@ -80,4 +79,5 @@ ansible.builtin.file: path: "{{ playbook_dir }}/../k3s/{{ galaxy_name }}/.kubeconfig.yaml" state: absent + delegate_to: localhost when: galaxy_name is defined From cfba34102dd0ce0be3351590d03a958ab137c8a3 Mon Sep 17 00:00:00 2001 From: Mrugesh Mohapatra Date: Sun, 5 Apr 2026 12:23:03 +0530 Subject: [PATCH 31/40] refactor: rewrite galaxy + reset playbooks from standard patterns Galaxy playbook (5 plays, down from 7): - Use server_config_yaml for all static config (k3s hardening guide format) - Use extra_service_envs for S3 creds (role-native mechanism) - Set user_kubectl: false (Play 5 handles kubeconfig correctly) - Document required DO firewall ports in header - Remove Cilium cluster.name/id from values.yaml (set_values is source of truth) Reset playbook: simplified, delegate_to for local cleanup Group vars: remove cluster_context (unused with user_kubectl: false) Park deployment tasks in SPIKE-STATUS.md pending clean redeploy. --- SPIKE-STATUS.md | 26 ++-- ansible/inventory/group_vars/gxy_mgmt_k3s.yml | 23 ++-- ansible/play-k3s--galaxy.yml | 114 ++++++------------ ansible/play-k3s--reset.yml | 72 ++++------- k3s/gxy-management/cluster/cilium/values.yaml | 7 +- 5 files changed, 90 insertions(+), 152 deletions(-) diff --git a/SPIKE-STATUS.md b/SPIKE-STATUS.md index 2515f8686..fed13d199 100644 --- a/SPIKE-STATUS.md +++ b/SPIKE-STATUS.md @@ -187,17 +187,21 @@ Apps that only need K8s Secrets (ArgoCD, Zot) use `just deploy` which decrypts ` ## What's Next -Deploy sequentially. Verify each before moving to the next. - -### Phase A: Bootstrap Cluster - -| # | Task | Status | Command / Notes | -| --- | ------------------------------- | ------ | -------------------------------------------------------------------- | -| A1 | Populate Windmill secrets | DONE | `windmill.values.yaml.enc` + `windmill.secrets.env.enc` | -| A2 | Refactor playbook to group_vars | BLOCK | Playbook has hardcoded values — needs proper Ansible design | -| A3 | Run K3s galaxy playbook | BLOCK | Failed: missing CIS sysctls (sysctl fix added, needs rerun after A2) | -| A4 | Verify cluster health | TODO | 3 nodes Ready, Cilium green, Traefik running, Gateway CRDs | -| A5 | Encrypt kubeconfig | TODO | sops encrypt to infra-secrets | +**BLOCKED on playbook rewrite.** The galaxy playbook failed on every run for different reasons (CIS sysctls, folded scalar bugs, Gateway CRD conflicts, KUBECONFIG missing, etcd peer timeout). The playbook needs to be rewritten from scratch following standard k3s-ansible + Cilium patterns. Cluster has been torn down. + +### Parked Tasks (resume after playbooks are solid) + +| Task | Notes | +| ------------------------- | -------------------------------------------- | +| Run galaxy playbook | Needs rewritten playbook | +| Verify cluster health | 3 nodes Ready, Cilium green, Traefik running | +| Encrypt kubeconfig | sops encrypt to infra-secrets | +| Install Windmill Helm | `just helm-upgrade gxy-management windmill` | +| Deploy Windmill manifests | Gateway + TLS via `just deploy` | +| Cloudflare DNS + Access | ClickOps: A records, email OTP gate | +| Smoke test Windmill | curl + browser | +| Commit infra-secrets | Push to GitHub | +| Clean up stale files | Remove SPIKE-STATUS.md, archive cruft | ### Phase B: Windmill (Day 0 Deliverable) diff --git a/ansible/inventory/group_vars/gxy_mgmt_k3s.yml b/ansible/inventory/group_vars/gxy_mgmt_k3s.yml index 3530428b5..33fdea0fa 100644 --- a/ansible/inventory/group_vars/gxy_mgmt_k3s.yml +++ b/ansible/inventory/group_vars/gxy_mgmt_k3s.yml @@ -1,29 +1,26 @@ --- # gxy-management galaxy configuration # Applied automatically when targeting the gxy_mgmt_k3s inventory group +# +# To add a new galaxy: create a new file matching the DO inventory tag. galaxy_name: gxy-management - -# k3s k3s_version: v1.34.5+k3s1 - -# Cilium CNI cilium_cluster_id: 1 -# k3s config.yaml — written to /etc/rancher/k3s/config.yaml -# Keys are hyphenated (matching CLI flags). Per k3s docs: -# https://docs.k3s.io/installation/configuration -# https://docs.k3s.io/security/hardening-guide +# k3s config.yaml — written to /etc/rancher/k3s/config.yaml by the role +# Keys are hyphenated, matching CLI flags. Docs: +# https://docs.k3s.io/installation/configuration +# https://docs.k3s.io/security/hardening-guide +# +# Do NOT add tls-san, token, cluster-init, or server here — +# those are managed by the k3s-ansible role via extra_server_args. server_config_yaml: | - # Do NOT add tls-san here — managed by the playbook's extra_server_args (per-node) - # Networking (ADR-009) flannel-backend: "none" disable-network-policy: true disable-kube-proxy: true cluster-cidr: "10.1.0.0/16" service-cidr: "10.11.0.0/16" - - # Security (CIS hardening) protect-kernel-defaults: true secrets-encryption: true kube-apiserver-arg: @@ -33,8 +30,6 @@ server_config_yaml: | - "audit-log-maxage=30" - "audit-log-maxbackup=10" - "audit-log-maxsize=100" - - # etcd S3 backups (DO Spaces FRA1) etcd-s3: true etcd-s3-endpoint: "fra1.digitaloceanspaces.com" etcd-s3-bucket: "net.freecodecamp.universe-backups" diff --git a/ansible/play-k3s--galaxy.yml b/ansible/play-k3s--galaxy.yml index 313298415..4afdf10de 100644 --- a/ansible/play-k3s--galaxy.yml +++ b/ansible/play-k3s--galaxy.yml @@ -4,10 +4,11 @@ # Generic playbook for any Universe galaxy. All galaxy-specific config # lives in inventory/group_vars/.yml — not in this file. # -# Plays: validate → sysctl + prereqs + traefik config → k3s server → cilium → verify → fetch kubeconfig +# Plays: validate → prereqs → k3s server → cilium → verify + kubeconfig # # Prerequisites: # - VMs provisioned with VPC (eth1) and Tailscale connected +# - DO cloud firewall allows: 2379-2380, 4240, 5001, 6443, 8472, 10250 between VPC nodes # - Env vars loaded via direnv (cd into cluster dir first) # - Group vars populated for the target inventory group # @@ -15,8 +16,8 @@ # cd k3s/ # just play k3s--galaxy -# Play 1: Validate prerequisites -- name: "K3s {{ variable_host }} - Validate prerequisites" +# Play 1: Validate prerequisites and build dynamic groups +- name: "K3s {{ variable_host }} - Validate" hosts: "{{ variable_host }}" gather_facts: true become: true @@ -25,7 +26,7 @@ do_spaces_secret_key: "{{ lookup('env', 'DO_SPACES_SECRET_KEY') }}" tasks: - - name: Validate required group_vars are set + - name: Validate required group_vars assert: that: - galaxy_name is defined and galaxy_name | length > 0 @@ -41,26 +42,19 @@ - do_spaces_secret_key | length > 0 fail_msg: "DO_SPACES_ACCESS_KEY/SECRET_KEY not set. cd into cluster dir first." - - name: Validate VPC interface exists (eth1) + - name: Validate VPC interface (eth1) assert: that: - ansible_eth1 is defined - ansible_eth1.ipv4 is defined - - ansible_eth1.ipv4.address is defined - fail_msg: "VPC interface eth1 not found." - - - name: Validate VPC IP is in expected range - assert: - that: - ansible_eth1.ipv4.address | regex_search('^10\.') - fail_msg: "VPC IP {{ ansible_eth1.ipv4.address }} not in 10.x.x.x range." + fail_msg: "VPC interface eth1 not found or IP not in 10.x.x.x range." - - name: Validate Tailscale is connected + - name: Validate Tailscale connected assert: that: - ansible_tailscale0 is defined - ansible_tailscale0.ipv4 is defined - - ansible_tailscale0.ipv4.address is defined fail_msg: "Tailscale not connected." - name: Set network facts @@ -70,19 +64,15 @@ - name: Display configuration debug: - msg: >- - {{ inventory_hostname }}: VPC={{ vpc_ip }}, Tailscale={{ tailscale_ip }}, - galaxy={{ galaxy_name }}, k3s={{ k3s_version }} + msg: "{{ inventory_hostname }}: VPC={{ vpc_ip }}, TS={{ tailscale_ip }}, galaxy={{ galaxy_name }}" - name: Build dynamic groups group_by: key: "{{ item }}" - loop: - - k3s_cluster - - server + loop: [k3s_cluster, server] -# Play 2: System prerequisites -- name: "K3s {{ variable_host }} - System prerequisites" +# Play 2: System prerequisites (before k3s starts) +- name: "K3s {{ variable_host }} - Prerequisites" hosts: k3s_cluster gather_facts: true become: true @@ -90,7 +80,7 @@ cluster_config_dir: "{{ playbook_dir }}/../k3s/{{ galaxy_name }}" pre_tasks: - - name: Set kubelet kernel parameters (CIS hardening) + - name: Set CIS kubelet kernel parameters ansible.posix.sysctl: name: "{{ item.key }}" value: "{{ item.value }}" @@ -102,11 +92,15 @@ - { key: kernel.panic, value: "10" } - { key: kernel.panic_on_oops, value: "1" } - - name: Ensure k3s config directory exists + - name: Ensure k3s directories exist file: - path: /etc/rancher/k3s + path: "{{ item }}" state: directory mode: "0755" + loop: + - /etc/rancher/k3s + - /var/lib/rancher/k3s/server/manifests + - /var/log/k3s - name: Copy PSS admission config copy: @@ -120,13 +114,7 @@ dest: /etc/rancher/k3s/audit-policy.yaml mode: "0600" - - name: Ensure k3s manifests directory exists - file: - path: /var/lib/rancher/k3s/server/manifests - state: directory - mode: "0755" - - - name: Copy Traefik HelmChartConfig (before k3s starts) + - name: Copy Traefik HelmChartConfig copy: src: "{{ cluster_config_dir }}/cluster/traefik-config.yaml" dest: /var/lib/rancher/k3s/server/manifests/traefik-config.yaml @@ -136,46 +124,27 @@ - role: k3s.orchestration.prereq # Play 3: Deploy k3s server -- name: "K3s {{ variable_host }} - Deploy k3s server" +- name: "K3s {{ variable_host }} - Deploy" hosts: server gather_facts: true become: true vars: - do_spaces_access_key: "{{ lookup('env', 'DO_SPACES_ACCESS_KEY') }}" - do_spaces_secret_key: "{{ lookup('env', 'DO_SPACES_SECRET_KEY') }}" api_endpoint: "{{ hostvars[groups['server'][0]]['vpc_ip'] }}" extra_server_args: >- --node-ip={{ hostvars[inventory_hostname]['vpc_ip'] }} --advertise-address={{ hostvars[inventory_hostname]['vpc_ip'] }} --tls-san={{ hostvars[inventory_hostname]['vpc_ip'] }} --tls-san={{ hostvars[inventory_hostname]['tailscale_ip'] }} + extra_service_envs: + - "AWS_ACCESS_KEY_ID={{ lookup('env', 'DO_SPACES_ACCESS_KEY') }}" + - "AWS_SECRET_ACCESS_KEY={{ lookup('env', 'DO_SPACES_SECRET_KEY') }}" server_group: server - - pre_tasks: - - name: Create audit log directory - file: - path: /var/log/k3s - state: directory - mode: "0750" - - - name: Write S3 credentials to service environment - ansible.builtin.lineinfile: - path: /etc/systemd/system/k3s.service.env - regexp: "^{{ item.key }}=" - line: "{{ item.key }}={{ item.value }}" - create: true - mode: "0600" - loop: - - { key: AWS_ACCESS_KEY_ID, value: "{{ do_spaces_access_key }}" } - - { key: AWS_SECRET_ACCESS_KEY, value: "{{ do_spaces_secret_key }}" } - no_log: true - + user_kubectl: false roles: - role: k3s.orchestration.k3s_server -# Play 4: Install Cilium (must run before nodes can be Ready) -# Gateway API CRDs are bundled in the Traefik CRD Helm chart — do NOT install manually. -- name: "K3s {{ variable_host }} - Install Cilium" +# Play 4: Install Cilium CNI +- name: "K3s {{ variable_host }} - Cilium" hosts: server[0] gather_facts: false become: true @@ -188,15 +157,21 @@ cilium_values_file: "{{ cluster_config_dir }}/cluster/cilium/values.yaml" cilium_k8s_service_host: "{{ hostvars[groups['server'][0]]['vpc_ip'] }}" -# Play 5: Wait for cluster ready (after Cilium provides the CNI) -- name: "K3s {{ variable_host }} - Verify cluster" +# Play 5: Verify cluster and fetch kubeconfig +- name: "K3s {{ variable_host }} - Verify" hosts: server[0] gather_facts: false become: true + vars: + cluster_config_dir: "{{ playbook_dir }}/../k3s/{{ galaxy_name }}" tasks: - - name: Wait for all nodes ready + - name: Wait for all nodes Ready command: k3s kubectl wait --for=condition=Ready nodes --all --timeout=300s + register: wait_result + retries: 3 + delay: 30 + until: wait_result.rc == 0 changed_when: false - name: Display cluster status @@ -208,22 +183,13 @@ debug: msg: "{{ cluster_status.stdout_lines }}" -# Play 6: Fetch kubeconfig -- name: "K3s {{ variable_host }} - Fetch kubeconfig" - hosts: server[0] - gather_facts: false - become: true - vars: - cluster_config_dir: "{{ playbook_dir }}/../k3s/{{ galaxy_name }}" - - tasks: - - name: Read kubeconfig from server + - name: Read kubeconfig slurp: src: /etc/rancher/k3s/k3s.yaml register: kubeconfig_raw no_log: true - - name: Write kubeconfig locally (replace server IP and context name) + - name: Write kubeconfig locally copy: content: >- {{ kubeconfig_raw.content | b64decode @@ -235,8 +201,8 @@ dest: "{{ cluster_config_dir }}/.kubeconfig.yaml" mode: "0600" delegate_to: localhost - no_log: true become: false + no_log: true - name: Verify kubectl connectivity command: kubectl get nodes @@ -247,7 +213,7 @@ delegate_to: localhost become: false - - name: Display final status + - name: Done debug: msg: - "=== {{ galaxy_name }} cluster ready ===" diff --git a/ansible/play-k3s--reset.yml b/ansible/play-k3s--reset.yml index a94365cfb..8375ff51b 100644 --- a/ansible/play-k3s--reset.yml +++ b/ansible/play-k3s--reset.yml @@ -6,77 +6,55 @@ # cd k3s/ # just play k3s--reset -- name: "K3s {{ variable_host }} - Build groups for reset" +- name: "K3s {{ variable_host }} - Build groups" hosts: "{{ variable_host }}" gather_facts: false tasks: - - name: Build k3s_cluster group + - name: Build dynamic groups group_by: - key: k3s_cluster + key: "{{ item }}" + loop: [k3s_cluster, server] - - name: Build server group - group_by: - key: server - -- name: "K3s {{ variable_host }} - Reset cluster" +- name: "K3s {{ variable_host }} - Reset" hosts: k3s_cluster become: true tasks: - - name: Run k3s uninstall script (server) + - name: Run k3s uninstall (server) when: "'server' in group_names" - ansible.builtin.command: + command: cmd: k3s-uninstall.sh removes: /var/lib/rancher/k3s/* - - name: Run k3s uninstall script (agent) + - name: Run k3s uninstall (agent) when: "'agent' in group_names" - ansible.builtin.command: + command: cmd: k3s-agent-uninstall.sh removes: /var/lib/rancher/k3s/* - - name: Remove user kubeconfig - ansible.builtin.file: - path: "~{{ ansible_user }}/.kube/config" - state: absent - - - name: Remove k3s install script - ansible.builtin.file: - path: /usr/local/bin/k3s-install.sh - state: absent - - - name: Remove k3s config directory - ansible.builtin.file: - path: /etc/rancher/k3s - state: absent - - - name: Remove k3s service environment - ansible.builtin.file: - path: /etc/systemd/system/k3s.service.env + - name: Clean up remaining files + file: + path: "{{ item }}" state: absent - - - name: Remove Cilium BPF state - ansible.builtin.command: + loop: + - "~{{ ansible_user }}/.kube/config" + - /usr/local/bin/k3s-install.sh + - /etc/rancher/k3s + - /etc/systemd/system/k3s.service.env + - /usr/local/bin/helm + - /var/log/k3s + + - name: Clean up Cilium BPF state + command: cmd: rm -rf /sys/fs/bpf/cilium removes: /sys/fs/bpf/cilium - - name: Remove Helm binary - ansible.builtin.file: - path: /usr/local/bin/helm - state: absent - - - name: Remove k3s audit logs - ansible.builtin.file: - path: /var/log/k3s - state: absent - -- name: "K3s {{ variable_host }} - Clean up local kubeconfig" - hosts: "{{ variable_host }}" +- name: "K3s {{ variable_host }} - Clean local kubeconfig" + hosts: server[0] gather_facts: false become: false - run_once: true tasks: - name: Remove local kubeconfig - ansible.builtin.file: + file: path: "{{ playbook_dir }}/../k3s/{{ galaxy_name }}/.kubeconfig.yaml" state: absent delegate_to: localhost diff --git a/k3s/gxy-management/cluster/cilium/values.yaml b/k3s/gxy-management/cluster/cilium/values.yaml index 9b66764fd..02d100f3c 100644 --- a/k3s/gxy-management/cluster/cilium/values.yaml +++ b/k3s/gxy-management/cluster/cilium/values.yaml @@ -1,9 +1,4 @@ -cluster: - name: "gxy-management" - id: 1 - -# Set at deploy time via Ansible --set flags -# k8sServiceHost: +# cluster.name, cluster.id, k8sServiceHost set at deploy time via Ansible --set flags k8sServicePort: "6443" kubeProxyReplacement: true From 7cbd1177eaeffafd9eddfdfbb56d9c84242dde45 Mon Sep 17 00:00:00 2001 From: Mrugesh Mohapatra Date: Sun, 5 Apr 2026 12:27:21 +0530 Subject: [PATCH 32/40] chore: remove SPIKE-STATUS.md Operational findings, failure analysis, and deployment plan are now maintained in Universe/spike/field-notes.md (the canonical source). This file accumulated cruft from multiple failed deployment attempts and was redundant with the field notes. --- SPIKE-STATUS.md | 313 ------------------------------------------------ 1 file changed, 313 deletions(-) delete mode 100644 SPIKE-STATUS.md diff --git a/SPIKE-STATUS.md b/SPIKE-STATUS.md deleted file mode 100644 index fed13d199..000000000 --- a/SPIKE-STATUS.md +++ /dev/null @@ -1,313 +0,0 @@ -# Universe gxy-management Spike Status - -Status as of 2026-04-04. This document captures all research, decisions, and progress for the first Universe galaxy cluster deployment. - -## Spike Goal - -Deploy gxy-management — the first Universe galaxy cluster — on DigitalOcean FRA1 with Cilium CNI, Traefik ingress, and three core services (Windmill, ArgoCD, Zot). - -Day 0 deliverable: Windmill accessible to all staff at windmill.freecodecamp.net. - -## Architecture Decisions (from Universe ADRs) - -| ADR | Decision | Impact on This Spike | -| ----------------------------- | ---------------------------------------------- | -------------------------------------------------- | -| 001 - Infrastructure Topology | K3s, 4 galaxies planned | gxy-management is first, 3-node HA | -| 002 - IaC Tooling | OpenTofu + Ansible | Using Ansible for bootstrap, TF migration separate | -| 005 - GitOps | ArgoCD multi-cluster | Installed on gxy-management, manages all galaxies | -| 008 - Data Storage | Rook-Ceph (later), local-path Day 0 | No Longhorn, K3s default storage | -| 009 - Networking | Cilium CNI, Cloudflare TLS, Tailscale SSH only | No cert-manager, origin certs from CF | -| 010 - Secrets | SOPS + age Phase 1, OpenBao Phase 2 | SOPS + age implemented in private repo | -| 011 - Security | Pin by SHA, PSS, audit logging | PSS + audit policy in cluster config | -| 015 - Observability | VictoriaMetrics + ClickHouse + HyperDX | Not in this spike, future galaxy | - -## Infrastructure Provisioned - -### DigitalOcean (Universe Account, FRA1) - -| Resource | Details | Status | -| ------------- | --------------------------------------------------------------- | ------ | -| VPC | `gxy-management-vpc`, 10.110.0.0/20, FRA1 | Done | -| Firewall | `universe-firewall` (80, 443, 6443 from VPC, 22 from Tailscale) | Done | -| Droplet 1 | `gxy-vm-mgmt-k3s-1`, s-8vcpu-16gb, 104.248.36.250 | Done | -| Droplet 2 | `gxy-vm-mgmt-k3s-2`, s-8vcpu-16gb, 134.122.69.214 | Done | -| Droplet 3 | `gxy-vm-mgmt-k3s-3`, s-8vcpu-16gb, 104.248.40.237 | Done | -| Spaces bucket | `net.freecodecamp.universe-backups` (etcd snapshots) | Done | -| Spaces bucket | `net.freecodecamp.universe-registry` (Zot images) | Done | - -Tag: `gxy-mgmt-k3s` → Ansible inventory group: `gxy_mgmt_k3s` - -### Cloud-init - -All droplets use `cloud-init/basic.yml` which provides: - -- Package updates/upgrades -- fail2ban (5 retries, 3600s ban) -- SSH hardening via `/etc/ssh/sshd_config.d/99-hardening.conf` (no root login, no passwords, pubkey only) -- `freecodecamp` user with sudo NOPASSWD + GitHub SSH key import -- Uses `ssh.service` (Ubuntu 24.04 naming, with `sshd` fallback) - -### Cluster Specs - -| Setting | Value | -| ------------ | -------------------------------------------------------- | -| K3s version | v1.34.5+k3s1 | -| CNI | Cilium (eBPF, Hubble enabled, kube-proxy replacement) | -| Pod CIDR | 10.1.0.0/16 | -| Service CIDR | 10.11.0.0/16 | -| Ingress | Traefik via ServiceLB (Klipper), ports 80/443 | -| Storage | local-path (K3s default) | -| etcd backups | Every 6h → DO Spaces (net.freecodecamp.universe-backups) | -| Security | Secrets encryption, PSS admission, audit logging | - -## Secrets Architecture - -### What Changed - -Migrated from ansible-vault (single shared password, whole-file encryption in public repo) to sops+age (per-person keys, value-level encryption in private repo). - -Commit: `6ac1504 refactor: migrate secrets from ansible-vault to sops+age` - -### How It Works - -``` -infra-secrets (private repo) infra (public repo) -───────────────────────── ────────────────────── - -global/.env.enc ──── direnv ───────────→ env: LINODE_API_TOKEN, TAILSCALE_AUTH_KEY, - HCP_CLIENT_ID, CLOUDFLARE_*, GRAFANA_* - -do-primary/.env.enc ── direnv ─────────→ env: DO_API_TOKEN (ops-backoffice-tools) -do-universe/.env.enc ── direnv ────────→ env: DO_API_TOKEN, DO_SPACES_ACCESS_KEY, - DO_SPACES_SECRET_KEY (gxy-management) - -k3s//kubeconfig.yaml.enc - └── just kubeconfig-sync ── sops -d → k3s//.kubeconfig.yaml (persists) - -k3s//.secrets.env.enc -k3s//.tls.crt.enc k3s//apps//.../secrets/ -k3s//.tls.key.enc ├── .secrets.env (temp) - └── just deploy ── sops -d ─────────→ ├── tls.crt (temp) - └── tls.key (temp) - (all deleted after kubectl apply) -``` - -### direnv Hierarchy - -| Directory | What Loads | -| --------------------------- | ----------------------------------------------------------- | -| `infra/` (root) | Global tokens (Linode, Tailscale, HCP, Cloudflare, Grafana) | -| `k3s/gxy-management/` | Above + DO_API_TOKEN (universe account) + KUBECONFIG | -| `k3s/ops-backoffice-tools/` | Above + DO_API_TOKEN (primary account) + KUBECONFIG | - -### Key Files - -- `infra/.envrc` — defines `use_sops()` function, loads global tokens, adds ansible venv to PATH -- `infra/k3s//.envrc` — sources parent, loads cluster-specific DO token, sets KUBECONFIG -- `infra-secrets/.sops.yaml` — creation rules with age public keys -- `~/.config/sops/age/keys.txt` — your age private key (from your password manager) - -### infra-secrets File Inventory - -``` -22 encrypted files (.enc) -16 sample files (.sample) - -global/.env.enc — Linode, Tailscale, HCP, Cloudflare, Grafana Cloud tokens -do-primary/.env.enc — Primary DO team API token -do-universe/.env.enc — Universe DO team API token -ansible/vault-k3s.yaml.enc — DO Spaces creds, Tailscale OAuth (YAML format) -appsmith/.env.enc — Appsmith app secrets -outline/.env.enc — Outline app secrets - -k3s/ops-backoffice-tools/ - kubeconfig.yaml.enc — Cluster kubeconfig - appsmith.secrets.env.enc — Appsmith deployed secrets - appsmith.tls.crt.enc — Appsmith Cloudflare origin cert - appsmith.tls.key.enc — Appsmith origin private key - outline.secrets.env.enc — Outline deployed secrets - outline.tls.crt.enc — Outline Cloudflare origin cert - outline.tls.key.enc — Outline origin private key - -k8s/o11y/ - kubeconfig.yaml.enc — o11y cluster kubeconfig - o11y.secrets.env.enc — o11y deployed secrets - o11y.tls.crt.enc — o11y Cloudflare origin cert - o11y.tls.key.enc — o11y origin private key - -docker/oldeworld/oncall.env.enc — Oncall stack secrets -scratchpad/ — dev.env.enc, org.env.enc, sample.env.enc -``` - -## justfile Recipes - -| Recipe | Purpose | Requires | -| ----------------------------------- | ----------------------------------- | --------------------- | -| `just secret-verify-all` | Verify all secrets decrypt | age key | -| `just secret-view ` | View a secret (auto-detects format) | age key | -| `just secret-edit ` | Edit a secret in $EDITOR | age key | -| `just kubeconfig-sync ` | Decrypt kubeconfig (run once) | age key | -| `just play [inv]` | Run any ansible playbook | API token via direnv | -| `just deploy ` | Deploy app (secrets + TLS → apply) | KUBECONFIG via direnv | -| `just helm-upgrade ` | Install/upgrade Helm chart | KUBECONFIG via direnv | -| `just k8s-validate [version]` | Validate manifests with kubeconform | — | -| `just ansible-install` | Install ansible + dependencies | — | -| `just tf [workspace]` | Run terraform (selective or all) | API tokens via direnv | -| `just tf-fmt` | Format all terraform files | — | -| `just tf-list` | List terraform workspaces | — | - -## What's Done - -- [x] DigitalOcean infrastructure (VPC, firewall, 3 droplets, 2 Spaces buckets) -- [x] Cloud-init hardening (fail2ban, SSH, user creation) tested on OrbStack + deployed -- [x] Secrets migration: ansible-vault → sops+age in private infra-secrets repo -- [x] direnv wiring: root + cluster .envrc files with use_sops -- [x] justfile recipes: secrets, deploy, play, helm-upgrade, kubeconfig-sync, tf -- [x] gxy-management cluster configs (Cilium values, security policies, Traefik config) -- [x] App manifests (ArgoCD, Windmill, Zot — kustomization, gateway, httproutes) -- [x] Helm chart values (ArgoCD, Windmill, Zot — credentials stripped to secret overlays) -- [x] Documentation (infra-secrets README wiring doc, gxy-management README runbook) -- [x] Tailscale installed and connected on all 3 nodes (verified: online, SSH enabled) -- [x] Cloudflare origin certs encrypted for all 3 apps (reused existing wildcard) -- [x] Code review: 3 CRITICALs + 10 WARNINGs + 6 SUGGESTIONs found and fixed -- [x] Justfile overhaul: 18 → 11 parametric recipes, no special-case orchestration - -## Secrets → Helm Flow - -Public values.yaml (structure, resources, flags) are overlaid with secret values from infra-secrets: - -``` -Public: k3s//apps//charts//values.yaml -Secret: infra-secrets/k3s//.values.yaml.enc (optional, sops-encrypted) - -just helm-upgrade → helm upgrade --install -f values.yaml -f /tmp/secret-values.yaml → cleanup -``` - -Apps that only need K8s Secrets (ArgoCD, Zot) use `just deploy` which decrypts `.secrets.env` + TLS. - -## What's Next - -**BLOCKED on playbook rewrite.** The galaxy playbook failed on every run for different reasons (CIS sysctls, folded scalar bugs, Gateway CRD conflicts, KUBECONFIG missing, etcd peer timeout). The playbook needs to be rewritten from scratch following standard k3s-ansible + Cilium patterns. Cluster has been torn down. - -### Parked Tasks (resume after playbooks are solid) - -| Task | Notes | -| ------------------------- | -------------------------------------------- | -| Run galaxy playbook | Needs rewritten playbook | -| Verify cluster health | 3 nodes Ready, Cilium green, Traefik running | -| Encrypt kubeconfig | sops encrypt to infra-secrets | -| Install Windmill Helm | `just helm-upgrade gxy-management windmill` | -| Deploy Windmill manifests | Gateway + TLS via `just deploy` | -| Cloudflare DNS + Access | ClickOps: A records, email OTP gate | -| Smoke test Windmill | curl + browser | -| Commit infra-secrets | Push to GitHub | -| Clean up stale files | Remove SPIKE-STATUS.md, archive cruft | - -### Phase B: Windmill (Day 0 Deliverable) - -| # | Task | Status | Command / Notes | -| --- | --------------------- | ------ | ----------------------------------------------------- | -| B1 | Install Windmill Helm | TODO | `just helm-upgrade gxy-management windmill` | -| B2 | Verify pods ready | TODO | `kubectl get pods -n windmill` | -| B3 | Deploy manifests | TODO | `just deploy gxy-management windmill` (Gateway + TLS) | -| B4 | Cloudflare DNS | TODO | ClickOps: A records (proxied) → 3 node public IPs | -| B5 | Cloudflare Access | TODO | ClickOps: email OTP gate, all staff | -| B6 | Smoke test | TODO | curl + browser, verify Access gate | - -### Phase C: ArgoCD (Platform Team) - -| # | Task | Status | Command / Notes | -| --- | ----------------------- | ------ | ------------------------------------------------------- | -| C1 | Populate ArgoCD secrets | TODO | Create `argocd.secrets.env.enc` (bcrypt admin password) | -| C2 | Install ArgoCD Helm | TODO | `just helm-upgrade gxy-management argocd` | -| C3 | Deploy manifests | TODO | `just deploy gxy-management argocd` | -| C4 | DNS + Access | TODO | ClickOps: argocd.freecodecamp.net, platform team only | -| C5 | Verify | TODO | Login, verify dashboard | - -### Phase D: Zot (Platform Team) - -| # | Task | Status | Command / Notes | -| --- | -------------------- | ------ | ------------------------------------------------------- | -| D1 | Populate Zot secrets | TODO | Create `zot.secrets.env.enc` (S3 creds, htpasswd) | -| D2 | Install Zot Helm | TODO | `just helm-upgrade gxy-management zot` | -| D3 | Deploy manifests | TODO | `just deploy gxy-management zot` | -| D4 | DNS + Access | TODO | ClickOps: registry.freecodecamp.net, platform team only | -| D5 | Verify | TODO | Push/pull test image | - -### Phase E: Cleanup - -| # | Task | Status | Notes | -| --- | ---------------------- | ------ | ---------------------------------------------- | -| E1 | Commit infra-secrets | TODO | Push to GitHub | -| E2 | Remove SPIKE-STATUS.md | TODO | Absorb permanent decisions into cluster README | -| E3 | Clean up stale files | TODO | Orphaned samples, archive cruft | - -Unblocked now: A1, A2 (populate secrets). - -## Existing Infrastructure (Unchanged) - -### ops-backoffice-tools (live, 101 days uptime) - -- 3 nodes: ops-vm-tools-k3s-nyc3-{01,02,03}, k3s v1.32.11 -- Apps: Appsmith (1 pod), Outline (3 containers) -- Storage: Longhorn v1.10.1 (31 pods) -- Ingress: Traefik v3.5.1 -- Network: Tailscale operator -- Helm: longhorn, tailscale-operator, traefik, traefik-crd - -### What Was Archived (this branch) - -Observability stack torn down and moved to `.archive/2026-03-observability-teardown/`: - -- ops-logs-clickhouse cluster (3 droplets) -- Grafana, Prometheus, Vector from ops-backoffice-tools -- Savings: ~$231/month - -### Branch History - -``` -feat/k3s-universe (13 commits ahead of main) - -ab0f800 chore: add tailscale justfile recipes and update gxy-management README -6ac1504 refactor: migrate secrets from ansible-vault to sops+age -9c902c1 feat(cloud-init): update config for Ubuntu 24.04 -0619242 fix(k8s): exclude JSON and dashboards from kubeconform validation -2332a1c feat(k8s): add kubeconform manifest validation — local + CI -b5fc35b feat(gxy-management): align Day 0 config with spike-plan and ADRs -c9c1b4e fix: move archive -6137073 fix: move scratchpad -5810c79 feat: add direnv hierarchy and secrets bootstrap workflow -4ebcc24 feat: consolidate secrets management with ansible-vault -a564bd6 refactor: consolidate justfiles into root justfile -b0fae18 feat(k3s): add gxy-management galaxy configs and Day 0 spike infrastructure -e72beb5 feat(k3s): add ops-mgmt cluster configs and tooling -``` - -## Errors and Fixes (for Future Reference) - -| Issue | Root Cause | Fix | -| ----------------------------------------------------- | ------------------------------------------------------------------------------ | --------------------------------------------------------- | -| cloud-init heredoc syntax error | runcmd `\|` strings don't support heredoc | Moved to write_files section | -| `systemctl restart sshd` fails on Ubuntu 24.04 | Service renamed to `ssh.service` | `ssh \|\| sshd \|\| true` fallback | -| SSH hardening sed had no effect | Ubuntu 24.04 ships commented defaults | Drop-in file at sshd_config.d/99-hardening.conf | -| sops `path_regex: .*\.enc$` didn't match input files | Regex matches input path, not output | Changed to `.*` (match all) | -| sops `dotenv` format failed on YAML file | ansible vars are YAML, not dotenv | Renamed to `.yaml.enc`, format detection in verify recipe | -| direnv `$(dirname "$0")` empty | Not available in direnv context | Use `expand_path ../infra-secrets` | -| k3s fails: `invalid kernel flag vm/overcommit_memory` | `--protect-kernel-defaults` requires CIS sysctls, prereq role doesn't set them | Add sysctl pre_task (90-kubelet.conf) before k3s starts | -| Ansible `galaxy_name` recursive template loop | `vars: galaxy_name: "{{ galaxy_name \| default(...) }}"` self-references | Remove from vars, pass via `-e`, add assert validation | -| Galaxy playbook hardcodes all environment values | CIDRs, versions, bucket names, Cilium ID in playbook not group_vars | **BLOCKING** — refactor to group_vars before re-run | - -## Open Questions - -- **Playbook refactor**: Move all galaxy-specific config to `inventory/group_vars/.yml`. Playbook must be generic. -- **Helm chart versions**: Pin versions — Windmill 4.0.124, ArgoCD 9.4.17, Zot 0.1.104 -- **Cloudflare Access policies**: Exact group/email configuration TBD -- **Windmill DB**: Embedded PostgreSQL for Day 0 (ADR-008: CNPG later) - -## Lessons Learned - -1. **Use tool primitives.** Ansible has group_vars, roles, templates — use them. Don't write shell scripts disguised as YAML. -2. **Research before coding.** The `--protect-kernel-defaults` failure was predictable from the k3s hardening guide. Read the docs first. -3. **The spike IS production.** No shortcuts, no "fix later." Every line of code must be production-quality. -4. **Aggressive review.** Every change gets independent hostile review before presenting. The review that caught 3 CRITICALs was correct — but more should have been caught at design time. -5. **Validate before suggesting.** Never tell the operator to "try it" without dry-run, syntax check, or verified expansion. From 51362a7bf6e973e94b21061c987c5bab15374d51 Mon Sep 17 00:00:00 2001 From: Mrugesh Mohapatra Date: Sun, 5 Apr 2026 12:51:07 +0530 Subject: [PATCH 33/40] fix: flush stale Cilium iptables chains in reset playbook k3s-uninstall.sh does not clean Cilium's iptables chains (CILIUM_INPUT, CILIUM_PRE_mangle, etc). These stale rules block inter-node traffic on redeploy, causing etcd peer timeouts. --- ansible/play-k3s--reset.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/ansible/play-k3s--reset.yml b/ansible/play-k3s--reset.yml index 8375ff51b..04d2e8056 100644 --- a/ansible/play-k3s--reset.yml +++ b/ansible/play-k3s--reset.yml @@ -48,6 +48,13 @@ cmd: rm -rf /sys/fs/bpf/cilium removes: /sys/fs/bpf/cilium + - name: Flush Cilium iptables chains + shell: | + iptables-save | grep -iv cilium | iptables-restore + ip6tables-save | grep -iv cilium | ip6tables-restore + changed_when: true + ignore_errors: true + - name: "K3s {{ variable_host }} - Clean local kubeconfig" hosts: server[0] gather_facts: false From f8944825db4da086bf3f87b12c711b8a77912d0c Mon Sep 17 00:00:00 2001 From: Mrugesh Mohapatra Date: Sun, 5 Apr 2026 13:45:57 +0530 Subject: [PATCH 34/40] =?UTF-8?q?fix:=20reset=20playbook=20=E2=80=94=20add?= =?UTF-8?q?=20gather=5Ffacts,=20fix=20ansible=5Fuser=5Fid,=20verify=20clea?= =?UTF-8?q?n?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit gather_facts needed for ansible_user_id in kubeconfig path cleanup. ansible_user is a connection var, not a fact — use ansible_user_id. --- ansible/play-k3s--reset.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ansible/play-k3s--reset.yml b/ansible/play-k3s--reset.yml index 04d2e8056..988f061ef 100644 --- a/ansible/play-k3s--reset.yml +++ b/ansible/play-k3s--reset.yml @@ -17,6 +17,7 @@ - name: "K3s {{ variable_host }} - Reset" hosts: k3s_cluster + gather_facts: true become: true tasks: - name: Run k3s uninstall (server) @@ -36,7 +37,7 @@ path: "{{ item }}" state: absent loop: - - "~{{ ansible_user }}/.kube/config" + - "~{{ ansible_user_id }}/.kube/config" - /usr/local/bin/k3s-install.sh - /etc/rancher/k3s - /etc/systemd/system/k3s.service.env From adafa0faab4d449002a55c334fdbe8ef8fc09405 Mon Sep 17 00:00:00 2001 From: Mrugesh Mohapatra Date: Sun, 5 Apr 2026 13:47:57 +0530 Subject: [PATCH 35/40] refactor(ansible): rename play-k3s--galaxy to play-k3s--bootstrap --- ansible/{play-k3s--galaxy.yml => play-k3s--bootstrap.yml} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename ansible/{play-k3s--galaxy.yml => play-k3s--bootstrap.yml} (100%) diff --git a/ansible/play-k3s--galaxy.yml b/ansible/play-k3s--bootstrap.yml similarity index 100% rename from ansible/play-k3s--galaxy.yml rename to ansible/play-k3s--bootstrap.yml From 44706c7783f4a467e648d8ca0683b50beda45035 Mon Sep 17 00:00:00 2001 From: Mrugesh Mohapatra Date: Sun, 5 Apr 2026 15:34:38 +0530 Subject: [PATCH 36/40] fix(cilium): add bpf.masquerade, increase timeout, add retries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add bpf.masquerade: true to Cilium values — moves masquerade to eBPF, fixes etcd peer communication failure caused by k3s iptables save/restore conflicting with Cilium chains (k3s#7736) - Remove installNoConntrackIptablesRules (incompatible with VXLAN tunnel mode) - Increase Helm install timeout from 5m to 10m (first install pulls images) - Add retries to DaemonSet/operator rollout and status verification tasks (k3s API is transiently unavailable after Cilium changes network stack) - Update all README references from k3s--galaxy to k3s--bootstrap --- ansible/roles/cilium/tasks/main.yml | 13 ++++++++++++- k3s/README.md | 12 ++++++------ k3s/gxy-management/README.md | 4 ++-- k3s/gxy-management/cluster/cilium/values.yaml | 2 ++ 4 files changed, 22 insertions(+), 9 deletions(-) diff --git a/ansible/roles/cilium/tasks/main.yml b/ansible/roles/cilium/tasks/main.yml index 2cc56d7b9..79df3b07d 100644 --- a/ansible/roles/cilium/tasks/main.yml +++ b/ansible/roles/cilium/tasks/main.yml @@ -44,7 +44,7 @@ - value: "k8sServiceHost={{ cilium_k8s_service_host }}" - value: "k8sServicePort={{ cilium_k8s_service_port }}" wait: true - timeout: "5m0s" + timeout: "10m0s" environment: KUBECONFIG: /etc/rancher/k3s/k3s.yaml @@ -59,6 +59,10 @@ environment: KUBECONFIG: /etc/rancher/k3s/k3s.yaml changed_when: false + retries: 3 + delay: 15 + register: cilium_agent_rollout + until: cilium_agent_rollout.rc == 0 - name: Wait for Cilium operator Deployment rollout ansible.builtin.command: @@ -68,6 +72,10 @@ environment: KUBECONFIG: /etc/rancher/k3s/k3s.yaml changed_when: false + retries: 3 + delay: 15 + register: cilium_operator_rollout + until: cilium_operator_rollout.rc == 0 - name: Verify Cilium status ansible.builtin.command: @@ -78,6 +86,9 @@ KUBECONFIG: /etc/rancher/k3s/k3s.yaml register: cilium_status changed_when: false + retries: 5 + delay: 20 + until: cilium_status.rc == 0 - name: Display Cilium status ansible.builtin.debug: diff --git a/k3s/README.md b/k3s/README.md index 577c27827..292e57b9e 100644 --- a/k3s/README.md +++ b/k3s/README.md @@ -80,7 +80,7 @@ just play k3s--cluster tools_k3s just play k3s--longhorn tools_k3s # Deploy gxy-management galaxy (decrypts vault vars automatically) -just play k3s--galaxy gxy_mgmt_k3s +just play k3s--bootstrap gxy_mgmt_k3s ``` --- @@ -179,8 +179,8 @@ CNI: Cilium Storage: local-path SSH/kubectl: Tailscale ## Playbooks Reference -| Playbook | Purpose | -| ---------------------- | ----------------------------------------------------- | -| play-k3s--cluster.yml | Deploy k3s HA cluster | -| play-k3s--longhorn.yml | Install Longhorn storage | -| play-k3s--galaxy.yml | Deploy any Universe galaxy (K3s + Cilium + Tailscale) | +| Playbook | Purpose | +| ----------------------- | ----------------------------------------------------- | +| play-k3s--cluster.yml | Deploy k3s HA cluster | +| play-k3s--longhorn.yml | Install Longhorn storage | +| play-k3s--bootstrap.yml | Deploy any Universe galaxy (K3s + Cilium + Tailscale) | diff --git a/k3s/gxy-management/README.md b/k3s/gxy-management/README.md index 44cd3cc05..c419a3b39 100644 --- a/k3s/gxy-management/README.md +++ b/k3s/gxy-management/README.md @@ -29,7 +29,7 @@ kubectl get nodes ## Deploy ```bash -just play k3s--galaxy gxy_mgmt_k3s +just play k3s--bootstrap gxy_mgmt_k3s ``` ## Deployment Runbook @@ -46,7 +46,7 @@ just play k3s--galaxy gxy_mgmt_k3s ### K3s Bootstrap ```bash -just play k3s--galaxy gxy_mgmt_k3s +just play k3s--bootstrap gxy_mgmt_k3s ``` Deploys k3s HA cluster with Cilium CNI, Traefik ingress, etcd S3 backups, and fetches kubeconfig. diff --git a/k3s/gxy-management/cluster/cilium/values.yaml b/k3s/gxy-management/cluster/cilium/values.yaml index 02d100f3c..ef3e31c8f 100644 --- a/k3s/gxy-management/cluster/cilium/values.yaml +++ b/k3s/gxy-management/cluster/cilium/values.yaml @@ -2,6 +2,8 @@ k8sServicePort: "6443" kubeProxyReplacement: true +bpf: + masquerade: true ipam: operator: From 44b6277d2145a4bd68f7728c7b099179b87224c3 Mon Sep 17 00:00:00 2001 From: Mrugesh Mohapatra Date: Sun, 5 Apr 2026 16:41:47 +0530 Subject: [PATCH 37/40] fix(k3s): disable kube-proxy replacement, fix kubeconfig write MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Set kubeProxyReplacement: false in Cilium values — kube-proxy replacement breaks etcd on k3s HA embedded etcd (see field-notes Failure 7). Cilium still provides CNI + network policies + Hubble without it. - Re-enable kube-proxy in k3s config (disable-kube-proxy: false) - Fix kubeconfig write: use copy + replace instead of chained Jinja2 regex_replace in folded scalar (was writing 127.0.0.1 instead of Tailscale IP) - Reset playbook: clean /etc/rancher and /var/lib/rancher entirely --- ansible/inventory/group_vars/gxy_mgmt_k3s.yml | 5 +++- ansible/play-k3s--bootstrap.yml | 24 +++++++++++++------ ansible/play-k3s--reset.yml | 3 ++- k3s/gxy-management/cluster/cilium/values.yaml | 4 +--- 4 files changed, 24 insertions(+), 12 deletions(-) diff --git a/ansible/inventory/group_vars/gxy_mgmt_k3s.yml b/ansible/inventory/group_vars/gxy_mgmt_k3s.yml index 33fdea0fa..88a567b8c 100644 --- a/ansible/inventory/group_vars/gxy_mgmt_k3s.yml +++ b/ansible/inventory/group_vars/gxy_mgmt_k3s.yml @@ -18,7 +18,10 @@ cilium_cluster_id: 1 server_config_yaml: | flannel-backend: "none" disable-network-policy: true - disable-kube-proxy: true + # kube-proxy replacement disabled — breaks etcd on k3s HA (see field-notes Failure 7) + # Cilium still provides CNI + network policies + Hubble without it + # Revisit on bare metal where performance matters + disable-kube-proxy: false cluster-cidr: "10.1.0.0/16" service-cidr: "10.11.0.0/16" protect-kernel-defaults: true diff --git a/ansible/play-k3s--bootstrap.yml b/ansible/play-k3s--bootstrap.yml index 4afdf10de..74ca44624 100644 --- a/ansible/play-k3s--bootstrap.yml +++ b/ansible/play-k3s--bootstrap.yml @@ -191,19 +191,29 @@ - name: Write kubeconfig locally copy: - content: >- - {{ kubeconfig_raw.content | b64decode - | regex_replace('127\\.0\\.0\\.1', hostvars[inventory_hostname]['tailscale_ip']) - | regex_replace('name: default', 'name: ' + galaxy_name) - | regex_replace('cluster: default', 'cluster: ' + galaxy_name) - | regex_replace('user: default', 'user: ' + galaxy_name) - | regex_replace('current-context: default', 'current-context: ' + galaxy_name) }} + content: "{{ kubeconfig_raw.content | b64decode }}" dest: "{{ cluster_config_dir }}/.kubeconfig.yaml" mode: "0600" delegate_to: localhost become: false no_log: true + - name: Fix kubeconfig server address (use Tailscale IP) + delegate_to: localhost + become: false + ansible.builtin.replace: + path: "{{ cluster_config_dir }}/.kubeconfig.yaml" + regexp: 'https://127\.0\.0\.1:6443' + replace: "https://{{ hostvars[inventory_hostname]['tailscale_ip'] }}:6443" + + - name: Fix kubeconfig context name + delegate_to: localhost + become: false + ansible.builtin.replace: + path: "{{ cluster_config_dir }}/.kubeconfig.yaml" + regexp: '(\s+)(name|cluster|user|current-context): default' + replace: '\1\2: {{ galaxy_name }}' + - name: Verify kubectl connectivity command: kubectl get nodes environment: diff --git a/ansible/play-k3s--reset.yml b/ansible/play-k3s--reset.yml index 988f061ef..365429de5 100644 --- a/ansible/play-k3s--reset.yml +++ b/ansible/play-k3s--reset.yml @@ -39,7 +39,8 @@ loop: - "~{{ ansible_user_id }}/.kube/config" - /usr/local/bin/k3s-install.sh - - /etc/rancher/k3s + - /etc/rancher + - /var/lib/rancher - /etc/systemd/system/k3s.service.env - /usr/local/bin/helm - /var/log/k3s diff --git a/k3s/gxy-management/cluster/cilium/values.yaml b/k3s/gxy-management/cluster/cilium/values.yaml index ef3e31c8f..08fabe673 100644 --- a/k3s/gxy-management/cluster/cilium/values.yaml +++ b/k3s/gxy-management/cluster/cilium/values.yaml @@ -1,9 +1,7 @@ # cluster.name, cluster.id, k8sServiceHost set at deploy time via Ansible --set flags k8sServicePort: "6443" -kubeProxyReplacement: true -bpf: - masquerade: true +kubeProxyReplacement: false ipam: operator: From d47b359d76dc745064a44967462734e777915e97 Mon Sep 17 00:00:00 2001 From: Mrugesh Mohapatra Date: Sun, 5 Apr 2026 17:28:55 +0530 Subject: [PATCH 38/40] fix(windmill): remove unused Opaque secretGenerator Windmill Helm chart does not consume a windmill-secrets Opaque secret. Database credentials come from the secret values overlay via Helm. Admin password is set via Windmill UI on first boot. Keep only the TLS secretGenerator (referenced by Gateway). --- .../apps/windmill/manifests/base/kustomization.yaml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/k3s/gxy-management/apps/windmill/manifests/base/kustomization.yaml b/k3s/gxy-management/apps/windmill/manifests/base/kustomization.yaml index b6e81a5fa..93ac095f6 100644 --- a/k3s/gxy-management/apps/windmill/manifests/base/kustomization.yaml +++ b/k3s/gxy-management/apps/windmill/manifests/base/kustomization.yaml @@ -8,12 +8,6 @@ resources: - httproutes.yaml secretGenerator: - - name: windmill-secrets - type: Opaque - envs: - - secrets/.secrets.env - options: - disableNameSuffixHash: true - name: windmill-tls-cloudflare type: kubernetes.io/tls files: From bacbf5c5bc4cae67097cfb58d933e631036b16be Mon Sep 17 00:00:00 2001 From: Mrugesh Mohapatra Date: Mon, 6 Apr 2026 00:21:23 +0530 Subject: [PATCH 39/40] fix(cilium): pin devices/MTU, disable metrics-server MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cilium auto-detected tailscale0 (MTU 1280) alongside eth0/eth1 (MTU 1500), setting all pod veths to 1280. This broke cross-node pod-to-pod HTTP (packets exceeded path MTU and were dropped). Pin devices to [eth0, eth1] and MTU to 1500 to exclude tailscale0. Disable metrics-server — pods cannot reach node VPC IPs directly (connection refused, all ports). Services via kube-proxy DNAT and pod-to-pod via VXLAN work fine. Root cause under investigation (Cilium BPF handling of pod-to-host traffic on multi-NIC nodes). Also inline >- folded scalars in Cilium role tasks. --- ansible/inventory/group_vars/gxy_mgmt_k3s.yml | 2 ++ ansible/roles/cilium/tasks/main.yml | 8 ++------ k3s/gxy-management/cluster/cilium/values.yaml | 2 ++ 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/ansible/inventory/group_vars/gxy_mgmt_k3s.yml b/ansible/inventory/group_vars/gxy_mgmt_k3s.yml index 88a567b8c..382c6e821 100644 --- a/ansible/inventory/group_vars/gxy_mgmt_k3s.yml +++ b/ansible/inventory/group_vars/gxy_mgmt_k3s.yml @@ -18,6 +18,8 @@ cilium_cluster_id: 1 server_config_yaml: | flannel-backend: "none" disable-network-policy: true + disable: + - metrics-server # kube-proxy replacement disabled — breaks etcd on k3s HA (see field-notes Failure 7) # Cilium still provides CNI + network policies + Hubble without it # Revisit on bare metal where performance matters diff --git a/ansible/roles/cilium/tasks/main.yml b/ansible/roles/cilium/tasks/main.yml index 79df3b07d..14ea98c67 100644 --- a/ansible/roles/cilium/tasks/main.yml +++ b/ansible/roles/cilium/tasks/main.yml @@ -66,9 +66,7 @@ - name: Wait for Cilium operator Deployment rollout ansible.builtin.command: - cmd: >- - kubectl -n kube-system rollout status - deployment/cilium-operator --timeout=180s + cmd: kubectl -n kube-system rollout status deployment/cilium-operator --timeout=180s environment: KUBECONFIG: /etc/rancher/k3s/k3s.yaml changed_when: false @@ -79,9 +77,7 @@ - name: Verify Cilium status ansible.builtin.command: - cmd: >- - kubectl -n kube-system exec ds/cilium - -c cilium-agent -- cilium status --brief + cmd: kubectl -n kube-system exec ds/cilium -c cilium-agent -- cilium status --brief environment: KUBECONFIG: /etc/rancher/k3s/k3s.yaml register: cilium_status diff --git a/k3s/gxy-management/cluster/cilium/values.yaml b/k3s/gxy-management/cluster/cilium/values.yaml index 08fabe673..7b5242a24 100644 --- a/k3s/gxy-management/cluster/cilium/values.yaml +++ b/k3s/gxy-management/cluster/cilium/values.yaml @@ -2,6 +2,8 @@ k8sServicePort: "6443" kubeProxyReplacement: false +devices: [eth0, eth1] +mtu: 1500 ipam: operator: From 10adb2a2421c4aa2d49646f3fef0a5f16d8d79c9 Mon Sep 17 00:00:00 2001 From: Mrugesh Mohapatra Date: Mon, 6 Apr 2026 00:36:31 +0530 Subject: [PATCH 40/40] fix(k3s): re-enable metrics-server with hostNetwork workaround Pods cannot reach node VPC IPs directly on Cilium multi-NIC nodes (open issue, see field-notes Failure 8b). metrics-server needs kubelet access on nodeIP:10250. Workaround: patch metrics-server deployment in Play 5 to use hostNetwork with --secure-port=4443 (avoids kubelet port conflict). Verified: kubectl top nodes returns data for all 3 nodes. --- ansible/inventory/group_vars/gxy_mgmt_k3s.yml | 2 -- ansible/play-k3s--bootstrap.yml | 14 ++++++++++++++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/ansible/inventory/group_vars/gxy_mgmt_k3s.yml b/ansible/inventory/group_vars/gxy_mgmt_k3s.yml index 382c6e821..88a567b8c 100644 --- a/ansible/inventory/group_vars/gxy_mgmt_k3s.yml +++ b/ansible/inventory/group_vars/gxy_mgmt_k3s.yml @@ -18,8 +18,6 @@ cilium_cluster_id: 1 server_config_yaml: | flannel-backend: "none" disable-network-policy: true - disable: - - metrics-server # kube-proxy replacement disabled — breaks etcd on k3s HA (see field-notes Failure 7) # Cilium still provides CNI + network policies + Hubble without it # Revisit on bare metal where performance matters diff --git a/ansible/play-k3s--bootstrap.yml b/ansible/play-k3s--bootstrap.yml index 74ca44624..811a5f4be 100644 --- a/ansible/play-k3s--bootstrap.yml +++ b/ansible/play-k3s--bootstrap.yml @@ -223,6 +223,20 @@ delegate_to: localhost become: false + - name: Patch metrics-server for hostNetwork (pod→nodeIP workaround) + command: >- + k3s kubectl -n kube-system patch deploy metrics-server + --type=json + -p='[{"op":"add","path":"/spec/template/spec/hostNetwork","value":true}, + {"op":"replace","path":"/spec/template/spec/containers/0/ports/0/containerPort","value":4443}, + {"op":"replace","path":"/spec/template/spec/containers/0/args","value":["--cert-dir=/tmp","--secure-port=4443","--kubelet-preferred-address-types=InternalIP,ExternalIP,Hostname","--kubelet-use-node-status-port","--metric-resolution=15s"]}]' + register: patch_result + changed_when: "'patched' in patch_result.stdout" + failed_when: false + retries: 5 + delay: 10 + until: patch_result.rc == 0 + - name: Done debug: msg: