diff --git a/ix-dev/community/nas-doctor/README.md b/ix-dev/community/nas-doctor/README.md new file mode 100644 index 00000000000..8478c69c32c --- /dev/null +++ b/ix-dev/community/nas-doctor/README.md @@ -0,0 +1,26 @@ +# NAS Doctor + +[NAS Doctor](https://github.com/mcdays94/nas-doctor) is a local diagnostic and monitoring tool for your NAS. +It runs periodic health checks — analyzing SMART data, disk usage, Docker containers, +GPU, process CPU usage, network speed, ZFS pools, UPS power, tunnels, and more — then +surfaces findings with actionable recommendations backed by Backblaze failure rate data. + +## Features + +- 20+ diagnostic rules with automatic root-cause correlation +- SMART health with Backblaze failure-rate thresholds (337k+ drives) +- Top Processes with Docker container attribution (cgroup v1 and v2) +- GPU monitoring (NVIDIA, Intel, AMD) +- Network speed test scheduling (Ookla CLI) +- Service checks (HTTP, TCP, DNS, Ping/ICMP, SMB, NFS, Speed) +- ZFS pool health, UPS/power monitoring +- Webhook alerts (Discord, Slack, Gotify, Ntfy) +- Prometheus metrics endpoint +- Multi-server fleet monitoring + +## Notes + +- Requires privileged access (`SYS_RAWIO` capability) for SMART health monitoring +- Host PID namespace sharing is enabled so Top Processes can see all host processes + and match them to Docker containers via cgroup data +- `/dev` and `/sys` are mounted read-only for device access and GPU telemetry diff --git a/ix-dev/community/nas-doctor/app.yaml b/ix-dev/community/nas-doctor/app.yaml new file mode 100644 index 00000000000..840821ff161 --- /dev/null +++ b/ix-dev/community/nas-doctor/app.yaml @@ -0,0 +1,52 @@ +annotations: + min_scale_version: 24.10.2.2 +app_version: 0.9.0 +capabilities: +- description: NAS Doctor requires privileged access for SMART disk health monitoring + via smartctl. + name: SYS_RAWIO +categories: +- monitoring +changelog_url: https://github.com/mcdays94/nas-doctor/releases +date_added: '2026-04-10' +description: Sleep tight knowing your server never does. NAS Doctor is a local diagnostic + and monitoring tool for your NAS — analyzing SMART data, disk usage, Docker containers, + GPU, process CPU, network speed, ZFS pools, UPS power, tunnels, and more. +home: https://github.com/mcdays94/nas-doctor +host_mounts: +- description: Docker socket for container monitoring + host_path: /var/run/docker.sock +- description: Device nodes for SMART and GPU access + host_path: /dev +- description: Sysfs for GPU telemetry and drive mapping + host_path: /sys +icon: https://raw.githubusercontent.com/mcdays94/nas-doctor/main/icons/icon3.png +keywords: +- monitoring +- diagnostics +- smart +- health +- nas +- dashboard +- zfs +- gpu +lib_version: 2.3.2 +lib_version_hash: '' +maintainers: +- email: dev@truenas.com + name: truenas + url: https://www.truenas.com/ +name: nas-doctor +run_as_context: +- description: NAS Doctor requires root access for SMART disk health monitoring via + smartctl. + gid: 0 + group_name: root + uid: 0 + user_name: root +screenshots: [] +sources: +- https://github.com/mcdays94/nas-doctor +title: NAS Doctor +train: community +version: 1.0.0 diff --git a/ix-dev/community/nas-doctor/ix_values.yaml b/ix-dev/community/nas-doctor/ix_values.yaml new file mode 100644 index 00000000000..e800a953359 --- /dev/null +++ b/ix-dev/community/nas-doctor/ix_values.yaml @@ -0,0 +1,8 @@ +images: + image: + repository: ghcr.io/mcdays94/nas-doctor + tag: "0.9.0" + +consts: + nas_doctor_container_name: nas-doctor + perms_container_name: nas-doctor-perms diff --git a/ix-dev/community/nas-doctor/questions.yaml b/ix-dev/community/nas-doctor/questions.yaml new file mode 100644 index 00000000000..e941006e62d --- /dev/null +++ b/ix-dev/community/nas-doctor/questions.yaml @@ -0,0 +1,449 @@ +groups: + - name: NAS Doctor Configuration + description: Configure NAS Doctor + - name: Network Configuration + description: Configure Network for NAS Doctor + - name: Storage Configuration + description: Configure Storage for NAS Doctor + - name: Labels Configuration + description: Configure Labels for NAS Doctor + - name: Resources Configuration + description: Configure Resources for NAS Doctor + +questions: + - variable: nas_doctor + label: "" + group: NAS Doctor Configuration + schema: + type: dict + attrs: + - variable: scan_interval + label: Scan Interval + description: How often to run diagnostic scans. + schema: + type: string + default: "30m" + enum: + - value: "30m" + description: Every 30 minutes + - value: "1h" + description: Every hour + - value: "3h" + description: Every 3 hours + - value: "6h" + description: Every 6 hours + - value: "12h" + description: Every 12 hours + - value: "24h" + description: Every 24 hours + - variable: timezone + label: Timezone + description: Container timezone (e.g. America/New_York, Europe/London). + schema: + type: string + default: "Etc/UTC" + $ref: + - definitions/timezone + - variable: mount_docker_socket + label: Mount Docker Socket + description: | + Mount the Docker socket (read only) to enable container monitoring. + schema: + type: boolean + default: true + - variable: mount_host_logs + label: Mount Host Logs + description: | + Mount /var/log (read only) to enable system log analysis (dmesg, syslog errors). + schema: + type: boolean + default: true + - variable: mount_host_mnt + label: Mount Host Mounts + description: | + Mount /mnt (read only) to enable per-disk and per-pool space monitoring. + schema: + type: boolean + default: true + - variable: mount_dev + label: Mount /dev + description: | + Mount /dev (read only) to enable SMART disk health monitoring and GPU device access. + Required for SMART to work on most systems. + schema: + type: boolean + default: true + - variable: mount_sys + label: Mount /sys + description: | + Mount /sys (read only) to enable GPU telemetry (NVIDIA/Intel/AMD) and drive mapping. + Required for GPU monitoring. + schema: + type: boolean + default: true + - variable: additional_envs + label: Additional Environment Variables + schema: + type: list + default: [] + items: + - variable: env + label: Environment Variable + schema: + type: dict + attrs: + - variable: name + label: Name + schema: + type: string + required: true + - variable: value + label: Value + schema: + type: string + + - variable: network + label: "" + group: Network Configuration + schema: + type: dict + attrs: + - variable: web_port + label: WebUI Port + description: The port for NAS Doctor web dashboard and API. + schema: + type: dict + attrs: + - variable: bind_mode + label: Port Bind Mode + description: | + The port bind mode.
+ - Publish: The port will be published on the host for external access.
+ - Expose: The port will be exposed for inter-container communication.
+ - None: The port will not be exposed or published. + schema: + type: string + default: "published" + enum: + - value: "published" + description: Publish port on the host for external access + - value: "exposed" + description: Expose port for inter-container communication + - value: "" + description: None + - variable: port_number + label: Port Number + schema: + type: int + default: 8060 + min: 1 + max: 65535 + required: true + - variable: host_ips + label: Host IPs + description: IPs on the host to bind this port. + schema: + type: list + show_if: [["bind_mode", "=", "published"]] + default: [] + items: + - variable: host_ip + label: Host IP + schema: + type: string + required: true + $ref: + - definitions/node_bind_ip + - variable: host_network + label: Host Network + description: | + Bind to the host network. Recommended for NAS Doctor to access + all network interfaces and SMART devices. + schema: + type: boolean + default: false + - variable: dns_nameservers + label: DNS Nameservers + schema: + type: list + default: [] + items: + - variable: option + label: Option + schema: + type: string + required: true + + - variable: storage + label: "" + group: Storage Configuration + schema: + type: dict + attrs: + - variable: data + label: NAS Doctor Data Storage + description: Persistent storage for SQLite database, config, and backups. + schema: + type: dict + attrs: + - variable: type + label: Type + description: | + ixVolume: Is dataset created automatically by the system.
+ Host Path: Is a path that already exists on the system. + schema: + type: string + required: true + default: "ix_volume" + enum: + - value: "host_path" + description: Host Path (Path that already exists on the system) + - value: "ix_volume" + description: ixVolume (Dataset created automatically by the system) + - variable: ix_volume_config + label: ixVolume Configuration + schema: + type: dict + show_if: [["type", "=", "ix_volume"]] + $ref: + - "normalize/ix_volume" + attrs: + - variable: acl_enable + label: Enable ACL + schema: + type: boolean + default: false + - variable: dataset_name + label: Dataset Name + schema: + type: string + required: true + hidden: true + default: "data" + - variable: acl_entries + label: ACL Configuration + schema: + type: dict + show_if: [["acl_enable", "=", true]] + attrs: [] + - variable: host_path_config + label: Host Path Configuration + schema: + type: dict + show_if: [["type", "=", "host_path"]] + attrs: + - variable: acl_enable + label: Enable ACL + schema: + type: boolean + default: false + - variable: acl + label: ACL Configuration + schema: + type: dict + show_if: [["acl_enable", "=", true]] + attrs: [] + $ref: + - "normalize/acl" + - variable: path + label: Host Path + schema: + type: hostpath + show_if: [["acl_enable", "=", false]] + required: true + - variable: additional_storage + label: Additional Storage + schema: + type: list + default: [] + items: + - variable: storageEntry + label: Storage Entry + schema: + type: dict + attrs: + - variable: type + label: Type + schema: + type: string + required: true + default: "host_path" + enum: + - value: "host_path" + description: Host Path (Path that already exists on the system) + - value: "ix_volume" + description: ixVolume (Dataset created automatically by the system) + - value: "cifs" + description: SMB/CIFS Share + - value: "nfs" + description: NFS Share + - variable: read_only + label: Read Only + schema: + type: boolean + default: false + - variable: mount_path + label: Mount Path + description: The path inside the container to mount the storage. + schema: + type: path + required: true + - variable: host_path_config + label: Host Path Configuration + schema: + type: dict + show_if: [["type", "=", "host_path"]] + attrs: + - variable: acl_enable + label: Enable ACL + schema: + type: boolean + default: false + - variable: acl + label: ACL Configuration + schema: + type: dict + show_if: [["acl_enable", "=", true]] + attrs: [] + $ref: + - "normalize/acl" + - variable: path + label: Host Path + schema: + type: hostpath + show_if: [["acl_enable", "=", false]] + required: true + - variable: ix_volume_config + label: ixVolume Configuration + schema: + type: dict + show_if: [["type", "=", "ix_volume"]] + $ref: + - "normalize/ix_volume" + attrs: + - variable: acl_enable + label: Enable ACL + schema: + type: boolean + default: false + - variable: dataset_name + label: Dataset Name + schema: + type: string + required: true + default: "storage_entry" + - variable: acl_entries + label: ACL Configuration + schema: + type: dict + show_if: [["acl_enable", "=", true]] + attrs: [] + - variable: cifs_config + label: SMB Configuration + schema: + type: dict + show_if: [["type", "=", "cifs"]] + attrs: + - variable: server + label: Server + schema: + type: string + required: true + - variable: path + label: Path + schema: + type: string + required: true + - variable: username + label: Username + schema: + type: string + required: true + - variable: password + label: Password + schema: + type: string + required: true + private: true + - variable: domain + label: Domain + schema: + type: string + - variable: nfs_config + label: NFS Configuration + schema: + type: dict + show_if: [["type", "=", "nfs"]] + attrs: + - variable: server + label: Server + schema: + type: string + required: true + - variable: path + label: Path + schema: + type: string + required: true + + - variable: labels + label: "" + group: Labels Configuration + schema: + type: list + default: [] + items: + - variable: label + label: Label + schema: + type: dict + attrs: + - variable: key + label: Key + schema: + type: string + required: true + - variable: value + label: Value + schema: + type: string + required: true + - variable: containers + label: Containers + schema: + type: list + items: + - variable: container + label: Container + schema: + type: string + required: true + enum: + - value: nas-doctor + description: nas-doctor + + - variable: resources + label: "" + group: Resources Configuration + schema: + type: dict + attrs: + - variable: limits + label: Limits + schema: + type: dict + attrs: + - variable: cpus + label: CPUs + description: CPUs limit for NAS Doctor. + schema: + type: int + default: 2 + required: true + - variable: memory + label: Memory (in MB) + description: Memory limit for NAS Doctor. + schema: + type: int + default: 4096 + required: true diff --git a/ix-dev/community/nas-doctor/templates/docker-compose.yaml b/ix-dev/community/nas-doctor/templates/docker-compose.yaml new file mode 100644 index 00000000000..2501e0f89cd --- /dev/null +++ b/ix-dev/community/nas-doctor/templates/docker-compose.yaml @@ -0,0 +1,71 @@ +{% set tpl = ix_lib.base.render.Render(values) %} + +{% set c1 = tpl.add_container(values.consts.nas_doctor_container_name, "image") %} +{% set perm_container = tpl.deps.perms(values.consts.perms_container_name) %} +{% set perms_config = {"uid": 0, "gid": 0, "mode": "check"} %} + +{# NAS Doctor requires root for smartctl access #} +{% do c1.set_user(0, 0) %} +{% do c1.add_caps(["SYS_RAWIO"]) %} + +{# PID=host is required for Top Processes to see all host processes + and match them to Docker containers via cgroup inspection #} +{% do c1.set_pid_mode("host") %} + +{% do c1.healthcheck.set_test("curl", {"port": values.network.web_port.port_number, "path": "/api/v1/health"}) %} + +{# Environment variables #} +{% do c1.environment.add_env("NAS_DOCTOR_LISTEN", ":" ~ values.network.web_port.port_number) %} +{% do c1.environment.add_env("NAS_DOCTOR_INTERVAL", values.nas_doctor.scan_interval) %} +{% if values.nas_doctor.timezone %} + {% do c1.environment.add_env("TZ", values.nas_doctor.timezone) %} +{% endif %} +{% do c1.environment.add_user_envs(values.nas_doctor.additional_envs) %} + +{# Port mapping #} +{% do c1.add_port(values.network.web_port) %} + +{# Docker socket for container monitoring #} +{% if values.nas_doctor.mount_docker_socket %} + {% do c1.add_docker_socket() %} +{% endif %} + +{# Storage: data directory #} +{% do c1.add_storage("/data", values.storage.data) %} +{% do perm_container.add_or_skip_action("data", values.storage.data, perms_config) %} + +{# Storage: host log access #} +{% if values.nas_doctor.mount_host_logs %} + {% do c1.add_storage("/host/log", {"type": "host_path", "host_path_config": {"path": "/var/log"}, "read_only": true}) %} +{% endif %} + +{# Storage: host mnt access (for disk space monitoring) #} +{% if values.nas_doctor.mount_host_mnt %} + {% do c1.add_storage("/host/mnt", {"type": "host_path", "host_path_config": {"path": "/mnt"}, "read_only": true}) %} +{% endif %} + +{# Storage: /dev for SMART device access and GPU device nodes #} +{% if values.nas_doctor.mount_dev %} + {% do c1.add_storage("/dev", {"type": "host_path", "host_path_config": {"path": "/dev"}, "read_only": true}) %} +{% endif %} + +{# Storage: /sys for GPU telemetry and drive mapping #} +{% if values.nas_doctor.mount_sys %} + {% do c1.add_storage("/sys", {"type": "host_path", "host_path_config": {"path": "/sys"}, "read_only": true}) %} +{% endif %} + +{# Additional storage mounts #} +{% for store in values.storage.additional_storage %} + {% do c1.add_storage(store.mount_path, store) %} + {% do perm_container.add_or_skip_action(store.mount_path, store, perms_config) %} +{% endfor %} + +{% if perm_container.has_actions() %} + {% do perm_container.activate() %} + {% do c1.depends.add_dependency(values.consts.perms_container_name, "service_completed_successfully") %} +{% endif %} + +{# Portal for web UI #} +{% do tpl.portals.add(values.network.web_port) %} + +{{ tpl.render() | tojson }} diff --git a/ix-dev/community/nas-doctor/templates/test_values/basic-values.yaml b/ix-dev/community/nas-doctor/templates/test_values/basic-values.yaml new file mode 100644 index 00000000000..0f3e6ea9c5f --- /dev/null +++ b/ix-dev/community/nas-doctor/templates/test_values/basic-values.yaml @@ -0,0 +1,31 @@ +resources: + limits: + cpus: 2.0 + memory: 4096 + +nas_doctor: + scan_interval: "30m" + timezone: "Etc/UTC" + mount_docker_socket: true + mount_host_logs: true + mount_host_mnt: true + mount_dev: true + mount_sys: true + additional_envs: [] + +network: + host_network: false + web_port: + bind_mode: published + port_number: 30060 + +ix_volumes: + data: /opt/tests/mnt/nas-doctor/data + +storage: + data: + type: ix_volume + ix_volume_config: + dataset_name: data + create_host_path: true + additional_storage: []