diff --git a/cmd/main.go b/cmd/main.go index 332623f0d..d7cc6cc8b 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -25,6 +25,7 @@ import ( "path/filepath" "time" + crmetrics "sigs.k8s.io/controller-runtime/pkg/metrics" "sigs.k8s.io/controller-runtime/pkg/metrics/filters" intController "github.com/splunk/splunk-operator/internal/controller" @@ -55,6 +56,7 @@ import ( "github.com/splunk/splunk-operator/internal/controller" cnpgv1 "github.com/cloudnative-pg/cloudnative-pg/api/v1" + pgprometheus "github.com/splunk/splunk-operator/pkg/postgresql/shared/adapter/prometheus" //+kubebuilder:scaffold:imports //extapi "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" ) @@ -282,18 +284,29 @@ func main() { setupLog.Error(err, "unable to create controller", "controller", "Telemetry") os.Exit(1) } + pgMetricsRecorder := pgprometheus.NewPrometheusRecorder() + if err := pgprometheus.Register(crmetrics.Registry); err != nil { + setupLog.Error(err, "unable to register PostgreSQL metrics") + os.Exit(1) + } + pgFleetMetricsCollector := pgprometheus.NewFleetCollector() + if err := (&controller.PostgresDatabaseReconciler{ - Client: mgr.GetClient(), - Scheme: mgr.GetScheme(), - Recorder: mgr.GetEventRecorderFor("postgresdatabase-controller"), + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + Recorder: mgr.GetEventRecorderFor("postgresdatabase-controller"), + Metrics: pgMetricsRecorder, + FleetCollector: pgFleetMetricsCollector, }).SetupWithManager(mgr); err != nil { setupLog.Error(err, "unable to create controller", "controller", "PostgresDatabase") os.Exit(1) } if err := (&controller.PostgresClusterReconciler{ - Client: mgr.GetClient(), - Scheme: mgr.GetScheme(), - Recorder: mgr.GetEventRecorderFor("postgrescluster-controller"), + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + Recorder: mgr.GetEventRecorderFor("postgrescluster-controller"), + Metrics: pgMetricsRecorder, + FleetCollector: pgFleetMetricsCollector, }).SetupWithManager(mgr); err != nil { setupLog.Error(err, "unable to create controller", "controller", "PostgresCluster") os.Exit(1) diff --git a/internal/controller/postgrescluster_controller.go b/internal/controller/postgrescluster_controller.go index 70b11c9e6..c49e7ba4d 100644 --- a/internal/controller/postgrescluster_controller.go +++ b/internal/controller/postgrescluster_controller.go @@ -22,6 +22,8 @@ import ( cnpgv1 "github.com/cloudnative-pg/cloudnative-pg/api/v1" enterprisev4 "github.com/splunk/splunk-operator/api/v4" clustercore "github.com/splunk/splunk-operator/pkg/postgresql/cluster/core" + "github.com/splunk/splunk-operator/pkg/postgresql/shared/ports" + pgprometheus "github.com/splunk/splunk-operator/pkg/postgresql/shared/adapter/prometheus" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/equality" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -42,8 +44,10 @@ const ( // PostgresClusterReconciler reconciles PostgresCluster resources. type PostgresClusterReconciler struct { client.Client - Scheme *runtime.Scheme - Recorder record.EventRecorder + Scheme *runtime.Scheme + Recorder record.EventRecorder + Metrics ports.Recorder + FleetCollector *pgprometheus.FleetCollector } // +kubebuilder:rbac:groups=enterprise.splunk.com,resources=postgresclusters,verbs=get;list;watch;create;update;patch;delete @@ -57,8 +61,10 @@ type PostgresClusterReconciler struct { // +kubebuilder:rbac:groups=core,resources=events,verbs=create;patch func (r *PostgresClusterReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { - rc := &clustercore.ReconcileContext{Client: r.Client, Scheme: r.Scheme, Recorder: r.Recorder} - return clustercore.PostgresClusterService(ctx, rc, req) + rc := &clustercore.ReconcileContext{Client: r.Client, Scheme: r.Scheme, Recorder: r.Recorder, Metrics: r.Metrics} + result, err := clustercore.PostgresClusterService(ctx, rc, req) + r.FleetCollector.CollectClusterMetrics(ctx, r.Client, r.Metrics) + return result, err } // SetupWithManager registers the controller and owned resource watches. diff --git a/internal/controller/postgresdatabase_controller.go b/internal/controller/postgresdatabase_controller.go index 0c6db9628..ab54da0fd 100644 --- a/internal/controller/postgresdatabase_controller.go +++ b/internal/controller/postgresdatabase_controller.go @@ -24,6 +24,8 @@ import ( enterprisev4 "github.com/splunk/splunk-operator/api/v4" dbadapter "github.com/splunk/splunk-operator/pkg/postgresql/database/adapter" dbcore "github.com/splunk/splunk-operator/pkg/postgresql/database/core" + "github.com/splunk/splunk-operator/pkg/postgresql/shared/ports" + pgprometheus "github.com/splunk/splunk-operator/pkg/postgresql/shared/adapter/prometheus" corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" @@ -42,8 +44,10 @@ import ( // PostgresDatabaseReconciler reconciles a PostgresDatabase object. type PostgresDatabaseReconciler struct { client.Client - Scheme *runtime.Scheme - Recorder record.EventRecorder + Scheme *runtime.Scheme + Recorder record.EventRecorder + Metrics ports.Recorder + FleetCollector *pgprometheus.FleetCollector } const ( @@ -71,8 +75,11 @@ func (r *PostgresDatabaseReconciler) Reconcile(ctx context.Context, req ctrl.Req } return ctrl.Result{}, err } - rc := &dbcore.ReconcileContext{Client: r.Client, Scheme: r.Scheme, Recorder: r.Recorder} - return dbcore.PostgresDatabaseService(ctx, rc, postgresDB, dbadapter.NewDBRepository) + rc := &dbcore.ReconcileContext{Client: r.Client, Scheme: r.Scheme, Recorder: r.Recorder, Metrics: r.Metrics} + result, err := dbcore.PostgresDatabaseService(ctx, rc, postgresDB, dbadapter.NewDBRepository) + r.FleetCollector.CollectDatabaseMetrics(ctx, r.Client, r.Metrics) + + return result, err } // SetupWithManager sets up the controller with the Manager. diff --git a/pkg/postgresql/cluster/core/cluster.go b/pkg/postgresql/cluster/core/cluster.go index 3334011c6..efbf00d14 100644 --- a/pkg/postgresql/cluster/core/cluster.go +++ b/pkg/postgresql/cluster/core/cluster.go @@ -24,6 +24,7 @@ import ( cnpgv1 "github.com/cloudnative-pg/cloudnative-pg/api/v1" password "github.com/sethvargo/go-password/password" enterprisev4 "github.com/splunk/splunk-operator/api/v4" + "github.com/splunk/splunk-operator/pkg/postgresql/shared/ports" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/equality" apierrors "k8s.io/apimachinery/pkg/api/errors" @@ -67,7 +68,7 @@ func PostgresClusterService(ctx context.Context, rc *ReconcileContext, req ctrl. ctx = log.IntoContext(ctx, logger) updateStatus := func(conditionType conditionTypes, status metav1.ConditionStatus, reason conditionReasons, message string, phase reconcileClusterPhases) error { - return setStatus(ctx, c, postgresCluster, conditionType, status, reason, message, phase) + return setStatus(ctx, c, rc.Metrics, postgresCluster, conditionType, status, reason, message, phase) } // Finalizer handling must come before any other processing. @@ -384,7 +385,7 @@ func PostgresClusterService(ctx context.Context, rc *ReconcileContext, req ctrl. default: oldConditions := make([]metav1.Condition, len(postgresCluster.Status.Conditions)) copy(oldConditions, postgresCluster.Status.Conditions) - if err := syncPoolerStatus(ctx, c, postgresCluster); err != nil { + if err := syncPoolerStatus(ctx, c, rc.Metrics, postgresCluster); err != nil { logger.Error(err, "Failed to sync pooler status") rc.emitWarning(postgresCluster, EventPoolerReconcileFailed, fmt.Sprintf("Failed to sync pooler status: %v", err)) if statusErr := updateStatus(poolerReady, metav1.ConditionFalse, reasonPoolerReconciliationFailed, @@ -450,7 +451,7 @@ func PostgresClusterService(ctx context.Context, rc *ReconcileContext, req ctrl. if postgresCluster.Status.Phase != nil { oldPhase = *postgresCluster.Status.Phase } - if err := syncStatus(ctx, c, postgresCluster, cnpgCluster); err != nil { + if err := syncStatus(ctx, c, rc.Metrics, postgresCluster, cnpgCluster); err != nil { logger.Error(err, "Failed to sync status") if apierrors.IsConflict(err) { logger.Info("Conflict during status update, will requeue") @@ -478,7 +479,7 @@ func PostgresClusterService(ctx context.Context, rc *ReconcileContext, req ctrl. logger.Info("Poolers ready, syncing status") poolerOldConditions := make([]metav1.Condition, len(postgresCluster.Status.Conditions)) copy(poolerOldConditions, postgresCluster.Status.Conditions) - _ = syncPoolerStatus(ctx, c, postgresCluster) + _ = syncPoolerStatus(ctx, c, rc.Metrics, postgresCluster) rc.emitPoolerReadyTransition(postgresCluster, poolerOldConditions) } } @@ -756,7 +757,7 @@ func deleteConnectionPoolers(ctx context.Context, c client.Client, cluster *ente } // syncPoolerStatus populates ConnectionPoolerStatus and the PoolerReady condition. -func syncPoolerStatus(ctx context.Context, c client.Client, cluster *enterprisev4.PostgresCluster) error { +func syncPoolerStatus(ctx context.Context, c client.Client, metrics ports.Recorder, cluster *enterprisev4.PostgresCluster) error { rwPooler := &cnpgv1.Pooler{} if err := c.Get(ctx, types.NamespacedName{ Name: poolerResourceName(cluster.Name, readWriteEndpoint), @@ -777,13 +778,13 @@ func syncPoolerStatus(ctx context.Context, c client.Client, cluster *enterprisev rwDesired, rwScheduled := poolerInstanceCount(rwPooler) roDesired, roScheduled := poolerInstanceCount(roPooler) - return setStatus(ctx, c, cluster, poolerReady, metav1.ConditionTrue, reasonAllInstancesReady, + return setStatus(ctx, c, metrics, cluster, poolerReady, metav1.ConditionTrue, reasonAllInstancesReady, fmt.Sprintf("%s: %d/%d, %s: %d/%d", readWriteEndpoint, rwScheduled, rwDesired, readOnlyEndpoint, roScheduled, roDesired), readyClusterPhase) } // syncStatus maps CNPG Cluster state to PostgresCluster status. -func syncStatus(ctx context.Context, c client.Client, cluster *enterprisev4.PostgresCluster, cnpgCluster *cnpgv1.Cluster) error { +func syncStatus(ctx context.Context, c client.Client, metrics ports.Recorder, cluster *enterprisev4.PostgresCluster, cnpgCluster *cnpgv1.Cluster) error { cluster.Status.ProvisionerRef = &corev1.ObjectReference{ APIVersion: "postgresql.cnpg.io/v1", Kind: "Cluster", @@ -836,13 +837,13 @@ func syncStatus(ctx context.Context, c client.Client, cluster *enterprisev4.Post message = fmt.Sprintf("CNPG cluster phase: %s", cnpgCluster.Status.Phase) } - return setStatus(ctx, c, cluster, clusterReady, condStatus, reason, message, phase) + return setStatus(ctx, c, metrics, cluster, clusterReady, condStatus, reason, message, phase) } // setStatus sets the phase, condition and persists the status. // It skips the API write when the resulting status is identical to the current // state, avoiding unnecessary etcd churn and ResourceVersion bumps on stable clusters. -func setStatus(ctx context.Context, c client.Client, cluster *enterprisev4.PostgresCluster, condType conditionTypes, status metav1.ConditionStatus, reason conditionReasons, message string, phase reconcileClusterPhases) error { +func setStatus(ctx context.Context, c client.Client, metrics ports.Recorder, cluster *enterprisev4.PostgresCluster, condType conditionTypes, status metav1.ConditionStatus, reason conditionReasons, message string, phase reconcileClusterPhases) error { before := cluster.Status.DeepCopy() p := string(phase) @@ -859,6 +860,8 @@ func setStatus(ctx context.Context, c client.Client, cluster *enterprisev4.Postg return nil } + metrics.IncStatusTransition(ports.ControllerCluster, string(condType), string(status), string(reason)) + if err := c.Status().Update(ctx, cluster); err != nil { return fmt.Errorf("failed to update PostgresCluster status: %w", err) } diff --git a/pkg/postgresql/cluster/core/types.go b/pkg/postgresql/cluster/core/types.go index 042a5ae82..7a43322fe 100644 --- a/pkg/postgresql/cluster/core/types.go +++ b/pkg/postgresql/cluster/core/types.go @@ -4,6 +4,7 @@ import ( "time" enterprisev4 "github.com/splunk/splunk-operator/api/v4" + "github.com/splunk/splunk-operator/pkg/postgresql/shared/ports" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/client-go/tools/record" @@ -17,6 +18,7 @@ type ReconcileContext struct { Client client.Client Scheme *runtime.Scheme Recorder record.EventRecorder + Metrics ports.Recorder } // normalizedCNPGClusterSpec is a subset of cnpgv1.ClusterSpec fields used for drift detection. diff --git a/pkg/postgresql/database/core/database.go b/pkg/postgresql/database/core/database.go index f84a35fd9..56dfb705a 100644 --- a/pkg/postgresql/database/core/database.go +++ b/pkg/postgresql/database/core/database.go @@ -11,6 +11,7 @@ import ( cnpgv1 "github.com/cloudnative-pg/cloudnative-pg/api/v1" "github.com/sethvargo/go-password/password" enterprisev4 "github.com/splunk/splunk-operator/api/v4" + "github.com/splunk/splunk-operator/pkg/postgresql/shared/ports" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/api/meta" @@ -43,7 +44,7 @@ func PostgresDatabaseService( logger.Info("Reconciling PostgresDatabase") updateStatus := func(conditionType conditionTypes, conditionStatus metav1.ConditionStatus, reason conditionReasons, message string, phase reconcileDBPhases) error { - return persistStatus(ctx, c, postgresDB, conditionType, conditionStatus, reason, message, phase) + return persistStatus(ctx, c, rc.Metrics, postgresDB, conditionType, conditionStatus, reason, message, phase) } // Finalizer: cleanup on deletion, register on creation. @@ -186,6 +187,10 @@ func PostgresDatabaseService( if err := patchManagedRoles(ctx, c, postgresDB, cluster); err != nil { logger.Error(err, "Failed to patch users in CNPG Cluster") rc.emitWarning(postgresDB, EventManagedRolesPatchFailed, fmt.Sprintf("Failed to patch managed roles: %v", err)) + if statusErr := updateStatus(rolesReady, metav1.ConditionFalse, reasonUsersCreationFailed, + fmt.Sprintf("Failed to patch managed roles: %v", err), failedDBPhase); statusErr != nil { + logger.Error(statusErr, "Failed to update status") + } return ctrl.Result{}, err } rc.emitNormal(postgresDB, EventRoleReconciliationStarted, fmt.Sprintf("Patched managed roles, waiting for %d roles to reconcile", len(desiredUsers))) @@ -223,6 +228,10 @@ func PostgresDatabaseService( if err != nil { logger.Error(err, "Failed to reconcile CNPG Databases") rc.emitWarning(postgresDB, EventDatabasesReconcileFailed, fmt.Sprintf("Failed to reconcile databases: %v", err)) + if statusErr := updateStatus(databasesReady, metav1.ConditionFalse, reasonDatabaseReconcileFailed, + fmt.Sprintf("Failed to reconcile databases: %v", err), failedDBPhase); statusErr != nil { + logger.Error(statusErr, "Failed to update status") + } return ctrl.Result{}, err } if len(adopted) > 0 { @@ -493,8 +502,9 @@ func verifyDatabasesReady(ctx context.Context, c client.Client, postgresDB *ente return notReady, nil } -func persistStatus(ctx context.Context, c client.Client, db *enterprisev4.PostgresDatabase, conditionType conditionTypes, conditionStatus metav1.ConditionStatus, reason conditionReasons, message string, phase reconcileDBPhases) error { +func persistStatus(ctx context.Context, c client.Client, metrics ports.Recorder, db *enterprisev4.PostgresDatabase, conditionType conditionTypes, conditionStatus metav1.ConditionStatus, reason conditionReasons, message string, phase reconcileDBPhases) error { applyStatus(db, conditionType, conditionStatus, reason, message, phase) + metrics.IncStatusTransition(ports.ControllerDatabase, string(conditionType), string(conditionStatus), string(reason)) return c.Status().Update(ctx, db) } diff --git a/pkg/postgresql/database/core/database_unit_test.go b/pkg/postgresql/database/core/database_unit_test.go index 8d4da6c52..aeaf5fa84 100644 --- a/pkg/postgresql/database/core/database_unit_test.go +++ b/pkg/postgresql/database/core/database_unit_test.go @@ -20,6 +20,7 @@ import ( cnpgv1 "github.com/cloudnative-pg/cloudnative-pg/api/v1" enterprisev4 "github.com/splunk/splunk-operator/api/v4" + pgprometheus "github.com/splunk/splunk-operator/pkg/postgresql/shared/adapter/prometheus" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" corev1 "k8s.io/api/core/v1" @@ -306,7 +307,7 @@ func TestVerifyRolesReady(t *testing.T) { }, }, }, - wantErr: "user main_db_rw reconciliation failed: [reserved role]", + wantErr: "reconciling user main_db_rw: [reserved role]", }, { name: "returns missing roles that are not reconciled yet", @@ -591,6 +592,7 @@ func TestSetStatus(t *testing.T) { err := persistStatus( context.Background(), c, + &pgprometheus.NoopRecorder{}, postgresDB, clusterReady, metav1.ConditionTrue, diff --git a/pkg/postgresql/database/core/types.go b/pkg/postgresql/database/core/types.go index bf07fd19f..8771f3c34 100644 --- a/pkg/postgresql/database/core/types.go +++ b/pkg/postgresql/database/core/types.go @@ -4,16 +4,18 @@ import ( "time" enterprisev4 "github.com/splunk/splunk-operator/api/v4" + "github.com/splunk/splunk-operator/pkg/postgresql/shared/ports" "k8s.io/apimachinery/pkg/runtime" "k8s.io/client-go/tools/record" "sigs.k8s.io/controller-runtime/pkg/client" ) -// ReconcileContext bundles infrastructure dependencies injected by the controller +// ReconcileContext bundles infrastructure dependencies injected by the controller. type ReconcileContext struct { Client client.Client Scheme *runtime.Scheme Recorder record.EventRecorder + Metrics ports.Recorder } type reconcileDBPhases string @@ -76,9 +78,10 @@ const ( reasonUsersAvailable conditionReasons = "UsersAvailable" reasonRoleConflict conditionReasons = "RoleConflict" reasonConfigMapsCreationFailed conditionReasons = "ConfigMapsCreationFailed" - reasonConfigMapsCreated conditionReasons = "ConfigMapsCreated" - reasonPrivilegesGranted conditionReasons = "PrivilegesGranted" - reasonPrivilegesGrantFailed conditionReasons = "PrivilegesGrantFailed" + reasonConfigMapsCreated conditionReasons = "ConfigMapsCreated" + reasonDatabaseReconcileFailed conditionReasons = "DatabaseReconcileFailed" + reasonPrivilegesGranted conditionReasons = "PrivilegesGranted" + reasonPrivilegesGrantFailed conditionReasons = "PrivilegesGrantFailed" // ClusterReady sentinel values returned by ensureClusterReady. // Exported so the controller adapter can switch on them if needed. diff --git a/pkg/postgresql/shared/adapter/prometheus/collector.go b/pkg/postgresql/shared/adapter/prometheus/collector.go new file mode 100644 index 000000000..bbd44944b --- /dev/null +++ b/pkg/postgresql/shared/adapter/prometheus/collector.go @@ -0,0 +1,91 @@ +package prometheus + +import ( + "context" + + enterprisev4 "github.com/splunk/splunk-operator/api/v4" + "github.com/splunk/splunk-operator/pkg/postgresql/shared/ports" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/log" +) + +// FleetCollector recomputes fleet-state gauges from the K8s API (informer cache). +type FleetCollector struct{} + +// NewFleetCollector returns a new FleetCollector. +func NewFleetCollector() *FleetCollector { + return &FleetCollector{} +} + +// CollectClusterMetrics lists all PostgresCluster resources and updates phase +// gauges, pooler gauges, and managed-user gauges. +func (fc *FleetCollector) CollectClusterMetrics(ctx context.Context, c client.Client, recorder ports.Recorder) { + logger := log.FromContext(ctx) + + var list enterprisev4.PostgresClusterList + if err := c.List(ctx, &list); err != nil { + logger.Error(err, "Failed to list PostgresClusters for fleet metrics") + return + } + + phases := make(map[string]float64) + var poolerEnabledCount float64 + managedUserStates := map[string]float64{ + "desired": 0, + "reconciled": 0, + "pending": 0, + "failed": 0, + } + + for i := range list.Items { + cluster := &list.Items[i] + + // Phase gauge. + phase := "Unknown" + if cluster.Status.Phase != nil { + phase = *cluster.Status.Phase + } + phases[phase]++ + + // Pooler-enabled count. + if cluster.Spec.ConnectionPoolerEnabled != nil && *cluster.Spec.ConnectionPoolerEnabled { + poolerEnabledCount++ + } + + // Managed users. + managedUserStates["desired"] += float64(len(cluster.Spec.ManagedRoles)) + if cluster.Status.ManagedRolesStatus != nil { + managedUserStates["reconciled"] += float64(len(cluster.Status.ManagedRolesStatus.Reconciled)) + managedUserStates["pending"] += float64(len(cluster.Status.ManagedRolesStatus.Pending)) + managedUserStates["failed"] += float64(len(cluster.Status.ManagedRolesStatus.Failed)) + } + } + + recorder.SetClusterPhases(phases) + recorder.SetPoolerEnabledClusters(poolerEnabledCount) + recorder.SetManagedUsers(ports.ControllerCluster, managedUserStates) +} + +// CollectDatabaseMetrics lists all PostgresDatabase resources and updates +// phase gauges. +func (fc *FleetCollector) CollectDatabaseMetrics(ctx context.Context, c client.Client, recorder ports.Recorder) { + logger := log.FromContext(ctx) + + var list enterprisev4.PostgresDatabaseList + if err := c.List(ctx, &list); err != nil { + logger.Error(err, "Failed to list PostgresDatabases for fleet metrics") + return + } + + phases := make(map[string]float64) + for i := range list.Items { + db := &list.Items[i] + phase := "Unknown" + if db.Status.Phase != nil { + phase = *db.Status.Phase + } + phases[phase]++ + } + + recorder.SetDatabasePhases(phases) +} diff --git a/pkg/postgresql/shared/adapter/prometheus/noop.go b/pkg/postgresql/shared/adapter/prometheus/noop.go new file mode 100644 index 000000000..91b9307f8 --- /dev/null +++ b/pkg/postgresql/shared/adapter/prometheus/noop.go @@ -0,0 +1,15 @@ +package prometheus + +import "github.com/splunk/splunk-operator/pkg/postgresql/shared/ports" + +// NoopRecorder is a no-op implementation of Recorder for use in tests. +type NoopRecorder struct{} + +func (n *NoopRecorder) IncStatusTransition(string, string, string, string) {} +func (n *NoopRecorder) SetClusterPhases(map[string]float64) {} +func (n *NoopRecorder) SetPoolerEnabledClusters(float64) {} +func (n *NoopRecorder) SetDatabasePhases(map[string]float64) {} +func (n *NoopRecorder) SetManagedUsers(string, map[string]float64) {} + +// Compile-time interface check. +var _ ports.Recorder = (*NoopRecorder)(nil) diff --git a/pkg/postgresql/shared/adapter/prometheus/recorder.go b/pkg/postgresql/shared/adapter/prometheus/recorder.go new file mode 100644 index 000000000..61d3d409d --- /dev/null +++ b/pkg/postgresql/shared/adapter/prometheus/recorder.go @@ -0,0 +1,91 @@ +package prometheus + +import ( + "github.com/prometheus/client_golang/prometheus" + "github.com/splunk/splunk-operator/pkg/postgresql/shared/ports" +) + +var ( + statusTransitionsTotal = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "splunk_operator_postgres_status_transitions_total", + Help: "Status condition transitions by controller, condition type, status, and reason.", + }, []string{"controller", "condition", "status", "reason"}) + + clusters = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "splunk_operator_postgres_clusters", + Help: "Current number of PostgresCluster resources by status phase.", + }, []string{"phase"}) + + poolerEnabledClusters = prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "splunk_operator_postgres_clusters_pooler_enabled", + Help: "Current number of PostgresCluster resources with connection pooling enabled.", + }) + + databases = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "splunk_operator_postgres_databases", + Help: "Current number of PostgresDatabase resources by status phase.", + }, []string{"phase"}) + + managedUsers = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "splunk_operator_postgres_managed_users", + Help: "Current counts of managed users by state.", + }, []string{"controller", "state"}) + + allCollectors = []prometheus.Collector{ + statusTransitionsTotal, + clusters, + poolerEnabledClusters, + databases, + managedUsers, + } +) + +// Register registers all PostgreSQL metrics with the given registerer. +// Call once at startup from cmd/main.go. +func Register(registerer prometheus.Registerer) error { + for _, c := range allCollectors { + if err := registerer.Register(c); err != nil { + return err + } + } + return nil +} + +// PrometheusRecorder implements shared.Recorder using Prometheus client_golang. +type PrometheusRecorder struct{} + +// NewPrometheusRecorder returns a new PrometheusRecorder. +func NewPrometheusRecorder() *PrometheusRecorder { + return &PrometheusRecorder{} +} + +func (p *PrometheusRecorder) IncStatusTransition(controller, condition, status, reason string) { + statusTransitionsTotal.WithLabelValues(controller, condition, status, reason).Inc() +} + +func (p *PrometheusRecorder) SetClusterPhases(phases map[string]float64) { + clusters.Reset() // drop stale label combinations before re-populating + for phase, count := range phases { + clusters.WithLabelValues(phase).Set(count) + } +} + +func (p *PrometheusRecorder) SetPoolerEnabledClusters(count float64) { + poolerEnabledClusters.Set(count) +} + +func (p *PrometheusRecorder) SetDatabasePhases(phases map[string]float64) { + databases.Reset() // drop stale label combinations before re-populating + for phase, count := range phases { + databases.WithLabelValues(phase).Set(count) + } +} + +func (p *PrometheusRecorder) SetManagedUsers(controller string, states map[string]float64) { + for state, count := range states { + managedUsers.WithLabelValues(controller, state).Set(count) + } +} + +// Compile-time interface check. +var _ ports.Recorder = (*PrometheusRecorder)(nil) diff --git a/pkg/postgresql/shared/ports/metrics.go b/pkg/postgresql/shared/ports/metrics.go new file mode 100644 index 000000000..79ec2fbfb --- /dev/null +++ b/pkg/postgresql/shared/ports/metrics.go @@ -0,0 +1,35 @@ +package ports + +// Controller name labels. +const ( + ControllerCluster = "postgrescluster" + ControllerDatabase = "postgresdatabase" +) + +// Recorder is the port for all PostgreSQL controller metrics. +// Core service packages depend on this interface, never on Prometheus directly. +// +// Reconcile-level metrics (total count, duration, error count) are handled +// automatically by controller-runtime — see controller_runtime_reconcile_total, +// controller_runtime_reconcile_time_seconds, controller_runtime_reconcile_errors_total. +// +// Domain-specific business metrics are emitted automatically via IncStatusTransition +// every time a status condition is written. Fleet-level gauges are populated by the +// collector on each reconcile. +type Recorder interface { + // IncStatusTransition increments the status transition counter. + // Called automatically by persistStatus/setStatus — no manual calls needed in service code. + IncStatusTransition(controller, condition, status, reason string) + + // SetClusterPhases sets gauge values for cluster counts by phase. + SetClusterPhases(phases map[string]float64) + + // SetPoolerEnabledClusters sets the gauge for clusters with connection pooling enabled. + SetPoolerEnabledClusters(count float64) + + // SetDatabasePhases sets gauge values for database counts by phase. + SetDatabasePhases(phases map[string]float64) + + // SetManagedUsers sets the gauge for managed user states. + SetManagedUsers(controller string, states map[string]float64) +}