Skip to content

Commit a1b796f

Browse files
committed
add basic grafana
1 parent 8b9b538 commit a1b796f

File tree

11 files changed

+1366
-20
lines changed

11 files changed

+1366
-20
lines changed

api/v4/postgrescluster_types.go

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,30 @@ type PostgresClusterSpec struct {
108108
// +kubebuilder:default=Retain
109109
// +optional
110110
ClusterDeletionPolicy *string `json:"clusterDeletionPolicy,omitempty"`
111+
112+
// Observability contains configuration for monitoring and observability features.
113+
// +optional
114+
Observability *PostgresObservabilityOverride `json:"observability,omitempty"`
115+
}
116+
117+
// PostgresObservabilityOverride overrides observability configuration options for PostgresClusterClass.
118+
type PostgresObservabilityOverride struct {
119+
120+
// +optional
121+
PostgreSQL *FeatureDisableOverride `json:"postgresql,omitempty"`
122+
123+
// +optional
124+
PgBouncer *FeatureDisableOverride `json:"pgbouncer,omitempty"`
125+
126+
// +optional
127+
GrafanaDashboard *FeatureDisableOverride `json:"grafanaDashboard,omitempty"`
128+
}
129+
130+
type FeatureDisableOverride struct {
131+
// Disable set to true will disable the feature even if it's enabled in the class.
132+
// +kubebuilder:default=false
133+
// +optional
134+
Disabled *bool `json:"disabled,omitempty"`
111135
}
112136

113137
// PostgresClusterResources defines references to Kubernetes resources related to the PostgresCluster, such as ConfigMaps and Secrets.

api/v4/postgresclusterclass_types.go

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,13 @@ type PostgresClusterClassConfig struct {
9999
// +kubebuilder:default=false
100100
// +optional
101101
ConnectionPoolerEnabled *bool `json:"connectionPoolerEnabled,omitempty"`
102+
103+
// Observability contains configuration for metrics and dashboards.
104+
// When enabled, creates metrics resources and Grafana dashboard for clusters using this class.
105+
// Can be overridden in PostgresCluster CR.
106+
// +kubebuilder:default={}
107+
// +optional
108+
Observability *PostgresObservabilityClassConfig `json:"observability,omitempty"`
102109
}
103110

104111
// ConnectionPoolerMode defines the PgBouncer connection pooling strategy.
@@ -172,6 +179,29 @@ type PostgresClusterClassStatus struct {
172179
Phase *string `json:"phase,omitempty"`
173180
}
174181

182+
type PostgresObservabilityClassConfig struct {
183+
// +optional
184+
PostgreSQL *MetricsClassConfig `json:"postgresql,omitempty"`
185+
// +optional
186+
PgBouncer *MetricsClassConfig `json:"pgbouncer,omitempty"`
187+
// +optional
188+
GrafanaDashboard *GrafanaDashboardClassConfig `json:"grafanaDashboard,omitempty"`
189+
}
190+
191+
type MetricsClassConfig struct {
192+
// Enabled controls whether metrics resources should be created for this target.
193+
// +kubebuilder:default=false
194+
// +optional
195+
Enabled *bool `json:"enabled,omitempty"`
196+
}
197+
198+
type GrafanaDashboardClassConfig struct {
199+
// Enabled controls whether a Grafana dashboard ConfigMap should be created for this class.
200+
// +kubebuilder:default=false
201+
// +optional
202+
Enabled *bool `json:"enabled,omitempty"`
203+
}
204+
175205
// +kubebuilder:object:root=true
176206
// +kubebuilder:subresource:status
177207
// +kubebuilder:resource:scope=Cluster

cmd/main.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ import (
5555
"github.com/splunk/splunk-operator/internal/controller"
5656

5757
cnpgv1 "github.com/cloudnative-pg/cloudnative-pg/api/v1"
58+
monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
5859
//+kubebuilder:scaffold:imports
5960
//extapi "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
6061
)
@@ -69,6 +70,7 @@ func init() {
6970
utilruntime.Must(enterpriseApi.AddToScheme(scheme))
7071
utilruntime.Must(enterpriseApiV3.AddToScheme(scheme))
7172
utilruntime.Must(cnpgv1.AddToScheme(scheme))
73+
utilruntime.Must(monitoringv1.AddToScheme(scheme))
7274
//+kubebuilder:scaffold:scheme
7375
//utilruntime.Must(extapi.AddToScheme(scheme))
7476
}

config/samples/enterprise_v4_postgresclusterclass_dev.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,11 @@ spec:
2727
cpu: "1"
2828
memory: "2Gi"
2929
connectionPoolerEnabled: true
30+
observability:
31+
grafanaDashboard:
32+
enabled: true
33+
pgbouncer:
34+
enabled: true
3035

3136
cnpg:
3237
# Restart method - tolerate downtime in dev

internal/controller/postgrescluster_controller.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ type PostgresClusterReconciler struct {
4545
Scheme *runtime.Scheme
4646
Recorder record.EventRecorder
4747
}
48-
48+
// +kubebuilder:rbac:groups=monitoring.coreos.com,resources=servicemonitors,verbs=get;list;watch;create;update;patch;delete
4949
// +kubebuilder:rbac:groups=enterprise.splunk.com,resources=postgresclusters,verbs=get;list;watch;create;update;patch;delete
5050
// +kubebuilder:rbac:groups=enterprise.splunk.com,resources=postgresclusters/status,verbs=get;update;patch
5151
// +kubebuilder:rbac:groups=enterprise.splunk.com,resources=postgresclusters/finalizers,verbs=update

internal/controller/suite_test.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ import (
3939
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
4040
ctrl "sigs.k8s.io/controller-runtime"
4141

42+
monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
4243
enterpriseApiV3 "github.com/splunk/splunk-operator/api/v3"
4344
enterpriseApi "github.com/splunk/splunk-operator/api/v4"
4445
//+kubebuilder:scaffold:imports
@@ -109,6 +110,9 @@ var _ = BeforeSuite(func(ctx context.Context) {
109110
err = enterpriseApi.AddToScheme(clientgoscheme.Scheme)
110111
Expect(err).NotTo(HaveOccurred())
111112

113+
err = monitoringv1.AddToScheme(clientgoscheme.Scheme)
114+
Expect(err).NotTo(HaveOccurred())
115+
112116
//+kubebuilder:scaffold:scheme
113117

114118
// Create New Manager for controller

pkg/postgresql/cluster/core/cluster.go

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -396,6 +396,64 @@ func PostgresClusterService(ctx context.Context, rc *ReconcileContext, req ctrl.
396396
rc.emitPoolerReadyTransition(postgresCluster, oldConditions)
397397
}
398398

399+
if err := reconcilePostgreSQLMetricsService(ctx, c, rc.Scheme, postgresCluster, isPostgreSQLMetricsEnabled(postgresCluster, clusterClass)); err != nil {
400+
return ctrl.Result{}, err
401+
}
402+
403+
poolerMetricsEnabled := isConnectionPoolerMetricsEnabled(postgresCluster, clusterClass)
404+
rwPoolerMetricsEnabled := poolerMetricsEnabled && rwPoolerExists
405+
roPoolerMetricsEnabled := poolerMetricsEnabled && roPoolerExists
406+
if err := reconcileConnectionPoolerMetricsService(ctx, c, rc.Scheme, postgresCluster, readWriteEndpoint, rwPoolerMetricsEnabled); err != nil {
407+
return ctrl.Result{}, err
408+
}
409+
if err := reconcileConnectionPoolerMetricsService(ctx, c, rc.Scheme, postgresCluster, readOnlyEndpoint, roPoolerMetricsEnabled); err != nil {
410+
return ctrl.Result{}, err
411+
}
412+
413+
if err := reconcileGrafanaDashboardConfigMap(ctx, c, rc.Scheme, postgresCluster, isGrafanaDashboardEnabled(postgresCluster, clusterClass)); err != nil {
414+
return ctrl.Result{}, err
415+
}
416+
417+
serviceMonitorUnavailableEmitted := false
418+
handleServiceMonitorError := func(err error) (bool, error) {
419+
if err == nil {
420+
return false, nil
421+
}
422+
if !isServiceMonitorUnavailable(err) {
423+
return false, err
424+
}
425+
if !serviceMonitorUnavailableEmitted {
426+
serviceMonitorUnavailableEmitted = true
427+
logger.Info("ServiceMonitor CRD unavailable, continuing without ServiceMonitors")
428+
rc.emitWarning(postgresCluster, EventServiceMonitorUnavailable,
429+
"ServiceMonitor CRD not found; continuing without Prometheus ServiceMonitors")
430+
}
431+
return true, nil
432+
}
433+
434+
if handled, err := handleServiceMonitorError(
435+
reconcilePostgreSQLMetricsServiceMonitor(ctx, c, rc.Scheme, postgresCluster, isPostgreSQLMetricsEnabled(postgresCluster, clusterClass)),
436+
); err != nil {
437+
return ctrl.Result{}, err
438+
} else if handled {
439+
logger.Info("Skipped PostgreSQL ServiceMonitor reconciliation")
440+
}
441+
442+
if handled, err := handleServiceMonitorError(
443+
reconcileConnectionPoolerMetricsServiceMonitor(ctx, c, rc.Scheme, postgresCluster, readWriteEndpoint, rwPoolerMetricsEnabled),
444+
); err != nil {
445+
return ctrl.Result{}, err
446+
} else if handled {
447+
logger.Info("Skipped RW PgBouncer ServiceMonitor reconciliation")
448+
}
449+
if handled, err := handleServiceMonitorError(
450+
reconcileConnectionPoolerMetricsServiceMonitor(ctx, c, rc.Scheme, postgresCluster, readOnlyEndpoint, roPoolerMetricsEnabled),
451+
); err != nil {
452+
return ctrl.Result{}, err
453+
} else if handled {
454+
logger.Info("Skipped RO PgBouncer ServiceMonitor reconciliation")
455+
}
456+
399457
// Reconcile ConfigMap when CNPG cluster is healthy.
400458
if cnpgCluster.Status.Phase == cnpgv1.PhaseHealthy {
401459
logger.Info("CNPG Cluster healthy, reconciling ConfigMap")
Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
{
2+
"title": "PostgreSQL __CLUSTER_NAME__",
3+
"uid": "pg-__CLUSTER_NAME__",
4+
"schemaVersion": 39,
5+
"version": 1,
6+
"refresh": "30s",
7+
"timezone": "browser",
8+
"tags": ["postgresql", "cnpg", "pgbouncer"],
9+
"editable": true,
10+
"graphTooltip": 0,
11+
"panels": [
12+
{
13+
"id": 1,
14+
"type": "stat",
15+
"title": "PostgreSQL Instances",
16+
"gridPos": { "x": 0, "y": 0, "w": 6, "h": 4 },
17+
"targets": [
18+
{
19+
"expr": "count(max by (pod) (cnpg_pg_postmaster_start_time_seconds{namespace=\"__NAMESPACE__\",service=\"__POSTGRES_SERVICE__\"}))",
20+
"refId": "A"
21+
}
22+
],
23+
"options": {
24+
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
25+
"orientation": "horizontal",
26+
"textMode": "value"
27+
}
28+
},
29+
{
30+
"id": 2,
31+
"type": "stat",
32+
"title": "RW PgBouncer Pods Up",
33+
"gridPos": { "x": 6, "y": 0, "w": 6, "h": 4 },
34+
"targets": [
35+
{
36+
"expr": "round(sum(max by (pod) (cnpg_pgbouncer_up{namespace=\"__NAMESPACE__\",service=\"__RW_POOLER_SERVICE__\"})))",
37+
"refId": "A"
38+
}
39+
],
40+
"options": {
41+
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
42+
"orientation": "horizontal",
43+
"textMode": "value"
44+
}
45+
},
46+
{
47+
"id": 3,
48+
"type": "stat",
49+
"title": "RO PgBouncer Pods Up",
50+
"gridPos": { "x": 12, "y": 0, "w": 6, "h": 4 },
51+
"targets": [
52+
{
53+
"expr": "round(sum(max by (pod) (cnpg_pgbouncer_up{namespace=\"__NAMESPACE__\",service=\"__RO_POOLER_SERVICE__\"})))",
54+
"refId": "A"
55+
}
56+
],
57+
"options": {
58+
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
59+
"orientation": "horizontal",
60+
"textMode": "value"
61+
}
62+
},
63+
{
64+
"id": 4,
65+
"type": "stat",
66+
"title": "Total Database Size",
67+
"gridPos": { "x": 18, "y": 0, "w": 6, "h": 4 },
68+
"targets": [
69+
{
70+
"expr": "sum(max by (datname) (cnpg_pg_database_size_bytes{namespace=\"__NAMESPACE__\",service=\"__POSTGRES_SERVICE__\"}))",
71+
"refId": "A"
72+
}
73+
],
74+
"fieldConfig": {
75+
"defaults": {
76+
"unit": "bytes"
77+
}
78+
},
79+
"options": {
80+
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
81+
"orientation": "horizontal",
82+
"textMode": "value"
83+
}
84+
},
85+
{
86+
"id": 5,
87+
"type": "timeseries",
88+
"title": "WAL Files by Pod",
89+
"gridPos": { "x": 0, "y": 4, "w": 8, "h": 8 },
90+
"targets": [
91+
{
92+
"expr": "round(max by (pod) (cnpg_pg_wal_files_total{namespace=\"__NAMESPACE__\",service=\"__POSTGRES_SERVICE__\"}))",
93+
"legendFormat": "{{pod}}",
94+
"refId": "A"
95+
}
96+
]
97+
},
98+
{
99+
"id": 6,
100+
"type": "timeseries",
101+
"title": "Archived WAL Rate by Pod",
102+
"gridPos": { "x": 8, "y": 4, "w": 8, "h": 8 },
103+
"targets": [
104+
{
105+
"expr": "max by (pod) (rate(cnpg_pg_stat_archiver_archived_count{namespace=\"__NAMESPACE__\",service=\"__POSTGRES_SERVICE__\"}[5m]))",
106+
"legendFormat": "{{pod}}",
107+
"refId": "A"
108+
}
109+
]
110+
},
111+
{
112+
"id": 7,
113+
"type": "timeseries",
114+
"title": "PgBouncer Active Clients",
115+
"gridPos": { "x": 16, "y": 4, "w": 8, "h": 8 },
116+
"targets": [
117+
{
118+
"expr": "round(sum(cnpg_pgbouncer_pools_cl_active{namespace=\"__NAMESPACE__\",service=\"__RW_POOLER_SERVICE__\"}))",
119+
"legendFormat": "rw",
120+
"refId": "A"
121+
},
122+
{
123+
"expr": "round(sum(cnpg_pgbouncer_pools_cl_active{namespace=\"__NAMESPACE__\",service=\"__RO_POOLER_SERVICE__\"}))",
124+
"legendFormat": "ro",
125+
"refId": "B"
126+
}
127+
]
128+
}
129+
],
130+
"templating": {
131+
"list": []
132+
},
133+
"annotations": {
134+
"list": []
135+
}
136+
}

pkg/postgresql/cluster/core/events.go

Lines changed: 20 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -10,25 +10,26 @@ import (
1010
)
1111

1212
const (
13-
EventSecretReady = "SecretReady"
14-
EventConfigMapReady = "ConfigMapReady"
15-
EventClusterAdopted = "ClusterAdopted"
16-
EventClusterCreationStarted = "ClusterCreationStarted"
17-
EventClusterUpdateStarted = "ClusterUpdateStarted"
18-
EventClusterReady = "ClusterReady"
19-
EventPoolerCreationStarted = "PoolerCreationStarted"
20-
EventPoolerReady = "PoolerReady"
21-
EventCleanupComplete = "CleanupComplete"
22-
EventClusterClassNotFound = "ClusterClassNotFound"
23-
EventConfigMergeFailed = "ConfigMergeFailed"
24-
EventSecretReconcileFailed = "SecretReconcileFailed"
25-
EventClusterCreateFailed = "ClusterCreateFailed"
26-
EventClusterUpdateFailed = "ClusterUpdateFailed"
27-
EventManagedRolesFailed = "ManagedRolesFailed"
28-
EventPoolerReconcileFailed = "PoolerReconcileFailed"
29-
EventConfigMapReconcileFailed = "ConfigMapReconcileFailed"
30-
EventClusterDegraded = "ClusterDegraded"
31-
EventCleanupFailed = "CleanupFailed"
13+
EventSecretReady = "SecretReady"
14+
EventConfigMapReady = "ConfigMapReady"
15+
EventClusterAdopted = "ClusterAdopted"
16+
EventClusterCreationStarted = "ClusterCreationStarted"
17+
EventClusterUpdateStarted = "ClusterUpdateStarted"
18+
EventClusterReady = "ClusterReady"
19+
EventPoolerCreationStarted = "PoolerCreationStarted"
20+
EventPoolerReady = "PoolerReady"
21+
EventCleanupComplete = "CleanupComplete"
22+
EventClusterClassNotFound = "ClusterClassNotFound"
23+
EventConfigMergeFailed = "ConfigMergeFailed"
24+
EventSecretReconcileFailed = "SecretReconcileFailed"
25+
EventClusterCreateFailed = "ClusterCreateFailed"
26+
EventClusterUpdateFailed = "ClusterUpdateFailed"
27+
EventManagedRolesFailed = "ManagedRolesFailed"
28+
EventPoolerReconcileFailed = "PoolerReconcileFailed"
29+
EventConfigMapReconcileFailed = "ConfigMapReconcileFailed"
30+
EventServiceMonitorUnavailable = "ServiceMonitorUnavailable"
31+
EventClusterDegraded = "ClusterDegraded"
32+
EventCleanupFailed = "CleanupFailed"
3233
)
3334

3435
func (rc *ReconcileContext) emitNormal(obj client.Object, reason, message string) {

0 commit comments

Comments
 (0)