-
Notifications
You must be signed in to change notification settings - Fork 56
Expand file tree
/
Copy pathpg_exporter.yml
More file actions
2857 lines (2666 loc) · 215 KB
/
pg_exporter.yml
File metadata and controls
2857 lines (2666 loc) · 215 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#==============================================================#
# Desc : pg_exporter metrics collector definition
# Ver : PostgreSQL 10 ~ 18+ and pgbouncer 1.9~1.25+
# Ctime : 2019-12-09
# Mtime : 2026-03-21
# Homepage : https://pigsty.io
# Author : Ruohang Feng (rh@vonng.com)
# License : Apache-2.0 @ https://github.com/pgsty/pg_exporter
# Copyright : 2018-2026 Ruohang Feng / Vonng (rh@vonng.com)
#==============================================================#
#==============================================================#
# 1. Config File
#==============================================================#
# The configuration file for pg_exporter is a YAML file.
# Default configurations are retrieved via following precedence:
# 1. command line args: --config=<config path>
# 2. environment variables: PG_EXPORTER_CONFIG=<config path>
# 3. pg_exporter.yml (Current directory)
# 4. /etc/pg_exporter.yml (config file)
# 5. /etc/pg_exporter (config dir)
#==============================================================#
# 2. Config Format
#==============================================================#
# pg_exporter config could be a single YAML file, or a directory containing a series of separated YAML files.
# Each YAML config file consists of one or more metrics Collector definition, which are top-level objects.
# If a directory is provided, all YAML in that directory will be merged in alphabetic order.
# Collector definition examples are shown below.
#==============================================================#
# 3. Collector Example
#==============================================================#
# # Here is an example of a metrics collector definition
# pg_primary_only: # Collector branch name. Must be UNIQUE among the entire configuration
# name: pg # Collector namespace, used as METRIC PREFIX, set to branch name by default, can be override
# # the same namespace may contain multiple collector branches. It`s the user`s responsibility
# # to make sure that AT MOST ONE collector is picked for each namespace.
#
# desc: PostgreSQL basic information (on primary) # Collector description
# query: | # Metrics Query SQL
#
# SELECT extract(EPOCH FROM CURRENT_TIMESTAMP) AS timestamp,
# pg_current_wal_lsn() - '0/0' AS lsn,
# pg_current_wal_insert_lsn() - '0/0' AS insert_lsn,
# pg_current_wal_lsn() - '0/0' AS write_lsn,
# pg_current_wal_flush_lsn() - '0/0' AS flush_lsn,
# extract(EPOCH FROM now() - pg_postmaster_start_time()) AS uptime,
# extract(EPOCH FROM now() - pg_conf_load_time()) AS conf_reload_time,
# pg_is_in_backup() AS is_in_backup,
# extract(EPOCH FROM now() - pg_backup_start_time()) AS backup_time;
#
# # [OPTIONAL] metadata fields, control collector behavior
# ttl: 10 # Cache TTL: in seconds, how long will pg_exporter cache this collector`s query result.
# timeout: 0.1 # Query Timeout: in seconds, queries that exceed this limit will be canceled.
# min_version: 100000 # minimal supported version, boundary IS included. In server version number format,
# max_version: 130000 # maximal supported version, boundary NOT included, In server version number format
# fatal: false # Collector marked `fatal` fails, the entire scrape will abort immediately and marked as failed
# skip: false # Collector marked `skip` will not be installed during the planning procedure
#
# tags: [cluster, primary] # Collector tags, used for planning and scheduling
#
# # tags are list of strings, which could be:
# # * `cluster` marks this query as cluster level, so it will only execute once for the same PostgreSQL Server
# # * `primary` or `master` mark this query can only run on a primary instance (WILL NOT execute if pg_is_in_recovery())
# # * `standby` or `replica` mark this query can only run on a replica instance (WILL execute if pg_is_in_recovery())
# # some special tag prefix have special interpretation:
# # * `dbname:<dbname>` means this query will ONLY be executed on database with name `<dbname>`
# # * `username:<user>` means this query will only be executed when connect with user `<user>`
# # * `extension:<extname>` means this query will only be executed when extension `<extname>` is installed
# # * `schema:<nspname>` means this query will only by executed when schema `<nspname>` exist
# # * `not:<negtag>` means this query WILL NOT be executed when exporter is tagged with `<negtag>`
# # * `<tag>` means this query WILL be executed when exporter is tagged with `<tag>`
# # ( <tag> could not be cluster,primary,standby,master,replica,etc...)
#
# # One or more "predicate queries" may be defined for a metric query. These
# # are run before the main metric query (after any cache hit check). If all
# # of them, when run sequentially, return a single row with a single column
# # boolean true result, the main metric query is executed. If any of them
# # return false or return zero rows, the main query is skipped. If any
# # predicate query returns more than one row, a non-boolean result, or fails
# # with an error, the whole query is marked failed. Predicate queries can be
# # used to check for the presence of specific functions, tables, extensions,
# # settings, and vendor-specific pg features before running the main query.
#
# predicate_queries:
# - name: predicate query name
# predicate_query: |
# SELECT EXISTS (SELECT 1 FROM information_schema.routines WHERE routine_schema = 'pg_catalog' AND routine_name = 'pg_backup_start_time');
#
# metrics: # List of returned columns, each column must have a `name` and `usage`, `rename` and `description` are optional
# - timestamp: # Column name, should be exactly the same as returned column name
# usage: GAUGE # Metric type, `usage` could be
# * DISCARD: completely ignoring this field
# * LABEL: use columnName=columnValue as a label in metric
# * GAUGE: Mark column as a gauge metric, full name will be `<query.name>_<column.name>`
# * COUNTER: Same as above, except it is a counter rather than a gauge.
# rename: ts # [OPTIONAL] Alias, optional, the alias will be used instead of the column name
# description: xxxx # [OPTIONAL] Description of the column, will be used as a metric description
# default: 0 # [OPTIONAL] Default value, will be used when column is NULL
# scale: 1000 # [OPTIONAL] Scale the value by this factor
# - lsn:
# usage: COUNTER
# description: log sequence number, current write location (on primary)
# - insert_lsn:
# usage: COUNTER
# description: primary only, location of current wal inserting
# - write_lsn:
# usage: COUNTER
# description: primary only, location of current wal writing
# - flush_lsn:
# usage: COUNTER
# description: primary only, location of current wal syncing
# - uptime:
# usage: GAUGE
# description: seconds since postmaster start
# - conf_reload_time:
# usage: GAUGE
# description: seconds since last configuration reload
# - is_in_backup:
# usage: GAUGE
# description: 1 if backup is in progress
# - backup_time:
# usage: GAUGE
# description: seconds since the current backup start. null if don`t have one
#
# .... # you can also use rename & scale to customize the metric name and value:
# - checkpoint_write_time:
# rename: write_time
# usage: COUNTER
# scale: 1e-3
# description: Total amount of time that has been spent in the portion of checkpoint processing where files are written to disk, in seconds
#==============================================================#
# 4. Collector Presets
#==============================================================#
# pg_exporter is shipped with a series of preset collectors (already numbered and ordered by filename)
#
# 1xx Basic metrics: basic info, metadata, settings
# 2xx Replication metrics: replication, walreceiver, downstream, sync standby, slots, subscription
# 3xx Persist metrics: size, wal, background writer, checkpointer, ssl, checkpoint, recovery, slru cache, shmem usage
# 4xx Activity metrics: backend count group by state, wait event, locks, xacts, queries
# 5xx Progress metrics: clustering, vacuuming, indexing, basebackup, copy
# 6xx Database metrics: pg_database, publication, subscription
# 7xx Object metrics: pg_class, table, index, function, sequence, default partition
# 8xx Optional metrics: optional metrics collector (disable by default, slow queries)
# 9xx Pgbouncer metrics: metrics from pgbouncer admin database `pgbouncer`
#
# 100-599 Metrics for entire database cluster (scrape once)
# 600-899 Metrics for single database instance (scrape for each database ,except for pg_db itself)
#==============================================================#
# 5. Cache TTL
#==============================================================#
# Cache can be used for reducing query overhead, it can be enabled by setting a non-zero value for `ttl`
# It is highly recommended to use cache to avoid duplicate scrapes. Especially when you got multiple Prometheus
# scraping the same instance with slow monitoring queries. Setting `ttl` to zero or leaving blank will disable
# result caching, which is the default behavior
#
# TTL has to be smaller than your scrape interval. 15s scrape interval and 10s TTL is a good start for
# production environment. Some expensive monitoring queries (such as size/bloat check) will have longer `ttl`
# which can also be used as a mechanism to achieve `different scrape frequency`
#==============================================================#
# 6. Query Timeout
#==============================================================#
# Collectors can be configured with an optional Timeout. If the collector's query executes more than that
# timeout, it will be canceled immediately. Setting the `timeout` to 0 or leaving blank will reset it to
# default timeout 0.1 (100ms). Setting it to any negative number will disable the query timeout feature.
# All queries have a default timeout of 100ms, if exceeded, the query will be canceled immediately to avoid
# avalanche. You can explicitly overwrite that option. but beware: in some extreme cases, if all your
# timeouts sum up greater your scrape/cache interval (usually 15s), the queries may still be jammed.
# or, you can just disable potential slow queries.
#==============================================================#
# 7. Version Compatibility
#==============================================================#
# Each collector has two optional version compatibility parameters: `min_version` and `max_version`.
# These two parameters specify the version compatibility of the collector. If target postgres/pgbouncer's
# version is less than `min_version`, or higher than `max_version`, the collector will not be installed.
# These two parameters are using PostgreSQL server version number format, which is a 6-digit integer
# format as <major:2 digit><minor:2 digit>:<release: 2 digit>.
# For example, 090600 stands for 9.6, and 120100 stands for 12.1
# And beware that version compatibility range is left-inclusive right exclusive: [min, max), set to zero or
# leaving blank will affect as -inf or +inf
#==============================================================#
# 8. Fatality
#==============================================================#
# If a collector is marked with `fatal` falls, the entire scrape operation will be marked as fail and key metrics
# `pg_up` / `pgbouncer_up` will be reset to 0. It is always a good practice to set up AT LEAST ONE fatal
# collector for pg_exporter. `pg.pg_primary_only` and `pgbouncer_list` are the default fatal collector.
#
# If a collector without `fatal` flag fails, it will increase global fail counters. But the scrape operation
# will carry on. The entire scrape result will not be marked as faile, thus will not affect the `<xx>_up` metric.
#==============================================================#
# 9. Skip
#==============================================================#
# Collector with `skip` flag set to true will NOT be installed.
# This could be a handy option to disable collectors
#==============================================================#
# 10. Tags and Planning
#==============================================================#
# Tags are designed for collector planning & schedule. It can be handy to customize which queries run
# on which instances. And thus you can use one-single monolith config for multiple environments
#
# Tags are a list of strings, each string could be:
# Pre-defined special tags
# * `cluster` marks this collector as cluster level, so it will ONLY BE EXECUTED ONCE for the same PostgreSQL Server
# * `primary` or `master` mark this collector as primary-only, so it WILL NOT work iff pg_is_in_recovery()
# * `standby` or `replica` mark this collector as replica-only, so it WILL work iff pg_is_in_recovery()
# Special tag prefix which have different interpretation:
# * `dbname:<dbname>` means this collector will ONLY work on database with name `<dbname>`
# * `username:<user>` means this collector will ONLY work when connect with user `<user>`
# * `extension:<extname>` means this collector will ONLY work when extension `<extname>` is installed
# * `schema:<nspname>` means this collector will only work when schema `<nspname>` exists
# Customized positive tags (filter) and negative tags (taint)
# * `not:<negtag>` means this collector WILL NOT work when exporter is tagged with `<negtag>`
# * `<tag>` means this query WILL work if exporter is tagged with `<tag>` (special tags not included)
#
# pg_exporter will trigger the Planning procedure after connecting to the target. It will gather database facts
# and match them with tags and other metadata (such as supported version range). Collector will only
# be installed if and only if it is compatible with the target server.
#==============================================================#
# 0110 pg
#==============================================================#
pg_primary_only:
name: pg
desc: PostgreSQL basic information (on primary)
query: |-
SELECT
extract(EPOCH FROM CURRENT_TIMESTAMP) AS timestamp,
extract(EPOCH FROM now() - pg_postmaster_start_time()) AS uptime,
extract(EPOCH FROM pg_postmaster_start_time()) AS boot_time,
pg_current_wal_lsn() - '0/0' AS lsn,
pg_current_wal_insert_lsn() - '0/0' AS insert_lsn,
pg_current_wal_lsn() - '0/0' AS write_lsn,
pg_current_wal_flush_lsn() - '0/0' AS flush_lsn,
NULL::BIGINT AS receive_lsn,
NULL::BIGINT AS replay_lsn,
extract(EPOCH FROM pg_conf_load_time()) AS reload_time,
extract(EPOCH FROM now() - pg_conf_load_time()) AS conf_reload_time,
NULL::FLOAT AS last_replay_time,
0::FLOAT AS lag,
pg_is_in_recovery() AS is_in_recovery,
FALSE AS is_wal_replay_paused;
tags: [ cluster, primary ]
ttl: 1
min_version: 100000
fatal: true
skip: false
metrics:
- timestamp: { usage: GAUGE ,description: "current database timestamp in unix epoch" }
- uptime: { usage: GAUGE ,description: "seconds since postmaster start" }
- boot_time: { usage: GAUGE ,description: "postmaster boot timestamp in unix epoch" }
- lsn: { usage: COUNTER ,description: "log sequence number, current write location" }
- insert_lsn: { usage: COUNTER ,description: "primary only, location of current wal inserting" }
- write_lsn: { usage: COUNTER ,description: "primary only, location of current wal writing" }
- flush_lsn: { usage: COUNTER ,description: "primary only, location of current wal syncing" }
- receive_lsn: { usage: COUNTER ,description: "replica only, location of wal synced to disk" }
- replay_lsn: { usage: COUNTER ,description: "replica only, location of wal applied" }
- reload_time: { usage: GAUGE ,description: "time when configuration was last reloaded" }
- conf_reload_time: { usage: GAUGE ,description: "seconds since last configuration reload" }
- last_replay_time: { usage: GAUGE ,description: "time when last transaction been replayed" }
- lag: { usage: GAUGE ,description: "replica only, replication lag in seconds" }
- is_in_recovery: { usage: GAUGE ,description: "1 if in recovery mode" }
- is_wal_replay_paused: { usage: GAUGE ,description: "1 if wal play is paused" }
pg_replica_only:
name: pg
desc: PostgreSQL basic information (on replica)
query: |-
SELECT
extract(EPOCH FROM CURRENT_TIMESTAMP) AS timestamp,
extract(EPOCH FROM now() - pg_postmaster_start_time()) AS uptime,
extract(EPOCH FROM pg_postmaster_start_time()) AS boot_time,
pg_last_wal_replay_lsn() - '0/0' AS lsn,
NULL::BIGINT AS insert_lsn,
NULL::BIGINT AS write_lsn,
NULL::BIGINT AS flush_lsn,
pg_last_wal_receive_lsn() - '0/0' AS receive_lsn,
pg_last_wal_replay_lsn() - '0/0' AS replay_lsn,
extract(EPOCH FROM pg_conf_load_time()) AS reload_time,
extract(EPOCH FROM now() - pg_conf_load_time()) AS conf_reload_time,
extract(EPOCH FROM pg_last_xact_replay_timestamp()) AS last_replay_time,
CASE WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() THEN 0
ELSE EXTRACT(EPOCH FROM now() - pg_last_xact_replay_timestamp()) END AS lag,
pg_is_in_recovery() AS is_in_recovery,
pg_is_wal_replay_paused() AS is_wal_replay_paused;
tags: [ cluster, replica ]
ttl: 1
min_version: 100000
fatal: true
skip: false
metrics:
- timestamp: { usage: GAUGE ,description: "current database timestamp in unix epoch" }
- uptime: { usage: GAUGE ,description: "seconds since postmaster start" }
- boot_time: { usage: GAUGE ,description: "postmaster boot timestamp in unix epoch" }
- lsn: { usage: COUNTER ,description: "log sequence number, current write location" }
- insert_lsn: { usage: COUNTER ,description: "primary only, location of current wal inserting" }
- write_lsn: { usage: COUNTER ,description: "primary only, location of current wal writing" }
- flush_lsn: { usage: COUNTER ,description: "primary only, location of current wal syncing" }
- receive_lsn: { usage: COUNTER ,description: "replica only, location of wal synced to disk" }
- replay_lsn: { usage: COUNTER ,description: "replica only, location of wal applied" }
- reload_time: { usage: GAUGE ,description: "time when configuration was last reloaded" }
- conf_reload_time: { usage: GAUGE ,description: "seconds since last configuration reload" }
- last_replay_time: { usage: GAUGE ,description: "time when last transaction been replayed" }
- lag: { usage: GAUGE ,description: "replica only, replication lag in seconds" }
- is_in_recovery: { usage: GAUGE ,description: "1 if in recovery mode" }
- is_wal_replay_paused: { usage: GAUGE ,description: "1 if wal play is paused" }
#==============================================================#
# 0120 pg_meta
#==============================================================#
pg_meta_13:
name: pg_meta
desc: PostgreSQL meta info for pg 13+, with extra primary conninfo
query: |
SELECT
(SELECT system_identifier FROM pg_control_system()) AS cluster_id,
current_setting('cluster_name') AS cluster_name,
current_setting('port') AS listen_port,
current_setting('data_directory', true) AS data_dir,
current_setting('config_file', true) AS conf_path,
current_setting('hba_file', true) AS hba_path,
current_setting('wal_level') AS wal_level,
current_setting('server_encoding') AS encoding,
current_setting('server_version') AS version,
current_setting('server_version_num') AS ver_num,
version() AS ver_str,
current_setting('shared_preload_libraries', true) AS extensions,
current_setting('primary_conninfo', true) AS primary_conninfo,
1 AS info
ttl: 10
min_version: 130000
tags: [ cluster ]
metrics:
- cluster_id: { usage: LABEL ,description: "cluster system identifier" }
- cluster_name: { usage: LABEL ,description: "cluster name" }
- listen_port: { usage: LABEL ,description: "listen port" }
- data_dir: { usage: LABEL ,description: "path to data directory" }
- conf_path: { usage: LABEL ,description: "path to postgresql.conf" }
- hba_path: { usage: LABEL ,description: "path to pg_hba.conf" }
- wal_level: { usage: LABEL ,description: "wal level" }
- encoding: { usage: LABEL ,description: "server encoding" }
- version: { usage: LABEL ,description: "server version in human-readable format" }
- ver_num: { usage: LABEL ,description: "server version number in machine-readable format" }
- ver_str: { usage: LABEL ,description: "complete version string" }
- extensions: { usage: LABEL ,description: "server installed preload libraries" }
- primary_conninfo: { usage: LABEL ,description: "connection string to upstream (do not set password here)" }
- info: { usage: GAUGE ,description: "constant 1" }
pg_meta_10:
name: pg_meta
desc: PostgreSQL meta info
query: |
SELECT
(SELECT system_identifier FROM pg_control_system()) AS cluster_id,
current_setting('cluster_name') AS cluster_name,
current_setting('port') AS listen_port,
current_setting('data_directory', true) AS data_dir,
current_setting('config_file', true) AS conf_path,
current_setting('hba_file', true) AS hba_path,
current_setting('wal_level') AS wal_level,
current_setting('server_encoding') AS encoding,
current_setting('server_version') AS version,
current_setting('server_version_num') AS ver_num,
version() AS ver_str,
current_setting('shared_preload_libraries', true) AS extensions,
'N/A' AS primary_conninfo,
1 AS info
ttl: 10
min_version: 090600
max_version: 130000
tags: [ cluster ]
metrics:
- cluster_id: { usage: LABEL ,description: "cluster system identifier" }
- cluster_name: { usage: LABEL ,description: "cluster name" }
- listen_port: { usage: LABEL ,description: "listen port" }
- data_dir: { usage: LABEL ,description: "path to data directory" }
- conf_path: { usage: LABEL ,description: "path to postgresql.conf" }
- hba_path: { usage: LABEL ,description: "path to pg_hba.conf" }
- wal_level: { usage: LABEL ,description: "wal level" }
- encoding: { usage: LABEL ,description: "server encoding" }
- version: { usage: LABEL ,description: "server version in human-readable format" }
- ver_num: { usage: LABEL ,description: "server version number in machine-readable format" }
- ver_str: { usage: LABEL ,description: "complete version string" }
- extensions: { usage: LABEL ,description: "server installed preload libraries" }
- primary_conninfo: { usage: LABEL ,description: "connection string to upstream (do not set password here)" }
- info: { usage: GAUGE ,description: "constant 1" }
#==============================================================#
# 0130 pg_setting
#==============================================================#
# Key PostgreSQL configuration parameters
# All parameters use current_setting(name, missing_ok) for version safety
# Parameters introduced after PG10 use missing_ok=true to return NULL on older versions
pg_setting:
name: pg_setting
desc: PostgreSQL shared configuration parameters (shared across all databases)
query: |
SELECT
current_setting('max_connections')::int AS max_connections,
current_setting('max_prepared_transactions')::int AS max_prepared_transactions,
current_setting('max_locks_per_transaction')::int AS max_locks_per_transaction,
current_setting('max_worker_processes')::int AS max_worker_processes,
current_setting('max_parallel_workers')::int AS max_parallel_workers,
current_setting('max_parallel_workers_per_gather')::int AS max_parallel_workers_per_gather,
current_setting('max_parallel_maintenance_workers', true)::int AS max_parallel_maintenance_workers,
current_setting('max_replication_slots')::int AS max_replication_slots,
current_setting('max_wal_senders')::int AS max_wal_senders,
current_setting('block_size')::int AS block_size,
current_setting('wal_block_size')::int AS wal_block_size,
pg_size_bytes(current_setting('segment_size')) AS segment_size,
pg_size_bytes(current_setting('wal_segment_size')) AS wal_segment_size,
CASE current_setting('data_checksums') WHEN 'on' THEN 1 ELSE 0 END AS data_checksums,
CASE current_setting('wal_log_hints') WHEN 'on' THEN 1 ELSE 0 END AS wal_log_hints,
CASE current_setting('fsync') WHEN 'on' THEN 1 ELSE 0 END AS fsync,
CASE current_setting('full_page_writes') WHEN 'on' THEN 1 ELSE 0 END AS full_page_writes,
CASE current_setting('wal_level') WHEN 'logical' THEN 3 WHEN 'replica' THEN 2 WHEN 'minimal' THEN 1 ELSE 0 END AS wal_level,
pg_size_bytes(current_setting('min_wal_size')) AS min_wal_size,
pg_size_bytes(current_setting('max_wal_size')) AS max_wal_size,
pg_size_bytes(current_setting('max_slot_wal_keep_size', true)) AS max_slot_wal_keep_size,
pg_size_bytes(current_setting('shared_buffers')) AS shared_buffers,
pg_size_bytes(current_setting('work_mem')) AS work_mem,
pg_size_bytes(current_setting('maintenance_work_mem')) AS maintenance_work_mem,
pg_size_bytes(current_setting('effective_cache_size')) AS effective_cache_size,
pg_size_bytes(current_setting('shared_memory_size', true)) AS shared_memory_size,
CASE current_setting('huge_pages_status', true) WHEN 'on' THEN 1 WHEN 'off' THEN 0 WHEN 'unknown' THEN -1 ELSE NULL END AS hugepage_status,
current_setting('shared_memory_size_in_huge_pages', true)::int AS hugepage_count,
CASE current_setting('archive_mode') WHEN 'off' THEN 0 WHEN 'on' THEN 1 WHEN 'always' THEN 2 ELSE -1 END AS archive_mode,
CASE current_setting('autovacuum') WHEN 'on' THEN 1 ELSE 0 END AS autovacuum,
current_setting('autovacuum_max_workers')::int AS autovacuum_max_workers,
extract(epoch from current_setting('checkpoint_timeout')::interval)::int AS checkpoint_timeout,
current_setting('checkpoint_completion_target')::float AS checkpoint_completion_target,
CASE current_setting('hot_standby') WHEN 'on' THEN 1 ELSE 0 END AS hot_standby,
CASE current_setting('synchronous_commit')
WHEN 'off' THEN 0 WHEN 'local' THEN 1 WHEN 'remote_write' THEN 2
WHEN 'on' THEN 3 WHEN 'remote_apply' THEN 4 ELSE -1 END AS synchronous_commit,
CASE current_setting('io_method', true)
WHEN 'sync' THEN 0 WHEN 'worker' THEN 1 WHEN 'io_uring' THEN 2 ELSE NULL END AS io_method;
ttl: 10
min_version: 100000
tags: [ cluster ]
metrics:
- max_connections: { usage: GAUGE ,description: "maximum number of concurrent connections to the database server" }
- max_prepared_transactions: { usage: GAUGE ,description: "maximum number of transactions that can be in the prepared state simultaneously" }
- max_locks_per_transaction: { usage: GAUGE ,description: "maximum number of locks per transaction" }
- max_worker_processes: { usage: GAUGE ,description: "maximum number of background processes" }
- max_parallel_workers: { usage: GAUGE ,description: "maximum number of parallel workers that can be active at one time" }
- max_parallel_workers_per_gather: { usage: GAUGE ,description: "maximum number of parallel workers per Gather node" }
- max_parallel_maintenance_workers: { usage: GAUGE ,description: "maximum number of parallel maintenance workers (PG11+, NULL on older)" }
- max_replication_slots: { usage: GAUGE ,description: "maximum number of replication slots" }
- max_wal_senders: { usage: GAUGE ,description: "maximum number of concurrent WAL sender connections" }
- block_size: { usage: GAUGE ,description: "database block size in bytes (default 8192)" }
- wal_block_size: { usage: GAUGE ,description: "WAL block size in bytes" }
- segment_size: { usage: GAUGE ,description: "database file segment size in bytes" }
- wal_segment_size: { usage: GAUGE ,description: "WAL segment size in bytes" }
- data_checksums: { usage: GAUGE ,description: "data checksums enabled, 1=on 0=off" }
- wal_log_hints: { usage: GAUGE ,description: "WAL log hints enabled, 1=on 0=off" }
- fsync: { usage: GAUGE ,description: "fsync enabled (CRITICAL for data safety), 1=on 0=off" }
- full_page_writes: { usage: GAUGE ,description: "full page writes enabled, 1=on 0=off" }
- wal_level: { usage: GAUGE ,description: "WAL level, 1=minimal 2=replica 3=logical" }
- min_wal_size: { usage: GAUGE ,description: "minimum WAL size in bytes" }
- max_wal_size: { usage: GAUGE ,description: "maximum WAL size in bytes" }
- max_slot_wal_keep_size: { usage: GAUGE ,description: "maximum WAL size retained by replication slots in bytes (PG13+, NULL on older)" }
- shared_buffers: { usage: GAUGE ,description: "shared buffer size in bytes" }
- work_mem: { usage: GAUGE ,description: "work memory size in bytes" }
- maintenance_work_mem: { usage: GAUGE ,description: "maintenance work memory size in bytes" }
- effective_cache_size: { usage: GAUGE ,description: "planner's assumption about effective OS cache size in bytes" }
- shared_memory_size: { usage: GAUGE ,description: "total shared memory size in bytes (PG13+, NULL on older)" }
- hugepage_status: { usage: GAUGE ,description: "huge pages status, 1=on 0=off -1=unknown NULL=unavailable (PG14+)" }
- hugepage_count: { usage: GAUGE ,description: "number of huge pages needed for shared memory (PG14+, NULL on older)" }
- archive_mode: { usage: GAUGE ,description: "archive mode, 0=off 1=on 2=always" }
- autovacuum: { usage: GAUGE ,description: "autovacuum enabled, 1=on 0=off" }
- autovacuum_max_workers: { usage: GAUGE ,description: "maximum number of autovacuum worker processes" }
- checkpoint_timeout: { usage: GAUGE ,description: "checkpoint timeout in seconds" }
- checkpoint_completion_target: { usage: GAUGE ,description: "checkpoint completion target (0.0-1.0)" }
- hot_standby: { usage: GAUGE ,description: "hot standby mode enabled, 1=on 0=off" }
- synchronous_commit: { usage: GAUGE ,description: "synchronous commit level, 0=off 1=local 2=remote_write 3=on 4=remote_apply" }
- io_method: { usage: GAUGE ,description: "I/O method (PG18+), 0=sync 1=worker 2=io_uring NULL=unavailable" }
#==============================================================#
# 0210 pg_repl
#==============================================================#
pg_repl_12:
name: pg_repl
desc: PostgreSQL replication stat metrics 12+
query: |
SELECT application_name AS appname, usename, coalesce(client_addr::TEXT,'localhost') AS address, pid::TEXT, client_port,
CASE state WHEN 'streaming' THEN 0 WHEN 'startup' THEN 1 WHEN 'catchup' THEN 2 WHEN 'backup' THEN 3 WHEN 'stopping' THEN 4 ELSE -1 END AS state,
CASE sync_state WHEN 'async' THEN 0 WHEN 'potential' THEN 1 WHEN 'sync' THEN 2 WHEN 'quorum' THEN 3 ELSE -1 END AS sync_state,
sync_priority, backend_xmin::TEXT::BIGINT AS backend_xmin, current.lsn - '0/0' AS lsn,
current.lsn - sent_lsn AS sent_diff, current.lsn - write_lsn AS write_diff, current.lsn - flush_lsn AS flush_diff, current.lsn - replay_lsn AS replay_diff,
sent_lsn - '0/0' AS sent_lsn, write_lsn - '0/0' AS write_lsn, flush_lsn - '0/0' AS flush_lsn, replay_lsn - '0/0' AS replay_lsn,
coalesce(extract(EPOCH FROM write_lag), 0) AS write_lag, coalesce(extract(EPOCH FROM flush_lag), 0) AS flush_lag, coalesce(extract(EPOCH FROM replay_lag), 0) AS replay_lag,
extract(EPOCH FROM current_timestamp) AS "time", extract(EPOCH FROM backend_start) AS launch_time, extract(EPOCH FROM reply_time) AS reply_time
FROM pg_stat_replication, (SELECT CASE WHEN pg_is_in_recovery() THEN pg_last_wal_replay_lsn() ELSE pg_current_wal_lsn() END AS lsn) current;
ttl: 10
min_version: 120000
tags: [ cluster ]
metrics:
- appname: { usage: LABEL ,description: "Name of the application that is connected to this WAL sender" }
- usename: { usage: LABEL ,description: "Name of the user logged into this WAL sender process" }
- address: { usage: LABEL ,description: "IP address of the client connected to this WAL sender, localhost for unix socket" }
- pid: { usage: LABEL ,description: "Process ID of the WAL sender process" }
- client_port: { usage: GAUGE ,description: "TCP port number that the client is using for communication with this WAL sender, or -1 if a Unix socket is used" }
- state: { usage: GAUGE ,description: "Current WAL sender encoded state 0-4 for streaming|startup|catchup|backup|stopping" }
- sync_state: { usage: GAUGE ,description: "Encoded synchronous state of this standby server, 0-3 for async|potential|sync|quorum" }
- sync_priority: { usage: GAUGE ,description: "Priority of this standby server for being chosen as the synchronous standby" }
- backend_xmin: { usage: COUNTER ,description: "This standby's xmin horizon reported by hot_standby_feedback." }
- lsn: { usage: COUNTER ,description: "Current log position on this server" }
- sent_diff: { usage: GAUGE ,description: "Last log position sent to this standby server diff with current lsn" }
- write_diff: { usage: GAUGE ,description: "Last log position written to disk by this standby server diff with current lsn" }
- flush_diff: { usage: GAUGE ,description: "Last log position flushed to disk by this standby server diff with current lsn" }
- replay_diff: { usage: GAUGE ,description: "Last log position replayed into the database on this standby server diff with current lsn" }
- sent_lsn: { usage: COUNTER ,description: "Last write-ahead log location sent on this connection" }
- write_lsn: { usage: COUNTER ,description: "Last write-ahead log location written to disk by this standby server" }
- flush_lsn: { usage: COUNTER ,description: "Last write-ahead log location flushed to disk by this standby server" }
- replay_lsn: { usage: COUNTER ,description: "Last write-ahead log location replayed into the database on this standby server" }
- write_lag: { usage: GAUGE ,description: "Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written it" }
- flush_lag: { usage: GAUGE ,description: "Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written and flushed it" }
- replay_lag: { usage: GAUGE ,description: "Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written, flushed and applied it" }
- time: { usage: COUNTER ,description: "Current timestamp in unix epoch" }
- launch_time: { usage: COUNTER ,description: "Time when this process was started, i.e., when the client connected to this WAL sender" }
- reply_time: { usage: GAUGE ,description: "Send time of last reply message received from standby server" }
pg_repl_10:
name: pg_repl
desc: PostgreSQL replication stat metrics v10 v11
query: |
SELECT application_name AS appname, usename, coalesce(client_addr::TEXT,'localhost') AS address, pid::TEXT, client_port,
CASE state WHEN 'streaming' THEN 0 WHEN 'startup' THEN 1 WHEN 'catchup' THEN 2 WHEN 'backup' THEN 3 WHEN 'stopping' THEN 4 ELSE -1 END AS state,
CASE sync_state WHEN 'async' THEN 0 WHEN 'potential' THEN 1 WHEN 'sync' THEN 2 WHEN 'quorum' THEN 3 ELSE -1 END AS sync_state,
sync_priority, backend_xmin::TEXT::BIGINT AS backend_xmin, current.lsn - '0/0' AS lsn,
current.lsn - sent_lsn AS sent_diff, current.lsn - write_lsn AS write_diff, current.lsn - flush_lsn AS flush_diff, current.lsn - replay_lsn AS replay_diff,
sent_lsn - '0/0' AS sent_lsn, write_lsn - '0/0' AS write_lsn, flush_lsn - '0/0' AS flush_lsn, replay_lsn - '0/0' AS replay_lsn,
coalesce(extract(EPOCH FROM write_lag), 0) AS write_lag, coalesce(extract(EPOCH FROM flush_lag), 0) AS flush_lag, coalesce(extract(EPOCH FROM replay_lag), 0) AS replay_lag,
extract(EPOCH FROM current_timestamp) AS "time", extract(EPOCH FROM backend_start) AS launch_time
FROM pg_stat_replication, (SELECT CASE WHEN pg_is_in_recovery() THEN pg_last_wal_replay_lsn() ELSE pg_current_wal_lsn() END AS lsn) current;
ttl: 10
min_version: 100000
max_version: 120000
tags: [ cluster ]
metrics:
- appname: { usage: LABEL ,description: "Name of the application that is connected to this WAL sender" }
- usename: { usage: LABEL ,description: "Name of the user logged into this WAL sender process" }
- address: { usage: LABEL ,description: "IP address of the client connected to this WAL sender, localhost for unix socket" }
- pid: { usage: LABEL ,description: "Process ID of the WAL sender process" }
- client_port: { usage: GAUGE ,description: "TCP port number that the client is using for communication with this WAL sender, or -1 if a Unix socket is used" }
- state: { usage: GAUGE ,description: "Current WAL sender encoded state 0-4 for streaming|startup|catchup|backup|stopping" }
- sync_state: { usage: GAUGE ,description: "Encoded synchronous state of this standby server, 0-3 for async|potential|sync|quorum" }
- sync_priority: { usage: GAUGE ,description: "Priority of this standby server for being chosen as the synchronous standby" }
- backend_xmin: { usage: COUNTER ,description: "This standby's xmin horizon reported by hot_standby_feedback." }
- lsn: { usage: COUNTER ,description: "Current log position on this server" }
- sent_diff: { usage: GAUGE ,description: "Last log position sent to this standby server diff with current lsn" }
- write_diff: { usage: GAUGE ,description: "Last log position written to disk by this standby server diff with current lsn" }
- flush_diff: { usage: GAUGE ,description: "Last log position flushed to disk by this standby server diff with current lsn" }
- replay_diff: { usage: GAUGE ,description: "Last log position replayed into the database on this standby server diff with current lsn" }
- sent_lsn: { usage: COUNTER ,description: "Last write-ahead log location sent on this connection" }
- write_lsn: { usage: COUNTER ,description: "Last write-ahead log location written to disk by this standby server" }
- flush_lsn: { usage: COUNTER ,description: "Last write-ahead log location flushed to disk by this standby server" }
- replay_lsn: { usage: COUNTER ,description: "Last write-ahead log location replayed into the database on this standby server" }
- write_lag: { usage: GAUGE ,description: "Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written it" }
- flush_lag: { usage: GAUGE ,description: "Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written and flushed it" }
- replay_lag: { usage: GAUGE ,description: "Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written, flushed and applied it" }
- time: { usage: COUNTER ,description: "Current timestamp in unix epoch" }
- launch_time: { usage: COUNTER ,description: "Time when this process was started, i.e., when the client connected to this WAL sender" }
#==============================================================#
# 0220 pg_sync_standby
#==============================================================#
pg_sync_standby:
name: pg_sync_standby
desc: PostgreSQL synchronous standby status and names
query: |
SELECT CASE WHEN names <> '' THEN names ELSE '<null>' END AS names, CASE WHEN names <> '' THEN 1 ELSE 0 END AS enabled FROM (SELECT current_setting('synchronous_standby_names') AS names) n;
ttl: 10
min_version: 090400
tags: [ cluster ]
metrics:
- names: { usage: LABEL ,description: "List of standby servers that can support synchronous replication, <null> if not enabled" }
- enabled: { usage: GAUGE ,description: "Synchronous commit enabled, 1 if enabled, 0 if disabled" }
#==============================================================#
# 0230 pg_downstream
#==============================================================#
pg_downstream:
name: pg_downstream
desc: PostgreSQL replication client count group by state
query: |
SELECT l.state, coalesce(count, 0 ) AS count FROM unnest(ARRAY ['streaming','startup','catchup', 'backup', 'stopping']) l(state) LEFT JOIN (SELECT state, count(*) AS count FROM pg_stat_replication GROUP BY state)r ON l.state = r.state;
ttl: 10
min_version: 090400
tags: [ cluster ]
metrics:
- state: { usage: LABEL ,description: "Replication client state, could be one of startup|catchup|streaming|backup|stopping" }
- count: { usage: GAUGE ,description: "Count of corresponding state" }
#==============================================================#
# 0240 pg_slot
#==============================================================#
pg_slot_17:
name: pg_slot
desc: PostgreSQL replication slot metrics v17, slot also exists on standby
query: |-
SELECT s.slot_name, s.slot_type, plugin, database AS datname,datoid,active_pid,
active,temporary,two_phase,conflicting,failover,synced,
xmin::TEXT::BIGINT AS xmin,catalog_xmin::TEXT::BIGINT AS catalog_xmin,
restart_lsn - '0/0' AS restart_lsn, confirmed_flush_lsn - '0/0' AS confirm_lsn,
CASE WHEN pg_is_in_recovery() THEN pg_last_wal_replay_lsn() ELSE pg_current_wal_lsn() END - restart_lsn AS retained_bytes,
safe_wal_size, CASE wal_status WHEN 'reserved' THEN 0 WHEN 'extended' THEN 1 WHEN 'unreserved' THEN 2 WHEN 'lost' THEN 3 ELSE -1 END AS wal_status,
spill_txns,spill_count,spill_bytes,stream_txns,stream_count,stream_bytes,total_txns,total_bytes,extract(EPOCH FROM stats_reset) AS reset_time,
extract(EPOCH FROM inactive_since) AS inactive_since, CASE invalidation_reason WHEN 'wal_removed' THEN 1 WHEN 'rows_removed' THEN 2 WHEN 'wal_level_insufficient' THEN 3 ELSE 0 END AS invalidation_reason
FROM pg_replication_slots s LEFT OUTER JOIN pg_stat_replication_slots ss ON s.slot_name = ss.slot_name;
ttl: 10
min_version: 170000
tags: [ cluster ]
metrics:
- slot_name: { usage: LABEL ,description: "A unique, cluster-wide identifier for the replication slot" }
- slot_type: { usage: LABEL ,description: "The slot type, physical or logical" }
- plugin: { usage: LABEL ,description: "The base name of the shared object containing the output plugin this logical slot is using, or null for physical slots." }
- datname: { usage: LABEL ,description: "The name of the database this slot is associated with, logical slots only, null for physical slot" }
- datoid: { usage: GAUGE ,description: "The OID of the database this slot is associated with, logical slots only, null for physical slot" }
- active_pid: { usage: GAUGE ,description: "The process ID of the session streaming data for this slot. NULL if inactive." }
- active: { usage: GAUGE ,description: "True(1) if this slot is currently actively being used" }
- temporary: { usage: GAUGE ,description: "True(1) if this is a temporary replication slot." }
- two_phase: { usage: GAUGE ,description: "True(1) if the slot is enabled for decoding prepared transactions. Always false for physical slots." }
- conflicting: { usage: GAUGE ,description: "True(1) if this logical slot conflicted with recovery. Always NULL for physical slots." }
- failover: { usage: GAUGE ,description: "True(1) if this is a logical slot enabled to be synced to the standbys" }
- synced: { usage: GAUGE ,description: "True(1) if this is a logical slot that was synced from a primary server" }
- xmin: { usage: COUNTER ,description: "The oldest transaction that this slot needs the database to retain." }
- catalog_xmin: { usage: COUNTER ,description: "The oldest transaction affecting the system catalogs that this slot needs the database to retain." }
- restart_lsn: { usage: COUNTER ,description: "The address (LSN) of oldest WAL which still might be required by the consumer of this slot" }
- confirm_lsn: { usage: COUNTER ,description: "The address (LSN) up to which the logical slot's consumer has confirmed receiving data." }
- retained_bytes: { usage: GAUGE ,description: "Size of bytes that retained for this slot" }
- safe_wal_size: { usage: GAUGE ,description: "bytes that can be written to WAL which will not make slot into lost" }
- wal_status: { usage: GAUGE ,description: "WAL reserve status 0-3 means reserved,extended,unreserved,lost, -1 means other" }
- spill_txns: { usage: COUNTER ,description: "Xacts that spilled to disk due to logical decode mem exceeding (subtrans included)" }
- spill_count: { usage: COUNTER ,description: "Xacts that spilled to disk due to logical decode mem exceeding" }
- spill_bytes: { usage: COUNTER ,description: "Bytes that spilled to disk due to logical decode mem exceeding" }
- stream_txns: { usage: COUNTER ,description: "Xacts that streamed to decoding output plugin after mem exceed" }
- stream_count: { usage: COUNTER ,description: "Xacts that streamed to decoding output plugin after mem exceed" }
- stream_bytes: { usage: COUNTER ,description: "Bytes that streamed to decoding output plugin after mem exceed" }
- total_txns: { usage: COUNTER ,description: "Number of decoded xacts sent to the decoding output plugin for this slot" }
- total_bytes: { usage: COUNTER ,description: "Number of decoded bytes sent to the decoding output plugin for this slot" }
- reset_time: { usage: GAUGE ,description: "When statistics were last reset" }
- invalidation_reason: { usage: GAUGE ,description: "ok=0, wal_removed=1, rows_removed=2, wal_level_insufficient=3" }
- inactive_since: { usage: GAUGE ,description: "The time when the slot became inactive" }
pg_slot_16:
name: pg_slot
desc: PostgreSQL replication slot metrics v16 with conflicting, now slot also exists on standby
query: |-
SELECT s.slot_name, s.slot_type, plugin, database AS datname,datoid,active_pid,
active,temporary,two_phase,conflicting,xmin::TEXT::BIGINT AS xmin,catalog_xmin::TEXT::BIGINT AS catalog_xmin,
restart_lsn - '0/0' AS restart_lsn, confirmed_flush_lsn - '0/0' AS confirm_lsn,
CASE WHEN pg_is_in_recovery() THEN pg_last_wal_replay_lsn() ELSE pg_current_wal_lsn() END - restart_lsn AS retained_bytes,
safe_wal_size, CASE wal_status WHEN 'reserved' THEN 0 WHEN 'extended' THEN 1 WHEN 'unreserved' THEN 2 WHEN 'lost' THEN 3 ELSE -1 END AS wal_status,
spill_txns,spill_count,spill_bytes,stream_txns,stream_count,stream_bytes,total_txns,total_bytes,extract(EPOCH FROM stats_reset) AS reset_time
FROM pg_replication_slots s LEFT OUTER JOIN pg_stat_replication_slots ss ON s.slot_name = ss.slot_name;
ttl: 10
min_version: 160000
max_version: 170000
tags: [ cluster ]
metrics:
- slot_name: { usage: LABEL ,description: "A unique, cluster-wide identifier for the replication slot" }
- slot_type: { usage: LABEL ,description: "The slot type, physical or logical" }
- plugin: { usage: LABEL ,description: "The base name of the shared object containing the output plugin this logical slot is using, or null for physical slots." }
- datname: { usage: LABEL ,description: "The name of the database this slot is associated with, logical slots only, null for physical slot" }
- datoid: { usage: GAUGE ,description: "The OID of the database this slot is associated with, logical slots only, null for physical slot" }
- active_pid: { usage: GAUGE ,description: "The process ID of the session streaming data for this slot. NULL if inactive." }
- active: { usage: GAUGE ,description: "True(1) if this slot is currently actively being used" }
- temporary: { usage: GAUGE ,description: "True(1) if this is a temporary replication slot." }
- two_phase: { usage: GAUGE ,description: "True(1) if the slot is enabled for decoding prepared transactions. Always false for physical slots." }
- conflicting: { usage: GAUGE ,description: "True if this logical slot conflicted with recovery. Always NULL for physical slots." }
- xmin: { usage: COUNTER ,description: "The oldest transaction that this slot needs the database to retain." }
- catalog_xmin: { usage: COUNTER ,description: "The oldest transaction affecting the system catalogs that this slot needs the database to retain." }
- restart_lsn: { usage: COUNTER ,description: "The address (LSN) of oldest WAL which still might be required by the consumer of this slot" }
- confirm_lsn: { usage: COUNTER ,description: "The address (LSN) up to which the logical slot's consumer has confirmed receiving data." }
- retained_bytes: { usage: GAUGE ,description: "Size of bytes that retained for this slot" }
- safe_wal_size: { usage: GAUGE ,description: "bytes that can be written to WAL which will not make slot into lost" }
- wal_status: { usage: GAUGE ,description: "WAL reserve status 0-3 means reserved,extended,unreserved,lost, -1 means other" }
- spill_txns: { usage: COUNTER ,description: "Xacts that spilled to disk due to logical decode mem exceeding (subtrans included)" }
- spill_count: { usage: COUNTER ,description: "Xacts that spilled to disk due to logical decode mem exceeding" }
- spill_bytes: { usage: COUNTER ,description: "Bytes that spilled to disk due to logical decode mem exceeding" }
- stream_txns: { usage: COUNTER ,description: "Xacts that streamed to decoding output plugin after mem exceed" }
- stream_count: { usage: COUNTER ,description: "Xacts that streamed to decoding output plugin after mem exceed" }
- stream_bytes: { usage: COUNTER ,description: "Bytes that streamed to decoding output plugin after mem exceed" }
- total_txns: { usage: COUNTER ,description: "Number of decoded xacts sent to the decoding output plugin for this slot" }
- total_bytes: { usage: COUNTER ,description: "Number of decoded bytes sent to the decoding output plugin for this slot" }
- reset_time: { usage: GAUGE ,description: "When statistics were last reset" }
pg_slot_14:
name: pg_slot
desc: PostgreSQL replication slot metrics v14 with pg_stat_replication_slots metrics
query: |-
SELECT s.slot_name, s.slot_type, plugin, database AS datname,datoid,active_pid,
active,temporary,two_phase,xmin::TEXT::BIGINT AS xmin,catalog_xmin::TEXT::BIGINT AS catalog_xmin,
restart_lsn - '0/0' AS restart_lsn, confirmed_flush_lsn - '0/0' AS confirm_lsn,
CASE WHEN pg_is_in_recovery() THEN pg_last_wal_replay_lsn() ELSE pg_current_wal_lsn() END - restart_lsn AS retained_bytes,
safe_wal_size, CASE wal_status WHEN 'reserved' THEN 0 WHEN 'extended' THEN 1 WHEN 'unreserved' THEN 2 WHEN 'lost' THEN 3 ELSE -1 END AS wal_status,
spill_txns,spill_count,spill_bytes,stream_txns,stream_count,stream_bytes,total_txns,total_bytes,extract(EPOCH FROM stats_reset) AS reset_time
FROM pg_replication_slots s LEFT OUTER JOIN pg_stat_replication_slots ss ON s.slot_name = ss.slot_name;
ttl: 10
min_version: 140000
max_version: 160000
tags: [ cluster, primary ]
metrics:
- slot_name: { usage: LABEL ,description: "A unique, cluster-wide identifier for the replication slot" }
- slot_type: { usage: LABEL ,description: "The slot type, physical or logical" }
- plugin: { usage: LABEL ,description: "The base name of the shared object containing the output plugin this logical slot is using, or null for physical slots." }
- datname: { usage: LABEL ,description: "The name of the database this slot is associated with, logical slots only, null for physical slot" }
- datoid: { usage: GAUGE ,description: "The OID of the database this slot is associated with, logical slots only, null for physical slot" }
- active_pid: { usage: GAUGE ,description: "The process ID of the session streaming data for this slot. NULL if inactive." }
- active: { usage: GAUGE ,description: "True(1) if this slot is currently actively being used" }
- temporary: { usage: GAUGE ,description: "True(1) if this is a temporary replication slot." }
- two_phase: { usage: GAUGE ,description: "True(1) if the slot is enabled for decoding prepared transactions. Always false for physical slots." }
- xmin: { usage: COUNTER ,description: "The oldest transaction that this slot needs the database to retain." }
- catalog_xmin: { usage: COUNTER ,description: "The oldest transaction affecting the system catalogs that this slot needs the database to retain." }
- restart_lsn: { usage: COUNTER ,description: "The address (LSN) of oldest WAL which still might be required by the consumer of this slot" }
- confirm_lsn: { usage: COUNTER ,description: "The address (LSN) up to which the logical slot's consumer has confirmed receiving data." }
- retained_bytes: { usage: GAUGE ,description: "Size of bytes that retained for this slot" }
- safe_wal_size: { usage: GAUGE ,description: "bytes that can be written to WAL which will not make slot into lost" }
- wal_status: { usage: GAUGE ,description: "WAL reserve status 0-3 means reserved,extended,unreserved,lost, -1 means other" }
- spill_txns: { usage: COUNTER ,description: "Xacts that spilled to disk due to logical decode mem exceeding (subtrans included)" }
- spill_count: { usage: COUNTER ,description: "Xacts that spilled to disk due to logical decode mem exceeding" }
- spill_bytes: { usage: COUNTER ,description: "Bytes that spilled to disk due to logical decode mem exceeding" }
- stream_txns: { usage: COUNTER ,description: "Xacts that streamed to decoding output plugin after mem exceed" }
- stream_count: { usage: COUNTER ,description: "Xacts that streamed to decoding output plugin after mem exceed" }
- stream_bytes: { usage: COUNTER ,description: "Bytes that streamed to decoding output plugin after mem exceed" }
- total_txns: { usage: COUNTER ,description: "Number of decoded xacts sent to the decoding output plugin for this slot" }
- total_bytes: { usage: COUNTER ,description: "Number of decoded bytes sent to the decoding output plugin for this slot" }
- reset_time: { usage: GAUGE ,description: "When statistics were last reset" }
pg_slot_13:
name: pg_slot
desc: PostgreSQL replication slot metrics v13 (wal safe size and status)
query: |-
SELECT slot_name, slot_type, plugin, database AS datname,datoid,active_pid,
active,temporary,xmin::TEXT::BIGINT AS xmin,catalog_xmin::TEXT::BIGINT AS catalog_xmin,
restart_lsn - '0/0' AS restart_lsn, confirmed_flush_lsn - '0/0' AS confirm_lsn,
CASE WHEN pg_is_in_recovery() THEN pg_last_wal_replay_lsn() ELSE pg_current_wal_lsn() END - restart_lsn AS retained_bytes,
safe_wal_size, CASE wal_status WHEN 'reserved' THEN 0 WHEN 'extended' THEN 1 WHEN 'unreserved' THEN 2 WHEN 'lost' THEN 3 ELSE -1 END AS wal_status
FROM pg_replication_slots;
ttl: 10
min_version: 130000
max_version: 140000
tags: [ cluster, primary ]
metrics:
- slot_name: { usage: LABEL ,description: "A unique, cluster-wide identifier for the replication slot" }
- slot_type: { usage: LABEL ,description: "The slot type, physical or logical" }
- plugin: { usage: LABEL ,description: "The base name of the shared object containing the output plugin this logical slot is using, or null for physical slots." }
- datname: { usage: LABEL ,description: "The name of the database this slot is associated with, logical slots only, null for physical slot" }
- datoid: { usage: GAUGE ,description: "The OID of the database this slot is associated with, logical slots only, null for physical slot" }
- active_pid: { usage: GAUGE ,description: "The process ID of the session streaming data for this slot. NULL if inactive." }
- active: { usage: GAUGE ,description: "True(1) if this slot is currently actively being used" }
- temporary: { usage: GAUGE ,description: "True(1) if this is a temporary replication slot." }
- xmin: { usage: COUNTER ,description: "The oldest transaction that this slot needs the database to retain." }
- catalog_xmin: { usage: COUNTER ,description: "The oldest transaction affecting the system catalogs that this slot needs the database to retain." }
- restart_lsn: { usage: COUNTER ,description: "The address (LSN) of oldest WAL which still might be required by the consumer of this slot" }
- confirm_lsn: { usage: COUNTER ,description: "The address (LSN) up to which the logical slot's consumer has confirmed receiving data." }
- retained_bytes: { usage: GAUGE ,description: "Size of bytes that retained for this slot" }
- safe_wal_size: { usage: GAUGE ,description: "bytes that can be written to WAL which will not make slot into lost" }
- wal_status: { usage: GAUGE ,description: "WAL reserve status 0-3 means reserved,extended,unreserved,lost, -1 means other" }
pg_slot_10:
name: pg_slot
desc: PostgreSQL replication slot metrics 10 ~ 12
query: |-
SELECT slot_name, slot_type, plugin, database AS datname,datoid,active_pid,
active,temporary,xmin::TEXT::BIGINT AS xmin,catalog_xmin::TEXT::BIGINT AS catalog_xmin,
restart_lsn - '0/0' AS restart_lsn, confirmed_flush_lsn - '0/0' AS confirm_lsn,
CASE WHEN pg_is_in_recovery() THEN pg_last_wal_replay_lsn() ELSE pg_current_wal_lsn() END - restart_lsn AS retained_bytes
FROM pg_replication_slots;
ttl: 10
min_version: 100000
max_version: 130000
tags: [ cluster, primary ]
metrics:
- slot_name: { usage: LABEL ,description: "A unique, cluster-wide identifier for the replication slot" }
- slot_type: { usage: LABEL ,description: "The slot type, physical or logical" }
- plugin: { usage: LABEL ,description: "The base name of the shared object containing the output plugin this logical slot is using, or null for physical slots." }
- datname: { usage: LABEL ,description: "The name of the database this slot is associated with, logical slots only, null for physical slot" }
- datoid: { usage: GAUGE ,description: "The OID of the database this slot is associated with, logical slots only, null for physical slot" }
- active_pid: { usage: GAUGE ,description: "The process ID of the session streaming data for this slot. NULL if inactive." }
- active: { usage: GAUGE ,description: "True(1) if this slot is currently actively being used" }
- temporary: { usage: GAUGE ,description: "True(1) if this is a temporary replication slot." }
- xmin: { usage: COUNTER ,description: "The oldest transaction that this slot needs the database to retain." }
- catalog_xmin: { usage: COUNTER ,description: "The oldest transaction affecting the system catalogs that this slot needs the database to retain." }
- restart_lsn: { usage: COUNTER ,description: "The address (LSN) of oldest WAL which still might be required by the consumer of this slot" }
- confirm_lsn: { usage: COUNTER ,description: "The address (LSN) up to which the logical slot's consumer has confirmed receiving data." }
- retained_bytes: { usage: GAUGE ,description: "Size of bytes that retained for this slot" }
#==============================================================#
# 0250 pg_recv
#==============================================================#
pg_recv_13:
name: pg_recv
desc: PostgreSQL walreceiver metrics 13+
query: |-
SELECT
coalesce(sender_host, (regexp_match(conninfo, '.*host=(\S+).*'))[1]) AS sender_host, coalesce(sender_port::TEXT, (regexp_match(conninfo, '.*port=(\S+).*'))[1]) AS sender_port, coalesce(slot_name, 'NULL') AS slot_name,
pid, CASE status WHEN 'streaming' THEN 0 WHEN 'startup' THEN 1 WHEN 'catchup' THEN 2 WHEN 'backup' THEN 3 WHEN 'stopping' THEN 4 ELSE -1 END AS state,
receive_start_lsn - '0/0' AS init_lsn,receive_start_tli AS init_tli,
flushed_lsn - '0/0' AS flush_lsn,written_lsn - '0/0' AS write_lsn, received_tli AS flush_tli, latest_end_lsn - '0/0' AS reported_lsn,
last_msg_send_time AS msg_send_time,last_msg_receipt_time AS msg_recv_time,latest_end_time AS reported_time,now() AS time FROM pg_stat_wal_receiver;
ttl: 10
min_version: 130000
tags: [ cluster, replica ]
metrics:
- sender_host: { usage: LABEL ,description: "Host of the PostgreSQL instance this WAL receiver is connected to" }
- sender_port: { usage: LABEL ,description: "Port number of the PostgreSQL instance this WAL receiver is connected to." }
- slot_name: { usage: LABEL ,description: "Replication slot name used by this WAL receiver" }
- pid: { usage: GAUGE ,description: "Process ID of the WAL receiver process" }
- state: { usage: GAUGE ,description: "Encoded activity status of the WAL receiver process 0-4 for streaming|startup|catchup|backup|stopping" }
- init_lsn: { usage: COUNTER ,description: "First write-ahead log location used when WAL receiver is started" }
- init_tli: { usage: COUNTER ,description: "First timeline number used when WAL receiver is started" }
- flush_lsn: { usage: COUNTER ,description: "Last write-ahead log location already received and flushed to disk" }
- write_lsn: { usage: COUNTER ,description: "Last write-ahead log location already received and written to disk, but not flushed." }
- flush_tli: { usage: COUNTER ,description: "Timeline number of last write-ahead log location received and flushed to disk" }
- reported_lsn: { usage: COUNTER ,description: "Last write-ahead log location reported to origin WAL sender" }
- msg_send_time: { usage: GAUGE ,description: "Send time of last message received from origin WAL sender" }
- msg_recv_time: { usage: GAUGE ,description: "Receipt time of last message received from origin WAL sender" }
- reported_time: { usage: GAUGE ,description: "Time of last write-ahead log location reported to origin WAL sender" }
- time: { usage: GAUGE ,description: "Time of current snapshot" }
pg_recv_11:
name: pg_recv
desc: PostgreSQL walreceiver metrics (11-12)
query: |-
SELECT
coalesce(sender_host, (regexp_match(conninfo, '.*host=(\S+).*'))[1]) AS sender_host, coalesce(sender_port::TEXT, (regexp_match(conninfo, '.*port=(\S+).*'))[1]) AS sender_port, coalesce(slot_name, 'NULL') AS slot_name,
pid, CASE status WHEN 'streaming' THEN 0 WHEN 'startup' THEN 1 WHEN 'catchup' THEN 2 WHEN 'backup' THEN 3 WHEN 'stopping' THEN 4 ELSE -1 END AS state,
receive_start_lsn - '0/0' AS init_lsn,receive_start_tli AS init_tli,
received_lsn - '0/0' AS flush_lsn, received_tli AS flush_tli, latest_end_lsn - '0/0' AS reported_lsn,
last_msg_send_time AS msg_send_time,last_msg_receipt_time AS msg_recv_time,latest_end_time AS reported_time,now() AS time FROM pg_stat_wal_receiver;
ttl: 10
tags: [ cluster, replica ]
min_version: 110000
max_version: 130000
metrics:
- sender_host: { usage: LABEL ,description: "Host of the PostgreSQL instance this WAL receiver is connected to" }
- sender_port: { usage: LABEL ,description: "Port number of the PostgreSQL instance this WAL receiver is connected to." }
- slot_name: { usage: LABEL ,description: "Replication slot name used by this WAL receiver" }
- pid: { usage: GAUGE ,description: "Process ID of the WAL receiver process" }
- state: { usage: GAUGE ,description: "Encoded activity status of the WAL receiver process 0-4 for streaming|startup|catchup|backup|stopping" }
- init_lsn: { usage: COUNTER ,description: "First write-ahead log location used when WAL receiver is started" }
- init_tli: { usage: COUNTER ,description: "First timeline number used when WAL receiver is started" }
- flush_lsn: { usage: COUNTER ,description: "Last write-ahead log location already received and flushed to disk" }
- flush_tli: { usage: COUNTER ,description: "Timeline number of last write-ahead log location received and flushed to disk" }
- reported_lsn: { usage: COUNTER ,description: "Last write-ahead log location reported to origin WAL sender" }
- msg_send_time: { usage: GAUGE ,description: "Send time of last message received from origin WAL sender" }
- msg_recv_time: { usage: GAUGE ,description: "Receipt time of last message received from origin WAL sender" }
- reported_time: { usage: GAUGE ,description: "Time of last write-ahead log location reported to origin WAL sender" }
- time: { usage: GAUGE ,description: "Time of current snapshot" }
pg_recv_10:
name: pg_recv
desc: PostgreSQL walreceiver metrics (10)
query: |-
SELECT
(regexp_match(conninfo, '.*host=(\S+).*'))[1] AS sender_host, (regexp_match(conninfo, '.*port=(\S+).*'))[1] AS sender_port, coalesce(slot_name, 'NULL') AS slot_name,
pid, CASE status WHEN 'streaming' THEN 0 WHEN 'startup' THEN 1 WHEN 'catchup' THEN 2 WHEN 'backup' THEN 3 WHEN 'stopping' THEN 4 ELSE -1 END AS state,
receive_start_lsn - '0/0' AS init_lsn,receive_start_tli AS init_tli,
received_lsn - '0/0' AS flush_lsn, received_tli AS flush_tli, latest_end_lsn - '0/0' AS reported_lsn,
last_msg_send_time AS msg_send_time,last_msg_receipt_time AS msg_recv_time,latest_end_time AS reported_time,now() AS time FROM pg_stat_wal_receiver;
ttl: 10
tags: [ cluster, replica ]
min_version: 100000
max_version: 110000
metrics:
- sender_host: { usage: LABEL ,description: "Host of the PostgreSQL instance this WAL receiver is connected to" }
- sender_port: { usage: LABEL ,description: "Port number of the PostgreSQL instance this WAL receiver is connected to." }
- slot_name: { usage: LABEL ,description: "Replication slot name used by this WAL receiver" }
- pid: { usage: GAUGE ,description: "Process ID of the WAL receiver process" }
- state: { usage: GAUGE ,description: "Encoded activity status of the WAL receiver process 0-4 for streaming|startup|catchup|backup|stopping" }
- init_lsn: { usage: COUNTER ,description: "First write-ahead log location used when WAL receiver is started" }
- init_tli: { usage: COUNTER ,description: "First timeline number used when WAL receiver is started" }
- flush_lsn: { usage: COUNTER ,description: "Last write-ahead log location already received and flushed to disk" }
- flush_tli: { usage: COUNTER ,description: "Timeline number of last write-ahead log location received and flushed to disk" }
- reported_lsn: { usage: COUNTER ,description: "Last write-ahead log location reported to origin WAL sender" }
- msg_send_time: { usage: GAUGE ,description: "Send time of last message received from origin WAL sender" }
- msg_recv_time: { usage: GAUGE ,description: "Receipt time of last message received from origin WAL sender" }
- reported_time: { usage: GAUGE ,description: "Time of last write-ahead log location reported to origin WAL sender" }
- time: { usage: GAUGE ,description: "Time of current snapshot" }
#==============================================================#
# 0260 pg_sub
#==============================================================#
pg_sub_16:
name: pg_sub
desc: PostgreSQL subscription statistics (16+)
query: |-
SELECT
s1.subname, subid AS id, pid, received_lsn, reported_lsn,
msg_send_time, msg_recv_time, reported_time,
apply_error_count, sync_error_count
FROM
(SELECT
subname, subid, pid,
received_lsn - '0/0' AS received_lsn, latest_end_lsn - '0/0' AS reported_lsn,
extract(epoch from last_msg_send_time) AS msg_send_time,
extract(epoch from last_msg_receipt_time) AS msg_recv_time,
extract(epoch from latest_end_time) AS reported_time
FROM pg_stat_subscription
WHERE relid IS NULL AND leader_pid IS NULL) s1
LEFT OUTER JOIN pg_stat_subscription_stats s2 USING(subid);
ttl: 10
min_version: 160000
tags: [ cluster ]
metrics:
- subname: { usage: LABEL ,description: "Name of this subscription" }
- id: { usage: GAUGE ,description: "OID of the subscription" }
- pid: { usage: GAUGE ,description: "Process ID of the subscription leader apply worker" }
- received_lsn: { usage: COUNTER ,description: "Last write-ahead log location received" }
- reported_lsn: { usage: COUNTER ,description: "Last write-ahead log location reported to origin WAL sender" }
- msg_send_time: { usage: GAUGE ,description: "Send time of last message received from origin WAL sender" }
- msg_recv_time: { usage: GAUGE ,description: "Receipt time of last message received from origin WAL sender" }
- reported_time: { usage: GAUGE ,description: "Time of last write-ahead log location reported to origin WAL sender" }
- apply_error_count: { usage: COUNTER ,description: "Number of times an error occurred while applying changes" }
- sync_error_count: { usage: COUNTER ,description: "Number of times an error occurred during the initial table synchronization" }
pg_sub_15:
name: pg_sub
desc: PostgreSQL subscription statistics (15)
query: |-
SELECT
s1.subname, subid AS id, pid, received_lsn, reported_lsn,
msg_send_time, msg_recv_time, reported_time,
apply_error_count, sync_error_count
FROM
(SELECT
subname, subid, pid,
received_lsn - '0/0' AS received_lsn, latest_end_lsn - '0/0' AS reported_lsn,
extract(epoch from last_msg_send_time) AS msg_send_time,
extract(epoch from last_msg_receipt_time) AS msg_recv_time,
extract(epoch from latest_end_time) AS reported_time
FROM pg_stat_subscription WHERE relid ISNULL) s1
LEFT OUTER JOIN pg_stat_subscription_stats s2 USING(subid);
ttl: 10
min_version: 150000
max_version: 160000
tags: [ cluster ]
metrics:
- subname: { usage: LABEL ,description: "Name of this subscription" }
- id: { usage: GAUGE ,description: "OID of the subscription" }
- pid: { usage: GAUGE ,description: "Process ID of the subscription main apply worker process" }
- received_lsn: { usage: COUNTER ,description: "Last write-ahead log location received" }
- reported_lsn: { usage: COUNTER ,description: "Last write-ahead log location reported to origin WAL sender" }
- msg_send_time: { usage: GAUGE ,description: "Send time of last message received from origin WAL sender" }
- msg_recv_time: { usage: GAUGE ,description: "Receipt time of last message received from origin WAL sender" }
- reported_time: { usage: GAUGE ,description: "Time of last write-ahead log location reported to origin WAL sender" }
- apply_error_count: { usage: COUNTER ,description: "Number of times an error occurred while applying changes." }
- sync_error_count: { usage: COUNTER ,description: "Number of times an error occurred during the initial table synchronization" }
pg_sub_10:
name: pg_sub
desc: PostgreSQL subscription statistics (10-14)
query: |-
SELECT
subname, subid AS id, pid,
received_lsn - '0/0' AS received_lsn, latest_end_lsn - '0/0' AS reported_lsn,
extract(epoch from last_msg_send_time) AS msg_send_time,
extract(epoch from last_msg_receipt_time) AS msg_recv_time,
extract(epoch from latest_end_time) AS reported_time
FROM pg_stat_subscription WHERE relid ISNULL;
ttl: 10
min_version: 100000
max_version: 150000
tags: [ cluster ]
metrics:
- subname: { usage: LABEL ,description: "Name of this subscription" }
- id: { usage: GAUGE ,description: "OID of the subscription" }
- pid: { usage: GAUGE ,description: "Process ID of the subscription main apply worker process" }