Derived Metrics
Details of the definition of Pigsty-derived monitoring metrics
Here are the rules for defining all Pigsty’s derived indicators.
Derived Metrics for Node
---
- name: node-rules
rules:
#==============================================================#
# Aliveness #
#==============================================================#
# TODO: change this to your node exporter port
- record: node_exporter_up
expr: up{instance=~".*:9099"}
- record: node:uptime
expr: time() - node_boot_time_seconds{}
#==============================================================#
# CPU #
#==============================================================#
# cpu mode time ratio
- record: node:cpu:cpu_mode
expr: irate(node_cpu_seconds_total{}[1m])
- record: node:ins:cpu_mode
expr: sum without (cpu) (node:cpu:cpu_mode)
- record: node:cls:cpu_mode
expr: sum by (cls, mode) (node:ins:cpu_mode)
# cpu schedule time-slices
- record: node:cpu:sched_timeslices
expr: irate(node_schedstat_timeslices_total{}[1m])
- record: node:ins:sched_timeslices
expr: sum without (cpu) (node:cpu:sched_timeslices)
- record: node:cls:sched_timeslicesa
expr: sum by (cls) (node:ins:sched_timeslices)
# cpu count
- record: node:ins:cpu_count
expr: count without (cpu) (node:cpu:cpu_usage)
- record: node:cls:cpu_count
expr: sum by (cls) (node:ins:cpu_count)
# cpu usage
- record: node:cpu:cpu_usage
expr: 1 - sum without (mode) (node:cpu:cpu_mode{mode="idle"})
- record: node:ins:cpu_usage
expr: sum without (cpu) (node:cpu:cpu_usage) / node:ins:cpu_count
- record: node:cls:cpu_usage
expr: sum by (cls) (node:ins:cpu_usage * node:ins:cpu_count) / sum by (cls) (node:ins:cpu_count)
# cpu usage avg5m
- record: node:cpu:cpu_usage_avg5m
expr: avg_over_time(node:cpu:cpu_usage[5m])
- record: node:ins:cpu_usage_avg5m
expr: avg_over_time(node:ins:cpu_usage[5m])
- record: node:cls:cpu_usage_avg5m
expr: avg_over_time(node:cls:cpu_usage[5m])
#==============================================================#
# Memory #
#==============================================================#
# mem usage
- record: node:ins:mem_app
expr: node_memory_MemTotal_bytes - node_memory_MemFree_bytes - node_memory_Buffers_bytes - node_memory_Cached_bytes - node_memory_Slab_bytes - node_memory_PageTables_bytes - node_memory_SwapCached_bytes
- record: node:ins:mem_free
expr: node_memory_MemFree_bytes{} + node_memory_Cached_bytes{}
- record: node:ins:mem_usage
expr: node:ins:mem_app / node_memory_MemTotal_bytes
- record: node:cls:mem_usage
expr: sum by (cls) (node:ins:mem_app) / sum by (cls) (node_memory_MemTotal_bytes)
- record: node:ins:swap_usage
expr: 1 - node_memory_SwapFree_bytes{} / node_memory_SwapTotal_bytes{}
#==============================================================#
# Disk #
#==============================================================#
# disk read iops
- record: node:dev:disk_read_iops
expr: irate(node_disk_reads_completed_total{device=~"[a-zA-Z-_]+"}[1m])
- record: node:ins:disk_read_iops
expr: sum without (device) (node:dev:disk_read_iops)
- record: node:cls:disk_read_iops
expr: sum by (cls) (node:ins:disk_read_iops)
# disk write iops
- record: node:dev:disk_write_iops
expr: irate(node_disk_writes_completed_total{device=~"[a-zA-Z-_]+"}[1m])
- record: node:ins:disk_write_iops
expr: sum without (device) (node:dev:disk_write_iops)
- record: node:cls:disk_write_iops
expr: sum by (cls) (node:ins:disk_write_iops)
# disk iops
- record: node:dev:disk_iops
expr: node:dev:disk_read_iops + node:dev:disk_write_iops
- record: node:ins:disk_iops
expr: node:ins:disk_read_iops + node:ins:disk_write_iops
- record: node:cls:disk_iops
expr: node:cls:disk_read_iops + node:cls:disk_write_iops
# read bandwidth (rate1m)
- record: node:dev:disk_read_rate
expr: rate(node_disk_read_bytes_total{device=~"[a-zA-Z-_]+"}[1m])
- record: node:ins:disk_read_rate
expr: sum without (device) (node:dev:disk_read_rate)
- record: node:cls:disk_read_rate
expr: sum by (cls) (node:ins:disk_read_rate)
# write bandwidth (rate1m)
- record: node:dev:disk_write_rate
expr: rate(node_disk_written_bytes_total{device=~"[a-zA-Z-_]+"}[1m])
- record: node:ins:disk_write_rate
expr: sum without (device) (node:dev:disk_write_rate)
- record: node:cls:disk_write_rate
expr: sum by (cls) (node:ins:disk_write_rate)
# io bandwidth (rate1m)
- record: node:dev:disk_io_rate
expr: node:dev:disk_read_rate + node:dev:disk_write_rate
- record: node:ins:disk_io_rate
expr: node:ins:disk_read_rate + node:ins:disk_write_rate
- record: node:cls:disk_io_rate
expr: node:cls:disk_read_rate + node:cls:disk_write_rate
# read/write total time
- record: node:dev:disk_read_time
expr: rate(node_disk_read_time_seconds_total{device=~"[a-zA-Z-_]+"}[1m])
- record: node:dev:disk_write_time
expr: rate(node_disk_read_time_seconds_total{device=~"[a-zA-Z-_]+"}[1m])
# read/write response time
- record: node:dev:disk_read_rt
expr: node:dev:disk_read_time / node:dev:disk_read_iops
- record: node:dev:disk_write_rt
expr: node:dev:disk_write_time / node:dev:disk_write_iops
- record: node:dev:disk_rt
expr: (node:dev:disk_read_time + node:dev:disk_write_time) / node:dev:iops
#==============================================================#
# Network #
#==============================================================#
# transmit bandwidth (out)
- record: node:dev:network_tx
expr: irate(node_network_transmit_bytes_total{}[1m])
- record: node:ins:network_tx
expr: sum without (device) (node:dev:network_tx{device!~"lo|bond.*"})
- record: node:cls:network_tx
expr: sum by (cls) (node:ins:network_tx)
# receive bandwidth (in)
- record: node:dev:network_rx
expr: irate(node_network_receive_bytes_total{}[1m])
- record: node:ins:network_rx
expr: sum without (device) (node:dev:network_rx{device!~"lo|bond.*"})
- record: node:cls:network_rx
expr: sum by (cls) (node:ins:network_rx)
# io bandwidth
- record: node:dev:network_io_rate
expr: node:dev:network_tx + node:dev:network_rx
- record: node:ins:network_io
expr: node:ins:network_tx + node:ins:network_rx
- record: node:cls:network_io
expr: node:cls:network_tx + node:cls:network_rx
#==============================================================#
# Schedule #
#==============================================================#
# normalized load
- record: node:ins:stdload1
expr: node_load1 / node:ins:cpu_count
- record: node:ins:stdload5
expr: node_load5 / node:ins:cpu_count
- record: node:ins:stdload15
expr: node_load15 / node:ins:cpu_count
# process
- record: node:ins:forks
expr: irate(node_forks_total[1m])
# interrupt & context switch
- record: node:ins:intrrupt
expr: irate(node_intr_total[1m])
- record: node:ins:ctx_switch
expr: irate(node_context_switches_total{}[1m])
#==============================================================#
# VM #
#==============================================================#
- record: node:ins:pagefault
expr: irate(node_vmstat_pgfault[1m])
- record: node:ins:pagein
expr: irate(node_vmstat_pgpgin[1m])
- record: node:ins:pageout
expr: irate(node_vmstat_pgpgout[1m])
- record: node:ins:swapin
expr: irate(node_vmstat_pswpin[1m])
- record: node:ins:swapout
expr: irate(node_vmstat_pswpout[1m])
#==============================================================#
# FS #
#==============================================================#
# filesystem space usage
- record: node:fs:free_bytes
expr: max without(device, fstype) (node_filesystem_free_bytes{fstype!~"(n|root|tmp)fs.*"})
- record: node:fs:avail_bytes
expr: max without(device, fstype) (node_filesystem_avail_bytes{fstype!~"(n|root|tmp)fs.*"})
- record: node:fs:size_bytes
expr: max without(device, fstype) (node_filesystem_size_bytes{fstype!~"(n|root|tmp)fs.*"})
- record: node:fs:space_usage
expr: 1 - (node:fs:avail_bytes{} / node:fs:size_bytes{})
- record: node:fs:free_inode
expr: max without(device, fstype) (node_filesystem_files_free{fstype!~"(n|root|tmp)fs.*"})
- record: node:fs:total_inode
expr: max without(device, fstype) (node_filesystem_files{fstype!~"(n|root|tmp)fs.*"})
# space delta and prediction
- record: node:fs:space_deriv_1h
expr: 0 - deriv(node_filesystem_avail_bytes{}[1h])
- record: node:fs:space_exhaust
expr: (node_filesystem_avail_bytes{} / node:fs:space_deriv_1h{}) > 0
# fs inode usage
- record: node:fs:inode_usage
expr: 1 - (node:fs:free_inode / node:fs:total_inode)
# file descriptor usage
- record: node:ins:fd_usage
expr: node_filefd_allocated / node_filefd_maximum
#==============================================================#
# TCP #
#==============================================================#
# tcp segments (rate1m)
- record: node:ins:tcp_insegs
expr: rate(node_netstat_Tcp_InSegs{}[1m])
- record: node:ins:tcp_outsegs
expr: rate(node_netstat_Tcp_OutSegs{}[1m])
- record: node:ins:tcp_retranssegs
expr: rate(node_netstat_Tcp_RetransSegs{}[1m])
- record: node:ins:tcp_segs
expr: node:ins:tcp_insegs + node:ins:tcp_outsegs
# retransmit
- record: node:ins:tcp_retrans_rate
expr: node:ins:tcp_retranssegs / node:ins:tcp_outsegs
# overflow
- record: node:ins:tcp_overflow_rate
expr: rate(node_netstat_TcpExt_ListenOverflows[1m])
#==============================================================#
# Netstat #
#==============================================================#
# tcp open (rate1m)
- record: node:ins:tcp_passive_opens
expr: rate(node_netstat_Tcp_PassiveOpens[1m])
- record: node:ins:tcp_active_opens
expr: rate(node_netstat_Tcp_ActiveOpens[1m])
# tcp close
- record: node:ins:tcp_attempt_fails
expr: rate(node_netstat_Tcp_AttemptFails[1m])
- record: node:ins:tcp_estab_resets
expr: rate(node_netstat_Tcp_EstabResets[1m])
# tcp drop
- record: node:ins:tcp_overflow
expr: rate(node_netstat_TcpExt_ListenOverflows[1m])
- record: node:ins:tcp_dropped
expr: rate(node_netstat_TcpExt_ListenDrops[1m])
#==============================================================#
# NTP #
#==============================================================#
- record: node:cls:ntp_offset_range
expr: max by (cls)(node_ntp_offset_seconds) - min by (cls)(node_ntp_offset_seconds)
...
Derived Metrics about postgres and pgbouncer
---
#==============================================================#
# File : pgsql.yml
# Ctime : 2020-04-22
# Mtime : 2020-12-03
# Desc : Record and alert rules for postgres
# Path : /etc/prometheus/rules/pgsql.yml
# Copyright (C) 2018-2021 Ruohang Feng
#==============================================================#
groups:
################################################################
# PgSQL Rules #
################################################################
- name: pgsql-rules
rules:
#==============================================================#
# Aliveness #
#==============================================================#
# TODO: change these to your pg_exporter & pgbouncer_exporter port
- record: pg_exporter_up
expr: up{instance=~".*:9185"}
- record: pgbouncer_exporter_up
expr: up{instance=~".*:9127"}
#==============================================================#
# Identity #
#==============================================================#
- record: pg_is_primary
expr: 1 - pg_in_recovery
- record: pg_is_replica
expr: pg_in_recovery
- record: pg_status
expr: (pg_up{} * 2) + (1 - pg_in_recovery{})
# encoded: 0:replica[DOWN] 1:primary[DOWN] 2:replica 3:primary
#==============================================================#
# Age #
#==============================================================#
# age
- record: pg:ins:age
expr: max without (datname) (pg_database_age{datname!~"template[0-9]"})
- record: pg:cls:age
expr: max by (cls) (pg:ins:age)
- record: pg:all:age
expr: max(pg:cls:age)
# age derive and prediction
- record: pg:db:age_deriv_1h
expr: deriv(pg_database_age{}[1h])
- record: pg:db:age_exhaust
expr: (2147483648 - pg_database_age{}) / pg:db:age_deriv_1h
#==============================================================#
# Sessions #
#==============================================================#
# session count (by state)
- record: pg:db:sessions
expr: pg_activity_count
- record: pg:ins:sessions
expr: sum without (datname) (pg:db:sessions)
- record: pg:svc:sessions
expr: sum by (cls, role, state) (pg:ins:sessions)
- record: pg:cls:sessions
expr: sum by (cls, state) (pg:ins:sessions)
- record: pg:all:sessions
expr: sum by (state) (pg:cls:sessions)
# backends
- record: pg:db:backends
expr: pg_db_numbackends
- record: pg:ins:backends
expr: sum without (datname) (pg_db_numbackends)
- record: pg:svc:backends
expr: sum by (cls, role) (pg:ins:backends)
- record: pg:cls:backends
expr: sum by (cls) (pg:ins:backends)
- record: pg:all:backends
expr: sum(pg:cls:backends)
# active backends
- record: pg:ins:active_backends
expr: pg:ins:sessions{state="active"}
- record: pg:svc:active_backends
expr: sum by (cls, role) (pg:ins:active_backends)
- record: pg:cls:active_backends
expr: sum by (cls) (pg:ins:active_backends)
- record: pg:all:active_backends
expr: sum(pg:cls:active_backends)
# idle in xact backends (including abort)
- record: pg:ins:ixact_backends
expr: pg:ins:sessions{state=~"idle in.*"}
- record: pg:svc:ixact_backends
expr: sum by (cls, role) (pg:ins:active_backends)
- record: pg:cls:ixact_backends
expr: sum by (cls) (pg:ins:active_backends)
- record: pg:all:ixact_backends
expr: sum(pg:cls:active_backends)
#==============================================================#
# Servers (Pgbouncer) #
#==============================================================#
# active servers
- record: pg:pool:active_servers
expr: pgbouncer_pool_active_servers{datname!="pgbouncer"}
- record: pg:db:active_servers
expr: sum without(user) (pg:pool:active_servers)
- record: pg:ins:active_servers
expr: sum without(user, datname) (pg:pool:active_servers)
- record: pg:svc:active_servers
expr: sum by (cls, role) (pg:ins:active_servers)
- record: pg:cls:active_servers
expr: sum by (cls) (pg:ins:active_servers)
- record: pg:all:active_servers
expr: sum(pg:cls:active_servers)
# idle servers
- record: pg:pool:idle_servers
expr: pgbouncer_pool_idle_servers{datname!="pgbouncer"}
- record: pg:db:idle_servers
expr: sum without(user) (pg:pool:idle_servers)
- record: pg:ins:idle_servers
expr: sum without(user, datname) (pg:pool:idle_servers)
- record: pg:svc:idle_servers
expr: sum by (cls, role) (pg:ins:idle_servers)
- record: pg:cls:idle_servers
expr: sum by (cls) (pg:ins:idle_servers)
- record: pg:all:idle_servers
expr: sum(pg:cls:idle_servers)
# used servers
- record: pg:pool:used_servers
expr: pgbouncer_pool_used_servers{datname!="pgbouncer"}
- record: pg:db:used_servers
expr: sum without(user) (pg:pool:used_servers)
- record: pg:ins:used_servers
expr: sum without(user, datname) (pg:pool:used_servers)
- record: pg:svc:used_servers
expr: sum by (cls, role) (pg:ins:used_servers)
- record: pg:cls:used_servers
expr: sum by (cls) (pg:ins:used_servers)
- record: pg:all:used_servers
expr: sum(pg:cls:used_servers)
# tested servers
- record: pg:pool:tested_servers
expr: pgbouncer_pool_tested_servers{datname!="pgbouncer"}
- record: pg:db:tested_servers
expr: sum without(user) (pg:pool:tested_servers)
- record: pg:ins:tested_servers
expr: sum without(user, datname) (pg:pool:tested_servers)
- record: pg:svc:tested_servers
expr: sum by (cls, role) (pg:ins:tested_servers)
- record: pg:cls:tested_servers
expr: sum by (cls) (pg:ins:tested_servers)
- record: pg:all:tested_servers
expr: sum(pg:cls:tested_servers)
# login servers
- record: pg:pool:login_servers
expr: pgbouncer_pool_login_servers{datname!="pgbouncer"}
- record: pg:db:login_servers
expr: sum without(user) (pg:pool:login_servers)
- record: pg:ins:login_servers
expr: sum without(user, datname) (pg:pool:login_servers)
- record: pg:svc:login_servers
expr: sum by (cls, role) (pg:ins:login_servers)
- record: pg:cls:login_servers
expr: sum by (cls) (pg:ins:login_servers)
- record: pg:all:login_servers
expr: sum(pg:cls:login_servers)
#==============================================================#
# Clients (Pgbouncer) #
#==============================================================#
# active clients
- record: pg:pool:active_clients
expr: pgbouncer_pool_active_clients{datname!="pgbouncer"}
- record: pg:db:active_clients
expr: sum without(user) (pg:pool:active_clients)
- record: pg:ins:active_clients
expr: sum without(user, datname) (pg:pool:active_clients)
- record: pg:svc:active_clients
expr: sum by (cls, role) (pg:ins:active_clients)
- record: pg:cls:active_clients
expr: sum by (cls) (pg:ins:active_clients)
- record: pg:all:active_clients
expr: sum(pg:cls:active_clients)
# waiting clients
- record: pg:pool:waiting_clients
expr: pgbouncer_pool_waiting_clients{datname!="pgbouncer"}
- record: pg:db:waiting_clients
expr: sum without(user) (pg:pool:waiting_clients)
- record: pg:ins:waiting_clients
expr: sum without(user, datname) (pg:pool:waiting_clients)
- record: pg:svc:waiting_clients
expr: sum by (cls, role) (pg:ins:waiting_clients)
- record: pg:cls:waiting_clients
expr: sum by (cls) (pg:ins:waiting_clients)
- record: pg:all:waiting_clients
expr: sum(pg:cls:waiting_clients)
#==============================================================#
# Transactions #
#==============================================================#
# commits (realtime)
- record: pg:db:commits_realtime
expr: irate(pg_db_xact_commit{}[1m])
- record: pg:ins:commits_realtime
expr: sum without (datname) (pg:db:commits_realtime)
- record: pg:svc:commits_realtime
expr: sum by (cls, role) (pg:ins:commits_realtime)
- record: pg:cls:commits_realtime
expr: sum by (cls) (pg:ins:commits_realtime)
- record: pg:all:commits_realtime
expr: sum(pg:cls:commits_realtime)
# commits (rate1m)
- record: pg:db:commits
expr: rate(pg_db_xact_commit{}[1m])
- record: pg:ins:commits
expr: sum without (datname) (pg:db:commits)
- record: pg:svc:commits
expr: sum by (cls, role) (pg:ins:commits)
- record: pg:cls:commits
expr: sum by (cls) (pg:ins:commits)
- record: pg:all:commits
expr: sum(pg:cls:commits)
# rollbacks realtime
- record: pg:db:rollbacks_realtime
expr: irate(pg_db_xact_rollback{}[1m])
- record: pg:ins:rollbacks_realtime
expr: sum without (datname) (pg:db:rollbacks_realtime)
- record: pg:svc:rollbacks_realtime
expr: sum by (cls, role) (pg:ins:rollbacks_realtime)
- record: pg:cls:rollbacks_realtime
expr: sum by (cls) (pg:ins:rollbacks_realtime)
- record: pg:all:rollbacks_realtime
expr: sum(pg:cls:rollbacks_realtime)
# rollbacks
- record: pg:db:rollbacks
expr: rate(pg_db_xact_rollback{}[1m])
- record: pg:ins:rollbacks
expr: sum without (datname) (pg:db:rollbacks)
- record: pg:svc:rollbacks
expr: sum by (cls, role) (pg:ins:rollbacks)
- record: pg:cls:rollbacks
expr: sum by (cls) (pg:ins:rollbacks)
- record: pg:all:rollbacks
expr: sum(pg:cls:rollbacks)
# xacts (realtime)
- record: pg:db:xacts_realtime
expr: irate(pg_db_xact_commit{}[1m])
- record: pg:ins:xacts_realtime
expr: sum without (datname) (pg:db:xacts_realtime)
- record: pg:svc:xacts_realtime
expr: sum by (cls, role) (pg:ins:xacts_realtime)
- record: pg:cls:xacts_realtime
expr: sum by (cls) (pg:ins:xacts_realtime)
- record: pg:all:xacts_realtime
expr: sum(pg:cls:xacts_realtime)
# xacts (rate1m)
- record: pg:db:xacts
expr: rate(pg_db_xact_commit{}[1m])
- record: pg:ins:xacts
expr: sum without (datname) (pg:db:xacts)
- record: pg:svc:xacts
expr: sum by (cls, role) (pg:ins:xacts)
- record: pg:cls:xacts
expr: sum by (cls) (pg:ins:xacts)
- record: pg:all:xacts
expr: sum(pg:cls:xacts)
# xacts avg30m
- record: pg:db:xacts_avg30m
expr: avg_over_time(pg:db:xacts[30m])
- record: pg:ins:xacts_avg30m
expr: avg_over_time(pg:ins:xacts[30m])
- record: pg:svc:xacts_avg30m
expr: avg_over_time(pg:svc:xacts[30m])
- record: pg:cls:xacts_avg30m
expr: avg_over_time(pg:cls:xacts[30m])
- record: pg:all:xacts_avg30m
expr: avg_over_time(pg:all:xacts[30m])
# xacts µ
- record: pg:db:xacts_mu
expr: avg_over_time(pg:db:xacts_avg30m[30m])
- record: pg:ins:xacts_mu
expr: avg_over_time(pg:ins:xacts_avg30m[30m])
- record: pg:svc:xacts_mu
expr: avg_over_time(pg:svc:xacts_avg30m[30m])
- record: pg:cls:xacts_mu
expr: avg_over_time(pg:cls:xacts_avg30m[30m])
- record: pg:all:xacts_mu
expr: avg_over_time(pg:all:xacts_avg30m[30m])
# xacts σ: sigma
- record: pg:db:xacts_sigma
expr: stddev_over_time(pg:db:xacts[30m])
- record: pg:ins:xacts_sigma
expr: stddev_over_time(pg:ins:xacts[30m])
- record: pg:svc:xacts_sigma
expr: stddev_over_time(pg:svc:xacts[30m])
- record: pg:cls:xacts_sigma
expr: stddev_over_time(pg:cls:xacts[30m])
- record: pg:all:xacts_sigma
expr: stddev_over_time(pg:all:xacts[30m])
#==============================================================#
# TPS (Pgbouncer) #
#==============================================================#
# TPS realtime (irate1m)
- record: pg:db:tps_realtime
expr: irate(pgbouncer_stat_total_xact_count{}[1m])
- record: pg:ins:tps_realtime
expr: sum without(datname) (pg:db:tps_realtime{})
- record: pg:svc:tps_realtime
expr: sum by(cls, role) (pg:ins:tps_realtime{})
- record: pg:cls:tps_realtime
expr: sum by(cls) (pg:ins:tps_realtime{})
- record: pg:all:tps_realtime
expr: sum(pg:cls:tps_realtime{})
# TPS (rate1m)
- record: pg:db:tps
expr: pgbouncer_stat_avg_xact_count{datname!="pgbouncer"}
- record: pg:ins:tps
expr: sum without(datname) (pg:db:tps)
- record: pg:svc:tps
expr: sum by (cls, role) (pg:ins:tps)
- record: pg:cls:tps
expr: sum by(cls) (pg:ins:tps)
- record: pg:all:tps
expr: sum(pg:cls:tps)
# tps : avg30m
- record: pg:db:tps_avg30m
expr: avg_over_time(pg:db:tps[30m])
- record: pg:ins:tps_avg30m
expr: avg_over_time(pg:ins:tps[30m])
- record: pg:svc:tps_avg30m
expr: avg_over_time(pg:svc:tps[30m])
- record: pg:cls:tps_avg30m
expr: avg_over_time(pg:cls:tps[30m])
- record: pg:all:tps_avg30m
expr: avg_over_time(pg:all:tps[30m])
# tps µ
- record: pg:db:tps_mu
expr: avg_over_time(pg:db:tps_avg30m[30m])
- record: pg:ins:tps_mu
expr: avg_over_time(pg:ins:tps_avg30m[30m])
- record: pg:svc:tps_mu
expr: avg_over_time(pg:svc:tps_avg30m[30m])
- record: pg:cls:tps_mu
expr: avg_over_time(pg:cls:tps_avg30m[30m])
- record: pg:all:tps_mu
expr: avg_over_time(pg:all:tps_avg30m[30m])
# tps σ
- record: pg:db:tps_sigma
expr: stddev_over_time(pg:db:tps[30m])
- record: pg:ins:tps_sigma
expr: stddev_over_time(pg:ins:tps[30m])
- record: pg:svc:tps_sigma
expr: stddev_over_time(pg:svc:tps[30m])
- record: pg:cls:tps_sigma
expr: stddev_over_time(pg:cls:tps[30m])
- record: pg:all:tps_sigma
expr: stddev_over_time(pg:all:tps[30m])
# xact rt (rate1m)
- record: pg:db:xact_rt
expr: pgbouncer_stat_avg_xact_time{datname!="pgbouncer"} / 1000000
- record: pg:ins:xact_rt
expr: sum without(datname) (rate(pgbouncer_stat_total_xact_time[1m])) / sum without(datname) (rate(pgbouncer_stat_total_xact_count[1m])) / 1000000
- record: pg:svc:xact_rt
expr: sum by (cls, role) (rate(pgbouncer_stat_total_xact_time[1m])) / sum by (cls, role) (rate(pgbouncer_stat_total_xact_count[1m])) / 1000000
# xact_rt avg30m
- record: pg:db:xact_rt_avg30m
expr: avg_over_time(pg:db:xact_rt[30m])
- record: pg:ins:xact_rt_avg30m
expr: avg_over_time(pg:ins:xact_rt[30m])
- record: pg:svc:xact_rt_avg30m
expr: avg_over_time(pg:svc:xact_rt[30m])
# xact_rt µ
- record: pg:db:xact_rt_mu
expr: avg_over_time(pg:db:xact_rt_avg30m[30m])
- record: pg:ins:xact_rt_mu
expr: avg_over_time(pg:ins:xact_rt_avg30m[30m])
- record: pg:svc:xact_rt_mu
expr: avg_over_time(pg:svc:xact_rt_avg30m[30m])
# xact_rt σ: stddev30m
- record: pg:db:xact_rt_sigma
expr: stddev_over_time(pg:db:xact_rt[30m])
- record: pg:ins:xact_rt_sigma
expr: stddev_over_time(pg:ins:xact_rt[30m])
- record: pg:svc:xact_rt_sigma
expr: stddev_over_time(pg:svc:xact_rt[30m])
#==============================================================#
# QPS (Pgbouncer) #
#==============================================================#
# QPS realtime (irate1m)
- record: pg:db:qps_realtime
expr: irate(pgbouncer_stat_total_query_count{}[1m])
- record: pg:ins:qps_realtime
expr: sum without(datname) (pg:db:qps_realtime{})
- record: pg:svc:qps_realtime
expr: sum by(cls, role) (pg:ins:qps_realtime{})
- record: pg:cls:qps_realtime
expr: sum by(cls) (pg:ins:qps_realtime{})
- record: pg:all:qps_realtime
expr: sum(pg:cls:qps_realtime{})
# qps (rate1m)
- record: pg:db:qps
expr: pgbouncer_stat_avg_query_count{datname!="pgbouncer"}
- record: pg:ins:qps
expr: sum without(datname) (pg:db:qps)
- record: pg:svc:qps
expr: sum by (cls, role) (pg:ins:qps)
- record: pg:cls:qps
expr: sum by(cls) (pg:ins:qps)
- record: pg:all:qps
expr: sum(pg:cls:qps)
# qps avg30m
- record: pg:db:qps_avg30m
expr: avg_over_time(pg:db:qps[30m])
- record: pg:ins:qps_avg30m
expr: avg_over_time(pg:ins:qps[30m])
- record: pg:svc:qps_avg30m
expr: avg_over_time(pg:svc:qps[30m])
- record: pg:cls:qps_avg30m
expr: avg_over_time(pg:cls:qps[30m])
- record: pg:all:qps_avg30m
expr: avg_over_time(pg:all:qps[30m])
# qps µ
- record: pg:db:qps_mu
expr: avg_over_time(pg:db:qps_avg30m[30m])
- record: pg:ins:qps_mu
expr: avg_over_time(pg:ins:qps_avg30m[30m])
- record: pg:svc:qps_mu
expr: avg_over_time(pg:svc:qps_avg30m[30m])
- record: pg:cls:qps_mu
expr: avg_over_time(pg:cls:qps_avg30m[30m])
- record: pg:all:qps_mu
expr: avg_over_time(pg:all:qps_avg30m[30m])
# qps σ: stddev30m qps
- record: pg:db:qps_sigma
expr: stddev_over_time(pg:db:qps[30m])
- record: pg:ins:qps_sigma
expr: stddev_over_time(pg:ins:qps[30m])
- record: pg:svc:qps_sigma
expr: stddev_over_time(pg:svc:qps[30m])
- record: pg:cls:qps_sigma
expr: stddev_over_time(pg:cls:qps[30m])
- record: pg:all:qps_sigma
expr: stddev_over_time(pg:all:qps[30m])
# query rt (1m avg)
- record: pg:db:query_rt
expr: pgbouncer_stat_avg_query_time{datname!="pgbouncer"} / 1000000
- record: pg:ins:query_rt
expr: sum without(datname) (rate(pgbouncer_stat_total_query_time[1m])) / sum without(datname) (rate(pgbouncer_stat_total_query_count[1m])) / 1000000
- record: pg:svc:query_rt
expr: sum by (cls, role) (rate(pgbouncer_stat_total_query_time[1m])) / sum by (cls, role) (rate(pgbouncer_stat_total_query_count[1m])) / 1000000
# query_rt avg30m
- record: pg:db:query_rt_avg30m
expr: avg_over_time(pg:db:query_rt[30m])
- record: pg:ins:query_rt_avg30m
expr: avg_over_time(pg:ins:query_rt[30m])
- record: pg:svc:query_rt_avg30m
expr: avg_over_time(pg:svc:query_rt[30m])
# query_rt µ
- record: pg:db:query_rt_mu
expr: avg_over_time(pg:db:query_rt_avg30m[30m])
- record: pg:ins:query_rt_mu
expr: avg_over_time(pg:ins:query_rt_avg30m[30m])
- record: pg:svc:query_rt_mu
expr: avg_over_time(pg:svc:query_rt_avg30m[30m])
# query_rt σ: stddev30m
- record: pg:db:query_rt_sigma
expr: stddev_over_time(pg:db:query_rt[30m])
- record: pg:ins:query_rt_sigma
expr: stddev_over_time(pg:ins:query_rt[30m])
- record: pg:svc:query_rt_sigma
expr: stddev_over_time(pg:svc:query_rt[30m])
#==============================================================#
# PG Load #
#==============================================================#
# seconds spend on transaction in last minute
- record: pg:ins:xact_time_rate1m
expr: sum without (datname) (rate(pgbouncer_stat_total_xact_time{}[1m])) / 1000000
- record: pg:ins:xact_time_rate5m
expr: sum without (datname) (rate(pgbouncer_stat_total_xact_time{}[5m])) / 1000000
- record: pg:ins:xact_time_rate15m
expr: sum without (datname) (rate(pgbouncer_stat_total_xact_time{}[15m])) / 1000000
# seconds spend on queries in last minute
- record: pg:ins:query_time_rate1m
expr: sum without (datname) (rate(pgbouncer_stat_total_query_time{}[1m])) / 1000000
- record: pg:ins:query_time_rate5m
expr: sum without (datname) (rate(pgbouncer_stat_total_query_time{}[5m])) / 1000000
- record: pg:ins:query_time_rate15m
expr: sum without (datname) (rate(pgbouncer_stat_total_query_time{}[15m])) / 1000000
# instance level load
- record: pg:ins:load0
expr: sum without (datname) (irate(pgbouncer_stat_total_xact_time{}[1m])) / on (ip) group_left() node:ins:cpu_count / 1000000
- record: pg:ins:load1
expr: pg:ins:xact_time_rate1m / on (ip) group_left() node:ins:cpu_count
- record: pg:ins:load5
expr: pg:ins:xact_time_rate5m / on (ip) group_left() node:ins:cpu_count
- record: pg:ins:load15
expr: pg:ins:xact_time_rate15m / on (ip) group_left() node:ins:cpu_count
# service level load
- record: pg:svc:load0
expr: sum by (svc, cls, role) (irate(pgbouncer_stat_total_xact_time{}[1m])) / on (svc) group_left() sum by (svc) (node:ins:cpu_count{}) / 1000000
- record: pg:svc:load1
expr: sum by (svc, cls, role) (pg:ins:xact_time_rate1m) / on (svc) group_left() sum by (svc) (node:ins:cpu_count{}) / 1000000
- record: pg:svc:load5
expr: sum by (svc, cls, role) (pg:ins:xact_time_rate5m) / on (svc) group_left() sum by (svc) (node:ins:cpu_count{}) / 1000000
- record: pg:svc:load15
expr: sum by (svc, cls, role) (pg:ins:xact_time_rate15m) / on (svc) group_left() sum by (svc) (node:ins:cpu_count{}) / 1000000
# cluster level load
- record: pg:cls:load0
expr: sum by (cls) (irate(pgbouncer_stat_total_xact_time{}[1m])) / on (cls) node:cls:cpu_count{} / 1000000
- record: pg:cls:load1
expr: sum by (cls) (pg:ins:xact_time_rate1m) / on (cls) node:cls:cpu_count
- record: pg:cls:load5
expr: sum by (cls) (pg:ins:xact_time_rate5m) / on (cls) node:cls:cpu_count
- record: pg:cls:load15
expr: sum by (cls) (pg:ins:xact_time_rate15m) / on (cls) node:cls:cpu_count
#==============================================================#
# PG Saturation #
#==============================================================#
# max value of pg_load and cpu_usage
# instance level saturation
- record: pg:ins:saturation0
expr: pg:ins:load0 > node:ins:cpu_usage or node:ins:cpu_usage
- record: pg:ins:saturation1
expr: pg:ins:load1 > node:ins:cpu_usage or node:ins:cpu_usage
- record: pg:ins:saturation5
expr: pg:ins:load5 > node:ins:cpu_usage or node:ins:cpu_usage
- record: pg:ins:saturation15
expr: pg:ins:load15 > node:ins:cpu_usage or node:ins:cpu_usage
# cluster level saturation
- record: pg:cls:saturation0
expr: pg:cls:load0 > node:cls:cpu_usage or node:cls:cpu_usage
- record: pg:cls:saturation1
expr: pg:cls:load1 > node:cls:cpu_usage or node:cls:cpu_usage
- record: pg:cls:saturation5
expr: pg:cls:load5 > node:cls:cpu_usage or node:cls:cpu_usage
- record: pg:cls:saturation15
expr: pg:cls:load15 > node:cls:cpu_usage or node:cls:cpu_usage
#==============================================================#
# CRUD #
#==============================================================#
# rows touched
- record: pg:db:tup_touched
expr: irate(pg_db_tup_fetched{}[1m])
- record: pg:ins:tup_touched
expr: sum without(datname) (pg:db:tup_touched)
- record: pg:svc:tup_touched
expr: sum by (cls, role) (pg:ins:tup_touched)
- record: pg:cls:tup_touched
expr: sum by (cls) (pg:ins:tup_touched)
- record: pg:all:tup_touched
expr: sum(pg:cls:tup_touched)
# selected
- record: pg:db:tup_selected
expr: irate(pg_db_tup_returned{}[1m])
- record: pg:ins:tup_selected
expr: sum without(datname) (pg:db:tup_selected)
- record: pg:svc:tup_selected
expr: sum by (cls, role) (pg:ins:tup_selected)
- record: pg:cls:tup_selected
expr: sum by (cls) (pg:ins:tup_selected)
- record: pg:all:tup_selected
expr: sum(pg:cls:tup_selected)
# inserted
- record: pg:db:tup_inserted
expr: irate(pg_db_tup_inserted{}[1m])
- record: pg:ins:tup_inserted
expr: sum without(datname) (pg:db:tup_inserted)
- record: pg:svc:tup_inserted
expr: sum by (cls, role) (pg:ins:tup_inserted)
- record: pg:cls:tup_inserted
expr: sum by (cls) (pg:ins:tup_inserted{role="primary"})
- record: pg:all:tup_inserted
expr: sum(pg:cls:tup_inserted)
# updated
- record: pg:db:tup_updated
expr: irate(pg_db_tup_updated{}[1m])
- record: pg:ins:tup_updated
expr: sum without(datname) (pg:db:tup_updated)
- record: pg:svc:tup_updated
expr: sum by (cls, role) (pg:ins:tup_updated)
- record: pg:cls:tup_updated
expr: sum by (cls) (pg:ins:tup_updated{role="primary"})
- record: pg:all:tup_updated
expr: sum(pg:cls:tup_updated)
# deleted
- record: pg:db:tup_deleted
expr: irate(pg_db_tup_deleted{}[1m])
- record: pg:ins:tup_deleted
expr: sum without(datname) (pg:db:tup_deleted)
- record: pg:svc:tup_deleted
expr: sum by (cls, role) (pg:ins:tup_deleted)
- record: pg:cls:tup_deleted
expr: sum by (cls) (pg:ins:tup_deleted{role="primary"})
- record: pg:all:tup_deleted
expr: sum(pg:cls:tup_deleted)
# modified
- record: pg:db:tup_modified
expr: irate(pg_db_tup_modified{}[1m])
- record: pg:ins:tup_modified
expr: sum without(datname) (pg:db:tup_modified)
- record: pg:svc:tup_modified
expr: sum by (cls, role) (pg:ins:tup_modified)
- record: pg:cls:tup_modified
expr: sum by (cls) (pg:ins:tup_modified{role="primary"})
- record: pg:all:tup_modified
expr: sum(pg:cls:tup_deleted)
#==============================================================#
# Object Access #
#==============================================================#
# table access
- record: pg:table:idx_scan
expr: rate(pg_table_idx_scan{}[1m])
- record: pg:table:seq_scan
expr: rate(pg_table_seq_scan{}[1m])
- record: pg:table:qps_realtime
expr: irate(pg_table_idx_scan{}[1m])
# index access
- record: pg:index:idx_scan
expr: rate(pg_index_idx_scan{}[1m])
- record: pg:index:qps_realtime
expr: irate(pg_index_idx_scan{}[1m])
# func access
- record: pg:func:call
expr: rate(pg_func_calls{}[1m])
- record: pg:func:rt
expr: rate(pg_func_total_time{}[1m]) / pg:func:call
# query access
- record: pg:query:call
expr: rate(pg_query_calls{}[1m])
- record: pg:query:rt
expr: rate(pg_query_total_time{}[1m]) / pg:query:call / 1000
#==============================================================#
# Blocks IO #
#==============================================================#
# blocks read/hit/access in 1min
- record: pg:db:blks_read_1m
expr: increase(pg_db_blks_read{}[1m])
- record: pg:db:blks_hit_1m
expr: increase(pg_db_blks_hit{}[1m])
- record: pg:db:blks_access_1m
expr: increase(pg_db_blks_access{}[1m])
# buffer hit rate (1m)
- record: pg:db:buffer_hit_rate
expr: pg:db:blks_hit_1m / pg:db:blks_access_1m
- record: pg:ins:hit_rate
expr: sum without(datname) (pg:db:blks_hit_1m) / sum without(datname) (pg:db:blks_access_1m)
# read/write time usage
- record: pg:db:read_time_usage
expr: rate(pg_db_blk_read_time[1m])
- record: pg:db:write_time_usage
expr: rate(pg_db_blk_write_time[1m])
- record: pg:db:io_time_usage
expr: pg:db:read_time_usage + pg:db:write_time_usage
#==============================================================#
# Traffic IO (Pgbouncer) #
#==============================================================#
# transmit bandwidth (sent, out)
- record: pg:db:tx
expr: irate(pgbouncer_stat_total_sent{datname!="pgbouncer"}[1m])
- record: pg:ins:tx
expr: sum without (user, datname) (pg:db:tx)
- record: pg:svc:tx
expr: sum by (cls, role) (pg:ins:tx)
- record: pg:cls:tx
expr: sum by (cls) (pg:ins:tx)
- record: pg:all:tx
expr: sum(pg:cls:tx)
# receive bandwidth (sent, out)
- record: pg:db:rx
expr: irate(pgbouncer_stat_total_received{datname!="pgbouncer"}[1m])
- record: pg:ins:rx
expr: sum without (datname) (pg:db:rx)
- record: pg:svc:rx
expr: sum by (cls, role) (pg:ins:rx)
- record: pg:cls:rx
expr: sum by (cls) (pg:ins:rx)
- record: pg:all:rx
expr: sum(pg:cls:rx)
#==============================================================#
# Lock #
#==============================================================#
# lock count by mode
- record: pg:db:locks
expr: pg_lock_count
- record: pg:ins:locks
expr: sum without(datname) (pg:db:locks)
- record: pg:svc:locks
expr: sum by (cls, role, mode) (pg:ins:locks)
- record: pg:cls:locks
expr: sum by (cls, mode) (pg:ins:locks)
# total lock count
- record: pg:db:lock_count
expr: sum without (mode) (pg_lock_count{})
- record: pg:ins:lock_count
expr: sum without(datname) (pg:db:lock_count)
- record: pg:svc:lock_count
expr: sum by (cls, role) (pg:ins:lock_count)
- record: pg:cls:lock_count
expr: sum by (cls) (pg:ins:lock_count)
# read category lock
- record: pg:db:rlock
expr: sum without (mode) (pg_lock_count{mode="AccessShareLock"})
- record: pg:ins:rlock
expr: sum without(datname) (pg:db:rlock)
- record: pg:svc:rlock
expr: sum by (cls, role) (pg:ins:rlock)
- record: pg:cls:rlock
expr: sum by (cls) (pg:ins:rlock)
# write category lock (insert|update|delete)
- record: pg:db:wlock
expr: sum without (mode) (pg_lock_count{mode=~"RowShareLock|RowExclusiveLock"})
- record: pg:ins:wlock
expr: sum without(datname) (pg:db:wlock)
- record: pg:svc:wlock
expr: sum by (cls, role) (pg:ins:wlock)
- record: pg:cls:wlock
expr: sum by (cls) (pg:ins:wlock)
# exclusive category lock
- record: pg:db:xlock
expr: sum without (mode) (pg_lock_count{mode=~"AccessExclusiveLock|ExclusiveLock|ShareRowExclusiveLock|ShareLock|ShareUpdateExclusiveLock"})
- record: pg:ins:xlock
expr: sum without(datname) (pg:db:xlock)
- record: pg:svc:xlock
expr: sum by (cls, role) (pg:ins:xlock)
- record: pg:cls:xlock
expr: sum by (cls) (pg:ins:xlock)
#==============================================================#
# Temp #
#==============================================================#
# temp files and bytes
- record: pg:db:temp_bytes
expr: rate(pg_db_temp_bytes{}[1m])
- record: pg:ins:temp_bytes
expr: sum without(datname) (pg:db:temp_bytes)
- record: pg:svc:temp_bytes
expr: sum by (cls, role) (pg:ins:temp_bytes)
- record: pg:cls:temp_bytes
expr: sum by (cls) (pg:ins:temp_bytes)
# temp file count in last 1m
- record: pg:db:temp_files
expr: increase(pg_db_temp_files{}[1m])
- record: pg:ins:temp_files
expr: sum without(datname) (pg:db:temp_files)
- record: pg:svc:temp_files
expr: sum by (cls, role) (pg:ins:temp_files)
- record: pg:cls:temp_files
expr: sum by (cls) (pg:ins:temp_files)
#==============================================================#
# Size #
#==============================================================#
# database size
- record: pg:ins:db_size
expr: pg_size_database
- record: pg:cls:db_size
expr: sum by (cls) (pg:ins:db_size)
# wal size
- record: pg:ins:wal_size
expr: pg_size_wal
- record: pg:cls:wal_size
expr: sum by (cls) (pg:ins:wal_size)
# log size
- record: pg:ins:log_size
expr: pg_size_log
- record: pg:cls:log_size
expr: sum by (cls) (pg_size_log)
#==============================================================#
# Checkpoint #
#==============================================================#
# checkpoint stats
- record: pg:ins:last_ckpt
expr: pg_checkpoint_elapse
- record: pg:ins:ckpt_timed
expr: increase(pg_bgwriter_checkpoints_timed{}[30s])
- record: pg:ins:ckpt_req
expr: increase(pg_bgwriter_checkpoints_req{}[30s])
- record: pg:cls:ckpt_1h
expr: increase(pg:ins:ckpt_timed[1h]) + increase(pg:ins:ckpt_req[1h])
# buffer flush & alloc
- record: pg:ins:buf_flush_backend
expr: irate(pg_bgwriter_buffers_backend{}[1m]) * 8192
- record: pg:ins:buf_flush_checkpoint
expr: irate(pg_bgwriter_buffers_checkpoint{}[1m]) * 8192
- record: pg:ins:buf_flush
expr: pg:ins:buf_flush_backend + pg:ins:buf_flush_checkpoint
- record: pg:svc:buf_flush
expr: sum by (cls, role) (pg:ins:buf_flush)
- record: pg:cls:buf_flush
expr: sum by (cls) (pg:ins:buf_flush)
- record: pg:all:buf_flush
expr: sum(pg:cls:buf_flush)
- record: pg:ins:buf_alloc
expr: irate(pg_bgwriter_buffers_alloc{}[1m]) * 8192
- record: pg:svc:buf_alloc
expr: sum by (cls, role) (pg:ins:buf_alloc)
- record: pg:cls:buf_alloc
expr: sum by (cls) (pg:ins:buf_alloc)
- record: pg:all:buf_alloc
expr: sum(pg:cls:buf_alloc)
#==============================================================#
# LSN #
#==============================================================#
# timeline & LSN
- record: pg_timeline
expr: pg_checkpoint_tli
- record: pg:ins:redo_lsn
expr: pg_checkpoint_redo_lsn
- record: pg:ins:checkpoint_lsn
expr: pg_checkpoint_checkpoint_lsn
# wal rate
- record: pg:ins:wal_rate
expr: rate(pg_lsn[1m])
- record: pg:cls:wal_rate
expr: max by (cls) (pg:ins:wal_rate{role="primary"})
- record: pg:all:wal_rate
expr: sum(pg:cls:wal_rate)
#==============================================================#
# Replication #
#==============================================================#
# lag time from replica's view
- record: pg:ins:lag_seconds
expr: pg_lag
- record: pg:cls:lag_seconds
expr: max by (cls) (pg:ins:lag_seconds)
- record: pg:all:lag_seconds
expr: max(pg:cls:lag_seconds)
# sync status
- record: pg:ins:sync_status # application_name must set to replica ins name
expr: max by (ins, svc, cls) (label_replace(pg_replication_sync_status, "ins", "$1", "application_name", "(.+)"))
# lag of self (application_name must set to standby ins name)
- record: pg:ins:lag_bytes
expr: max by (ins, svc, cls, role) (label_replace(pg_replication_lsn{} - pg_replication_replay_lsn{}, "ins", "$1", "application_name", "(.+)"))
- record: pg:cls:lag_bytes
expr: max by (cls) (pg:ins:lag_bytes)
- record: pg:all:lag_bytes
expr: max(pg:cls:lag_bytes)
# replication slot retained bytes
- record: pg:ins:slot_retained_bytes
expr: pg_slot_retained_bytes
# replica walreceiver
- record: pg:ins:recv_init_lsn
expr: pg_walreceiver_init_lsn
- record: pg:ins:recv_last_lsn
expr: pg_walreceiver_last_lsn
- record: pg:ins:recv_init_tli
expr: pg_walreceiver_init_tli
- record: pg:ins:recv_last_tli
expr: pg_walreceiver_last_tli
#==============================================================#
# Cluster Level Metrics
#==============================================================#
# cluster member count
- record: pg:cls:leader
expr: count by (cls, ins) (max by (cls, ins) (pg_status{}) == 3)
- record: pg:cls:size
expr: count by (cls) (max by (cls, ins) (pg_up{}))
- record: pg:cls:timeline
expr: max by (cls) (pg_checkpoint_tli{})
- record: pg:cls:primarys
expr: count by (cls) (max by (cls, ins) (pg_in_recovery{}) == 0)
- record: pg:cls:replicas
expr: count by (cls) (max by (cls, ins) (pg_in_recovery{}) == 1)
- record: pg:cls:synchronous
expr: max by (cls) (pg_sync_standby_enabled) > bool 0
- record: pg:cls:bridging_instances
expr: count by (cls, role, ins, ip) (pg_replication_lsn{state="streaming", role!="primary"} > 0)
- record: pg:cls:bridging
expr: count by (cls) (pg:cls:bridging_instances)
- record: pg:cls:cascading
expr: count by (cls) (pg_replication_lsn{state="streaming", role!="primary"})
#==============================================================#
# Pgbouncer List #
#==============================================================#
# object list
- record: pg:ins:pools
expr: pgbouncer_list_items{list="pools"}
- record: pg:ins:pool_databases
expr: pgbouncer_list_items{list="databases"}
- record: pg:ins:pool_users
expr: pgbouncer_list_items{list="users"}
- record: pg:ins:login_clients
expr: pgbouncer_list_items{list="login_clients"}
- record: pg:ins:free_clients
expr: pgbouncer_list_items{list="free_clients"}
- record: pg:ins:used_clients
expr: pgbouncer_list_items{list="used_clients"}
- record: pg:ins:free_servers
expr: pgbouncer_list_items{list="free_servers"}
#==============================================================#
# DBConfig (Pgbouncer) #
#==============================================================#
- record: pg:db:pool_max_conn
expr: pgbouncer_database_pool_size{datname!="pgbouncer"} + pgbouncer_database_reserve_pool{datname!="pgbouncer"}
- record: pg:db:pool_size
expr: pgbouncer_database_pool_size{datname!="pgbouncer"}
- record: pg:db:pool_reserve_size
expr: pgbouncer_database_reserve_pool{datname!="pgbouncer"}
- record: pg:db:pool_current_conn
expr: pgbouncer_database_current_connections{datname!="pgbouncer"}
- record: pg:db:pool_paused
expr: pgbouncer_database_paused{datname!="pgbouncer"}
- record: pg:db:pool_disabled
expr: pgbouncer_database_disabled{datname!="pgbouncer"}
#==============================================================#
# Waiting (Pgbouncer) #
#==============================================================#
# average wait time
- record: pg:db:wait_rt
expr: pgbouncer_stat_avg_wait_time{datname!="pgbouncer"} / 1000000
# max wait time among all clients
- record: pg:pool:maxwait
expr: pgbouncer_pool_maxwait{datname!="pgbouncer"} + pgbouncer_pool_maxwait_us{datname!="pgbouncer"} / 1000000
- record: pg:db:maxwait
expr: max without(user) (pg:pool:maxwait)
- record: pg:ins:maxwait
expr: max without(user, datname) (pg:db:maxwait)
- record: pg:svc:maxwait
expr: max by (cls, role) (pg:ins:maxwait)
- record: pg:cls:maxwait
expr: max by (cls) (pg:ins:maxwait)
- record: pg:all:maxwait
expr: max(pg:cls:maxwait)
...
Last modified 2021-03-28: update en docs (f994b54)