This the multi-page printable view of this section. Click here to print.
Reference
- 1: Example Config
- 2: Kernel Optimize
- 3: Metrics List
- 4: Derived Metrics
- 5: Alert Rules
- 6: Sandbox Stdout
- 7: PG Exporter
- 8: Prometheus Service Discovery
- 9: Tuned Template
- 10: Patroni Template
1 - Example Config
以下是用于沙箱环境的默认配置文件:pigsty.yml
---
######################################################################
# File : pigsty.yml
# Path : pigsty.yml
# Desc : Pigsty Configuration file
# Note : follow ansible inventory file format
# Ctime : 2020-05-22
# Mtime : 2021-03-16
# Copyright (C) 2018-2021 Ruohang Feng
######################################################################
######################################################################
# Development Environment Inventory #
######################################################################
all: # top-level namespace, match all hosts
#==================================================================#
# Clusters #
#==================================================================#
# postgres database clusters are defined as kv pair in `all.children`
# where the key is cluster name and the value is the object consist
# of cluster members (hosts) and ad-hoc variables (vars)
# meta node are defined in special group "meta" with `meta_node=true`
children:
#-----------------------------
# meta controller
#-----------------------------
meta: # special group 'meta' defines the main controller machine
vars:
meta_node: true # mark node as meta controller
ansible_group_priority: 99 # meta group is top priority
# nodes in meta group
hosts: {10.10.10.10: {ansible_host: meta}}
#-----------------------------
# cluster: pg-meta
#-----------------------------
pg-meta:
# - cluster members - #
hosts:
10.10.10.10: {pg_seq: 1, pg_role: primary, ansible_host: meta}
# - cluster configs - #
vars:
pg_cluster: pg-meta # define actual cluster name
pg_version: 13 # define installed pgsql version
node_tune: tiny # tune node into oltp|olap|crit|tiny mode
pg_conf: tiny.yml # tune pgsql into oltp/olap/crit/tiny mode
patroni_mode: pause # enter maintenance mode, {default|pause|remove}
patroni_watchdog_mode: off # disable watchdog (require|automatic|off)
pg_lc_ctype: en_US.UTF8 # enabled pg_trgm i18n char support
pg_users:
# complete example of user/role definition for production user
- name: dbuser_meta # example production user have read-write access
password: DBUser.Meta # example user's password, can be encrypted
login: true # can login, true by default (should be false for role)
superuser: false # is superuser? false by default
createdb: false # can create database? false by default
createrole: false # can create role? false by default
inherit: true # can this role use inherited privileges?
replication: false # can this role do replication? false by default
bypassrls: false # can this role bypass row level security? false by default
connlimit: -1 # connection limit, -1 disable limit
expire_at: '2030-12-31' # 'timestamp' when this role is expired
expire_in: 365 # now + n days when this role is expired (OVERWRITE expire_at)
roles: [dbrole_readwrite] # dborole_admin|dbrole_readwrite|dbrole_readonly
pgbouncer: true # add this user to pgbouncer? false by default (true for production user)
parameters: # user's default search path
search_path: public
comment: test user
# simple example for personal user definition
- name: dbuser_vonng2 # personal user example which only have limited access to offline instance
password: DBUser.Vonng # or instance with explict mark `pg_offline_query = true`
roles: [dbrole_offline] # personal/stats/ETL user should be grant with dbrole_offline
expire_in: 365 # expire in 365 days since creation
pgbouncer: false # personal user should NOT be allowed to login with pgbouncer
comment: example personal user for interactive queries
pg_databases:
- name: meta # name is the only required field for a database
# owner: postgres # optional, database owner
# template: template1 # optional, template1 by default
# encoding: UTF8 # optional, UTF8 by default , must same as template database, leave blank to set to db default
# locale: C # optional, C by default , must same as template database, leave blank to set to db default
# lc_collate: C # optional, C by default , must same as template database, leave blank to set to db default
# lc_ctype: C # optional, C by default , must same as template database, leave blank to set to db default
allowconn: true # optional, true by default, false disable connect at all
revokeconn: false # optional, false by default, true revoke connect from public # (only default user and owner have connect privilege on database)
# tablespace: pg_default # optional, 'pg_default' is the default tablespace
connlimit: -1 # optional, connection limit, -1 or none disable limit (default)
extensions: # optional, extension name and where to create
- {name: postgis, schema: public}
parameters: # optional, extra parameters with ALTER DATABASE
enable_partitionwise_join: true
pgbouncer: true # optional, add this database to pgbouncer list? true by default
comment: pigsty meta database # optional, comment string for database
pg_default_database: meta # default database will be used as primary monitor target
# proxy settings
vip_mode: l2 # enable/disable vip (require members in same LAN)
vip_address: 10.10.10.2 # virtual ip address
vip_cidrmask: 8 # cidr network mask length
vip_interface: eth1 # interface to add virtual ip
#-----------------------------
# cluster: pg-test
#-----------------------------
pg-test: # define cluster named 'pg-test'
# - cluster members - #
hosts:
10.10.10.11: {pg_seq: 1, pg_role: primary, ansible_host: node-1}
10.10.10.12: {pg_seq: 2, pg_role: replica, ansible_host: node-2}
10.10.10.13: {pg_seq: 3, pg_role: offline, ansible_host: node-3}
# - cluster configs - #
vars:
# basic settings
pg_cluster: pg-test # define actual cluster name
pg_version: 13 # define installed pgsql version
node_tune: tiny # tune node into oltp|olap|crit|tiny mode
pg_conf: tiny.yml # tune pgsql into oltp/olap/crit/tiny mode
# business users, adjust on your own needs
pg_users:
- name: test # example production user have read-write access
password: test # example user's password
roles: [dbrole_readwrite] # dborole_admin|dbrole_readwrite|dbrole_readonly|dbrole_offline
pgbouncer: true # production user that access via pgbouncer
comment: default test user for production usage
pg_databases: # create a business database 'test'
- name: test # use the simplest form
pg_default_database: test # default database will be used as primary monitor target
# proxy settings
vip_mode: l2 # enable/disable vip (require members in same LAN)
vip_address: 10.10.10.3 # virtual ip address
vip_cidrmask: 8 # cidr network mask length
vip_interface: eth1 # interface to add virtual ip
#==================================================================#
# Globals #
#==================================================================#
vars:
#------------------------------------------------------------------------------
# CONNECTION PARAMETERS
#------------------------------------------------------------------------------
# this section defines connection parameters
# ansible_user: vagrant # admin user with ssh access and sudo privilege
proxy_env: # global proxy env when downloading packages
no_proxy: "localhost,127.0.0.1,10.0.0.0/8,192.168.0.0/16,*.pigsty,*.aliyun.com,mirrors.aliyuncs.com,mirrors.tuna.tsinghua.edu.cn,mirrors.zju.edu.cn"
# http_proxy: ''
# https_proxy: ''
# all_proxy: ''
#------------------------------------------------------------------------------
# REPO PROVISION
#------------------------------------------------------------------------------
# this section defines how to build a local repo
# - repo basic - #
repo_enabled: true # build local yum repo on meta nodes?
repo_name: pigsty # local repo name
repo_address: yum.pigsty # repo external address (ip:port or url)
repo_port: 80 # listen address, must same as repo_address
repo_home: /www # default repo dir location
repo_rebuild: false # force re-download packages
repo_remove: true # remove existing repos
# - where to download - #
repo_upstreams:
- name: base
description: CentOS-$releasever - Base - Aliyun Mirror
baseurl:
- http://mirrors.aliyun.com/centos/$releasever/os/$basearch/
- http://mirrors.aliyuncs.com/centos/$releasever/os/$basearch/
- http://mirrors.cloud.aliyuncs.com/centos/$releasever/os/$basearch/
gpgcheck: no
failovermethod: priority
- name: updates
description: CentOS-$releasever - Updates - Aliyun Mirror
baseurl:
- http://mirrors.aliyun.com/centos/$releasever/updates/$basearch/
- http://mirrors.aliyuncs.com/centos/$releasever/updates/$basearch/
- http://mirrors.cloud.aliyuncs.com/centos/$releasever/updates/$basearch/
gpgcheck: no
failovermethod: priority
- name: extras
description: CentOS-$releasever - Extras - Aliyun Mirror
baseurl:
- http://mirrors.aliyun.com/centos/$releasever/extras/$basearch/
- http://mirrors.aliyuncs.com/centos/$releasever/extras/$basearch/
- http://mirrors.cloud.aliyuncs.com/centos/$releasever/extras/$basearch/
gpgcheck: no
failovermethod: priority
- name: epel
description: CentOS $releasever - EPEL - Aliyun Mirror
baseurl: http://mirrors.aliyun.com/epel/$releasever/$basearch
gpgcheck: no
failovermethod: priority
- name: grafana
description: Grafana - TsingHua Mirror
gpgcheck: no
baseurl: https://mirrors.tuna.tsinghua.edu.cn/grafana/yum/rpm
- name: prometheus
description: Prometheus and exporters
gpgcheck: no
baseurl: https://packagecloud.io/prometheus-rpm/release/el/$releasever/$basearch
# consider using ZJU PostgreSQL mirror in mainland china
- name: pgdg-common
description: PostgreSQL common RPMs for RHEL/CentOS $releasever - $basearch
gpgcheck: no
# baseurl: https://download.postgresql.org/pub/repos/yum/common/redhat/rhel-$releasever-$basearch
baseurl: http://mirrors.zju.edu.cn/postgresql/repos/yum/common/redhat/rhel-$releasever-$basearch
- name: pgdg13
description: PostgreSQL 13 for RHEL/CentOS $releasever - $basearch
gpgcheck: no
# baseurl: https://download.postgresql.org/pub/repos/yum/13/redhat/rhel-$releasever-$basearch
baseurl: http://mirrors.zju.edu.cn/postgresql/repos/yum/13/redhat/rhel-$releasever-$basearch
- name: centos-sclo
description: CentOS-$releasever - SCLo
gpgcheck: no
mirrorlist: http://mirrorlist.centos.org?arch=$basearch&release=7&repo=sclo-sclo
- name: centos-sclo-rh
description: CentOS-$releasever - SCLo rh
gpgcheck: no
mirrorlist: http://mirrorlist.centos.org?arch=$basearch&release=7&repo=sclo-rh
- name: nginx
description: Nginx Official Yum Repo
skip_if_unavailable: true
gpgcheck: no
baseurl: http://nginx.org/packages/centos/$releasever/$basearch/
- name: haproxy
description: Copr repo for haproxy
skip_if_unavailable: true
gpgcheck: no
baseurl: https://download.copr.fedorainfracloud.org/results/roidelapluie/haproxy/epel-$releasever-$basearch/
# for latest consul & kubernetes
- name: harbottle
description: Copr repo for main owned by harbottle
skip_if_unavailable: true
gpgcheck: no
baseurl: https://download.copr.fedorainfracloud.org/results/harbottle/main/epel-$releasever-$basearch/
# - what to download - #
repo_packages:
# repo bootstrap packages
- epel-release nginx wget yum-utils yum createrepo # bootstrap packages
# node basic packages
- ntp chrony uuid lz4 nc pv jq vim-enhanced make patch bash lsof wget unzip git tuned # basic system util
- readline zlib openssl libyaml libxml2 libxslt perl-ExtUtils-Embed ca-certificates # basic pg dependency
- numactl grubby sysstat dstat iotop bind-utils net-tools tcpdump socat ipvsadm telnet # system utils
# dcs & monitor packages
- grafana prometheus2 pushgateway alertmanager # monitor and ui
- node_exporter postgres_exporter nginx_exporter blackbox_exporter # exporter
- consul consul_exporter consul-template etcd # dcs
# python3 dependencies
- ansible python python-pip python-psycopg2 audit # ansible & python
- python3 python3-psycopg2 python36-requests python3-etcd python3-consul # python3
- python36-urllib3 python36-idna python36-pyOpenSSL python36-cryptography # python3 patroni extra deps
# proxy and load balancer
- haproxy keepalived dnsmasq # proxy and dns
# postgres common Packages
- patroni patroni-consul patroni-etcd pgbouncer pg_cli pgbadger pg_activity # major components
- pgcenter boxinfo check_postgres emaj pgbconsole pg_bloat_check pgquarrel # other common utils
- barman barman-cli pgloader pgFormatter pitrery pspg pgxnclient PyGreSQL pgadmin4 tail_n_mail
# postgres 13 packages
- postgresql13* postgis31* citus_13 timescaledb_13 # pgrouting_13 # postgres 13 and postgis 31
- pg_repack13 pg_squeeze13 # maintenance extensions
- pg_qualstats13 pg_stat_kcache13 system_stats_13 bgw_replstatus13 # stats extensions
- plr13 plsh13 plpgsql_check_13 plproxy13 plr13 plsh13 plpgsql_check_13 pldebugger13 # PL extensions # pl extensions
- hdfs_fdw_13 mongo_fdw13 mysql_fdw_13 ogr_fdw13 redis_fdw_13 pgbouncer_fdw13 # FDW extensions
- wal2json13 count_distinct13 ddlx_13 geoip13 orafce13 # MISC extensions
- rum_13 hypopg_13 ip4r13 jsquery_13 logerrors_13 periods_13 pg_auto_failover_13 pg_catcheck13
- pg_fkpart13 pg_jobmon13 pg_partman13 pg_prioritize_13 pg_track_settings13 pgaudit15_13
- pgcryptokey13 pgexportdoc13 pgimportdoc13 pgmemcache-13 pgmp13 pgq-13
- pguint13 pguri13 prefix13 safeupdate_13 semver13 table_version13 tdigest13
repo_url_packages:
- https://github.com/Vonng/pg_exporter/releases/download/v0.3.2/pg_exporter-0.3.2-1.el7.x86_64.rpm
- https://github.com/cybertec-postgresql/vip-manager/releases/download/v0.6/vip-manager_0.6-1_amd64.rpm
- http://guichaz.free.fr/polysh/files/polysh-0.4-1.noarch.rpm
#------------------------------------------------------------------------------
# NODE PROVISION
#------------------------------------------------------------------------------
# this section defines how to provision nodes
# nodename: # if defined, node's hostname will be overwritten
# - node dns - #
node_dns_hosts: # static dns records in /etc/hosts
- 10.10.10.10 yum.pigsty
node_dns_server: add # add (default) | none (skip) | overwrite (remove old settings)
node_dns_servers: # dynamic nameserver in /etc/resolv.conf
- 10.10.10.10
node_dns_options: # dns resolv options
- options single-request-reopen timeout:1 rotate
- domain service.consul
# - node repo - #
node_repo_method: local # none|local|public (use local repo for production env)
node_repo_remove: true # whether remove existing repo
node_local_repo_url: # local repo url (if method=local, make sure firewall is configured or disabled)
- http://yum.pigsty/pigsty.repo
# - node packages - #
node_packages: # common packages for all nodes
- wget,yum-utils,ntp,chrony,tuned,uuid,lz4,vim-minimal,make,patch,bash,lsof,wget,unzip,git,readline,zlib,openssl
- numactl,grubby,sysstat,dstat,iotop,bind-utils,net-tools,tcpdump,socat,ipvsadm,telnet,tuned,pv,jq
- python3,python3-psycopg2,python36-requests,python3-etcd,python3-consul
- python36-urllib3,python36-idna,python36-pyOpenSSL,python36-cryptography
- node_exporter,consul,consul-template,etcd,haproxy,keepalived,vip-manager
node_extra_packages: # extra packages for all nodes
- patroni,patroni-consul,patroni-etcd,pgbouncer,pgbadger,pg_activity
node_meta_packages: # packages for meta nodes only
- grafana,prometheus2,alertmanager,nginx_exporter,blackbox_exporter,pushgateway
- dnsmasq,nginx,ansible,pgbadger,polysh
# - node features - #
node_disable_numa: false # disable numa, important for production database, reboot required
node_disable_swap: false # disable swap, important for production database
node_disable_firewall: true # disable firewall (required if using kubernetes)
node_disable_selinux: true # disable selinux (required if using kubernetes)
node_static_network: true # keep dns resolver settings after reboot
node_disk_prefetch: false # setup disk prefetch on HDD to increase performance
# - node kernel modules - #
node_kernel_modules:
- softdog
- br_netfilter
- ip_vs
- ip_vs_rr
- ip_vs_rr
- ip_vs_wrr
- ip_vs_sh
- nf_conntrack_ipv4
# - node tuned - #
node_tune: tiny # install and activate tuned profile: none|oltp|olap|crit|tiny
node_sysctl_params: # set additional sysctl parameters, k:v format
net.bridge.bridge-nf-call-iptables: 1 # for kubernetes
# - node user - #
node_admin_setup: true # setup an default admin user ?
node_admin_uid: 88 # uid and gid for admin user
node_admin_username: admin # default admin user
node_admin_ssh_exchange: true # exchange ssh key among cluster ?
node_admin_pks: # public key list that will be installed
- 'ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAAAgQC7IMAMNavYtWwzAJajKqwdn3ar5BhvcwCnBTxxEkXhGlCO2vfgosSAQMEflfgvkiI5nM1HIFQ8KINlx1XLO7SdL5KdInG5LIJjAFh0pujS4kNCT9a5IGvSq1BrzGqhbEcwWYdju1ZPYBcJm/MG+JD0dYCh8vfrYB/cYMD0SOmNkQ== vagrant@pigsty.com'
# - node ntp - #
node_ntp_service: ntp # ntp or chrony
node_ntp_config: true # overwrite existing ntp config?
node_timezone: Asia/Shanghai # default node timezone
node_ntp_servers: # default NTP servers
- pool cn.pool.ntp.org iburst
- pool pool.ntp.org iburst
- pool time.pool.aliyun.com iburst
- server 10.10.10.10 iburst
#------------------------------------------------------------------------------
# META PROVISION
#------------------------------------------------------------------------------
# - ca - #
ca_method: create # create|copy|recreate
ca_subject: "/CN=root-ca" # self-signed CA subject
ca_homedir: /ca # ca cert directory
ca_cert: ca.crt # ca public key/cert
ca_key: ca.key # ca private key
# - nginx - #
nginx_upstream:
- { name: home, host: pigsty, url: "127.0.0.1:3000"}
- { name: consul, host: c.pigsty, url: "127.0.0.1:8500" }
- { name: grafana, host: g.pigsty, url: "127.0.0.1:3000" }
- { name: prometheus, host: p.pigsty, url: "127.0.0.1:9090" }
- { name: alertmanager, host: a.pigsty, url: "127.0.0.1:9093" }
- { name: haproxy, host: h.pigsty, url: "127.0.0.1:9091" }
# - nameserver - #
dns_records: # dynamic dns record resolved by dnsmasq
- 10.10.10.2 pg-meta # sandbox vip for pg-meta
- 10.10.10.3 pg-test # sandbox vip for pg-test
- 10.10.10.10 meta-1 # sandbox node meta-1 (node-0)
- 10.10.10.11 node-1 # sandbox node node-1
- 10.10.10.12 node-2 # sandbox node node-2
- 10.10.10.13 node-3 # sandbox node node-3
- 10.10.10.10 pigsty
- 10.10.10.10 y.pigsty yum.pigsty
- 10.10.10.10 c.pigsty consul.pigsty
- 10.10.10.10 g.pigsty grafana.pigsty
- 10.10.10.10 p.pigsty prometheus.pigsty
- 10.10.10.10 a.pigsty alertmanager.pigsty
- 10.10.10.10 n.pigsty ntp.pigsty
- 10.10.10.10 h.pigsty haproxy.pigsty
# - prometheus - #
prometheus_data_dir: /export/prometheus/data # prometheus data dir
prometheus_options: '--storage.tsdb.retention=30d'
prometheus_reload: false # reload prometheus instead of recreate it
prometheus_sd_method: consul # service discovery method: static|consul|etcd
prometheus_scrape_interval: 2s # global scrape & evaluation interval
prometheus_scrape_timeout: 1s # scrape timeout
prometheus_sd_interval: 2s # service discovery refresh interval
# - grafana - #
grafana_url: http://admin:admin@10.10.10.10:3000 # grafana url
grafana_admin_password: admin # default grafana admin user password
grafana_plugin: install # none|install|reinstall
grafana_cache: /www/pigsty/grafana/plugins.tar.gz # path to grafana plugins tarball
grafana_customize: true # customize grafana resources
grafana_plugins: # default grafana plugins list
- redis-datasource
- simpod-json-datasource
- fifemon-graphql-datasource
- sbueringer-consul-datasource
- camptocamp-prometheus-alertmanager-datasource
- ryantxu-ajax-panel
- marcusolsson-hourly-heatmap-panel
- michaeldmoore-multistat-panel
- marcusolsson-treemap-panel
- pr0ps-trackmap-panel
- dalvany-image-panel
- magnesium-wordcloud-panel
- cloudspout-button-panel
- speakyourcode-button-panel
- jdbranham-diagram-panel
- grafana-piechart-panel
- snuids-radar-panel
- digrich-bubblechart-panel
grafana_git_plugins:
- https://github.com/Vonng/grafana-echarts
#------------------------------------------------------------------------------
# DCS PROVISION
#------------------------------------------------------------------------------
service_registry: consul # where to register services: none | consul | etcd | both
dcs_type: consul # consul | etcd | both
dcs_name: pigsty # consul dc name | etcd initial cluster token
dcs_servers: # dcs server dict in name:ip format
meta-1: 10.10.10.10 # you could use existing dcs cluster
# meta-2: 10.10.10.11 # host which have their IP listed here will be init as server
# meta-3: 10.10.10.12 # 3 or 5 dcs nodes are recommend for production environment
dcs_exists_action: clean # abort|skip|clean if dcs server already exists
dcs_disable_purge: false # set to true to disable purge functionality for good (force dcs_exists_action = abort)
consul_data_dir: /var/lib/consul # consul data dir (/var/lib/consul by default)
etcd_data_dir: /var/lib/etcd # etcd data dir (/var/lib/consul by default)
#------------------------------------------------------------------------------
# POSTGRES INSTALLATION
#------------------------------------------------------------------------------
# - dbsu - #
pg_dbsu: postgres # os user for database, postgres by default (change it is not recommended!)
pg_dbsu_uid: 26 # os dbsu uid and gid, 26 for default postgres users and groups
pg_dbsu_sudo: limit # none|limit|all|nopass (Privilege for dbsu, limit is recommended)
pg_dbsu_home: /var/lib/pgsql # postgresql binary
pg_dbsu_ssh_exchange: false # exchange ssh key among same cluster
# - postgres packages - #
pg_version: 13 # default postgresql version
pgdg_repo: false # use official pgdg yum repo (disable if you have local mirror)
pg_add_repo: false # add postgres related repo before install (useful if you want a simple install)
pg_bin_dir: /usr/pgsql/bin # postgres binary dir
pg_packages:
- postgresql${pg_version}*
- postgis31_${pg_version}*
- pgbouncer patroni pg_exporter pgbadger
- patroni patroni-consul patroni-etcd pgbouncer pgbadger pg_activity
- python3 python3-psycopg2 python36-requests python3-etcd python3-consul
- python36-urllib3 python36-idna python36-pyOpenSSL python36-cryptography
pg_extensions:
- pg_repack${pg_version} pg_qualstats${pg_version} pg_stat_kcache${pg_version} wal2json${pg_version}
# - ogr_fdw${pg_version} mysql_fdw_${pg_version} redis_fdw_${pg_version} mongo_fdw${pg_version} hdfs_fdw_${pg_version}
# - count_distinct${version} ddlx_${version} geoip${version} orafce${version} # popular features
# - hypopg_${version} ip4r${version} jsquery_${version} logerrors_${version} periods_${version} pg_auto_failover_${version} pg_catcheck${version}
# - pg_fkpart${version} pg_jobmon${version} pg_partman${version} pg_prioritize_${version} pg_track_settings${version} pgaudit15_${version}
# - pgcryptokey${version} pgexportdoc${version} pgimportdoc${version} pgmemcache-${version} pgmp${version} pgq-${version} pgquarrel pgrouting_${version}
# - pguint${version} pguri${version} prefix${version} safeupdate_${version} semver${version} table_version${version} tdigest${version}
#------------------------------------------------------------------------------
# POSTGRES PROVISION
#------------------------------------------------------------------------------
# - identity - #
# pg_cluster: # [REQUIRED] cluster name (validated during pg_preflight)
# pg_seq: 0 # [REQUIRED] instance seq (validated during pg_preflight)
# pg_role: replica # [REQUIRED] service role (validated during pg_preflight)
pg_hostname: false # overwrite node hostname with pg instance name
pg_nodename: true # overwrite consul nodename with pg instance name
# - retention - #
# pg_exists_action, available options: abort|clean|skip
# - abort: abort entire play's execution (default)
# - clean: remove existing cluster (dangerous)
# - skip: end current play for this host
# pg_exists: false # auxiliary flag variable (DO NOT SET THIS)
pg_exists_action: clean
pg_disable_purge: false # set to true to disable pg purge functionality for good (force pg_exists_action = abort)
# - storage - #
pg_data: /pg/data # postgres data directory
pg_fs_main: /export # data disk mount point /pg -> {{ pg_fs_main }}/postgres/{{ pg_instance }}
pg_fs_bkup: /var/backups # backup disk mount point /pg/* -> {{ pg_fs_bkup }}/postgres/{{ pg_instance }}/*
# - connection - #
pg_listen: '0.0.0.0' # postgres listen address, '0.0.0.0' by default (all ipv4 addr)
pg_port: 5432 # postgres port (5432 by default)
pg_localhost: /var/run/postgresql # localhost unix socket dir for connection
# - patroni - #
# patroni_mode, available options: default|pause|remove
# - default: default ha mode
# - pause: into maintenance mode
# - remove: remove patroni after bootstrap
patroni_mode: default # pause|default|remove
pg_namespace: /pg # top level key namespace in dcs
patroni_port: 8008 # default patroni port
patroni_watchdog_mode: automatic # watchdog mode: off|automatic|required
pg_conf: tiny.yml # user provided patroni config template path
# - localization - #
pg_encoding: UTF8 # default to UTF8
pg_locale: C # default to C
pg_lc_collate: C # default to C
pg_lc_ctype: en_US.UTF8 # default to en_US.UTF8
# - pgbouncer - #
pgbouncer_port: 6432 # pgbouncer port (6432 by default)
pgbouncer_poolmode: transaction # pooling mode: (transaction pooling by default)
pgbouncer_max_db_conn: 100 # important! do not set this larger than postgres max conn or conn limit
#------------------------------------------------------------------------------
# POSTGRES TEMPLATE
#------------------------------------------------------------------------------
# - template - #
pg_init: pg-init # init script for cluster template
# - system roles - #
pg_replication_username: replicator # system replication user
pg_replication_password: DBUser.Replicator # system replication password
pg_monitor_username: dbuser_monitor # system monitor user
pg_monitor_password: DBUser.Monitor # system monitor password
pg_admin_username: dbuser_admin # system admin user
pg_admin_password: DBUser.Admin # system admin password
# - default roles - #
# chekc http://pigsty.cc/zh/docs/concepts/provision/acl/ for more detail
pg_default_roles:
# common production readonly user
- name: dbrole_readonly # production read-only roles
login: false
comment: role for global readonly access
# common production read-write user
- name: dbrole_readwrite # production read-write roles
login: false
roles: [dbrole_readonly] # read-write includes read-only access
comment: role for global read-write access
# offline have same privileges as readonly, but with limited hba access on offline instance only
# for the purpose of running slow queries, interactive queries and perform ETL tasks
- name: dbrole_offline
login: false
comment: role for restricted read-only access (offline instance)
# admin have the privileges to issue DDL changes
- name: dbrole_admin
login: false
bypassrls: true
comment: role for object creation
roles: [dbrole_readwrite,pg_monitor,pg_signal_backend]
# dbsu, name is designated by `pg_dbsu`. It's not recommend to set password for dbsu
- name: postgres
superuser: true
comment: system superuser
# default replication user, name is designated by `pg_replication_username`, and password is set by `pg_replication_password`
- name: replicator
replication: true # for replication user
bypassrls: true # logical replication require bypassrls
roles: [pg_monitor, dbrole_readonly] # logical replication require select privileges
comment: system replicator
# default replication user, name is designated by `pg_monitor_username`, and password is set by `pg_monitor_password`
- name: dbuser_monitor
connlimit: 16
comment: system monitor user
roles: [pg_monitor, dbrole_readonly]
# default admin user, name is designated by `pg_admin_username`, and password is set by `pg_admin_password`
- name: dbuser_admin
bypassrls: true
superuser: true
comment: system admin user
roles: [dbrole_admin]
# default stats user, for ETL and slow queries
- name: dbuser_stats
password: DBUser.Stats
comment: business offline user for offline queries and ETL
roles: [dbrole_offline]
# - privileges - #
# object created by dbsu and admin will have their privileges properly set
pg_default_privileges:
- GRANT USAGE ON SCHEMAS TO dbrole_readonly
- GRANT SELECT ON TABLES TO dbrole_readonly
- GRANT SELECT ON SEQUENCES TO dbrole_readonly
- GRANT EXECUTE ON FUNCTIONS TO dbrole_readonly
- GRANT USAGE ON SCHEMAS TO dbrole_offline
- GRANT SELECT ON TABLES TO dbrole_offline
- GRANT SELECT ON SEQUENCES TO dbrole_offline
- GRANT EXECUTE ON FUNCTIONS TO dbrole_offline
- GRANT INSERT, UPDATE, DELETE ON TABLES TO dbrole_readwrite
- GRANT USAGE, UPDATE ON SEQUENCES TO dbrole_readwrite
- GRANT TRUNCATE, REFERENCES, TRIGGER ON TABLES TO dbrole_admin
- GRANT CREATE ON SCHEMAS TO dbrole_admin
# - schemas - #
pg_default_schemas: [monitor] # default schemas to be created
# - extension - #
pg_default_extensions: # default extensions to be created
- { name: 'pg_stat_statements', schema: 'monitor' }
- { name: 'pgstattuple', schema: 'monitor' }
- { name: 'pg_qualstats', schema: 'monitor' }
- { name: 'pg_buffercache', schema: 'monitor' }
- { name: 'pageinspect', schema: 'monitor' }
- { name: 'pg_prewarm', schema: 'monitor' }
- { name: 'pg_visibility', schema: 'monitor' }
- { name: 'pg_freespacemap', schema: 'monitor' }
- { name: 'pg_repack', schema: 'monitor' }
- name: postgres_fdw
- name: file_fdw
- name: btree_gist
- name: btree_gin
- name: pg_trgm
- name: intagg
- name: intarray
# - hba - #
pg_offline_query: false # set to true to enable offline query on instance
pg_reload: true # reload postgres after hba changes
pg_hba_rules: # postgres host-based authentication rules
- title: allow meta node password access
role: common
rules:
- host all all 10.10.10.10/32 md5
- title: allow intranet admin password access
role: common
rules:
- host all +dbrole_admin 10.0.0.0/8 md5
- host all +dbrole_admin 172.16.0.0/12 md5
- host all +dbrole_admin 192.168.0.0/16 md5
- title: allow intranet password access
role: common
rules:
- host all all 10.0.0.0/8 md5
- host all all 172.16.0.0/12 md5
- host all all 192.168.0.0/16 md5
- title: allow local read/write (local production user via pgbouncer)
role: common
rules:
- local all +dbrole_readonly md5
- host all +dbrole_readonly 127.0.0.1/32 md5
- title: allow offline query (ETL,SAGA,Interactive) on offline instance
role: offline
rules:
- host all +dbrole_offline 10.0.0.0/8 md5
- host all +dbrole_offline 172.16.0.0/12 md5
- host all +dbrole_offline 192.168.0.0/16 md5
pg_hba_rules_extra: [] # extra hba rules (for cluster/instance overwrite)
pgbouncer_hba_rules: # pgbouncer host-based authentication rules
- title: local password access
role: common
rules:
- local all all md5
- host all all 127.0.0.1/32 md5
- title: intranet password access
role: common
rules:
- host all all 10.0.0.0/8 md5
- host all all 172.16.0.0/12 md5
- host all all 192.168.0.0/16 md5
pgbouncer_hba_rules_extra: [] # extra pgbouncer hba rules (for cluster/instance overwrite)
#------------------------------------------------------------------------------
# MONITOR PROVISION
#------------------------------------------------------------------------------
# - install - #
exporter_install: none # none|yum|binary, none by default
exporter_repo_url: '' # if set, repo will be added to /etc/yum.repos.d/ before yum installation
# - collect - #
exporter_metrics_path: /metrics # default metric path for pg related exporter
# - node exporter - #
node_exporter_enabled: true # setup node_exporter on instance
node_exporter_port: 9100 # default port for node exporter
node_exporter_options: '--no-collector.softnet --collector.systemd --collector.ntp --collector.tcpstat --collector.processes'
# - pg exporter - #
pg_exporter_config: pg_exporter-demo.yaml # default config files for pg_exporter
pg_exporter_enabled: true # setup pg_exporter on instance
pg_exporter_port: 9630 # default port for pg exporter
pg_exporter_url: '' # optional, if not set, generate from reference parameters
# - pgbouncer exporter - #
pgbouncer_exporter_enabled: true # setup pgbouncer_exporter on instance (if you don't have pgbouncer, disable it)
pgbouncer_exporter_port: 9631 # default port for pgbouncer exporter
pgbouncer_exporter_url: '' # optional, if not set, generate from reference parameters
#------------------------------------------------------------------------------
# SERVICE PROVISION
#------------------------------------------------------------------------------
pg_weight: 100 # default load balance weight (instance level)
# - service - #
pg_services: # how to expose postgres service in cluster?
# primary service will route {ip|name}:5433 to primary pgbouncer (5433->6432 rw)
- name: primary # service name {{ pg_cluster }}_primary
src_ip: "*"
src_port: 5433
dst_port: pgbouncer # 5433 route to pgbouncer
check_url: /primary # primary health check, success when instance is primary
selector: "[]" # select all instance as primary service candidate
# replica service will route {ip|name}:5434 to replica pgbouncer (5434->6432 ro)
- name: replica # service name {{ pg_cluster }}_replica
src_ip: "*"
src_port: 5434
dst_port: pgbouncer
check_url: /read-only # read-only health check. (including primary)
selector: "[]" # select all instance as replica service candidate
selector_backup: "[? pg_role == `primary`]" # primary are used as backup server in replica service
# default service will route {ip|name}:5436 to primary postgres (5436->5432 primary)
- name: default # service's actual name is {{ pg_cluster }}-{{ service.name }}
src_ip: "*" # service bind ip address, * for all, vip for cluster virtual ip address
src_port: 5436 # bind port, mandatory
dst_port: postgres # target port: postgres|pgbouncer|port_number , pgbouncer(6432) by default
check_method: http # health check method: only http is available for now
check_port: patroni # health check port: patroni|pg_exporter|port_number , patroni by default
check_url: /primary # health check url path, / as default
check_code: 200 # health check http code, 200 as default
selector: "[]" # instance selector
haproxy: # haproxy specific fields
maxconn: 3000 # default front-end connection
balance: roundrobin # load balance algorithm (roundrobin by default)
default_server_options: 'inter 3s fastinter 1s downinter 5s rise 3 fall 3 on-marked-down shutdown-sessions slowstart 30s maxconn 3000 maxqueue 128 weight 100'
# offline service will route {ip|name}:5438 to offline postgres (5438->5432 offline)
- name: offline # service name {{ pg_cluster }}_replica
src_ip: "*"
src_port: 5438
dst_port: postgres
check_url: /replica # offline MUST be a replica
selector: "[? pg_role == `offline` || pg_offline_query ]" # instances with pg_role == 'offline' or instance marked with 'pg_offline_query == true'
selector_backup: "[? pg_role == `replica` && !pg_offline_query]" # replica are used as backup server in offline service
pg_services_extra: [] # extra services to be added
# - haproxy - #
haproxy_enabled: true # enable haproxy among every cluster members
haproxy_reload: true # reload haproxy after config
haproxy_policy: roundrobin # roundrobin, leastconn
haproxy_admin_auth_enabled: false # enable authentication for haproxy admin?
haproxy_admin_username: admin # default haproxy admin username
haproxy_admin_password: admin # default haproxy admin password
haproxy_exporter_port: 9101 # default admin/exporter port
haproxy_client_timeout: 3h # client side connection timeout
haproxy_server_timeout: 3h # server side connection timeout
# - vip - #
vip_mode: none # none | l2 | l4
vip_reload: true # whether reload service after config
# vip_address: 127.0.0.1 # virtual ip address ip (l2 or l4)
# vip_cidrmask: 24 # virtual ip address cidr mask (l2 only)
# vip_interface: eth0 # virtual ip network interface (l2 only)
...
2 - Kernel Optimize
Pigsty使用tuned
调整操作系统配置,tuned
是CentOS7自带的调参工具。
Pigsty Tuned配置
Pigsty默认会为操作系统安装四种tuned profile
:
tuned-adm profile oltp # 启用OLTP模式
tuned-adm profile olap # 启用OLAP模式
tuned-adm profile crit # 启用CRIT模式
tuned-adm profile tiny # 启用TINY模式
Tuned基本操作
# 如需启动 tuned,请以 root 身份运行下列指令:
systemctl start tuned
# 若要在每次计算机启动时激活 tuned,请输入以下指令:
systemctl enable tuned
# 其它的 tuned 控制,例如配置文件选择等,请使用:
tuned-adm
# 若要查看可用的已安装配置文件,此命令需要 tuned 服务正在运行。
tuned-adm list
# 若要查看目前已激活的配置文件,请运行:
tuned-adm active
# 若要选择或激活某一配置文件,请运行:
tuned-adm profile profile
# 例如
tuned-adm profile powersave
# 若要让 tuned 推荐最适合您的系统的配置文件,同时不改变任何现有的配置文件,也不使用安装期间使用过的逻辑,请运行以下指令:
tuned-adm recommend
# 要禁用所有微调:
tuned-adm off
要列出所有可用配置文件并识别目前激活的配置文件,请运行:
tuned-adm list
要只显示当前激活的配置文件请运行:
tuned-adm active
要切换到某个可用的配置文件请运行:
tuned-adm profile profile_name
例如:
tuned-adm profile server-powersave
OLTP配置
# tuned configuration
#==============================================================#
# File : tuned.conf
# Mtime : 2020-06-29
# Desc : Tune operatiing system to oltp mode
# Path : /etc/tuned/oltp/tuned.conf
# Author : Vonng(fengruohang@outlook.com)
# Copyright (C) 2019-2020 Ruohang Feng
#==============================================================#
[main]
summary=Optimize for PostgreSQL OLTP System
include=network-latency
[cpu]
force_latency=1
governor=performance
energy_perf_bias=performance
min_perf_pct=100
[vm]
# disable transparent hugepages
transparent_hugepages=never
[sysctl]
#-------------------------------------------------------------#
# KERNEL #
#-------------------------------------------------------------#
# disable numa balancing
kernel.numa_balancing=0
# total shmem size in bytes: $(expr $(getconf _PHYS_PAGES) / 2 \* $(getconf PAGE_SIZE))
{% if param_shmall is defined and param_shmall != '' %}
kernel.shmall = {{ param_shmall }}
{% endif %}
# total shmem size in pages: $(expr $(getconf _PHYS_PAGES) / 2)
{% if param_shmmax is defined and param_shmmax != '' %}
kernel.shmmax = {{ param_shmmax }}
{% endif %}
# total shmem segs 4096 -> 8192
kernel.shmmni=8192
# total msg queue number, set to mem size in MB
kernel.msgmni=32768
# max length of message queue
kernel.msgmnb=65536
# max size of message
kernel.msgmax=65536
kernel.pid_max=131072
# max(Sem in Set)=2048, max(Sem)=max(Sem in Set) x max(SemSet) , max(Sem per Ops)=2048, max(SemSet)=65536
kernel.sem=2048 134217728 2048 65536
# do not sched postgres process in group
kernel.sched_autogroup_enabled = 0
# total time the scheduler will consider a migrated process cache hot and, thus, less likely to be remigrated
# defaut = 0.5ms (500000ns), update to 5ms , depending on your typical query (e.g < 1ms)
kernel.sched_migration_cost_ns=5000000
#-------------------------------------------------------------#
# VM #
#-------------------------------------------------------------#
# try not using swap
vm.swappiness=0
# disable when most mem are for file cache
vm.zone_reclaim_mode=0
# overcommit threshhold = 80%
vm.overcommit_memory=2
vm.overcommit_ratio=80
# vm.dirty_background_bytes=67108864 # 64MB mem (2xRAID cache) wake the bgwriter
vm.dirty_background_ratio=3 # latency-performance default
vm.dirty_ratio=10 # latency-performance default
# deny access on 0x00000 - 0x10000
vm.mmap_min_addr=65536
#-------------------------------------------------------------#
# Filesystem #
#-------------------------------------------------------------#
# max open files: 382589 -> 167772160
fs.file-max=167772160
# max concurrent unfinished async io, should be larger than 1M. 65536->1M
fs.aio-max-nr=1048576
#-------------------------------------------------------------#
# Network #
#-------------------------------------------------------------#
# max connection in listen queue (triggers retrans if full)
net.core.somaxconn=65535
net.core.netdev_max_backlog=8192
# tcp receive/transmit buffer default = 256KiB
net.core.rmem_default=262144
net.core.wmem_default=262144
# receive/transmit buffer limit = 4MiB
net.core.rmem_max=4194304
net.core.wmem_max=4194304
# ip options
net.ipv4.ip_forward=1
net.ipv4.ip_nonlocal_bind=1
net.ipv4.ip_local_port_range=32768 65000
# tcp options
net.ipv4.tcp_timestamps=1
net.ipv4.tcp_tw_reuse=1
net.ipv4.tcp_tw_recycle=0
net.ipv4.tcp_syncookies=0
net.ipv4.tcp_synack_retries=1
net.ipv4.tcp_syn_retries=1
# tcp read/write buffer
net.ipv4.tcp_rmem="4096 87380 16777216"
net.ipv4.tcp_wmem="4096 16384 16777216"
net.ipv4.udp_mem="3145728 4194304 16777216"
# tcp probe fail interval: 75s -> 20s
net.ipv4.tcp_keepalive_intvl=20
# tcp break after 3 * 20s = 1m
net.ipv4.tcp_keepalive_probes=3
# probe peroid = 1 min
net.ipv4.tcp_keepalive_time=60
net.ipv4.tcp_fin_timeout=5
net.ipv4.tcp_max_tw_buckets=262144
net.ipv4.tcp_max_syn_backlog=8192
net.ipv4.neigh.default.gc_thresh1=80000
net.ipv4.neigh.default.gc_thresh2=90000
net.ipv4.neigh.default.gc_thresh3=100000
net.bridge.bridge-nf-call-iptables=1
net.bridge.bridge-nf-call-ip6tables=1
net.bridge.bridge-nf-call-arptables=1
# max connection tracking number
net.netfilter.nf_conntrack_max=1048576
OLAP配置
# tuned configuration
#==============================================================#
# File : tuned.conf
# Mtime : 2020-09-18
# Desc : Tune operatiing system to olap mode
# Path : /etc/tuned/olap/tuned.conf
# Author : Vonng(fengruohang@outlook.com)
# Copyright (C) 2019-2020 Ruohang Feng
#==============================================================#
[main]
summary=Optimize for PostgreSQL OLAP System
include=network-throughput
[cpu]
force_latency=1
governor=performance
energy_perf_bias=performance
min_perf_pct=100
[vm]
# disable transparent hugepages
transparent_hugepages=never
[sysctl]
#-------------------------------------------------------------#
# KERNEL #
#-------------------------------------------------------------#
# disable numa balancing
kernel.numa_balancing=0
# total shmem size in bytes: $(expr $(getconf _PHYS_PAGES) / 2 \* $(getconf PAGE_SIZE))
{% if param_shmall is defined and param_shmall != '' %}
kernel.shmall = {{ param_shmall }}
{% endif %}
# total shmem size in pages: $(expr $(getconf _PHYS_PAGES) / 2)
{% if param_shmmax is defined and param_shmmax != '' %}
kernel.shmmax = {{ param_shmmax }}
{% endif %}
# total shmem segs 4096 -> 8192
kernel.shmmni=8192
# total msg queue number, set to mem size in MB
kernel.msgmni=32768
# max length of message queue
kernel.msgmnb=65536
# max size of message
kernel.msgmax=65536
kernel.pid_max=131072
# max(Sem in Set)=2048, max(Sem)=max(Sem in Set) x max(SemSet) , max(Sem per Ops)=2048, max(SemSet)=65536
kernel.sem=2048 134217728 2048 65536
# do not sched postgres process in group
kernel.sched_autogroup_enabled = 0
# total time the scheduler will consider a migrated process cache hot and, thus, less likely to be remigrated
# defaut = 0.5ms (500000ns), update to 5ms , depending on your typical query (e.g < 1ms)
kernel.sched_migration_cost_ns=5000000
#-------------------------------------------------------------#
# VM #
#-------------------------------------------------------------#
# try not using swap
# vm.swappiness=10
# disable when most mem are for file cache
vm.zone_reclaim_mode=0
# overcommit threshhold = 80%
vm.overcommit_memory=2
vm.overcommit_ratio=80
vm.dirty_background_ratio = 10 # throughput-performance default
vm.dirty_ratio=80 # throughput-performance default 40 -> 80
# deny access on 0x00000 - 0x10000
vm.mmap_min_addr=65536
#-------------------------------------------------------------#
# Filesystem #
#-------------------------------------------------------------#
# max open files: 382589 -> 167772160
fs.file-max=167772160
# max concurrent unfinished async io, should be larger than 1M. 65536->1M
fs.aio-max-nr=1048576
#-------------------------------------------------------------#
# Network #
#-------------------------------------------------------------#
# max connection in listen queue (triggers retrans if full)
net.core.somaxconn=65535
net.core.netdev_max_backlog=8192
# tcp receive/transmit buffer default = 256KiB
net.core.rmem_default=262144
net.core.wmem_default=262144
# receive/transmit buffer limit = 4MiB
net.core.rmem_max=4194304
net.core.wmem_max=4194304
# ip options
net.ipv4.ip_forward=1
net.ipv4.ip_nonlocal_bind=1
net.ipv4.ip_local_port_range=32768 65000
# tcp options
net.ipv4.tcp_timestamps=1
net.ipv4.tcp_tw_reuse=1
net.ipv4.tcp_tw_recycle=0
net.ipv4.tcp_syncookies=0
net.ipv4.tcp_synack_retries=1
net.ipv4.tcp_syn_retries=1
# tcp read/write buffer
net.ipv4.tcp_rmem="4096 87380 16777216"
net.ipv4.tcp_wmem="4096 16384 16777216"
net.ipv4.udp_mem="3145728 4194304 16777216"
# tcp probe fail interval: 75s -> 20s
net.ipv4.tcp_keepalive_intvl=20
# tcp break after 3 * 20s = 1m
net.ipv4.tcp_keepalive_probes=3
# probe peroid = 1 min
net.ipv4.tcp_keepalive_time=60
net.ipv4.tcp_fin_timeout=5
net.ipv4.tcp_max_tw_buckets=262144
net.ipv4.tcp_max_syn_backlog=8192
net.ipv4.neigh.default.gc_thresh1=80000
net.ipv4.neigh.default.gc_thresh2=90000
net.ipv4.neigh.default.gc_thresh3=100000
net.bridge.bridge-nf-call-iptables=1
net.bridge.bridge-nf-call-ip6tables=1
net.bridge.bridge-nf-call-arptables=1
# max connection tracking number
net.netfilter.nf_conntrack_max=1048576
CRIT配置
# tuned configuration
#==============================================================#
# File : tuned.conf
# Mtime : 2020-06-29
# Desc : Tune operatiing system to crit mode
# Path : /etc/tuned/crit/tuned.conf
# Author : Vonng(fengruohang@outlook.com)
# Copyright (C) 2019-2020 Ruohang Feng
#==============================================================#
[main]
summary=Optimize for PostgreSQL CRIT System
include=network-latency
[cpu]
force_latency=1
governor=performance
energy_perf_bias=performance
min_perf_pct=100
[vm]
# disable transparent hugepages
transparent_hugepages=never
[sysctl]
#-------------------------------------------------------------#
# KERNEL #
#-------------------------------------------------------------#
# disable numa balancing
kernel.numa_balancing=0
# total shmem size in bytes: $(expr $(getconf _PHYS_PAGES) / 2 \* $(getconf PAGE_SIZE))
{% if param_shmall is defined and param_shmall != '' %}
kernel.shmall = {{ param_shmall }}
{% endif %}
# total shmem size in pages: $(expr $(getconf _PHYS_PAGES) / 2)
{% if param_shmmax is defined and param_shmmax != '' %}
kernel.shmmax = {{ param_shmmax }}
{% endif %}
# total shmem segs 4096 -> 8192
kernel.shmmni=8192
# total msg queue number, set to mem size in MB
kernel.msgmni=32768
# max length of message queue
kernel.msgmnb=65536
# max size of message
kernel.msgmax=65536
kernel.pid_max=131072
# max(Sem in Set)=2048, max(Sem)=max(Sem in Set) x max(SemSet) , max(Sem per Ops)=2048, max(SemSet)=65536
kernel.sem=2048 134217728 2048 65536
# do not sched postgres process in group
kernel.sched_autogroup_enabled = 0
# total time the scheduler will consider a migrated process cache hot and, thus, less likely to be remigrated
# defaut = 0.5ms (500000ns), update to 5ms , depending on your typical query (e.g < 1ms)
kernel.sched_migration_cost_ns=5000000
#-------------------------------------------------------------#
# VM #
#-------------------------------------------------------------#
# try not using swap
vm.swappiness=0
# disable when most mem are for file cache
vm.zone_reclaim_mode=0
# overcommit threshhold = 80%
vm.overcommit_memory=2
vm.overcommit_ratio=100
# 64MB mem (2xRAID cache) wake the bgwriter
vm.dirty_background_bytes=67108864
# vm.dirty_background_ratio=3 # latency-performance default
vm.dirty_ratio=6 # latency-performance default
# deny access on 0x00000 - 0x10000
vm.mmap_min_addr=65536
#-------------------------------------------------------------#
# Filesystem #
#-------------------------------------------------------------#
# max open files: 382589 -> 167772160
fs.file-max=167772160
# max concurrent unfinished async io, should be larger than 1M. 65536->1M
fs.aio-max-nr=1048576
#-------------------------------------------------------------#
# Network #
#-------------------------------------------------------------#
# max connection in listen queue (triggers retrans if full)
net.core.somaxconn=65535
net.core.netdev_max_backlog=8192
# tcp receive/transmit buffer default = 256KiB
net.core.rmem_default=262144
net.core.wmem_default=262144
# receive/transmit buffer limit = 4MiB
net.core.rmem_max=4194304
net.core.wmem_max=4194304
# ip options
net.ipv4.ip_forward=1
net.ipv4.ip_nonlocal_bind=1
net.ipv4.ip_local_port_range=32768 65000
# tcp options
net.ipv4.tcp_timestamps=1
net.ipv4.tcp_tw_reuse=1
net.ipv4.tcp_tw_recycle=0
net.ipv4.tcp_syncookies=0
net.ipv4.tcp_synack_retries=1
net.ipv4.tcp_syn_retries=1
# tcp read/write buffer
net.ipv4.tcp_rmem="4096 87380 16777216"
net.ipv4.tcp_wmem="4096 16384 16777216"
net.ipv4.udp_mem="3145728 4194304 16777216"
# tcp probe fail interval: 75s -> 20s
net.ipv4.tcp_keepalive_intvl=20
# tcp break after 3 * 20s = 1m
net.ipv4.tcp_keepalive_probes=3
# probe peroid = 1 min
net.ipv4.tcp_keepalive_time=60
net.ipv4.tcp_fin_timeout=5
net.ipv4.tcp_max_tw_buckets=262144
net.ipv4.tcp_max_syn_backlog=8192
net.ipv4.neigh.default.gc_thresh1=80000
net.ipv4.neigh.default.gc_thresh2=90000
net.ipv4.neigh.default.gc_thresh3=100000
net.bridge.bridge-nf-call-iptables=1
net.bridge.bridge-nf-call-ip6tables=1
net.bridge.bridge-nf-call-arptables=1
# max connection tracking number
net.netfilter.nf_conntrack_max=1048576
TINY配置
# tuned configuration
#==============================================================#
# File : tuned.conf
# Mtime : 2020-06-29
# Desc : Tune operatiing system to tiny mode
# Path : /etc/tuned/tiny/tuned.conf
# Author : Vonng(fengruohang@outlook.com)
# Copyright (C) 2019-2020 Ruohang Feng
#==============================================================#
[main]
summary=Optimize for PostgreSQL TINY System
# include=virtual-guest
[vm]
# disable transparent hugepages
transparent_hugepages=never
[sysctl]
#-------------------------------------------------------------#
# KERNEL #
#-------------------------------------------------------------#
# disable numa balancing
kernel.numa_balancing=0
# If a workload mostly uses anonymous memory and it hits this limit, the entire
# working set is buffered for I/O, and any more write buffering would require
# swapping, so it's time to throttle writes until I/O can catch up. Workloads
# that mostly use file mappings may be able to use even higher values.
#
# The generator of dirty data starts writeback at this percentage (system default
# is 20%)
vm.dirty_ratio = 40
# Filesystem I/O is usually much more efficient than swapping, so try to keep
# swapping low. It's usually safe to go even lower than this on systems with
# server-grade storage.
vm.swappiness = 30
#-------------------------------------------------------------#
# Network #
#-------------------------------------------------------------#
# tcp options
net.ipv4.tcp_timestamps=1
net.ipv4.tcp_tw_reuse=1
net.ipv4.tcp_tw_recycle=0
net.ipv4.tcp_syncookies=0
net.ipv4.tcp_synack_retries=1
net.ipv4.tcp_syn_retries=1
# tcp probe fail interval: 75s -> 20s
net.ipv4.tcp_keepalive_intvl=20
# tcp break after 3 * 20s = 1m
net.ipv4.tcp_keepalive_probes=3
# probe peroid = 1 min
net.ipv4.tcp_keepalive_time=60
数据库内核调优参考
# Database kernel optimisation
fs.aio-max-nr = 1048576 # 限制并发未完成的异步请求数目,,不应小于1M
fs.file-max = 16777216 # 最大打开16M个文件
# kernel
kernel.shmmax = 485058 # 共享内存最大页面数量: $(expr $(getconf _PHYS_PAGES) / 2)
kernel.shmall = 1986797568 # 共享内存总大小: $(expr $(getconf _PHYS_PAGES) / 2 \* $(getconf PAGE_SIZE))
kernel.shmmni = 16384 # 系统范围内共享内存段的最大数量 4096 -> 16384
kernel.msgmni = 32768 # 系统的消息队列数目,影响可以启动的代理程序数 设为内存MB数
kernel.msgmnb = 65536 # 影响队列的大小
kernel.msgmax = 65536 # 影响队列中可以发送的消息的大小
kernel.numa_balancing = 0 # Numa禁用
kernel.sched_migration_cost_ns = 5000000 # 5ms内,调度认为进程还是Hot的。
kernel.sem = 2048 134217728 2048 65536 # 每个信号集最大信号量2048,系统总共可用信号量134217728,单次最大操作2048,信号集总数65536
# vm
vm.dirty_ratio = 80 # 绝对限制,超过80%阻塞写请求刷盘
vm.dirty_background_bytes = 268435456 # 256MB脏数据唤醒刷盘进程
vm.dirty_expire_centisecs = 6000 # 1分钟前的数据被认为需要刷盘
vm.dirty_writeback_centisecs= 500 # 刷新进程运行间隔5秒
vm.mmap_min_addr = 65536 # 禁止访问0x10000下的内存
vm.zone_reclaim_mode = 0 # Numa禁用
# vm swap
vm.swappiness = 0 # 禁用SWAP,但高水位仍会有
vm.overcommit_memory = 2 # 允许一定程度的Overcommit
vm.overcommit_ratio = 50 # 允许的Overcommit:$((($mem - $swap) * 100 / $mem))
# tcp memory
net.ipv4.tcp_rmem = 8192 65536 16777216 # tcp读buffer: 32M/256M/16G
net.ipv4.tcp_wmem = 8192 65536 16777216 # tcp写buffer: 32M/256M/16G
net.ipv4.tcp_mem = 131072 262144 16777216 # tcp 内存使用 512M/1G/16G
net.core.rmem_default = 262144 # 接受缓冲区默认大小: 256K
net.core.rmem_max = 4194304 # 接受缓冲区最大大小: 4M
net.core.wmem_default = 262144 # 发送缓冲区默认大小: 256K
net.core.wmem_max = 4194304 # 发送缓冲区最大大小: 4M
# tcp keepalive
net.ipv4.tcp_keepalive_intvl = 20 # 探测没有确认时,重新发送探测的频度。默认75s -> 20s
net.ipv4.tcp_keepalive_probes = 3 # 3 * 20 = 1分钟超时断开
net.ipv4.tcp_keepalive_time = 60 # 探活周期1分钟
# tcp port resure
net.ipv4.tcp_tw_reuse = 1 # 允许将TIME_WAIT socket用于新的TCP连接。默认为0
net.ipv4.tcp_tw_recycle = 0 # 快速回收,已弃用
net.ipv4.tcp_fin_timeout = 5 # 保持在FIN-WAIT-2状态的秒时间
net.ipv4.tcp_timestamps = 1
# tcp anti-flood
net.ipv4.tcp_syncookies = 1 # SYN_RECV队列满后发cookie,防止恶意攻击
net.ipv4.tcp_synack_retries = 1 # 收到不完整sync后的重试次数 5->2
net.ipv4.tcp_syn_retries = 1 #表示在内核放弃建立连接之前发送SYN包的数量。
# tcp load-balancer
net.ipv4.ip_forward = 1 # IP转发
net.ipv4.ip_nonlocal_bind = 1 # 绑定非本机地址
net.netfilter.nf_conntrack_max = 1048576 # 最大跟踪连接数
net.ipv4.ip_local_port_range = 10000 65535 # 端口范围
net.ipv4.tcp_max_tw_buckets = 262144 # 256k TIME_WAIT
net.core.somaxconn = 65535 # 限制LISTEN队列最大数据包量,触发重传机制。
net.ipv4.tcp_max_syn_backlog = 8192 # SYN队列大小:1024->8192
net.core.netdev_max_backlog = 8192 # 网卡收包快于内核时,允许队列长度
3 - Metrics List
Below is the list of monitoring indicators currently available for Pigsty.
For rules on defining derivative indicators, please refer to Derivative Metrics.
监控指标列表
name |
---|
go_gc_duration_seconds |
go_gc_duration_seconds_count |
go_gc_duration_seconds_sum |
go_goroutines |
go_info |
go_memstats_alloc_bytes |
go_memstats_alloc_bytes_total |
go_memstats_buck_hash_sys_bytes |
go_memstats_frees_total |
go_memstats_gc_cpu_fraction |
go_memstats_gc_sys_bytes |
go_memstats_heap_alloc_bytes |
go_memstats_heap_idle_bytes |
go_memstats_heap_inuse_bytes |
go_memstats_heap_objects |
go_memstats_heap_released_bytes |
go_memstats_heap_sys_bytes |
go_memstats_last_gc_time_seconds |
go_memstats_lookups_total |
go_memstats_mallocs_total |
go_memstats_mcache_inuse_bytes |
go_memstats_mcache_sys_bytes |
go_memstats_mspan_inuse_bytes |
go_memstats_mspan_sys_bytes |
go_memstats_next_gc_bytes |
go_memstats_other_sys_bytes |
go_memstats_stack_inuse_bytes |
go_memstats_stack_sys_bytes |
go_memstats_sys_bytes |
go_threads |
haproxy_backend_active_servers |
haproxy_backend_backup_servers |
haproxy_backend_bytes_in_total |
haproxy_backend_bytes_out_total |
haproxy_backend_check_last_change_seconds |
haproxy_backend_check_up_down_total |
haproxy_backend_client_aborts_total |
haproxy_backend_connect_time_average_seconds |
haproxy_backend_connection_attempts_total |
haproxy_backend_connection_errors_total |
haproxy_backend_connection_reuses_total |
haproxy_backend_current_queue |
haproxy_backend_current_sessions |
haproxy_backend_downtime_seconds_total |
haproxy_backend_failed_header_rewriting_total |
haproxy_backend_http_cache_hits_total |
haproxy_backend_http_cache_lookups_total |
haproxy_backend_http_comp_bytes_bypassed_total |
haproxy_backend_http_comp_bytes_in_total |
haproxy_backend_http_comp_bytes_out_total |
haproxy_backend_http_comp_responses_total |
haproxy_backend_http_requests_total |
haproxy_backend_http_responses_total |
haproxy_backend_internal_errors_total |
haproxy_backend_last_session_seconds |
haproxy_backend_limit_sessions |
haproxy_backend_loadbalanced_total |
haproxy_backend_max_connect_time_seconds |
haproxy_backend_max_queue |
haproxy_backend_max_queue_time_seconds |
haproxy_backend_max_response_time_seconds |
haproxy_backend_max_session_rate |
haproxy_backend_max_sessions |
haproxy_backend_max_total_time_seconds |
haproxy_backend_queue_time_average_seconds |
haproxy_backend_redispatch_warnings_total |
haproxy_backend_requests_denied_total |
haproxy_backend_response_errors_total |
haproxy_backend_response_time_average_seconds |
haproxy_backend_responses_denied_total |
haproxy_backend_retry_warnings_total |
haproxy_backend_server_aborts_total |
haproxy_backend_sessions_total |
haproxy_backend_status |
haproxy_backend_total_time_average_seconds |
haproxy_backend_weight |
haproxy_frontend_bytes_in_total |
haproxy_frontend_bytes_out_total |
haproxy_frontend_connections_rate_max |
haproxy_frontend_connections_total |
haproxy_frontend_current_sessions |
haproxy_frontend_denied_connections_total |
haproxy_frontend_denied_sessions_total |
haproxy_frontend_failed_header_rewriting_total |
haproxy_frontend_http_cache_hits_total |
haproxy_frontend_http_cache_lookups_total |
haproxy_frontend_http_comp_bytes_bypassed_total |
haproxy_frontend_http_comp_bytes_in_total |
haproxy_frontend_http_comp_bytes_out_total |
haproxy_frontend_http_comp_responses_total |
haproxy_frontend_http_requests_rate_max |
haproxy_frontend_http_requests_total |
haproxy_frontend_http_responses_total |
haproxy_frontend_intercepted_requests_total |
haproxy_frontend_internal_errors_total |
haproxy_frontend_limit_session_rate |
haproxy_frontend_limit_sessions |
haproxy_frontend_max_session_rate |
haproxy_frontend_max_sessions |
haproxy_frontend_request_errors_total |
haproxy_frontend_requests_denied_total |
haproxy_frontend_responses_denied_total |
haproxy_frontend_sessions_total |
haproxy_frontend_status |
haproxy_process_active_peers |
haproxy_process_busy_polling_enabled |
haproxy_process_connected_peers |
haproxy_process_connections_total |
haproxy_process_current_backend_ssl_key_rate |
haproxy_process_current_connection_rate |
haproxy_process_current_connections |
haproxy_process_current_frontend_ssl_key_rate |
haproxy_process_current_run_queue |
haproxy_process_current_session_rate |
haproxy_process_current_ssl_connections |
haproxy_process_current_ssl_rate |
haproxy_process_current_tasks |
haproxy_process_current_zlib_memory |
haproxy_process_dropped_logs_total |
haproxy_process_frontent_ssl_reuse |
haproxy_process_hard_max_connections |
haproxy_process_http_comp_bytes_in_total |
haproxy_process_http_comp_bytes_out_total |
haproxy_process_idle_time_percent |
haproxy_process_jobs |
haproxy_process_limit_connection_rate |
haproxy_process_limit_http_comp |
haproxy_process_limit_session_rate |
haproxy_process_limit_ssl_rate |
haproxy_process_listeners |
haproxy_process_max_backend_ssl_key_rate |
haproxy_process_max_connection_rate |
haproxy_process_max_connections |
haproxy_process_max_fds |
haproxy_process_max_frontend_ssl_key_rate |
haproxy_process_max_memory_bytes |
haproxy_process_max_pipes |
haproxy_process_max_session_rate |
haproxy_process_max_sockets |
haproxy_process_max_ssl_connections |
haproxy_process_max_ssl_rate |
haproxy_process_max_zlib_memory |
haproxy_process_nbproc |
haproxy_process_nbthread |
haproxy_process_pipes_free_total |
haproxy_process_pipes_used_total |
haproxy_process_pool_allocated_bytes |
haproxy_process_pool_failures_total |
haproxy_process_pool_used_bytes |
haproxy_process_relative_process_id |
haproxy_process_requests_total |
haproxy_process_ssl_cache_lookups_total |
haproxy_process_ssl_cache_misses_total |
haproxy_process_ssl_connections_total |
haproxy_process_start_time_seconds |
haproxy_process_stopping |
haproxy_process_unstoppable_jobs |
haproxy_server_bytes_in_total |
haproxy_server_bytes_out_total |
haproxy_server_check_code |
haproxy_server_check_duration_seconds |
haproxy_server_check_failures_total |
haproxy_server_check_last_change_seconds |
haproxy_server_check_status |
haproxy_server_check_up_down_total |
haproxy_server_client_aborts_total |
haproxy_server_connect_time_average_seconds |
haproxy_server_connection_attempts_total |
haproxy_server_connection_errors_total |
haproxy_server_connection_reuses_total |
haproxy_server_current_queue |
haproxy_server_current_sessions |
haproxy_server_current_throttle |
haproxy_server_downtime_seconds_total |
haproxy_server_failed_header_rewriting_total |
haproxy_server_internal_errors_total |
haproxy_server_last_session_seconds |
haproxy_server_limit_sessions |
haproxy_server_loadbalanced_total |
haproxy_server_max_connect_time_seconds |
haproxy_server_max_queue |
haproxy_server_max_queue_time_seconds |
haproxy_server_max_response_time_seconds |
haproxy_server_max_session_rate |
haproxy_server_max_sessions |
haproxy_server_max_total_time_seconds |
haproxy_server_queue_limit |
haproxy_server_queue_time_average_seconds |
haproxy_server_redispatch_warnings_total |
haproxy_server_response_errors_total |
haproxy_server_response_time_average_seconds |
haproxy_server_responses_denied_total |
haproxy_server_retry_warnings_total |
haproxy_server_server_aborts_total |
haproxy_server_server_idle_connections_current |
haproxy_server_server_idle_connections_limit |
haproxy_server_sessions_total |
haproxy_server_status |
haproxy_server_total_time_average_seconds |
haproxy_server_weight |
node:cls:cpu_count |
node:cls:cpu_mode |
node:cls:cpu_usage |
node:cls:cpu_usage_avg5m |
node:cls:disk_io_rate |
node:cls:disk_iops |
node:cls:disk_read_iops |
node:cls:disk_read_rate |
node:cls:disk_write_iops |
node:cls:disk_write_rate |
node:cls:mem_usage |
node:cls:network_io |
node:cls:network_rx |
node:cls:network_tx |
node:cls:ntp_offset_range |
node:cls:sched_timeslicesa |
node:cpu:cpu_mode |
node:cpu:cpu_usage |
node:cpu:cpu_usage_avg5m |
node:cpu:sched_timeslices |
node:dev:disk_io_rate |
node:dev:disk_iops |
node:dev:disk_read_iops |
node:dev:disk_read_rate |
node:dev:disk_read_rt |
node:dev:disk_read_time |
node:dev:disk_write_iops |
node:dev:disk_write_rate |
node:dev:disk_write_rt |
node:dev:disk_write_time |
node:dev:network_io_rate |
node:dev:network_rx |
node:dev:network_tx |
node:fs:avail_bytes |
node:fs:free_bytes |
node:fs:free_inode |
node:fs:inode_usage |
node:fs:size_bytes |
node:fs:space_deriv_1h |
node:fs:space_exhaust |
node:fs:space_usage |
node:fs:total_inode |
node:ins:cpu_count |
node:ins:cpu_mode |
node:ins:cpu_usage |
node:ins:cpu_usage_avg5m |
node:ins:ctx_switch |
node:ins:disk_io_rate |
node:ins:disk_iops |
node:ins:disk_read_iops |
node:ins:disk_read_rate |
node:ins:disk_write_iops |
node:ins:disk_write_rate |
node:ins:fd_usage |
node:ins:forks |
node:ins:intrrupt |
node:ins:mem_app |
node:ins:mem_free |
node:ins:mem_usage |
node:ins:network_io |
node:ins:network_rx |
node:ins:network_tx |
node:ins:pagefault |
node:ins:pagein |
node:ins:pageout |
node:ins:sched_timeslices |
node:ins:stdload1 |
node:ins:stdload15 |
node:ins:stdload5 |
node:ins:swap_usage |
node:ins:swapin |
node:ins:swapout |
node:ins:tcp_active_opens |
node:ins:tcp_dropped |
node:ins:tcp_insegs |
node:ins:tcp_outsegs |
node:ins:tcp_overflow |
node:ins:tcp_overflow_rate |
node:ins:tcp_passive_opens |
node:ins:tcp_retrans_rate |
node:ins:tcp_retranssegs |
node:ins:tcp_segs |
node:uptime |
node_arp_entries |
node_boot_time_seconds |
node_context_switches_total |
node_cooling_device_cur_state |
node_cooling_device_max_state |
node_cpu_guest_seconds_total |
node_cpu_seconds_total |
node_disk_io_now |
node_disk_io_time_seconds_total |
node_disk_io_time_weighted_seconds_total |
node_disk_read_bytes_total |
node_disk_read_time_seconds_total |
node_disk_reads_completed_total |
node_disk_reads_merged_total |
node_disk_write_time_seconds_total |
node_disk_writes_completed_total |
node_disk_writes_merged_total |
node_disk_written_bytes_total |
node_entropy_available_bits |
node_exporter_build_info |
node_filefd_allocated |
node_filefd_maximum |
node_filesystem_avail_bytes |
node_filesystem_device_error |
node_filesystem_files |
node_filesystem_files_free |
node_filesystem_free_bytes |
node_filesystem_readonly |
node_filesystem_size_bytes |
node_forks_total |
node_intr_total |
node_ipvs_connections_total |
node_ipvs_incoming_bytes_total |
node_ipvs_incoming_packets_total |
node_ipvs_outgoing_bytes_total |
node_ipvs_outgoing_packets_total |
node_load1 |
node_load15 |
node_load5 |
node_memory_Active_anon_bytes |
node_memory_Active_bytes |
node_memory_Active_file_bytes |
node_memory_AnonHugePages_bytes |
node_memory_AnonPages_bytes |
node_memory_Bounce_bytes |
node_memory_Buffers_bytes |
node_memory_Cached_bytes |
node_memory_CmaFree_bytes |
node_memory_CmaTotal_bytes |
node_memory_CommitLimit_bytes |
node_memory_Committed_AS_bytes |
node_memory_DirectMap2M_bytes |
node_memory_DirectMap4k_bytes |
node_memory_Dirty_bytes |
node_memory_HardwareCorrupted_bytes |
node_memory_HugePages_Free |
node_memory_HugePages_Rsvd |
node_memory_HugePages_Surp |
node_memory_HugePages_Total |
node_memory_Hugepagesize_bytes |
node_memory_Inactive_anon_bytes |
node_memory_Inactive_bytes |
node_memory_Inactive_file_bytes |
node_memory_KernelStack_bytes |
node_memory_Mapped_bytes |
node_memory_MemAvailable_bytes |
node_memory_MemFree_bytes |
node_memory_MemTotal_bytes |
node_memory_Mlocked_bytes |
node_memory_NFS_Unstable_bytes |
node_memory_PageTables_bytes |
node_memory_Percpu_bytes |
node_memory_SReclaimable_bytes |
node_memory_SUnreclaim_bytes |
node_memory_Shmem_bytes |
node_memory_Slab_bytes |
node_memory_SwapCached_bytes |
node_memory_SwapFree_bytes |
node_memory_SwapTotal_bytes |
node_memory_Unevictable_bytes |
node_memory_VmallocChunk_bytes |
node_memory_VmallocTotal_bytes |
node_memory_VmallocUsed_bytes |
node_memory_WritebackTmp_bytes |
node_memory_Writeback_bytes |
node_netstat_Icmp6_InErrors |
node_netstat_Icmp6_InMsgs |
node_netstat_Icmp6_OutMsgs |
node_netstat_Icmp_InErrors |
node_netstat_Icmp_InMsgs |
node_netstat_Icmp_OutMsgs |
node_netstat_Ip6_InOctets |
node_netstat_Ip6_OutOctets |
node_netstat_IpExt_InOctets |
node_netstat_IpExt_OutOctets |
node_netstat_Ip_Forwarding |
node_netstat_TcpExt_ListenDrops |
node_netstat_TcpExt_ListenOverflows |
node_netstat_TcpExt_SyncookiesFailed |
node_netstat_TcpExt_SyncookiesRecv |
node_netstat_TcpExt_SyncookiesSent |
node_netstat_TcpExt_TCPSynRetrans |
node_netstat_Tcp_ActiveOpens |
node_netstat_Tcp_CurrEstab |
node_netstat_Tcp_InErrs |
node_netstat_Tcp_InSegs |
node_netstat_Tcp_OutSegs |
node_netstat_Tcp_PassiveOpens |
node_netstat_Tcp_RetransSegs |
node_netstat_Udp6_InDatagrams |
node_netstat_Udp6_InErrors |
node_netstat_Udp6_NoPorts |
node_netstat_Udp6_OutDatagrams |
node_netstat_Udp6_RcvbufErrors |
node_netstat_Udp6_SndbufErrors |
node_netstat_UdpLite6_InErrors |
node_netstat_UdpLite_InErrors |
node_netstat_Udp_InDatagrams |
node_netstat_Udp_InErrors |
node_netstat_Udp_NoPorts |
node_netstat_Udp_OutDatagrams |
node_netstat_Udp_RcvbufErrors |
node_netstat_Udp_SndbufErrors |
node_network_address_assign_type |
node_network_carrier |
node_network_carrier_changes_total |
node_network_device_id |
node_network_dormant |
node_network_flags |
node_network_iface_id |
node_network_iface_link |
node_network_iface_link_mode |
node_network_info |
node_network_mtu_bytes |
node_network_net_dev_group |
node_network_protocol_type |
node_network_receive_bytes_total |
node_network_receive_compressed_total |
node_network_receive_drop_total |
node_network_receive_errs_total |
node_network_receive_fifo_total |
node_network_receive_frame_total |
node_network_receive_multicast_total |
node_network_receive_packets_total |
node_network_transmit_bytes_total |
node_network_transmit_carrier_total |
node_network_transmit_colls_total |
node_network_transmit_compressed_total |
node_network_transmit_drop_total |
node_network_transmit_errs_total |
node_network_transmit_fifo_total |
node_network_transmit_packets_total |
node_network_transmit_queue_length |
node_network_up |
node_nf_conntrack_entries |
node_nf_conntrack_entries_limit |
node_ntp_leap |
node_ntp_offset_seconds |
node_ntp_reference_timestamp_seconds |
node_ntp_root_delay_seconds |
node_ntp_root_dispersion_seconds |
node_ntp_rtt_seconds |
node_ntp_sanity |
node_ntp_stratum |
node_power_supply_capacity |
node_power_supply_cyclecount |
node_power_supply_energy_full |
node_power_supply_energy_full_design |
node_power_supply_energy_watthour |
node_power_supply_info |
node_power_supply_online |
node_power_supply_power_watt |
node_power_supply_present |
node_power_supply_voltage_min_design |
node_power_supply_voltage_volt |
node_processes_max_processes |
node_processes_max_threads |
node_processes_pids |
node_processes_state |
node_processes_threads |
node_procs_blocked |
node_procs_running |
node_schedstat_running_seconds_total |
node_schedstat_timeslices_total |
node_schedstat_waiting_seconds_total |
node_scrape_collector_duration_seconds |
node_scrape_collector_success |
node_sockstat_FRAG6_inuse |
node_sockstat_FRAG6_memory |
node_sockstat_FRAG_inuse |
node_sockstat_FRAG_memory |
node_sockstat_RAW6_inuse |
node_sockstat_RAW_inuse |
node_sockstat_TCP6_inuse |
node_sockstat_TCP_alloc |
node_sockstat_TCP_inuse |
node_sockstat_TCP_mem |
node_sockstat_TCP_mem_bytes |
node_sockstat_TCP_orphan |
node_sockstat_TCP_tw |
node_sockstat_UDP6_inuse |
node_sockstat_UDPLITE6_inuse |
node_sockstat_UDPLITE_inuse |
node_sockstat_UDP_inuse |
node_sockstat_UDP_mem |
node_sockstat_UDP_mem_bytes |
node_sockstat_sockets_used |
node_systemd_socket_accepted_connections_total |
node_systemd_socket_current_connections |
node_systemd_system_running |
node_systemd_timer_last_trigger_seconds |
node_systemd_unit_state |
node_systemd_units |
node_systemd_version |
node_tcp_connection_states |
node_textfile_scrape_error |
node_time_seconds |
node_timex_estimated_error_seconds |
node_timex_frequency_adjustment_ratio |
node_timex_loop_time_constant |
node_timex_maxerror_seconds |
node_timex_offset_seconds |
node_timex_pps_calibration_total |
node_timex_pps_error_total |
node_timex_pps_frequency_hertz |
node_timex_pps_jitter_seconds |
node_timex_pps_jitter_total |
node_timex_pps_shift_seconds |
node_timex_pps_stability_exceeded_total |
node_timex_pps_stability_hertz |
node_timex_status |
node_timex_sync_status |
node_timex_tai_offset_seconds |
node_timex_tick_seconds |
node_udp_queues |
node_uname_info |
node_vmstat_pgfault |
node_vmstat_pgmajfault |
node_vmstat_pgpgin |
node_vmstat_pgpgout |
node_vmstat_pswpin |
node_vmstat_pswpout |
node_xfs_allocation_btree_compares_total |
node_xfs_allocation_btree_lookups_total |
node_xfs_allocation_btree_records_deleted_total |
node_xfs_allocation_btree_records_inserted_total |
node_xfs_block_map_btree_compares_total |
node_xfs_block_map_btree_lookups_total |
node_xfs_block_map_btree_records_deleted_total |
node_xfs_block_map_btree_records_inserted_total |
node_xfs_block_mapping_extent_list_compares_total |
node_xfs_block_mapping_extent_list_deletions_total |
node_xfs_block_mapping_extent_list_insertions_total |
node_xfs_block_mapping_extent_list_lookups_total |
node_xfs_block_mapping_reads_total |
node_xfs_block_mapping_unmaps_total |
node_xfs_block_mapping_writes_total |
node_xfs_directory_operation_create_total |
node_xfs_directory_operation_getdents_total |
node_xfs_directory_operation_lookup_total |
node_xfs_directory_operation_remove_total |
node_xfs_extent_allocation_blocks_allocated_total |
node_xfs_extent_allocation_blocks_freed_total |
node_xfs_extent_allocation_extents_allocated_total |
node_xfs_extent_allocation_extents_freed_total |
node_xfs_read_calls_total |
node_xfs_vnode_active_total |
node_xfs_vnode_allocate_total |
node_xfs_vnode_get_total |
node_xfs_vnode_hold_total |
node_xfs_vnode_reclaim_total |
node_xfs_vnode_release_total |
node_xfs_vnode_remove_total |
node_xfs_write_calls_total |
pg:all:active_backends |
pg:all:age |
pg:all:backends |
pg:all:buf_alloc |
pg:all:buf_flush |
pg:all:commits |
pg:all:commits_realtime |
pg:all:ixact_backends |
pg:all:lag_bytes |
pg:all:lag_seconds |
pg:all:qps_realtime |
pg:all:rollbacks |
pg:all:rollbacks_realtime |
pg:all:sessions |
pg:all:tps_realtime |
pg:all:tup_deleted |
pg:all:tup_inserted |
pg:all:tup_modified |
pg:all:tup_selected |
pg:all:tup_touched |
pg:all:tup_updated |
pg:all:wal_rate |
pg:all:xacts |
pg:all:xacts_avg30m |
pg:all:xacts_mu |
pg:all:xacts_realtime |
pg:all:xacts_sigma |
pg:cls:active_backends |
pg:cls:age |
pg:cls:backends |
pg:cls:buf_alloc |
pg:cls:buf_flush |
pg:cls:ckpt_1h |
pg:cls:commits |
pg:cls:commits_realtime |
pg:cls:ixact_backends |
pg:cls:lag_bytes |
pg:cls:lag_seconds |
pg:cls:leader |
pg:cls:load0 |
pg:cls:load1 |
pg:cls:load15 |
pg:cls:load5 |
pg:cls:lock_count |
pg:cls:locks |
pg:cls:primarys |
pg:cls:qps_realtime |
pg:cls:replicas |
pg:cls:rlock |
pg:cls:rollbacks |
pg:cls:rollbacks_realtime |
pg:cls:saturation0 |
pg:cls:saturation1 |
pg:cls:saturation15 |
pg:cls:saturation5 |
pg:cls:sessions |
pg:cls:size |
pg:cls:synchronous |
pg:cls:temp_bytes |
pg:cls:temp_files |
pg:cls:timeline |
pg:cls:tps_realtime |
pg:cls:tup_deleted |
pg:cls:tup_inserted |
pg:cls:tup_modified |
pg:cls:tup_selected |
pg:cls:tup_touched |
pg:cls:tup_updated |
pg:cls:wal_rate |
pg:cls:wlock |
pg:cls:xacts |
pg:cls:xacts_avg30m |
pg:cls:xacts_mu |
pg:cls:xacts_realtime |
pg:cls:xacts_sigma |
pg:cls:xlock |
pg:db:age_deriv_1h |
pg:db:age_exhaust |
pg:db:backends |
pg:db:blks_access_1m |
pg:db:blks_hit_1m |
pg:db:blks_read_1m |
pg:db:buffer_hit_rate |
pg:db:commits |
pg:db:commits_realtime |
pg:db:io_time_usage |
pg:db:lock_count |
pg:db:locks |
pg:db:pool_current_conn |
pg:db:pool_disabled |
pg:db:pool_max_conn |
pg:db:pool_paused |
pg:db:pool_reserve_size |
pg:db:pool_size |
pg:db:qps_realtime |
pg:db:read_time_usage |
pg:db:rlock |
pg:db:rollbacks |
pg:db:rollbacks_realtime |
pg:db:sessions |
pg:db:temp_bytes |
pg:db:temp_files |
pg:db:tps_realtime |
pg:db:tup_deleted |
pg:db:tup_inserted |
pg:db:tup_modified |
pg:db:tup_selected |
pg:db:tup_touched |
pg:db:tup_updated |
pg:db:wlock |
pg:db:write_time_usage |
pg:db:xacts |
pg:db:xacts_avg30m |
pg:db:xacts_mu |
pg:db:xacts_realtime |
pg:db:xacts_sigma |
pg:db:xlock |
pg:ins:active_backends |
pg:ins:age |
pg:ins:backends |
pg:ins:buf_alloc |
pg:ins:buf_flush |
pg:ins:buf_flush_backend |
pg:ins:buf_flush_checkpoint |
pg:ins:checkpoint_lsn |
pg:ins:ckpt_req |
pg:ins:ckpt_timed |
pg:ins:commits |
pg:ins:commits_realtime |
pg:ins:free_clients |
pg:ins:free_servers |
pg:ins:hit_rate |
pg:ins:ixact_backends |
pg:ins:lag_bytes |
pg:ins:lag_seconds |
pg:ins:last_ckpt |
pg:ins:load0 |
pg:ins:load1 |
pg:ins:load15 |
pg:ins:load5 |
pg:ins:lock_count |
pg:ins:locks |
pg:ins:login_clients |
pg:ins:pool_databases |
pg:ins:pool_users |
pg:ins:pools |
pg:ins:qps_realtime |
pg:ins:query_rt |
pg:ins:query_rt_avg30m |
pg:ins:query_rt_mu |
pg:ins:query_rt_sigma |
pg:ins:query_time_rate15m |
pg:ins:query_time_rate1m |
pg:ins:query_time_rate5m |
pg:ins:recv_init_lsn |
pg:ins:recv_init_tli |
pg:ins:recv_last_lsn |
pg:ins:recv_last_tli |
pg:ins:redo_lsn |
pg:ins:rlock |
pg:ins:rollbacks |
pg:ins:rollbacks_realtime |
pg:ins:saturation0 |
pg:ins:saturation1 |
pg:ins:saturation15 |
pg:ins:saturation5 |
pg:ins:sessions |
pg:ins:slot_retained_bytes |
pg:ins:temp_bytes |
pg:ins:temp_files |
pg:ins:tps_realtime |
pg:ins:tup_deleted |
pg:ins:tup_inserted |
pg:ins:tup_modified |
pg:ins:tup_selected |
pg:ins:tup_touched |
pg:ins:tup_updated |
pg:ins:used_clients |
pg:ins:wal_rate |
pg:ins:wlock |
pg:ins:xact_rt |
pg:ins:xact_rt_avg30m |
pg:ins:xact_rt_mu |
pg:ins:xact_rt_sigma |
pg:ins:xact_time_rate15m |
pg:ins:xact_time_rate1m |
pg:ins:xact_time_rate5m |
pg:ins:xacts |
pg:ins:xacts_avg30m |
pg:ins:xacts_mu |
pg:ins:xacts_realtime |
pg:ins:xacts_sigma |
pg:ins:xlock |
pg:query:call |
pg:query:rt |
pg:svc:active_backends |
pg:svc:backends |
pg:svc:buf_alloc |
pg:svc:buf_flush |
pg:svc:commits |
pg:svc:commits_realtime |
pg:svc:ixact_backends |
pg:svc:load0 |
pg:svc:load1 |
pg:svc:load15 |
pg:svc:load5 |
pg:svc:lock_count |
pg:svc:locks |
pg:svc:qps_realtime |
pg:svc:query_rt |
pg:svc:query_rt_avg30m |
pg:svc:query_rt_mu |
pg:svc:query_rt_sigma |
pg:svc:rlock |
pg:svc:rollbacks |
pg:svc:rollbacks_realtime |
pg:svc:sessions |
pg:svc:temp_bytes |
pg:svc:temp_files |
pg:svc:tps_realtime |
pg:svc:tup_deleted |
pg:svc:tup_inserted |
pg:svc:tup_modified |
pg:svc:tup_selected |
pg:svc:tup_touched |
pg:svc:tup_updated |
pg:svc:wlock |
pg:svc:xact_rt |
pg:svc:xact_rt_avg30m |
pg:svc:xact_rt_mu |
pg:svc:xact_rt_sigma |
pg:svc:xacts |
pg:svc:xacts_avg30m |
pg:svc:xacts_mu |
pg:svc:xacts_realtime |
pg:svc:xacts_sigma |
pg:svc:xlock |
pg_activity_count |
pg_activity_max_conn_duration |
pg_activity_max_duration |
pg_activity_max_tx_duration |
pg_backend_count |
pg_backup_time |
pg_bgwriter_buffers_alloc |
pg_bgwriter_buffers_backend |
pg_bgwriter_buffers_backend_fsync |
pg_bgwriter_buffers_checkpoint |
pg_bgwriter_buffers_clean |
pg_bgwriter_checkpoint_sync_time |
pg_bgwriter_checkpoint_write_time |
pg_bgwriter_checkpoints_req |
pg_bgwriter_checkpoints_timed |
pg_bgwriter_maxwritten_clean |
pg_bgwriter_stats_reset |
pg_boot_time |
pg_checkpoint_checkpoint_lsn |
pg_checkpoint_elapse |
pg_checkpoint_full_page_writes |
pg_checkpoint_newest_commit_ts_xid |
pg_checkpoint_next_multi_offset |
pg_checkpoint_next_multixact_id |
pg_checkpoint_next_oid |
pg_checkpoint_next_xid |
pg_checkpoint_next_xid_epoch |
pg_checkpoint_oldest_active_xid |
pg_checkpoint_oldest_commit_ts_xid |
pg_checkpoint_oldest_multi_dbid |
pg_checkpoint_oldest_multi_xid |
pg_checkpoint_oldest_xid |
pg_checkpoint_oldest_xid_dbid |
pg_checkpoint_prev_tli |
pg_checkpoint_redo_lsn |
pg_checkpoint_time |
pg_checkpoint_tli |
pg_class_relage |
pg_class_relpages |
pg_class_relsize |
pg_class_reltuples |
pg_conf_reload_time |
pg_database_age |
pg_database_allow_conn |
pg_database_conn_limit |
pg_database_frozen_xid |
pg_database_is_template |
pg_db_blk_read_time |
pg_db_blk_write_time |
pg_db_blks_access |
pg_db_blks_hit |
pg_db_blks_read |
pg_db_checksum_failures |
pg_db_checksum_last_failure |
pg_db_confl_bufferpin |
pg_db_confl_deadlock |
pg_db_confl_lock |
pg_db_confl_snapshot |
pg_db_confl_tablespace |
pg_db_conflicts |
pg_db_deadlocks |
pg_db_numbackends |
pg_db_stats_reset |
pg_db_temp_bytes |
pg_db_temp_files |
pg_db_tup_deleted |
pg_db_tup_fetched |
pg_db_tup_inserted |
pg_db_tup_modified |
pg_db_tup_returned |
pg_db_tup_updated |
pg_db_xact_commit |
pg_db_xact_rollback |
pg_db_xact_total |
pg_downstream_count |
pg_exporter_last_scrape_time |
pg_exporter_query_cache_ttl |
pg_exporter_query_scrape_duration |
pg_exporter_query_scrape_error_count |
pg_exporter_query_scrape_hit_count |
pg_exporter_query_scrape_metric_count |
pg_exporter_query_scrape_total_count |
pg_exporter_scrape_duration |
pg_exporter_scrape_error_count |
pg_exporter_scrape_total_count |
pg_exporter_server_scrape_duration |
pg_exporter_server_scrape_total_count |
pg_exporter_server_scrape_total_seconds |
pg_exporter_up |
pg_exporter_uptime |
pg_flush_lsn |
pg_func_calls |
pg_func_self_time |
pg_func_total_time |
pg_in_recovery |
pg_index_bloat_ratio |
pg_index_bloat_size |
pg_index_idx_blks_hit |
pg_index_idx_blks_read |
pg_index_idx_scan |
pg_index_idx_tup_fetch |
pg_index_idx_tup_read |
pg_insert_lsn |
pg_is_in_backup |
pg_is_in_recovery |
pg_is_primary |
pg_is_replica |
pg_is_wal_replay_paused |
pg_lag |
pg_last_replay_time |
pg_lock_count |
pg_lsn |
pg_meta_info |
pg_query_blk_io_time |
pg_query_calls |
pg_query_max_time |
pg_query_mean_time |
pg_query_min_time |
pg_query_rows |
pg_query_stddev_time |
pg_query_total_time |
pg_query_wal_bytes |
pg_receive_lsn |
pg_replay_lsn |
pg_setting_block_size |
pg_setting_data_checksums |
pg_setting_max_connections |
pg_setting_max_locks_per_transaction |
pg_setting_max_prepared_transactions |
pg_setting_max_replication_slots |
pg_setting_max_wal_senders |
pg_setting_max_worker_processes |
pg_setting_wal_log_hints |
pg_shmem_allocated_size |
pg_shmem_offset |
pg_shmem_size |
pg_size_bytes |
pg_slru_blks_exists |
pg_slru_blks_hit |
pg_slru_blks_read |
pg_slru_blks_written |
pg_slru_blks_zeroed |
pg_slru_flushes |
pg_slru_stats_reset |
pg_slru_truncates |
pg_status |
pg_sync_standby_disabled |
pg_sync_standby_enabled |
pg_table_analyze_count |
pg_table_autoanalyze_count |
pg_table_autovacuum_count |
pg_table_bloat_ratio |
pg_table_bloat_size |
pg_table_heap_blks_hit |
pg_table_heap_blks_read |
pg_table_idx_blks_hit |
pg_table_idx_blks_read |
pg_table_idx_scan |
pg_table_idx_tup_fetch |
pg_table_last_analyze |
pg_table_last_autoanalyze |
pg_table_last_autovacuum |
pg_table_last_vacuum |
pg_table_n_dead_tup |
pg_table_n_live_tup |
pg_table_n_mod_since_analyze |
pg_table_n_tup_del |
pg_table_n_tup_hot_upd |
pg_table_n_tup_ins |
pg_table_n_tup_mod |
pg_table_n_tup_upd |
pg_table_seq_scan |
pg_table_seq_tup_read |
pg_table_size_bytes |
pg_table_size_indexsize |
pg_table_size_relsize |
pg_table_size_toastsize |
pg_table_tbl_scan |
pg_table_tidx_blks_hit |
pg_table_tidx_blks_read |
pg_table_toast_blks_hit |
pg_table_toast_blks_read |
pg_table_tup_read |
pg_table_vacuum_count |
pg_timeline |
pg_timestamp |
pg_up |
pg_uptime |
pg_version |
pg_write_lsn |
pg_xact_xmax |
pg_xact_xmin |
pg_xact_xnum |
pgbouncer_database_current_connections |
pgbouncer_database_disabled |
pgbouncer_database_max_connections |
pgbouncer_database_paused |
pgbouncer_database_pool_size |
pgbouncer_database_reserve_pool |
pgbouncer_exporter_last_scrape_time |
pgbouncer_exporter_query_cache_ttl |
pgbouncer_exporter_query_scrape_duration |
pgbouncer_exporter_query_scrape_error_count |
pgbouncer_exporter_query_scrape_hit_count |
pgbouncer_exporter_query_scrape_metric_count |
pgbouncer_exporter_query_scrape_total_count |
pgbouncer_exporter_scrape_duration |
pgbouncer_exporter_scrape_error_count |
pgbouncer_exporter_scrape_total_count |
pgbouncer_exporter_server_scrape_duration |
pgbouncer_exporter_server_scrape_total_count |
pgbouncer_exporter_server_scrape_total_seconds |
pgbouncer_exporter_up |
pgbouncer_exporter_uptime |
pgbouncer_in_recovery |
pgbouncer_list_items |
pgbouncer_pool_active_clients |
pgbouncer_pool_active_servers |
pgbouncer_pool_idle_servers |
pgbouncer_pool_login_servers |
pgbouncer_pool_maxwait |
pgbouncer_pool_maxwait_us |
pgbouncer_pool_tested_servers |
pgbouncer_pool_used_servers |
pgbouncer_pool_waiting_clients |
pgbouncer_stat_avg_query_count |
pgbouncer_stat_avg_query_time |
pgbouncer_stat_avg_recv |
pgbouncer_stat_avg_sent |
pgbouncer_stat_avg_wait_time |
pgbouncer_stat_avg_xact_count |
pgbouncer_stat_avg_xact_time |
pgbouncer_stat_total_query_count |
pgbouncer_stat_total_query_time |
pgbouncer_stat_total_received |
pgbouncer_stat_total_sent |
pgbouncer_stat_total_wait_time |
pgbouncer_stat_total_xact_count |
pgbouncer_stat_total_xact_time |
pgbouncer_up |
pgbouncer_version |
process_cpu_seconds_total |
process_max_fds |
process_open_fds |
process_resident_memory_bytes |
process_start_time_seconds |
process_virtual_memory_bytes |
process_virtual_memory_max_bytes |
promhttp_metric_handler_errors_total |
promhttp_metric_handler_requests_in_flight |
promhttp_metric_handler_requests_total |
scrape_duration_seconds |
scrape_samples_post_metric_relabeling |
scrape_samples_scraped |
scrape_series_added |
up |
4 - Derived Metrics
Here are the rules for defining all Pigsty’s derived indicators.
Derived Metrics for Node
---
- name: node-rules
rules:
#==============================================================#
# Aliveness #
#==============================================================#
# TODO: change this to your node exporter port
- record: node_exporter_up
expr: up{instance=~".*:9099"}
- record: node:uptime
expr: time() - node_boot_time_seconds{}
#==============================================================#
# CPU #
#==============================================================#
# cpu mode time ratio
- record: node:cpu:cpu_mode
expr: irate(node_cpu_seconds_total{}[1m])
- record: node:ins:cpu_mode
expr: sum without (cpu) (node:cpu:cpu_mode)
- record: node:cls:cpu_mode
expr: sum by (cls, mode) (node:ins:cpu_mode)
# cpu schedule time-slices
- record: node:cpu:sched_timeslices
expr: irate(node_schedstat_timeslices_total{}[1m])
- record: node:ins:sched_timeslices
expr: sum without (cpu) (node:cpu:sched_timeslices)
- record: node:cls:sched_timeslicesa
expr: sum by (cls) (node:ins:sched_timeslices)
# cpu count
- record: node:ins:cpu_count
expr: count without (cpu) (node:cpu:cpu_usage)
- record: node:cls:cpu_count
expr: sum by (cls) (node:ins:cpu_count)
# cpu usage
- record: node:cpu:cpu_usage
expr: 1 - sum without (mode) (node:cpu:cpu_mode{mode="idle"})
- record: node:ins:cpu_usage
expr: sum without (cpu) (node:cpu:cpu_usage) / node:ins:cpu_count
- record: node:cls:cpu_usage
expr: sum by (cls) (node:ins:cpu_usage * node:ins:cpu_count) / sum by (cls) (node:ins:cpu_count)
# cpu usage avg5m
- record: node:cpu:cpu_usage_avg5m
expr: avg_over_time(node:cpu:cpu_usage[5m])
- record: node:ins:cpu_usage_avg5m
expr: avg_over_time(node:ins:cpu_usage[5m])
- record: node:cls:cpu_usage_avg5m
expr: avg_over_time(node:cls:cpu_usage[5m])
#==============================================================#
# Memory #
#==============================================================#
# mem usage
- record: node:ins:mem_app
expr: node_memory_MemTotal_bytes - node_memory_MemFree_bytes - node_memory_Buffers_bytes - node_memory_Cached_bytes - node_memory_Slab_bytes - node_memory_PageTables_bytes - node_memory_SwapCached_bytes
- record: node:ins:mem_free
expr: node_memory_MemFree_bytes{} + node_memory_Cached_bytes{}
- record: node:ins:mem_usage
expr: node:ins:mem_app / node_memory_MemTotal_bytes
- record: node:cls:mem_usage
expr: sum by (cls) (node:ins:mem_app) / sum by (cls) (node_memory_MemTotal_bytes)
- record: node:ins:swap_usage
expr: 1 - node_memory_SwapFree_bytes{} / node_memory_SwapTotal_bytes{}
#==============================================================#
# Disk #
#==============================================================#
# disk read iops
- record: node:dev:disk_read_iops
expr: irate(node_disk_reads_completed_total{device=~"[a-zA-Z-_]+"}[1m])
- record: node:ins:disk_read_iops
expr: sum without (device) (node:dev:disk_read_iops)
- record: node:cls:disk_read_iops
expr: sum by (cls) (node:ins:disk_read_iops)
# disk write iops
- record: node:dev:disk_write_iops
expr: irate(node_disk_writes_completed_total{device=~"[a-zA-Z-_]+"}[1m])
- record: node:ins:disk_write_iops
expr: sum without (device) (node:dev:disk_write_iops)
- record: node:cls:disk_write_iops
expr: sum by (cls) (node:ins:disk_write_iops)
# disk iops
- record: node:dev:disk_iops
expr: node:dev:disk_read_iops + node:dev:disk_write_iops
- record: node:ins:disk_iops
expr: node:ins:disk_read_iops + node:ins:disk_write_iops
- record: node:cls:disk_iops
expr: node:cls:disk_read_iops + node:cls:disk_write_iops
# read bandwidth (rate1m)
- record: node:dev:disk_read_rate
expr: rate(node_disk_read_bytes_total{device=~"[a-zA-Z-_]+"}[1m])
- record: node:ins:disk_read_rate
expr: sum without (device) (node:dev:disk_read_rate)
- record: node:cls:disk_read_rate
expr: sum by (cls) (node:ins:disk_read_rate)
# write bandwidth (rate1m)
- record: node:dev:disk_write_rate
expr: rate(node_disk_written_bytes_total{device=~"[a-zA-Z-_]+"}[1m])
- record: node:ins:disk_write_rate
expr: sum without (device) (node:dev:disk_write_rate)
- record: node:cls:disk_write_rate
expr: sum by (cls) (node:ins:disk_write_rate)
# io bandwidth (rate1m)
- record: node:dev:disk_io_rate
expr: node:dev:disk_read_rate + node:dev:disk_write_rate
- record: node:ins:disk_io_rate
expr: node:ins:disk_read_rate + node:ins:disk_write_rate
- record: node:cls:disk_io_rate
expr: node:cls:disk_read_rate + node:cls:disk_write_rate
# read/write total time
- record: node:dev:disk_read_time
expr: rate(node_disk_read_time_seconds_total{device=~"[a-zA-Z-_]+"}[1m])
- record: node:dev:disk_write_time
expr: rate(node_disk_read_time_seconds_total{device=~"[a-zA-Z-_]+"}[1m])
# read/write response time
- record: node:dev:disk_read_rt
expr: node:dev:disk_read_time / node:dev:disk_read_iops
- record: node:dev:disk_write_rt
expr: node:dev:disk_write_time / node:dev:disk_write_iops
- record: node:dev:disk_rt
expr: (node:dev:disk_read_time + node:dev:disk_write_time) / node:dev:iops
#==============================================================#
# Network #
#==============================================================#
# transmit bandwidth (out)
- record: node:dev:network_tx
expr: irate(node_network_transmit_bytes_total{}[1m])
- record: node:ins:network_tx
expr: sum without (device) (node:dev:network_tx{device!~"lo|bond.*"})
- record: node:cls:network_tx
expr: sum by (cls) (node:ins:network_tx)
# receive bandwidth (in)
- record: node:dev:network_rx
expr: irate(node_network_receive_bytes_total{}[1m])
- record: node:ins:network_rx
expr: sum without (device) (node:dev:network_rx{device!~"lo|bond.*"})
- record: node:cls:network_rx
expr: sum by (cls) (node:ins:network_rx)
# io bandwidth
- record: node:dev:network_io_rate
expr: node:dev:network_tx + node:dev:network_rx
- record: node:ins:network_io
expr: node:ins:network_tx + node:ins:network_rx
- record: node:cls:network_io
expr: node:cls:network_tx + node:cls:network_rx
#==============================================================#
# Schedule #
#==============================================================#
# normalized load
- record: node:ins:stdload1
expr: node_load1 / node:ins:cpu_count
- record: node:ins:stdload5
expr: node_load5 / node:ins:cpu_count
- record: node:ins:stdload15
expr: node_load15 / node:ins:cpu_count
# process
- record: node:ins:forks
expr: irate(node_forks_total[1m])
# interrupt & context switch
- record: node:ins:intrrupt
expr: irate(node_intr_total[1m])
- record: node:ins:ctx_switch
expr: irate(node_context_switches_total{}[1m])
#==============================================================#
# VM #
#==============================================================#
- record: node:ins:pagefault
expr: irate(node_vmstat_pgfault[1m])
- record: node:ins:pagein
expr: irate(node_vmstat_pgpgin[1m])
- record: node:ins:pageout
expr: irate(node_vmstat_pgpgout[1m])
- record: node:ins:swapin
expr: irate(node_vmstat_pswpin[1m])
- record: node:ins:swapout
expr: irate(node_vmstat_pswpout[1m])
#==============================================================#
# FS #
#==============================================================#
# filesystem space usage
- record: node:fs:free_bytes
expr: max without(device, fstype) (node_filesystem_free_bytes{fstype!~"(n|root|tmp)fs.*"})
- record: node:fs:avail_bytes
expr: max without(device, fstype) (node_filesystem_avail_bytes{fstype!~"(n|root|tmp)fs.*"})
- record: node:fs:size_bytes
expr: max without(device, fstype) (node_filesystem_size_bytes{fstype!~"(n|root|tmp)fs.*"})
- record: node:fs:space_usage
expr: 1 - (node:fs:avail_bytes{} / node:fs:size_bytes{})
- record: node:fs:free_inode
expr: max without(device, fstype) (node_filesystem_files_free{fstype!~"(n|root|tmp)fs.*"})
- record: node:fs:total_inode
expr: max without(device, fstype) (node_filesystem_files{fstype!~"(n|root|tmp)fs.*"})
# space delta and prediction
- record: node:fs:space_deriv_1h
expr: 0 - deriv(node_filesystem_avail_bytes{}[1h])
- record: node:fs:space_exhaust
expr: (node_filesystem_avail_bytes{} / node:fs:space_deriv_1h{}) > 0
# fs inode usage
- record: node:fs:inode_usage
expr: 1 - (node:fs:free_inode / node:fs:total_inode)
# file descriptor usage
- record: node:ins:fd_usage
expr: node_filefd_allocated / node_filefd_maximum
#==============================================================#
# TCP #
#==============================================================#
# tcp segments (rate1m)
- record: node:ins:tcp_insegs
expr: rate(node_netstat_Tcp_InSegs{}[1m])
- record: node:ins:tcp_outsegs
expr: rate(node_netstat_Tcp_OutSegs{}[1m])
- record: node:ins:tcp_retranssegs
expr: rate(node_netstat_Tcp_RetransSegs{}[1m])
- record: node:ins:tcp_segs
expr: node:ins:tcp_insegs + node:ins:tcp_outsegs
# retransmit
- record: node:ins:tcp_retrans_rate
expr: node:ins:tcp_retranssegs / node:ins:tcp_outsegs
# overflow
- record: node:ins:tcp_overflow_rate
expr: rate(node_netstat_TcpExt_ListenOverflows[1m])
#==============================================================#
# Netstat #
#==============================================================#
# tcp open (rate1m)
- record: node:ins:tcp_passive_opens
expr: rate(node_netstat_Tcp_PassiveOpens[1m])
- record: node:ins:tcp_active_opens
expr: rate(node_netstat_Tcp_ActiveOpens[1m])
# tcp close
- record: node:ins:tcp_attempt_fails
expr: rate(node_netstat_Tcp_AttemptFails[1m])
- record: node:ins:tcp_estab_resets
expr: rate(node_netstat_Tcp_EstabResets[1m])
# tcp drop
- record: node:ins:tcp_overflow
expr: rate(node_netstat_TcpExt_ListenOverflows[1m])
- record: node:ins:tcp_dropped
expr: rate(node_netstat_TcpExt_ListenDrops[1m])
#==============================================================#
# NTP #
#==============================================================#
- record: node:cls:ntp_offset_range
expr: max by (cls)(node_ntp_offset_seconds) - min by (cls)(node_ntp_offset_seconds)
...
Derived Metrics about postgres and pgbouncer
---
#==============================================================#
# File : pgsql.yml
# Ctime : 2020-04-22
# Mtime : 2020-12-03
# Desc : Record and alert rules for postgres
# Path : /etc/prometheus/rules/pgsql.yml
# Copyright (C) 2018-2021 Ruohang Feng
#==============================================================#
groups:
################################################################
# PgSQL Rules #
################################################################
- name: pgsql-rules
rules:
#==============================================================#
# Aliveness #
#==============================================================#
# TODO: change these to your pg_exporter & pgbouncer_exporter port
- record: pg_exporter_up
expr: up{instance=~".*:9185"}
- record: pgbouncer_exporter_up
expr: up{instance=~".*:9127"}
#==============================================================#
# Identity #
#==============================================================#
- record: pg_is_primary
expr: 1 - pg_in_recovery
- record: pg_is_replica
expr: pg_in_recovery
- record: pg_status
expr: (pg_up{} * 2) + (1 - pg_in_recovery{})
# encoded: 0:replica[DOWN] 1:primary[DOWN] 2:replica 3:primary
#==============================================================#
# Age #
#==============================================================#
# age
- record: pg:ins:age
expr: max without (datname) (pg_database_age{datname!~"template[0-9]"})
- record: pg:cls:age
expr: max by (cls) (pg:ins:age)
- record: pg:all:age
expr: max(pg:cls:age)
# age derive and prediction
- record: pg:db:age_deriv_1h
expr: deriv(pg_database_age{}[1h])
- record: pg:db:age_exhaust
expr: (2147483648 - pg_database_age{}) / pg:db:age_deriv_1h
#==============================================================#
# Sessions #
#==============================================================#
# session count (by state)
- record: pg:db:sessions
expr: pg_activity_count
- record: pg:ins:sessions
expr: sum without (datname) (pg:db:sessions)
- record: pg:svc:sessions
expr: sum by (cls, role, state) (pg:ins:sessions)
- record: pg:cls:sessions
expr: sum by (cls, state) (pg:ins:sessions)
- record: pg:all:sessions
expr: sum by (state) (pg:cls:sessions)
# backends
- record: pg:db:backends
expr: pg_db_numbackends
- record: pg:ins:backends
expr: sum without (datname) (pg_db_numbackends)
- record: pg:svc:backends
expr: sum by (cls, role) (pg:ins:backends)
- record: pg:cls:backends
expr: sum by (cls) (pg:ins:backends)
- record: pg:all:backends
expr: sum(pg:cls:backends)
# active backends
- record: pg:ins:active_backends
expr: pg:ins:sessions{state="active"}
- record: pg:svc:active_backends
expr: sum by (cls, role) (pg:ins:active_backends)
- record: pg:cls:active_backends
expr: sum by (cls) (pg:ins:active_backends)
- record: pg:all:active_backends
expr: sum(pg:cls:active_backends)
# idle in xact backends (including abort)
- record: pg:ins:ixact_backends
expr: pg:ins:sessions{state=~"idle in.*"}
- record: pg:svc:ixact_backends
expr: sum by (cls, role) (pg:ins:active_backends)
- record: pg:cls:ixact_backends
expr: sum by (cls) (pg:ins:active_backends)
- record: pg:all:ixact_backends
expr: sum(pg:cls:active_backends)
#==============================================================#
# Servers (Pgbouncer) #
#==============================================================#
# active servers
- record: pg:pool:active_servers
expr: pgbouncer_pool_active_servers{datname!="pgbouncer"}
- record: pg:db:active_servers
expr: sum without(user) (pg:pool:active_servers)
- record: pg:ins:active_servers
expr: sum without(user, datname) (pg:pool:active_servers)
- record: pg:svc:active_servers
expr: sum by (cls, role) (pg:ins:active_servers)
- record: pg:cls:active_servers
expr: sum by (cls) (pg:ins:active_servers)
- record: pg:all:active_servers
expr: sum(pg:cls:active_servers)
# idle servers
- record: pg:pool:idle_servers
expr: pgbouncer_pool_idle_servers{datname!="pgbouncer"}
- record: pg:db:idle_servers
expr: sum without(user) (pg:pool:idle_servers)
- record: pg:ins:idle_servers
expr: sum without(user, datname) (pg:pool:idle_servers)
- record: pg:svc:idle_servers
expr: sum by (cls, role) (pg:ins:idle_servers)
- record: pg:cls:idle_servers
expr: sum by (cls) (pg:ins:idle_servers)
- record: pg:all:idle_servers
expr: sum(pg:cls:idle_servers)
# used servers
- record: pg:pool:used_servers
expr: pgbouncer_pool_used_servers{datname!="pgbouncer"}
- record: pg:db:used_servers
expr: sum without(user) (pg:pool:used_servers)
- record: pg:ins:used_servers
expr: sum without(user, datname) (pg:pool:used_servers)
- record: pg:svc:used_servers
expr: sum by (cls, role) (pg:ins:used_servers)
- record: pg:cls:used_servers
expr: sum by (cls) (pg:ins:used_servers)
- record: pg:all:used_servers
expr: sum(pg:cls:used_servers)
# tested servers
- record: pg:pool:tested_servers
expr: pgbouncer_pool_tested_servers{datname!="pgbouncer"}
- record: pg:db:tested_servers
expr: sum without(user) (pg:pool:tested_servers)
- record: pg:ins:tested_servers
expr: sum without(user, datname) (pg:pool:tested_servers)
- record: pg:svc:tested_servers
expr: sum by (cls, role) (pg:ins:tested_servers)
- record: pg:cls:tested_servers
expr: sum by (cls) (pg:ins:tested_servers)
- record: pg:all:tested_servers
expr: sum(pg:cls:tested_servers)
# login servers
- record: pg:pool:login_servers
expr: pgbouncer_pool_login_servers{datname!="pgbouncer"}
- record: pg:db:login_servers
expr: sum without(user) (pg:pool:login_servers)
- record: pg:ins:login_servers
expr: sum without(user, datname) (pg:pool:login_servers)
- record: pg:svc:login_servers
expr: sum by (cls, role) (pg:ins:login_servers)
- record: pg:cls:login_servers
expr: sum by (cls) (pg:ins:login_servers)
- record: pg:all:login_servers
expr: sum(pg:cls:login_servers)
#==============================================================#
# Clients (Pgbouncer) #
#==============================================================#
# active clients
- record: pg:pool:active_clients
expr: pgbouncer_pool_active_clients{datname!="pgbouncer"}
- record: pg:db:active_clients
expr: sum without(user) (pg:pool:active_clients)
- record: pg:ins:active_clients
expr: sum without(user, datname) (pg:pool:active_clients)
- record: pg:svc:active_clients
expr: sum by (cls, role) (pg:ins:active_clients)
- record: pg:cls:active_clients
expr: sum by (cls) (pg:ins:active_clients)
- record: pg:all:active_clients
expr: sum(pg:cls:active_clients)
# waiting clients
- record: pg:pool:waiting_clients
expr: pgbouncer_pool_waiting_clients{datname!="pgbouncer"}
- record: pg:db:waiting_clients
expr: sum without(user) (pg:pool:waiting_clients)
- record: pg:ins:waiting_clients
expr: sum without(user, datname) (pg:pool:waiting_clients)
- record: pg:svc:waiting_clients
expr: sum by (cls, role) (pg:ins:waiting_clients)
- record: pg:cls:waiting_clients
expr: sum by (cls) (pg:ins:waiting_clients)
- record: pg:all:waiting_clients
expr: sum(pg:cls:waiting_clients)
#==============================================================#
# Transactions #
#==============================================================#
# commits (realtime)
- record: pg:db:commits_realtime
expr: irate(pg_db_xact_commit{}[1m])
- record: pg:ins:commits_realtime
expr: sum without (datname) (pg:db:commits_realtime)
- record: pg:svc:commits_realtime
expr: sum by (cls, role) (pg:ins:commits_realtime)
- record: pg:cls:commits_realtime
expr: sum by (cls) (pg:ins:commits_realtime)
- record: pg:all:commits_realtime
expr: sum(pg:cls:commits_realtime)
# commits (rate1m)
- record: pg:db:commits
expr: rate(pg_db_xact_commit{}[1m])
- record: pg:ins:commits
expr: sum without (datname) (pg:db:commits)
- record: pg:svc:commits
expr: sum by (cls, role) (pg:ins:commits)
- record: pg:cls:commits
expr: sum by (cls) (pg:ins:commits)
- record: pg:all:commits
expr: sum(pg:cls:commits)
# rollbacks realtime
- record: pg:db:rollbacks_realtime
expr: irate(pg_db_xact_rollback{}[1m])
- record: pg:ins:rollbacks_realtime
expr: sum without (datname) (pg:db:rollbacks_realtime)
- record: pg:svc:rollbacks_realtime
expr: sum by (cls, role) (pg:ins:rollbacks_realtime)
- record: pg:cls:rollbacks_realtime
expr: sum by (cls) (pg:ins:rollbacks_realtime)
- record: pg:all:rollbacks_realtime
expr: sum(pg:cls:rollbacks_realtime)
# rollbacks
- record: pg:db:rollbacks
expr: rate(pg_db_xact_rollback{}[1m])
- record: pg:ins:rollbacks
expr: sum without (datname) (pg:db:rollbacks)
- record: pg:svc:rollbacks
expr: sum by (cls, role) (pg:ins:rollbacks)
- record: pg:cls:rollbacks
expr: sum by (cls) (pg:ins:rollbacks)
- record: pg:all:rollbacks
expr: sum(pg:cls:rollbacks)
# xacts (realtime)
- record: pg:db:xacts_realtime
expr: irate(pg_db_xact_commit{}[1m])
- record: pg:ins:xacts_realtime
expr: sum without (datname) (pg:db:xacts_realtime)
- record: pg:svc:xacts_realtime
expr: sum by (cls, role) (pg:ins:xacts_realtime)
- record: pg:cls:xacts_realtime
expr: sum by (cls) (pg:ins:xacts_realtime)
- record: pg:all:xacts_realtime
expr: sum(pg:cls:xacts_realtime)
# xacts (rate1m)
- record: pg:db:xacts
expr: rate(pg_db_xact_commit{}[1m])
- record: pg:ins:xacts
expr: sum without (datname) (pg:db:xacts)
- record: pg:svc:xacts
expr: sum by (cls, role) (pg:ins:xacts)
- record: pg:cls:xacts
expr: sum by (cls) (pg:ins:xacts)
- record: pg:all:xacts
expr: sum(pg:cls:xacts)
# xacts avg30m
- record: pg:db:xacts_avg30m
expr: avg_over_time(pg:db:xacts[30m])
- record: pg:ins:xacts_avg30m
expr: avg_over_time(pg:ins:xacts[30m])
- record: pg:svc:xacts_avg30m
expr: avg_over_time(pg:svc:xacts[30m])
- record: pg:cls:xacts_avg30m
expr: avg_over_time(pg:cls:xacts[30m])
- record: pg:all:xacts_avg30m
expr: avg_over_time(pg:all:xacts[30m])
# xacts µ
- record: pg:db:xacts_mu
expr: avg_over_time(pg:db:xacts_avg30m[30m])
- record: pg:ins:xacts_mu
expr: avg_over_time(pg:ins:xacts_avg30m[30m])
- record: pg:svc:xacts_mu
expr: avg_over_time(pg:svc:xacts_avg30m[30m])
- record: pg:cls:xacts_mu
expr: avg_over_time(pg:cls:xacts_avg30m[30m])
- record: pg:all:xacts_mu
expr: avg_over_time(pg:all:xacts_avg30m[30m])
# xacts σ: sigma
- record: pg:db:xacts_sigma
expr: stddev_over_time(pg:db:xacts[30m])
- record: pg:ins:xacts_sigma
expr: stddev_over_time(pg:ins:xacts[30m])
- record: pg:svc:xacts_sigma
expr: stddev_over_time(pg:svc:xacts[30m])
- record: pg:cls:xacts_sigma
expr: stddev_over_time(pg:cls:xacts[30m])
- record: pg:all:xacts_sigma
expr: stddev_over_time(pg:all:xacts[30m])
#==============================================================#
# TPS (Pgbouncer) #
#==============================================================#
# TPS realtime (irate1m)
- record: pg:db:tps_realtime
expr: irate(pgbouncer_stat_total_xact_count{}[1m])
- record: pg:ins:tps_realtime
expr: sum without(datname) (pg:db:tps_realtime{})
- record: pg:svc:tps_realtime
expr: sum by(cls, role) (pg:ins:tps_realtime{})
- record: pg:cls:tps_realtime
expr: sum by(cls) (pg:ins:tps_realtime{})
- record: pg:all:tps_realtime
expr: sum(pg:cls:tps_realtime{})
# TPS (rate1m)
- record: pg:db:tps
expr: pgbouncer_stat_avg_xact_count{datname!="pgbouncer"}
- record: pg:ins:tps
expr: sum without(datname) (pg:db:tps)
- record: pg:svc:tps
expr: sum by (cls, role) (pg:ins:tps)
- record: pg:cls:tps
expr: sum by(cls) (pg:ins:tps)
- record: pg:all:tps
expr: sum(pg:cls:tps)
# tps : avg30m
- record: pg:db:tps_avg30m
expr: avg_over_time(pg:db:tps[30m])
- record: pg:ins:tps_avg30m
expr: avg_over_time(pg:ins:tps[30m])
- record: pg:svc:tps_avg30m
expr: avg_over_time(pg:svc:tps[30m])
- record: pg:cls:tps_avg30m
expr: avg_over_time(pg:cls:tps[30m])
- record: pg:all:tps_avg30m
expr: avg_over_time(pg:all:tps[30m])
# tps µ
- record: pg:db:tps_mu
expr: avg_over_time(pg:db:tps_avg30m[30m])
- record: pg:ins:tps_mu
expr: avg_over_time(pg:ins:tps_avg30m[30m])
- record: pg:svc:tps_mu
expr: avg_over_time(pg:svc:tps_avg30m[30m])
- record: pg:cls:tps_mu
expr: avg_over_time(pg:cls:tps_avg30m[30m])
- record: pg:all:tps_mu
expr: avg_over_time(pg:all:tps_avg30m[30m])
# tps σ
- record: pg:db:tps_sigma
expr: stddev_over_time(pg:db:tps[30m])
- record: pg:ins:tps_sigma
expr: stddev_over_time(pg:ins:tps[30m])
- record: pg:svc:tps_sigma
expr: stddev_over_time(pg:svc:tps[30m])
- record: pg:cls:tps_sigma
expr: stddev_over_time(pg:cls:tps[30m])
- record: pg:all:tps_sigma
expr: stddev_over_time(pg:all:tps[30m])
# xact rt (rate1m)
- record: pg:db:xact_rt
expr: pgbouncer_stat_avg_xact_time{datname!="pgbouncer"} / 1000000
- record: pg:ins:xact_rt
expr: sum without(datname) (rate(pgbouncer_stat_total_xact_time[1m])) / sum without(datname) (rate(pgbouncer_stat_total_xact_count[1m])) / 1000000
- record: pg:svc:xact_rt
expr: sum by (cls, role) (rate(pgbouncer_stat_total_xact_time[1m])) / sum by (cls, role) (rate(pgbouncer_stat_total_xact_count[1m])) / 1000000
# xact_rt avg30m
- record: pg:db:xact_rt_avg30m
expr: avg_over_time(pg:db:xact_rt[30m])
- record: pg:ins:xact_rt_avg30m
expr: avg_over_time(pg:ins:xact_rt[30m])
- record: pg:svc:xact_rt_avg30m
expr: avg_over_time(pg:svc:xact_rt[30m])
# xact_rt µ
- record: pg:db:xact_rt_mu
expr: avg_over_time(pg:db:xact_rt_avg30m[30m])
- record: pg:ins:xact_rt_mu
expr: avg_over_time(pg:ins:xact_rt_avg30m[30m])
- record: pg:svc:xact_rt_mu
expr: avg_over_time(pg:svc:xact_rt_avg30m[30m])
# xact_rt σ: stddev30m
- record: pg:db:xact_rt_sigma
expr: stddev_over_time(pg:db:xact_rt[30m])
- record: pg:ins:xact_rt_sigma
expr: stddev_over_time(pg:ins:xact_rt[30m])
- record: pg:svc:xact_rt_sigma
expr: stddev_over_time(pg:svc:xact_rt[30m])
#==============================================================#
# QPS (Pgbouncer) #
#==============================================================#
# QPS realtime (irate1m)
- record: pg:db:qps_realtime
expr: irate(pgbouncer_stat_total_query_count{}[1m])
- record: pg:ins:qps_realtime
expr: sum without(datname) (pg:db:qps_realtime{})
- record: pg:svc:qps_realtime
expr: sum by(cls, role) (pg:ins:qps_realtime{})
- record: pg:cls:qps_realtime
expr: sum by(cls) (pg:ins:qps_realtime{})
- record: pg:all:qps_realtime
expr: sum(pg:cls:qps_realtime{})
# qps (rate1m)
- record: pg:db:qps
expr: pgbouncer_stat_avg_query_count{datname!="pgbouncer"}
- record: pg:ins:qps
expr: sum without(datname) (pg:db:qps)
- record: pg:svc:qps
expr: sum by (cls, role) (pg:ins:qps)
- record: pg:cls:qps
expr: sum by(cls) (pg:ins:qps)
- record: pg:all:qps
expr: sum(pg:cls:qps)
# qps avg30m
- record: pg:db:qps_avg30m
expr: avg_over_time(pg:db:qps[30m])
- record: pg:ins:qps_avg30m
expr: avg_over_time(pg:ins:qps[30m])
- record: pg:svc:qps_avg30m
expr: avg_over_time(pg:svc:qps[30m])
- record: pg:cls:qps_avg30m
expr: avg_over_time(pg:cls:qps[30m])
- record: pg:all:qps_avg30m
expr: avg_over_time(pg:all:qps[30m])
# qps µ
- record: pg:db:qps_mu
expr: avg_over_time(pg:db:qps_avg30m[30m])
- record: pg:ins:qps_mu
expr: avg_over_time(pg:ins:qps_avg30m[30m])
- record: pg:svc:qps_mu
expr: avg_over_time(pg:svc:qps_avg30m[30m])
- record: pg:cls:qps_mu
expr: avg_over_time(pg:cls:qps_avg30m[30m])
- record: pg:all:qps_mu
expr: avg_over_time(pg:all:qps_avg30m[30m])
# qps σ: stddev30m qps
- record: pg:db:qps_sigma
expr: stddev_over_time(pg:db:qps[30m])
- record: pg:ins:qps_sigma
expr: stddev_over_time(pg:ins:qps[30m])
- record: pg:svc:qps_sigma
expr: stddev_over_time(pg:svc:qps[30m])
- record: pg:cls:qps_sigma
expr: stddev_over_time(pg:cls:qps[30m])
- record: pg:all:qps_sigma
expr: stddev_over_time(pg:all:qps[30m])
# query rt (1m avg)
- record: pg:db:query_rt
expr: pgbouncer_stat_avg_query_time{datname!="pgbouncer"} / 1000000
- record: pg:ins:query_rt
expr: sum without(datname) (rate(pgbouncer_stat_total_query_time[1m])) / sum without(datname) (rate(pgbouncer_stat_total_query_count[1m])) / 1000000
- record: pg:svc:query_rt
expr: sum by (cls, role) (rate(pgbouncer_stat_total_query_time[1m])) / sum by (cls, role) (rate(pgbouncer_stat_total_query_count[1m])) / 1000000
# query_rt avg30m
- record: pg:db:query_rt_avg30m
expr: avg_over_time(pg:db:query_rt[30m])
- record: pg:ins:query_rt_avg30m
expr: avg_over_time(pg:ins:query_rt[30m])
- record: pg:svc:query_rt_avg30m
expr: avg_over_time(pg:svc:query_rt[30m])
# query_rt µ
- record: pg:db:query_rt_mu
expr: avg_over_time(pg:db:query_rt_avg30m[30m])
- record: pg:ins:query_rt_mu
expr: avg_over_time(pg:ins:query_rt_avg30m[30m])
- record: pg:svc:query_rt_mu
expr: avg_over_time(pg:svc:query_rt_avg30m[30m])
# query_rt σ: stddev30m
- record: pg:db:query_rt_sigma
expr: stddev_over_time(pg:db:query_rt[30m])
- record: pg:ins:query_rt_sigma
expr: stddev_over_time(pg:ins:query_rt[30m])
- record: pg:svc:query_rt_sigma
expr: stddev_over_time(pg:svc:query_rt[30m])
#==============================================================#
# PG Load #
#==============================================================#
# seconds spend on transaction in last minute
- record: pg:ins:xact_time_rate1m
expr: sum without (datname) (rate(pgbouncer_stat_total_xact_time{}[1m])) / 1000000
- record: pg:ins:xact_time_rate5m
expr: sum without (datname) (rate(pgbouncer_stat_total_xact_time{}[5m])) / 1000000
- record: pg:ins:xact_time_rate15m
expr: sum without (datname) (rate(pgbouncer_stat_total_xact_time{}[15m])) / 1000000
# seconds spend on queries in last minute
- record: pg:ins:query_time_rate1m
expr: sum without (datname) (rate(pgbouncer_stat_total_query_time{}[1m])) / 1000000
- record: pg:ins:query_time_rate5m
expr: sum without (datname) (rate(pgbouncer_stat_total_query_time{}[5m])) / 1000000
- record: pg:ins:query_time_rate15m
expr: sum without (datname) (rate(pgbouncer_stat_total_query_time{}[15m])) / 1000000
# instance level load
- record: pg:ins:load0
expr: sum without (datname) (irate(pgbouncer_stat_total_xact_time{}[1m])) / on (ip) group_left() node:ins:cpu_count / 1000000
- record: pg:ins:load1
expr: pg:ins:xact_time_rate1m / on (ip) group_left() node:ins:cpu_count
- record: pg:ins:load5
expr: pg:ins:xact_time_rate5m / on (ip) group_left() node:ins:cpu_count
- record: pg:ins:load15
expr: pg:ins:xact_time_rate15m / on (ip) group_left() node:ins:cpu_count
# service level load
- record: pg:svc:load0
expr: sum by (svc, cls, role) (irate(pgbouncer_stat_total_xact_time{}[1m])) / on (svc) group_left() sum by (svc) (node:ins:cpu_count{}) / 1000000
- record: pg:svc:load1
expr: sum by (svc, cls, role) (pg:ins:xact_time_rate1m) / on (svc) group_left() sum by (svc) (node:ins:cpu_count{}) / 1000000
- record: pg:svc:load5
expr: sum by (svc, cls, role) (pg:ins:xact_time_rate5m) / on (svc) group_left() sum by (svc) (node:ins:cpu_count{}) / 1000000
- record: pg:svc:load15
expr: sum by (svc, cls, role) (pg:ins:xact_time_rate15m) / on (svc) group_left() sum by (svc) (node:ins:cpu_count{}) / 1000000
# cluster level load
- record: pg:cls:load0
expr: sum by (cls) (irate(pgbouncer_stat_total_xact_time{}[1m])) / on (cls) node:cls:cpu_count{} / 1000000
- record: pg:cls:load1
expr: sum by (cls) (pg:ins:xact_time_rate1m) / on (cls) node:cls:cpu_count
- record: pg:cls:load5
expr: sum by (cls) (pg:ins:xact_time_rate5m) / on (cls) node:cls:cpu_count
- record: pg:cls:load15
expr: sum by (cls) (pg:ins:xact_time_rate15m) / on (cls) node:cls:cpu_count
#==============================================================#
# PG Saturation #
#==============================================================#
# max value of pg_load and cpu_usage
# instance level saturation
- record: pg:ins:saturation0
expr: pg:ins:load0 > node:ins:cpu_usage or node:ins:cpu_usage
- record: pg:ins:saturation1
expr: pg:ins:load1 > node:ins:cpu_usage or node:ins:cpu_usage
- record: pg:ins:saturation5
expr: pg:ins:load5 > node:ins:cpu_usage or node:ins:cpu_usage
- record: pg:ins:saturation15
expr: pg:ins:load15 > node:ins:cpu_usage or node:ins:cpu_usage
# cluster level saturation
- record: pg:cls:saturation0
expr: pg:cls:load0 > node:cls:cpu_usage or node:cls:cpu_usage
- record: pg:cls:saturation1
expr: pg:cls:load1 > node:cls:cpu_usage or node:cls:cpu_usage
- record: pg:cls:saturation5
expr: pg:cls:load5 > node:cls:cpu_usage or node:cls:cpu_usage
- record: pg:cls:saturation15
expr: pg:cls:load15 > node:cls:cpu_usage or node:cls:cpu_usage
#==============================================================#
# CRUD #
#==============================================================#
# rows touched
- record: pg:db:tup_touched
expr: irate(pg_db_tup_fetched{}[1m])
- record: pg:ins:tup_touched
expr: sum without(datname) (pg:db:tup_touched)
- record: pg:svc:tup_touched
expr: sum by (cls, role) (pg:ins:tup_touched)
- record: pg:cls:tup_touched
expr: sum by (cls) (pg:ins:tup_touched)
- record: pg:all:tup_touched
expr: sum(pg:cls:tup_touched)
# selected
- record: pg:db:tup_selected
expr: irate(pg_db_tup_returned{}[1m])
- record: pg:ins:tup_selected
expr: sum without(datname) (pg:db:tup_selected)
- record: pg:svc:tup_selected
expr: sum by (cls, role) (pg:ins:tup_selected)
- record: pg:cls:tup_selected
expr: sum by (cls) (pg:ins:tup_selected)
- record: pg:all:tup_selected
expr: sum(pg:cls:tup_selected)
# inserted
- record: pg:db:tup_inserted
expr: irate(pg_db_tup_inserted{}[1m])
- record: pg:ins:tup_inserted
expr: sum without(datname) (pg:db:tup_inserted)
- record: pg:svc:tup_inserted
expr: sum by (cls, role) (pg:ins:tup_inserted)
- record: pg:cls:tup_inserted
expr: sum by (cls) (pg:ins:tup_inserted{role="primary"})
- record: pg:all:tup_inserted
expr: sum(pg:cls:tup_inserted)
# updated
- record: pg:db:tup_updated
expr: irate(pg_db_tup_updated{}[1m])
- record: pg:ins:tup_updated
expr: sum without(datname) (pg:db:tup_updated)
- record: pg:svc:tup_updated
expr: sum by (cls, role) (pg:ins:tup_updated)
- record: pg:cls:tup_updated
expr: sum by (cls) (pg:ins:tup_updated{role="primary"})
- record: pg:all:tup_updated
expr: sum(pg:cls:tup_updated)
# deleted
- record: pg:db:tup_deleted
expr: irate(pg_db_tup_deleted{}[1m])
- record: pg:ins:tup_deleted
expr: sum without(datname) (pg:db:tup_deleted)
- record: pg:svc:tup_deleted
expr: sum by (cls, role) (pg:ins:tup_deleted)
- record: pg:cls:tup_deleted
expr: sum by (cls) (pg:ins:tup_deleted{role="primary"})
- record: pg:all:tup_deleted
expr: sum(pg:cls:tup_deleted)
# modified
- record: pg:db:tup_modified
expr: irate(pg_db_tup_modified{}[1m])
- record: pg:ins:tup_modified
expr: sum without(datname) (pg:db:tup_modified)
- record: pg:svc:tup_modified
expr: sum by (cls, role) (pg:ins:tup_modified)
- record: pg:cls:tup_modified
expr: sum by (cls) (pg:ins:tup_modified{role="primary"})
- record: pg:all:tup_modified
expr: sum(pg:cls:tup_deleted)
#==============================================================#
# Object Access #
#==============================================================#
# table access
- record: pg:table:idx_scan
expr: rate(pg_table_idx_scan{}[1m])
- record: pg:table:seq_scan
expr: rate(pg_table_seq_scan{}[1m])
- record: pg:table:qps_realtime
expr: irate(pg_table_idx_scan{}[1m])
# index access
- record: pg:index:idx_scan
expr: rate(pg_index_idx_scan{}[1m])
- record: pg:index:qps_realtime
expr: irate(pg_index_idx_scan{}[1m])
# func access
- record: pg:func:call
expr: rate(pg_func_calls{}[1m])
- record: pg:func:rt
expr: rate(pg_func_total_time{}[1m]) / pg:func:call
# query access
- record: pg:query:call
expr: rate(pg_query_calls{}[1m])
- record: pg:query:rt
expr: rate(pg_query_total_time{}[1m]) / pg:query:call / 1000
#==============================================================#
# Blocks IO #
#==============================================================#
# blocks read/hit/access in 1min
- record: pg:db:blks_read_1m
expr: increase(pg_db_blks_read{}[1m])
- record: pg:db:blks_hit_1m
expr: increase(pg_db_blks_hit{}[1m])
- record: pg:db:blks_access_1m
expr: increase(pg_db_blks_access{}[1m])
# buffer hit rate (1m)
- record: pg:db:buffer_hit_rate
expr: pg:db:blks_hit_1m / pg:db:blks_access_1m
- record: pg:ins:hit_rate
expr: sum without(datname) (pg:db:blks_hit_1m) / sum without(datname) (pg:db:blks_access_1m)
# read/write time usage
- record: pg:db:read_time_usage
expr: rate(pg_db_blk_read_time[1m])
- record: pg:db:write_time_usage
expr: rate(pg_db_blk_write_time[1m])
- record: pg:db:io_time_usage
expr: pg:db:read_time_usage + pg:db:write_time_usage
#==============================================================#
# Traffic IO (Pgbouncer) #
#==============================================================#
# transmit bandwidth (sent, out)
- record: pg:db:tx
expr: irate(pgbouncer_stat_total_sent{datname!="pgbouncer"}[1m])
- record: pg:ins:tx
expr: sum without (user, datname) (pg:db:tx)
- record: pg:svc:tx
expr: sum by (cls, role) (pg:ins:tx)
- record: pg:cls:tx
expr: sum by (cls) (pg:ins:tx)
- record: pg:all:tx
expr: sum(pg:cls:tx)
# receive bandwidth (sent, out)
- record: pg:db:rx
expr: irate(pgbouncer_stat_total_received{datname!="pgbouncer"}[1m])
- record: pg:ins:rx
expr: sum without (datname) (pg:db:rx)
- record: pg:svc:rx
expr: sum by (cls, role) (pg:ins:rx)
- record: pg:cls:rx
expr: sum by (cls) (pg:ins:rx)
- record: pg:all:rx
expr: sum(pg:cls:rx)
#==============================================================#
# Lock #
#==============================================================#
# lock count by mode
- record: pg:db:locks
expr: pg_lock_count
- record: pg:ins:locks
expr: sum without(datname) (pg:db:locks)
- record: pg:svc:locks
expr: sum by (cls, role, mode) (pg:ins:locks)
- record: pg:cls:locks
expr: sum by (cls, mode) (pg:ins:locks)
# total lock count
- record: pg:db:lock_count
expr: sum without (mode) (pg_lock_count{})
- record: pg:ins:lock_count
expr: sum without(datname) (pg:db:lock_count)
- record: pg:svc:lock_count
expr: sum by (cls, role) (pg:ins:lock_count)
- record: pg:cls:lock_count
expr: sum by (cls) (pg:ins:lock_count)
# read category lock
- record: pg:db:rlock
expr: sum without (mode) (pg_lock_count{mode="AccessShareLock"})
- record: pg:ins:rlock
expr: sum without(datname) (pg:db:rlock)
- record: pg:svc:rlock
expr: sum by (cls, role) (pg:ins:rlock)
- record: pg:cls:rlock
expr: sum by (cls) (pg:ins:rlock)
# write category lock (insert|update|delete)
- record: pg:db:wlock
expr: sum without (mode) (pg_lock_count{mode=~"RowShareLock|RowExclusiveLock"})
- record: pg:ins:wlock
expr: sum without(datname) (pg:db:wlock)
- record: pg:svc:wlock
expr: sum by (cls, role) (pg:ins:wlock)
- record: pg:cls:wlock
expr: sum by (cls) (pg:ins:wlock)
# exclusive category lock
- record: pg:db:xlock
expr: sum without (mode) (pg_lock_count{mode=~"AccessExclusiveLock|ExclusiveLock|ShareRowExclusiveLock|ShareLock|ShareUpdateExclusiveLock"})
- record: pg:ins:xlock
expr: sum without(datname) (pg:db:xlock)
- record: pg:svc:xlock
expr: sum by (cls, role) (pg:ins:xlock)
- record: pg:cls:xlock
expr: sum by (cls) (pg:ins:xlock)
#==============================================================#
# Temp #
#==============================================================#
# temp files and bytes
- record: pg:db:temp_bytes
expr: rate(pg_db_temp_bytes{}[1m])
- record: pg:ins:temp_bytes
expr: sum without(datname) (pg:db:temp_bytes)
- record: pg:svc:temp_bytes
expr: sum by (cls, role) (pg:ins:temp_bytes)
- record: pg:cls:temp_bytes
expr: sum by (cls) (pg:ins:temp_bytes)
# temp file count in last 1m
- record: pg:db:temp_files
expr: increase(pg_db_temp_files{}[1m])
- record: pg:ins:temp_files
expr: sum without(datname) (pg:db:temp_files)
- record: pg:svc:temp_files
expr: sum by (cls, role) (pg:ins:temp_files)
- record: pg:cls:temp_files
expr: sum by (cls) (pg:ins:temp_files)
#==============================================================#
# Size #
#==============================================================#
# database size
- record: pg:ins:db_size
expr: pg_size_database
- record: pg:cls:db_size
expr: sum by (cls) (pg:ins:db_size)
# wal size
- record: pg:ins:wal_size
expr: pg_size_wal
- record: pg:cls:wal_size
expr: sum by (cls) (pg:ins:wal_size)
# log size
- record: pg:ins:log_size
expr: pg_size_log
- record: pg:cls:log_size
expr: sum by (cls) (pg_size_log)
#==============================================================#
# Checkpoint #
#==============================================================#
# checkpoint stats
- record: pg:ins:last_ckpt
expr: pg_checkpoint_elapse
- record: pg:ins:ckpt_timed
expr: increase(pg_bgwriter_checkpoints_timed{}[30s])
- record: pg:ins:ckpt_req
expr: increase(pg_bgwriter_checkpoints_req{}[30s])
- record: pg:cls:ckpt_1h
expr: increase(pg:ins:ckpt_timed[1h]) + increase(pg:ins:ckpt_req[1h])
# buffer flush & alloc
- record: pg:ins:buf_flush_backend
expr: irate(pg_bgwriter_buffers_backend{}[1m]) * 8192
- record: pg:ins:buf_flush_checkpoint
expr: irate(pg_bgwriter_buffers_checkpoint{}[1m]) * 8192
- record: pg:ins:buf_flush
expr: pg:ins:buf_flush_backend + pg:ins:buf_flush_checkpoint
- record: pg:svc:buf_flush
expr: sum by (cls, role) (pg:ins:buf_flush)
- record: pg:cls:buf_flush
expr: sum by (cls) (pg:ins:buf_flush)
- record: pg:all:buf_flush
expr: sum(pg:cls:buf_flush)
- record: pg:ins:buf_alloc
expr: irate(pg_bgwriter_buffers_alloc{}[1m]) * 8192
- record: pg:svc:buf_alloc
expr: sum by (cls, role) (pg:ins:buf_alloc)
- record: pg:cls:buf_alloc
expr: sum by (cls) (pg:ins:buf_alloc)
- record: pg:all:buf_alloc
expr: sum(pg:cls:buf_alloc)
#==============================================================#
# LSN #
#==============================================================#
# timeline & LSN
- record: pg_timeline
expr: pg_checkpoint_tli
- record: pg:ins:redo_lsn
expr: pg_checkpoint_redo_lsn
- record: pg:ins:checkpoint_lsn
expr: pg_checkpoint_checkpoint_lsn
# wal rate
- record: pg:ins:wal_rate
expr: rate(pg_lsn[1m])
- record: pg:cls:wal_rate
expr: max by (cls) (pg:ins:wal_rate{role="primary"})
- record: pg:all:wal_rate
expr: sum(pg:cls:wal_rate)
#==============================================================#
# Replication #
#==============================================================#
# lag time from replica's view
- record: pg:ins:lag_seconds
expr: pg_lag
- record: pg:cls:lag_seconds
expr: max by (cls) (pg:ins:lag_seconds)
- record: pg:all:lag_seconds
expr: max(pg:cls:lag_seconds)
# sync status
- record: pg:ins:sync_status # application_name must set to replica ins name
expr: max by (ins, svc, cls) (label_replace(pg_replication_sync_status, "ins", "$1", "application_name", "(.+)"))
# lag of self (application_name must set to standby ins name)
- record: pg:ins:lag_bytes
expr: max by (ins, svc, cls, role) (label_replace(pg_replication_lsn{} - pg_replication_replay_lsn{}, "ins", "$1", "application_name", "(.+)"))
- record: pg:cls:lag_bytes
expr: max by (cls) (pg:ins:lag_bytes)
- record: pg:all:lag_bytes
expr: max(pg:cls:lag_bytes)
# replication slot retained bytes
- record: pg:ins:slot_retained_bytes
expr: pg_slot_retained_bytes
# replica walreceiver
- record: pg:ins:recv_init_lsn
expr: pg_walreceiver_init_lsn
- record: pg:ins:recv_last_lsn
expr: pg_walreceiver_last_lsn
- record: pg:ins:recv_init_tli
expr: pg_walreceiver_init_tli
- record: pg:ins:recv_last_tli
expr: pg_walreceiver_last_tli
#==============================================================#
# Cluster Level Metrics
#==============================================================#
# cluster member count
- record: pg:cls:leader
expr: count by (cls, ins) (max by (cls, ins) (pg_status{}) == 3)
- record: pg:cls:size
expr: count by (cls) (max by (cls, ins) (pg_up{}))
- record: pg:cls:timeline
expr: max by (cls) (pg_checkpoint_tli{})
- record: pg:cls:primarys
expr: count by (cls) (max by (cls, ins) (pg_in_recovery{}) == 0)
- record: pg:cls:replicas
expr: count by (cls) (max by (cls, ins) (pg_in_recovery{}) == 1)
- record: pg:cls:synchronous
expr: max by (cls) (pg_sync_standby_enabled) > bool 0
- record: pg:cls:bridging_instances
expr: count by (cls, role, ins, ip) (pg_replication_lsn{state="streaming", role!="primary"} > 0)
- record: pg:cls:bridging
expr: count by (cls) (pg:cls:bridging_instances)
- record: pg:cls:cascading
expr: count by (cls) (pg_replication_lsn{state="streaming", role!="primary"})
#==============================================================#
# Pgbouncer List #
#==============================================================#
# object list
- record: pg:ins:pools
expr: pgbouncer_list_items{list="pools"}
- record: pg:ins:pool_databases
expr: pgbouncer_list_items{list="databases"}
- record: pg:ins:pool_users
expr: pgbouncer_list_items{list="users"}
- record: pg:ins:login_clients
expr: pgbouncer_list_items{list="login_clients"}
- record: pg:ins:free_clients
expr: pgbouncer_list_items{list="free_clients"}
- record: pg:ins:used_clients
expr: pgbouncer_list_items{list="used_clients"}
- record: pg:ins:free_servers
expr: pgbouncer_list_items{list="free_servers"}
#==============================================================#
# DBConfig (Pgbouncer) #
#==============================================================#
- record: pg:db:pool_max_conn
expr: pgbouncer_database_pool_size{datname!="pgbouncer"} + pgbouncer_database_reserve_pool{datname!="pgbouncer"}
- record: pg:db:pool_size
expr: pgbouncer_database_pool_size{datname!="pgbouncer"}
- record: pg:db:pool_reserve_size
expr: pgbouncer_database_reserve_pool{datname!="pgbouncer"}
- record: pg:db:pool_current_conn
expr: pgbouncer_database_current_connections{datname!="pgbouncer"}
- record: pg:db:pool_paused
expr: pgbouncer_database_paused{datname!="pgbouncer"}
- record: pg:db:pool_disabled
expr: pgbouncer_database_disabled{datname!="pgbouncer"}
#==============================================================#
# Waiting (Pgbouncer) #
#==============================================================#
# average wait time
- record: pg:db:wait_rt
expr: pgbouncer_stat_avg_wait_time{datname!="pgbouncer"} / 1000000
# max wait time among all clients
- record: pg:pool:maxwait
expr: pgbouncer_pool_maxwait{datname!="pgbouncer"} + pgbouncer_pool_maxwait_us{datname!="pgbouncer"} / 1000000
- record: pg:db:maxwait
expr: max without(user) (pg:pool:maxwait)
- record: pg:ins:maxwait
expr: max without(user, datname) (pg:db:maxwait)
- record: pg:svc:maxwait
expr: max by (cls, role) (pg:ins:maxwait)
- record: pg:cls:maxwait
expr: max by (cls) (pg:ins:maxwait)
- record: pg:all:maxwait
expr: max(pg:cls:maxwait)
...
5 - Alert Rules
Prometheus Alert Rules
Node Alert Rules
################################################################
# Node Alert #
################################################################
- name: node-alert
rules:
# node exporter down for 1m triggers a P1 alert
- alert: NODE_EXPORTER_DOWN
expr: up{instance=~"^.*:(9100)$"} == 0
for: 1m
labels:
severity: P1
annotations:
summary: "P1 Node Exporter Down: {{ $labels.ins }} {{ $value }}"
description: |
up[instance={{ $labels.instance }}] = {{ $value }} == 0
https://dba.p1staff.com/d/node?var-ip={{ $labels.instance }}&from=now-5m&to=now&refresh=10s
#==============================================================#
# CPU & Load #
#==============================================================#
# node avg CPU usage > 90% for 1m
- alert: NODE_CPU_HIGH
expr: node:ins:cpu_usage > 0.90
for: 1m
labels:
severity: P1
annotations:
summary: "P1 Node CPU High: {{ $labels.ins }} {{ $labels.ip }}"
description: |
node:ins:cpu_usage[ins={{ $labels.ins }}, ip={{ $labels.ip }}] = {{ $value }} > 90%
http://g.pigsty/d/node?&from=now-10m&to=now&viewPanel=28&fullscreen&var-ip={{ $labels.ip }}
# node load5 > 100%
- alert: NODE_LOAD_HIGH
expr: node:ins:stdload5 > 1
for: 3m
labels:
severity: P2
annotations:
summary: "P2 Node Load High: {{ $labels.ins }} {{ $labels.ip }}"
description: |
node:ins:stdload5[ins={{ $labels.ins }}, ip={{ $labels.ip }}] = {{ $value }} > 100%
http://g.pigsty/d/node?&from=now-10m&to=now&viewPanel=37&fullscreen&var-ip={{ $labels.ip }}
#==============================================================#
# Disk & Filesystem #
#==============================================================#
# main fs readonly triggers an immediate P0 alert
- alert: NODE_FS_READONLY
expr: node_filesystem_readonly{fstype!~"(n|root|tmp)fs.*"} == 1
labels:
severity: P0
annotations:
summary: "P0 Node Filesystem Readonly: {{ $labels.ins }} {{ $labels.ip }}"
description: |
node_filesystem_readonly{ins={{ $labels.ins }}, ip={{ $labels.ip }},fstype!~"(n|root|tmp)fs.*"} == 1
http://g.pigsty/d/node?&from=now-10m&to=now&viewPanel=110&fullscreen&var-ip={{ $labels.ip }}
# main fs usage > 90% for 1m triggers P1 alert
- alert: NODE_FS_SPACE_FULL
expr: node:fs:space_usage > 0.90
for: 1m
labels:
severity: P1
annotations:
summary: "P1 Node Filesystem Space Full: {{ $labels.ins }} {{ $labels.ip }}"
description: |
node:fs:space_usage[ins={{ $labels.ins }}, ip={{ $labels.ip }}] = {{ $value }} > 90%
http://g.pigsty/d/node?&from=now-10m&to=now&viewPanel=110&fullscreen&var-ip={{ $labels.ip }}
# main fs inode usage > 90% for 1m triggers P1 alert
- alert: NODE_FS_INODE_FULL
expr: node:fs:inode_usage > 0.90
for: 1m
labels:
severity: P1
annotations:
summary: "P1 Node Filesystem iNode Full: {{ $labels.ins }} {{ $labels.ip }}"
description: |
node:fs:inode_usage[ins={{ $labels.ins }}, ip={{ $labels.ip }}] = {{ $value }} > 90%
http://g.pigsty/d/node?&from=now-10m&to=now&viewPanel=110&fullscreen&var-ip={{ $labels.ip }}
# fd usage > 90% for 1m triggers P1 alert
- alert: NODE_FD_FULL
expr: node:fs:fd_usage > 0.90
for: 1m
labels:
severity: P1
annotations:
summary: "P1 Node File Descriptor Full: {{ $labels.ins }} {{ $labels.ip }}"
description: |
node:fs:fd_usage[ins={{ $labels.ins }}, ip={{ $labels.ip }}] = {{ $value }} > 90%
http://g.pigsty/d/node?&from=now-10m&to=now&viewPanel=58&fullscreen&var-ip={{ $labels.ip }}
# ssd read latency > 32ms for 3m (except long-read)
- alert: NODE_READ_LATENCY_HIGH
expr: node:dev:disk_read_rt < 10000 and node:dev:disk_read_rt > 0.032
for: 3m
labels:
severity: P2
annotations:
summary: "P2 Node Read Latency High: {{ $labels.ins }} {{ $labels.ip }}"
description: |
node:dev:disk_read_rt[ins={{ $labels.ins }}, ip={{ $labels.ip }}, device={{ $labels.device }}] = {{ $value }} > 32ms
http://g.pigsty/d/node?&from=now-10m&to=now&viewPanel=29&fullscreen&var-ip={{ $labels.ip }}
# ssd write latency > 16ms for 3m
- alert: NODE_WRITE_LATENCY_HIGH
expr: node:dev:disk_write_rt < 10000 and node:dev:disk_write_rt > 0.016
for: 3m
labels:
severity: P2
annotations:
summary: "P2 Node Write Latency High: {{ $labels.ins }} {{ $labels.ip }}"
description: |
node:dev:disk_write_rt[ins={{ $labels.ins }}, ip={{ $labels.ip }}, device={{ $labels.device }}] = {{ $value }} > 16ms
http://g.pigsty/d/node?&from=now-10m&to=now&viewPanel=29&fullscreen&var-ip={{ $labels.ip }}
#==============================================================#
# Memory #
#==============================================================#
# shared memory usage > 80% for 1m triggers a P1 alert
- alert: NODE_MEM_HIGH
expr: node:ins:mem_usage > 0.80
for: 1m
labels:
severity: P1
annotations:
summary: "P1 Node Mem High: {{ $labels.ins }} {{ $labels.ip }}"
description: |
node:ins:mem_usage[ins={{ $labels.ins }}, ip={{ $labels.ip }}] = {{ $value }} > 80%
http://g.pigsty/d/node?&from=now-10m&to=now&viewPanel=40&fullscreen&var-ip={{ $labels.ip }}
#==============================================================#
# Network & TCP #
#==============================================================#
# node tcp listen overflow > 2 for 3m
- alert: NODE_TCP_LISTEN_OVERFLOW
expr: node:ins:tcp_overflow_rate > 2
for: 3m
labels:
severity: P1
annotations:
summary: "P1 Node TCP Listen Overflow: {{ $labels.ins }} {{ $labels.ip }}"
description: |
node:ins:tcp_overflow_rate[ins={{ $labels.ins }}, ip={{ $labels.ip }}] = {{ $value }} > 2
http://g.pigsty/d/node?&from=now-10m&to=now&viewPanel=55&fullscreen&var-ip={{ $labels.ip }}
# node tcp retrans > 32 per sec for 3m
- alert: NODE_TCP_RETRANS_HIGH
expr: node:ins:tcp_retranssegs > 32
for: 3m
labels:
severity: P2
annotations:
summary: "P2 Node TCP Retrans High: {{ $labels.ins }} {{ $labels.ip }}"
description: |
node:ins:tcp_retranssegs[ins={{ $labels.ins }}, ip={{ $labels.ip }}] = {{ $value }} > 32
http://g.pigsty/d/node?&from=now-10m&to=now&viewPanel=52&fullscreen&var-ip={{ $labels.ip }}
# node tcp conn > 32768 for 1m
- alert: NODE_TCP_CONN_HIGH
expr: node_netstat_Tcp_CurrEstab > 32768
for: 3m
labels:
severity: P2
annotations:
summary: "P2 Node TCP Connection High: {{ $labels.ins }} {{ $labels.ip }}"
description: |
node_netstat_Tcp_CurrEstab[ins={{ $labels.ins }}, ip={{ $labels.ip }}] = {{ $value }} > 32768
http://g.pigsty/d/node?&from=now-10m&to=now&viewPanel=54&fullscreen&var-ip={{ $labels.ip }}
#==============================================================#
# Misc #
#==============================================================#
# node ntp offset > 1s for 1m
- alert: NODE_NTP_OFFSET_HIGH
expr: abs(node_ntp_offset_seconds) > 1
for: 1m
labels:
severity: P1
annotations:
summary: "P1 Node NTP Offset High: {{ $labels.ins }} {{ $labels.ip }}"
description: |
node_ntp_offset_seconds[ins={{ $labels.ins }}, ip={{ $labels.ip }}] = {{ $value }} > 32768
http://g.pigsty/d/node?&from=now-10m&to=now&viewPanel=70&fullscreen&var-ip={{ $labels.ip }}
数据库与连接池报警规则
---
################################################################
# PgSQL Alert #
################################################################
- name: pgsql-alert
rules:
#==============================================================#
# Error / Aliveness #
#==============================================================#
# cluster size change triggers a P0 alert (warn: auto heal in 5min)
- alert: PGSQL_CLUSTER_SHRINK
expr: delta(pg:cls:size{}[5m]) < 0
for: 15s
labels:
severity: P1
annotations:
summary: 'delta(pg:cls:size{cls={{ $labels.cls }}}[15s]) = {{ $value | printf "%.0f" }} < 0'
description: |
http://g.pigsty/d/pg-cluster&from=now-10m&to=now&var-cls={{ $labels.cls }}
# postgres down for 15s triggers a P0 alert
- alert: PGSQL_DOWN
expr: PGSQL_up{} == 0
labels:
severity: P0
annotations:
summary: "[P0] PGSQL_DOWN: {{ $labels.ins }} {{ $value }}"
description: |
PGSQL_up[ins={{ $labels.ins }}] = {{ $value }} == 0
http://g.pigsty/d/pg-instance&from=now-10m&to=now&var-ins={{ $labels.ins }}
# pgbouncer down for 15s triggers a P0 alert
- alert: PGBOUNCER_DOWN
expr: pgbouncer_up{} == 0
labels:
severity: P0
annotations:
summary: "P0 Pgbouncer Down: {{ $labels.ins }} {{ $value }}"
description: |
pgbouncer_up[ins={{ $labels.ins }}] = {{ $value }} == 0
http://g.pigsty/d/pg-pgbouncer&from=now-10m&to=now&var-ins={{ $labels.ins }}
# pg/pgbouncer exporter down for 1m triggers a P1 alert
- alert: PGSQL_EXPORTER_DOWN
expr: up{instance=~"^.*:(9630|9631)$"} == 0
for: 1m
labels:
severity: P1
annotations:
summary: "P1 PG/PGB Exporter Down: {{ $labels.ins }} {{ $labels.instance }} {{ $value }}"
description: |
up[instance={{ $labels.instance }}] = {{ $value }} == 0
http://g.pigsty/d/pg-instance?from=now-10m&to=now&viewPanel=262&fullscreen&var-ins={{ $labels.ins }}
#==============================================================#
# Latency #
#==============================================================#
# replication break for 1m triggers a P1 alert (warn: heal in 5m)
- alert: PGSQL_REPLICATION_BREAK
expr: delta(PGSQL_downstream_count{state="streaming"}[5m]) < 0
for: 1m
labels:
severity: P1
annotations:
summary: "P1 PG Replication Break: {{ $labels.ins }} {{ $value }}"
description: |
PGSQL_downstream_count_delta[ins={{ $labels.ins }}] = {{ $value }} < 0
http://g.pigsty/d/pg-instance?from=now-10m&to=now&viewPanel=180&fullscreen&var-ins={{ $labels.ins }}
# replication lag greater than 8 second for 3m triggers a P1 alert
- alert: PGSQL_REPLICATION_LAG
expr: PGSQL_replication_replay_lag{application_name!='PGSQL_receivewal'} > 8
for: 3m
labels:
severity: P1
annotations:
summary: "P1 PG Replication Lagged: {{ $labels.ins }} {{ $value }}"
description: |
PGSQL_replication_replay_lag[ins={{ $labels.ins }}] = {{ $value }} > 8s
http://g.pigsty/d/pg-instance?from=now-10m&to=now&viewPanel=384&fullscreen&var-ins={{ $labels.ins }}
# pg avg response time > 16ms
- alert: PGSQL_QUERY_RT_HIGH
expr: pg:ins:query_rt > 0.016
for: 1m
labels:
severity: P1
annotations:
summary: "P1 PG Query Response Time High: {{ $labels.ins }} {{ $value }}"
description: |
pg:ins:query_rt[ins={{ $labels.ins }}] = {{ $value }} > 16ms
http://g.pigsty/d/pg-instance?from=now-10m&to=now&viewPanel=137&fullscreen&var-ins={{ $labels.ins }}
#==============================================================#
# Saturation #
#==============================================================#
# pg load1 high than 70% for 3m triggers a P1 alert
- alert: PGSQL_LOAD_HIGH
expr: pg:ins:load1{} > 0.70
for: 3m
labels:
severity: P1
annotations:
summary: "P1 PG Load High: {{ $labels.ins }} {{ $value }}"
description: |
pg:ins:load1[ins={{ $labels.ins }}] = {{ $value }} > 70%
http://g.pigsty/d/pg-instance?from=now-10m&to=now&viewPanel=210&fullscreen&var-ins={{ $labels.ins }}
# pg active backend more than 2 times of available cpu cores for 3m triggers a P1 alert
- alert: PGSQL_BACKEND_HIGH
expr: pg:ins:active_backends / on(ins) node:ins:cpu_count > 2
for: 3m
labels:
severity: P1
annotations:
summary: "P1 PG Backend High: {{ $labels.ins }} {{ $value }}"
description: |
pg:ins:active_backends/node:ins:cpu_count[ins={{ $labels.ins }}] = {{ $value }} > 2
http://g.pigsty/d/pg-instance?from=now-10m&to=now&viewPanel=150&fullscreen&var-ins={{ $labels.ins }}
# max idle xact duration exceed 3m
- alert: PGSQL_IDLE_XACT_BACKEND_HIGH
expr: pg:ins:ixact_backends > 1
for: 3m
labels:
severity: P2
annotations:
summary: "P1 PG Idle In Transaction Backend High: {{ $labels.ins }} {{ $value }}"
description: |
pg:ins:ixact_backends[ins={{ $labels.ins }}] = {{ $value }} > 1
http://g.pigsty/d/pg-instance?from=now-10m&to=now&viewPanel=161&fullscreen&var-ins={{ $labels.ins }}
# 2 waiting clients for 3m triggers a P1 alert
- alert: PGSQL_CLIENT_QUEUING
expr: pg:ins:waiting_clients > 2
for: 3m
labels:
severity: P1
annotations:
summary: "P1 PG Client Queuing: {{ $labels.ins }} {{ $value }}"
description: |
pg:ins:waiting_clients[ins={{ $labels.ins }}] = {{ $value }} > 2
http://g.pigsty/d/pg-instance?from=now-10m&to=now&viewPanel=159&fullscreen&var-ins={{ $labels.ins }}
# age wrap around (near half) triggers a P1 alert
- alert: PGSQL_AGE_HIGH
expr: pg:ins:age > 1000000000
for: 3m
labels:
severity: P1
annotations:
summary: "P1 PG Age High: {{ $labels.ins }} {{ $value }}"
description: |
pg:ins:age[ins={{ $labels.ins }}] = {{ $value }} > 1000000000
http://g.pigsty/d/pg-instance?from=now-10m&to=now&viewPanel=172&fullscreen&var-ins={{ $labels.ins }}
#==============================================================#
# Traffic #
#==============================================================#
# more than 30k TPS lasts for 3m triggers a P1 (pgbouncer bottleneck)
- alert: PGSQL_TPS_HIGH
expr: pg:ins:xacts > 30000
for: 3m
labels:
severity: P1
annotations:
summary: "P1 Postgres TPS High: {{ $labels.ins }} {{ $value }}"
description: |
pg:ins:xacts[ins={{ $labels.ins }}] = {{ $value }} > 30000
http://g.pigsty/d/pg-instance?from=now-10m&to=now&viewPanel=125&fullscreen&var-ins={{ $labels.ins }}
...
6 - Sandbox Stdout
Makefile shortcuts and their expected outputs
Overview
# Clone the repo and enter it
cd /tmp && git clone git@github.com:Vonng/pigsty.git && cd pigsty
make up # 拉起vagrant虚拟机
make ssh # 配置虚拟机ssh访问 【单次,下次启动无需再次执行】
sudo make dns # 写入Pigsty静态DNS域名 【sudo输入密码,可选,单次】
make download # 下载最新离线软件包 【可选,可显著加速初始化】
make upload # 将离线软件包上传至元节点
make init # 初始化Pigsty
make mon-view # 打开Pigsty监控首页(默认用户密码:admin:admin)
clone
克隆并进入项目目录,后续操作均位于项目根目录中(以/tmp/pigsty
为例)
cd /tmp && git clone git@github.com:Vonng/pigsty.git && cd pigsty
clean
清理所有的沙箱痕迹(如果有)
$ make clean
cd vagrant && vagrant destroy -f --parallel; exit 0
==> vagrant: A new version of Vagrant is available: 2.2.14 (installed version: 2.2.13)!
==> vagrant: To upgrade visit: https://www.vagrantup.com/downloads.html
==> node-3: Forcing shutdown of VM...
==> node-3: Destroying VM and associated drives...
==> node-2: Forcing shutdown of VM...
==> node-2: Destroying VM and associated drives...
==> node-1: Forcing shutdown of VM...
==> node-1: Destroying VM and associated drives...
==> meta: Forcing shutdown of VM...
==> meta: Destroying VM and associated drives...
up
执行make up
将调用vagrant up
命令,根据Vagrantfile中的定义,使用Virtualbox创建四台虚拟机。
请注意第一次执行vagrant up
时,软件会自动从官网下载 CentOS/7 的虚拟机镜像。如果您的网络状况不佳(例如没有FQ代理),则可能需要等待相当长的一段时间。您也可以选择自己创建虚拟机,并根据 部署 一章的说明进行Pigsty部署(不建议)。
$ make up
cd vagrant && vagrant up
Bringing machine 'meta' up with 'virtualbox' provider...
Bringing machine 'node-1' up with 'virtualbox' provider...
Bringing machine 'node-2' up with 'virtualbox' provider...
Bringing machine 'node-3' up with 'virtualbox' provider...
==> meta: Cloning VM...
==> meta: Matching MAC address for NAT networking...
==> meta: Setting the name of the VM: vagrant_meta_1614587906789_29514
==> meta: Clearing any previously set network interfaces...
==> meta: Preparing network interfaces based on configuration...
meta: Adapter 1: nat
meta: Adapter 2: hostonly
==> meta: Forwarding ports...
meta: 22 (guest) => 2222 (host) (adapter 1)
==> meta: Running 'pre-boot' VM customizations...
==> meta: Booting VM...
==> meta: Waiting for machine to boot. This may take a few minutes...
meta: SSH address: 127.0.0.1:2222
meta: SSH username: vagrant
meta: SSH auth method: private key
==> meta: Machine booted and ready!
==> meta: Checking for guest additions in VM...
meta: No guest additions were detected on the base box for this VM! Guest
meta: additions are required for forwarded ports, shared folders, host only
meta: networking, and more. If SSH fails on this machine, please install
meta: the guest additions and repackage the box to continue.
meta:
meta: This is not an error message; everything may continue to work properly,
meta: in which case you may ignore this message.
==> meta: Setting hostname...
==> meta: Configuring and enabling network interfaces...
==> meta: Rsyncing folder: /Volumes/Data/pigsty/vagrant/ => /vagrant
==> meta: Running provisioner: shell...
meta: Running: /var/folders/_5/_0mbf4292pl9y4xgy0kn2r1h0000gn/T/vagrant-shell20210301-60046-1jv6obp.sh
meta: [INFO] write ssh config to /home/vagrant/.ssh
==> node-1: Cloning VM...
==> node-1: Matching MAC address for NAT networking...
==> node-1: Setting the name of the VM: vagrant_node-1_1614587930603_84690
==> node-1: Fixed port collision for 22 => 2222. Now on port 2200.
==> node-1: Clearing any previously set network interfaces...
==> node-1: Preparing network interfaces based on configuration...
node-1: Adapter 1: nat
node-1: Adapter 2: hostonly
==> node-1: Forwarding ports...
node-1: 22 (guest) => 2200 (host) (adapter 1)
==> node-1: Running 'pre-boot' VM customizations...
==> node-1: Booting VM...
==> node-1: Waiting for machine to boot. This may take a few minutes...
node-1: SSH address: 127.0.0.1:2200
node-1: SSH username: vagrant
node-1: SSH auth method: private key
==> node-1: Machine booted and ready!
==> node-1: Checking for guest additions in VM...
node-1: No guest additions were detected on the base box for this VM! Guest
node-1: additions are required for forwarded ports, shared folders, host only
node-1: networking, and more. If SSH fails on this machine, please install
node-1: the guest additions and repackage the box to continue.
node-1:
node-1: This is not an error message; everything may continue to work properly,
node-1: in which case you may ignore this message.
==> node-1: Setting hostname...
==> node-1: Configuring and enabling network interfaces...
==> node-1: Rsyncing folder: /Volumes/Data/pigsty/vagrant/ => /vagrant
==> node-1: Running provisioner: shell...
node-1: Running: /var/folders/_5/_0mbf4292pl9y4xgy0kn2r1h0000gn/T/vagrant-shell20210301-60046-5w83e1.sh
node-1: [INFO] write ssh config to /home/vagrant/.ssh
==> node-2: Cloning VM...
==> node-2: Matching MAC address for NAT networking...
==> node-2: Setting the name of the VM: vagrant_node-2_1614587953786_32441
==> node-2: Fixed port collision for 22 => 2222. Now on port 2201.
==> node-2: Clearing any previously set network interfaces...
==> node-2: Preparing network interfaces based on configuration...
node-2: Adapter 1: nat
node-2: Adapter 2: hostonly
==> node-2: Forwarding ports...
node-2: 22 (guest) => 2201 (host) (adapter 1)
==> node-2: Running 'pre-boot' VM customizations...
==> node-2: Booting VM...
==> node-2: Waiting for machine to boot. This may take a few minutes...
node-2: SSH address: 127.0.0.1:2201
node-2: SSH username: vagrant
node-2: SSH auth method: private key
==> node-2: Machine booted and ready!
==> node-2: Checking for guest additions in VM...
node-2: No guest additions were detected on the base box for this VM! Guest
node-2: additions are required for forwarded ports, shared folders, host only
node-2: networking, and more. If SSH fails on this machine, please install
node-2: the guest additions and repackage the box to continue.
node-2:
node-2: This is not an error message; everything may continue to work properly,
node-2: in which case you may ignore this message.
==> node-2: Setting hostname...
==> node-2: Configuring and enabling network interfaces...
==> node-2: Rsyncing folder: /Volumes/Data/pigsty/vagrant/ => /vagrant
==> node-2: Running provisioner: shell...
node-2: Running: /var/folders/_5/_0mbf4292pl9y4xgy0kn2r1h0000gn/T/vagrant-shell20210301-60046-1xljcde.sh
node-2: [INFO] write ssh config to /home/vagrant/.ssh
==> node-3: Cloning VM...
==> node-3: Matching MAC address for NAT networking...
==> node-3: Setting the name of the VM: vagrant_node-3_1614587977533_52921
==> node-3: Fixed port collision for 22 => 2222. Now on port 2202.
==> node-3: Clearing any previously set network interfaces...
==> node-3: Preparing network interfaces based on configuration...
node-3: Adapter 1: nat
node-3: Adapter 2: hostonly
==> node-3: Forwarding ports...
node-3: 22 (guest) => 2202 (host) (adapter 1)
==> node-3: Running 'pre-boot' VM customizations...
==> node-3: Booting VM...
==> node-3: Waiting for machine to boot. This may take a few minutes...
node-3: SSH address: 127.0.0.1:2202
node-3: SSH username: vagrant
node-3: SSH auth method: private key
==> node-3: Machine booted and ready!
==> node-3: Checking for guest additions in VM...
node-3: No guest additions were detected on the base box for this VM! Guest
node-3: additions are required for forwarded ports, shared folders, host only
node-3: networking, and more. If SSH fails on this machine, please install
node-3: the guest additions and repackage the box to continue.
node-3:
node-3: This is not an error message; everything may continue to work properly,
node-3: in which case you may ignore this message.
==> node-3: Setting hostname...
==> node-3: Configuring and enabling network interfaces...
==> node-3: Rsyncing folder: /Volumes/Data/pigsty/vagrant/ => /vagrant
==> node-3: Running provisioner: shell...
node-3: Running: /var/folders/_5/_0mbf4292pl9y4xgy0kn2r1h0000gn/T/vagrant-shell20210301-60046-1cykx8o.sh
node-3: [INFO] write ssh config to /home/vagrant/.ssh
ssh
新拉起的虚拟机默认用户为vagrant
,需要配置本机到虚拟机的免密ssh访问。 执行make ssh
命令将调用vagrant的ssh-config
命令,将pigsty虚拟机节点的ssh配置文件写入~/.ssh/pigsty_config
。
通常该命令只需要在首次启动沙箱时执行一次,后续重新拉起的虚拟机通常会保有相同的SSH配置。
执行完毕后,用户才可以使用类似ssh node-1
的方式通过SSH别名连接至沙箱内的虚拟机节点。
$ make ssh
cd vagrant && vagrant ssh-config > ~/.ssh/pigsty_config 2>/dev/null; true
if ! grep --quiet "pigsty_config" ~/.ssh/config ; then (echo 'Include ~/.ssh/pigsty_config' && cat ~/.ssh/config) > ~/.ssh/config.tmp; mv ~/.ssh/config.tmp ~/.ssh/config && chmod 0600 ~/.ssh/config; fi
if ! grep --quiet "StrictHostKeyChecking=no" ~/.ssh/config ; then (echo 'StrictHostKeyChecking=no' && cat ~/.ssh/config) > ~/.ssh/config.tmp; mv ~/.ssh/config.tmp ~/.ssh/config && chmod 0600 ~/.ssh/config; fi
dns
此命令将Pigsty沙箱虚拟机的静态DNS配置写入/etc/hosts
,通常该命令只需要在首次启动沙箱时执行一次。
执行完毕后,用户才可以从本地浏览器使用域名访问 http://g.pigsty
等WebUI。
注意DNS命令需要SUDO权限执行,需要输入密码,因为/etc/hosts
文件需要特权方可修改。
$ sudo make dns
Password: #<在此输入用户密码>
if ! grep --quiet "pigsty dns records" /etc/hosts ; then cat files/dns >> /etc/hosts; fi
download
从CDN下载最新的Pigsty离线安装包至本地,大小约1GB,约1分钟下载完成。
$ make download
curl http://pigsty-1304147732.cos.accelerate.myqcloud.com/pkg.tgz -o files/pkg.tgz
% Total % Received % Xferd Average Speed Time Time Time Current
Dload Upload Total Spent Left Speed
100 1067M 100 1067M 0 0 15.2M 0 0:01:10 0:01:10 --:--:-- 29.0M
Pigsty是一个复杂的软件系统,为了确保系统的稳定,Pigsty会在初始化过程中从互联网下载所有依赖的软件包并建立本地Yum源。
所有依赖的软件总大小约1GB左右,下载速度取决于您的网络情况。尽管Pigsty已经尽量使用镜像源以加速下载,但少量包的下载仍可能受到防火墙的阻挠,可能出现非常慢的情况。您可以通过proxy_env
配置项设置下载代理以完成首次下载,或直接下载预先打包好的离线安装包。
最新的离线安装包地址为:
Github Release:https://github.com/Vonng/pigsty/releases
CDN Download:http://pigsty-1304147732.cos.accelerate.myqcloud.com/pkg.tgz
您也可以手工下载好后放置于files/pkg.tgz
。
upload
将下载的离线安装包上传元节点并解压,加速后续初始化。
$ make upload
ssh -t meta "sudo rm -rf /tmp/pkg.tgz"
Connection to 127.0.0.1 closed.
scp -r files/pkg.tgz meta:/tmp/pkg.tgz
pkg.tgz 100% 1068MB 53.4MB/s 00:19
ssh -t meta "sudo mkdir -p /www/pigsty/; sudo rm -rf /www/pigsty/*; sudo tar -xf /tmp/pkg.tgz --strip-component=1 -C /www/pigsty/"
Connection to 127.0.0.1 closed.
init
完成上述操作后,执行make init
即会调用ansible
完成Pigsty系统的初始化。
$ make init
./sandbox.yml # 快速初始化,并行初始化元节点与普通数据库节点
sandbox.yml
是专门为本地沙箱环境准备的初始化剧本,通过同时初始化元节点和数据库节点节省了一半时间。 生产环境建议使用infra.yml
与pgsql.yml
分别依次完成元节点与普通节点的初始化。
如果您已经将离线安装包上传至元节点,那么初始化环境会比较快,视机器配置可能总共需要5~10分钟不等。
若离线安装包不存在,那么Pigsty会在初始化过程中从互联网下载约1GB数据,视网络条件可能需要20分钟或更久。
$ make init
./sandbox.yml # interleave sandbox provisioning
[WARNING]: Invalid characters were found in group names but not replaced, use -vvvv to see details
PLAY [Init local repo] ***********************************************************************************************************************************************************************************
TASK [repo : Create local repo directory] ****************************************************************************************************************************************************************
ok: [10.10.10.10]
TASK [repo : Backup & remove existing repos] *************************************************************************************************************************************************************
changed: [10.10.10.10]
TASK [repo : Add required upstream repos] ****************************************************************************************************************************************************************
[WARNING]: Using a variable for a task's 'args' is unsafe in some situations (see https://docs.ansible.com/ansible/devel/reference_appendices/faq.html#argsplat-unsafe)
changed: [10.10.10.10] => (item={'name': 'base', 'description': 'CentOS-$releasever - Base - Aliyun Mirror', 'baseurl': ['http://mirrors.aliyun.com/centos/$releasever/os/$basearch/', 'http://mirrors.aliyuncs.com/centos/$releasever/os/$basearch/', 'http://mirrors.cloud.aliyuncs.com/centos/$releasever/os/$basearch/'], 'gpgcheck': False, 'failovermethod': 'priority'})
changed: [10.10.10.10] => (item={'name': 'updates', 'description': 'CentOS-$releasever - Updates - Aliyun Mirror', 'baseurl': ['http://mirrors.aliyun.com/centos/$releasever/updates/$basearch/', 'http://mirrors.aliyuncs.com/centos/$releasever/updates/$basearch/', 'http://mirrors.cloud.aliyuncs.com/centos/$releasever/updates/$basearch/'], 'gpgcheck': False, 'failovermethod': 'priority'})
changed: [10.10.10.10] => (item={'name': 'extras', 'description': 'CentOS-$releasever - Extras - Aliyun Mirror', 'baseurl': ['http://mirrors.aliyun.com/centos/$releasever/extras/$basearch/', 'http://mirrors.aliyuncs.com/centos/$releasever/extras/$basearch/', 'http://mirrors.cloud.aliyuncs.com/centos/$releasever/extras/$basearch/'], 'gpgcheck': False, 'failovermethod': 'priority'})
changed: [10.10.10.10] => (item={'name': 'epel', 'description': 'CentOS $releasever - EPEL - Aliyun Mirror', 'baseurl': 'http://mirrors.aliyun.com/epel/$releasever/$basearch', 'gpgcheck': False, 'failovermethod': 'priority'})
changed: [10.10.10.10] => (item={'name': 'grafana', 'description': 'Grafana - TsingHua Mirror', 'gpgcheck': False, 'baseurl': 'https://mirrors.tuna.tsinghua.edu.cn/grafana/yum/rpm'})
changed: [10.10.10.10] => (item={'name': 'prometheus', 'description': 'Prometheus and exporters', 'gpgcheck': False, 'baseurl': 'https://packagecloud.io/prometheus-rpm/release/el/$releasever/$basearch'})
changed: [10.10.10.10] => (item={'name': 'pgdg-common', 'description': 'PostgreSQL common RPMs for RHEL/CentOS $releasever - $basearch', 'gpgcheck': False, 'baseurl': 'http://mirrors.zju.edu.cn/postgresql/repos/yum/common/redhat/rhel-$releasever-$basearch'})
changed: [10.10.10.10] => (item={'name': 'pgdg13', 'description': 'PostgreSQL 13 for RHEL/CentOS $releasever - $basearch', 'gpgcheck': False, 'baseurl': 'http://mirrors.zju.edu.cn/postgresql/repos/yum/13/redhat/rhel-$releasever-$basearch'})
changed: [10.10.10.10] => (item={'name': 'centos-sclo', 'description': 'CentOS-$releasever - SCLo', 'gpgcheck': False, 'mirrorlist': 'http://mirrorlist.centos.org?arch=$basearch&release=7&repo=sclo-sclo'})
changed: [10.10.10.10] => (item={'name': 'centos-sclo-rh', 'description': 'CentOS-$releasever - SCLo rh', 'gpgcheck': False, 'mirrorlist': 'http://mirrorlist.centos.org?arch=$basearch&release=7&repo=sclo-rh'})
changed: [10.10.10.10] => (item={'name': 'nginx', 'description': 'Nginx Official Yum Repo', 'skip_if_unavailable': True, 'gpgcheck': False, 'baseurl': 'http://nginx.org/packages/centos/$releasever/$basearch/'})
changed: [10.10.10.10] => (item={'name': 'haproxy', 'description': 'Copr repo for haproxy', 'skip_if_unavailable': True, 'gpgcheck': False, 'baseurl': 'https://download.copr.fedorainfracloud.org/results/roidelapluie/haproxy/epel-$releasever-$basearch/'})
changed: [10.10.10.10] => (item={'name': 'harbottle', 'description': 'Copr repo for main owned by harbottle', 'skip_if_unavailable': True, 'gpgcheck': False, 'baseurl': 'https://download.copr.fedorainfracloud.org/results/harbottle/main/epel-$releasever-$basearch/'})
TASK [repo : Check repo pkgs cache exists] ***************************************************************************************************************************************************************
ok: [10.10.10.10]
TASK [repo : Set fact whether repo_exists] ***************************************************************************************************************************************************************
ok: [10.10.10.10]
TASK [repo : Move upstream repo to backup] ***************************************************************************************************************************************************************
changed: [10.10.10.10]
TASK [repo : Add local file system repos] ****************************************************************************************************************************************************************
changed: [10.10.10.10]
TASK [repo : Remake yum cache if not exists] *************************************************************************************************************************************************************
[WARNING]: Consider using the yum module rather than running 'yum'. If you need to use command because yum is insufficient you can add 'warn: false' to this command task or set
'command_warnings=False' in ansible.cfg to get rid of this message.
changed: [10.10.10.10]
TASK [repo : Install repo bootstrap packages] ************************************************************************************************************************************************************
changed: [10.10.10.10] => (item=['yum-utils', 'createrepo', 'ansible', 'nginx', 'wget'])
TASK [repo : Render repo nginx server files] *************************************************************************************************************************************************************
changed: [10.10.10.10] => (item={'src': 'index.html.j2', 'dest': '/www/index.html'})
changed: [10.10.10.10] => (item={'src': 'default.conf.j2', 'dest': '/etc/nginx/conf.d/default.conf'})
changed: [10.10.10.10] => (item={'src': 'local.repo.j2', 'dest': '/www/pigsty.repo'})
changed: [10.10.10.10] => (item={'src': 'nginx.conf.j2', 'dest': '/etc/nginx/nginx.conf'})
TASK [repo : Disable selinux for repo server] ************************************************************************************************************************************************************
[WARNING]: SELinux state temporarily changed from 'enforcing' to 'permissive'. State change will take effect next reboot.
changed: [10.10.10.10]
TASK [repo : Launch repo nginx server] *******************************************************************************************************************************************************************
changed: [10.10.10.10]
TASK [repo : Waits repo server online] *******************************************************************************************************************************************************************
ok: [10.10.10.10]
TASK [repo : Download web url packages] ******************************************************************************************************************************************************************
skipping: [10.10.10.10] => (item=https://github.com/Vonng/pg_exporter/releases/download/v0.3.2/pg_exporter-0.3.2-1.el7.x86_64.rpm)
skipping: [10.10.10.10] => (item=https://github.com/cybertec-postgresql/vip-manager/releases/download/v0.6/vip-manager_0.6-1_amd64.rpm)
skipping: [10.10.10.10] => (item=http://guichaz.free.fr/polysh/files/polysh-0.4-1.noarch.rpm)
TASK [repo : Download repo packages] *********************************************************************************************************************************************************************
skipping: [10.10.10.10] => (item=epel-release nginx wget yum-utils yum createrepo)
skipping: [10.10.10.10] => (item=ntp chrony uuid lz4 nc pv jq vim-enhanced make patch bash lsof wget unzip git tuned)
skipping: [10.10.10.10] => (item=readline zlib openssl libyaml libxml2 libxslt perl-ExtUtils-Embed ca-certificates)
skipping: [10.10.10.10] => (item=numactl grubby sysstat dstat iotop bind-utils net-tools tcpdump socat ipvsadm telnet)
skipping: [10.10.10.10] => (item=grafana prometheus2 pushgateway alertmanager)
skipping: [10.10.10.10] => (item=node_exporter postgres_exporter nginx_exporter blackbox_exporter)
skipping: [10.10.10.10] => (item=consul consul_exporter consul-template etcd)
skipping: [10.10.10.10] => (item=ansible python python-pip python-psycopg2 audit)
skipping: [10.10.10.10] => (item=python3 python3-psycopg2 python36-requests python3-etcd python3-consul)
skipping: [10.10.10.10] => (item=python36-urllib3 python36-idna python36-pyOpenSSL python36-cryptography)
skipping: [10.10.10.10] => (item=haproxy keepalived dnsmasq)
skipping: [10.10.10.10] => (item=patroni patroni-consul patroni-etcd pgbouncer pg_cli pgbadger pg_activity)
skipping: [10.10.10.10] => (item=pgcenter boxinfo check_postgres emaj pgbconsole pg_bloat_check pgquarrel)
skipping: [10.10.10.10] => (item=barman barman-cli pgloader pgFormatter pitrery pspg pgxnclient PyGreSQL pgadmin4 tail_n_mail)
skipping: [10.10.10.10] => (item=postgresql13* postgis31* citus_13 timescaledb_13)
skipping: [10.10.10.10] => (item=pg_repack13 pg_squeeze13)
skipping: [10.10.10.10] => (item=pg_qualstats13 pg_stat_kcache13 system_stats_13 bgw_replstatus13)
skipping: [10.10.10.10] => (item=plr13 plsh13 plpgsql_check_13 plproxy13 plr13 plsh13 plpgsql_check_13 pldebugger13)
skipping: [10.10.10.10] => (item=hdfs_fdw_13 mongo_fdw13 mysql_fdw_13 ogr_fdw13 redis_fdw_13 pgbouncer_fdw13)
skipping: [10.10.10.10] => (item=wal2json13 count_distinct13 ddlx_13 geoip13 orafce13)
skipping: [10.10.10.10] => (item=rum_13 hypopg_13 ip4r13 jsquery_13 logerrors_13 periods_13 pg_auto_failover_13 pg_catcheck13)
skipping: [10.10.10.10] => (item=pg_fkpart13 pg_jobmon13 pg_partman13 pg_prioritize_13 pg_track_settings13 pgaudit15_13)
skipping: [10.10.10.10] => (item=pgcryptokey13 pgexportdoc13 pgimportdoc13 pgmemcache-13 pgmp13 pgq-13)
skipping: [10.10.10.10] => (item=pguint13 pguri13 prefix13 safeupdate_13 semver13 table_version13 tdigest13)
TASK [repo : Download repo pkg deps] *********************************************************************************************************************************************************************
skipping: [10.10.10.10] => (item=epel-release nginx wget yum-utils yum createrepo)
skipping: [10.10.10.10] => (item=ntp chrony uuid lz4 nc pv jq vim-enhanced make patch bash lsof wget unzip git tuned)
skipping: [10.10.10.10] => (item=readline zlib openssl libyaml libxml2 libxslt perl-ExtUtils-Embed ca-certificates)
skipping: [10.10.10.10] => (item=numactl grubby sysstat dstat iotop bind-utils net-tools tcpdump socat ipvsadm telnet)
skipping: [10.10.10.10] => (item=grafana prometheus2 pushgateway alertmanager)
skipping: [10.10.10.10] => (item=node_exporter postgres_exporter nginx_exporter blackbox_exporter)
skipping: [10.10.10.10] => (item=consul consul_exporter consul-template etcd)
skipping: [10.10.10.10] => (item=ansible python python-pip python-psycopg2 audit)
skipping: [10.10.10.10] => (item=python3 python3-psycopg2 python36-requests python3-etcd python3-consul)
skipping: [10.10.10.10] => (item=python36-urllib3 python36-idna python36-pyOpenSSL python36-cryptography)
skipping: [10.10.10.10] => (item=haproxy keepalived dnsmasq)
skipping: [10.10.10.10] => (item=patroni patroni-consul patroni-etcd pgbouncer pg_cli pgbadger pg_activity)
skipping: [10.10.10.10] => (item=pgcenter boxinfo check_postgres emaj pgbconsole pg_bloat_check pgquarrel)
skipping: [10.10.10.10] => (item=barman barman-cli pgloader pgFormatter pitrery pspg pgxnclient PyGreSQL pgadmin4 tail_n_mail)
skipping: [10.10.10.10] => (item=postgresql13* postgis31* citus_13 timescaledb_13)
skipping: [10.10.10.10] => (item=pg_repack13 pg_squeeze13)
skipping: [10.10.10.10] => (item=pg_qualstats13 pg_stat_kcache13 system_stats_13 bgw_replstatus13)
skipping: [10.10.10.10] => (item=plr13 plsh13 plpgsql_check_13 plproxy13 plr13 plsh13 plpgsql_check_13 pldebugger13)
skipping: [10.10.10.10] => (item=hdfs_fdw_13 mongo_fdw13 mysql_fdw_13 ogr_fdw13 redis_fdw_13 pgbouncer_fdw13)
skipping: [10.10.10.10] => (item=wal2json13 count_distinct13 ddlx_13 geoip13 orafce13)
skipping: [10.10.10.10] => (item=rum_13 hypopg_13 ip4r13 jsquery_13 logerrors_13 periods_13 pg_auto_failover_13 pg_catcheck13)
skipping: [10.10.10.10] => (item=pg_fkpart13 pg_jobmon13 pg_partman13 pg_prioritize_13 pg_track_settings13 pgaudit15_13)
skipping: [10.10.10.10] => (item=pgcryptokey13 pgexportdoc13 pgimportdoc13 pgmemcache-13 pgmp13 pgq-13)
skipping: [10.10.10.10] => (item=pguint13 pguri13 prefix13 safeupdate_13 semver13 table_version13 tdigest13)
TASK [repo : Create local repo index] ********************************************************************************************************************************************************************
skipping: [10.10.10.10]
TASK [repo : Copy bootstrap scripts] *********************************************************************************************************************************************************************
skipping: [10.10.10.10]
TASK [repo : Mark repo cache as valid] *******************************************************************************************************************************************************************
skipping: [10.10.10.10]
PLAY [Provision Node] ************************************************************************************************************************************************************************************
TASK [node : Update node hostname] ***********************************************************************************************************************************************************************
skipping: [10.10.10.10]
skipping: [10.10.10.11]
skipping: [10.10.10.12]
skipping: [10.10.10.13]
TASK [node : Add new hostname to /etc/hosts] *************************************************************************************************************************************************************
skipping: [10.10.10.10]
skipping: [10.10.10.11]
skipping: [10.10.10.12]
skipping: [10.10.10.13]
TASK [node : Write static dns records] *******************************************************************************************************************************************************************
changed: [10.10.10.10] => (item=10.10.10.10 yum.pigsty)
changed: [10.10.10.11] => (item=10.10.10.10 yum.pigsty)
changed: [10.10.10.13] => (item=10.10.10.10 yum.pigsty)
changed: [10.10.10.12] => (item=10.10.10.10 yum.pigsty)
TASK [node : Get old nameservers] ************************************************************************************************************************************************************************
changed: [10.10.10.11]
changed: [10.10.10.10]
changed: [10.10.10.12]
changed: [10.10.10.13]
TASK [node : Truncate resolv file] ***********************************************************************************************************************************************************************
changed: [10.10.10.11]
changed: [10.10.10.10]
changed: [10.10.10.12]
changed: [10.10.10.13]
TASK [node : Write resolv options] ***********************************************************************************************************************************************************************
changed: [10.10.10.11] => (item=options single-request-reopen timeout:1 rotate)
changed: [10.10.10.12] => (item=options single-request-reopen timeout:1 rotate)
changed: [10.10.10.10] => (item=options single-request-reopen timeout:1 rotate)
changed: [10.10.10.13] => (item=options single-request-reopen timeout:1 rotate)
changed: [10.10.10.11] => (item=domain service.consul)
changed: [10.10.10.12] => (item=domain service.consul)
changed: [10.10.10.13] => (item=domain service.consul)
changed: [10.10.10.10] => (item=domain service.consul)
TASK [node : Add new nameservers] ************************************************************************************************************************************************************************
changed: [10.10.10.11] => (item=10.10.10.10)
changed: [10.10.10.12] => (item=10.10.10.10)
changed: [10.10.10.10] => (item=10.10.10.10)
changed: [10.10.10.13] => (item=10.10.10.10)
TASK [node : Append old nameservers] *********************************************************************************************************************************************************************
changed: [10.10.10.11] => (item=10.0.2.3)
changed: [10.10.10.12] => (item=10.0.2.3)
changed: [10.10.10.10] => (item=10.0.2.3)
changed: [10.10.10.13] => (item=10.0.2.3)
TASK [node : Node configure disable firewall] ************************************************************************************************************************************************************
ok: [10.10.10.11]
ok: [10.10.10.10]
ok: [10.10.10.12]
ok: [10.10.10.13]
TASK [node : Node disable selinux by default] ************************************************************************************************************************************************************
changed: [10.10.10.11]
changed: [10.10.10.12]
changed: [10.10.10.13]
[WARNING]: SELinux state change will take effect next reboot
ok: [10.10.10.10]
TASK [node : Backup existing repos] **********************************************************************************************************************************************************************
changed: [10.10.10.11]
changed: [10.10.10.10]
changed: [10.10.10.12]
changed: [10.10.10.13]
TASK [node : Install upstream repo] **********************************************************************************************************************************************************************
skipping: [10.10.10.10] => (item={'name': 'base', 'description': 'CentOS-$releasever - Base - Aliyun Mirror', 'baseurl': ['http://mirrors.aliyun.com/centos/$releasever/os/$basearch/', 'http://mirrors.aliyuncs.com/centos/$releasever/os/$basearch/', 'http://mirrors.cloud.aliyuncs.com/centos/$releasever/os/$basearch/'], 'gpgcheck': False, 'failovermethod': 'priority'})
skipping: [10.10.10.10] => (item={'name': 'updates', 'description': 'CentOS-$releasever - Updates - Aliyun Mirror', 'baseurl': ['http://mirrors.aliyun.com/centos/$releasever/updates/$basearch/', 'http://mirrors.aliyuncs.com/centos/$releasever/updates/$basearch/', 'http://mirrors.cloud.aliyuncs.com/centos/$releasever/updates/$basearch/'], 'gpgcheck': False, 'failovermethod': 'priority'})
skipping: [10.10.10.11] => (item={'name': 'base', 'description': 'CentOS-$releasever - Base - Aliyun Mirror', 'baseurl': ['http://mirrors.aliyun.com/centos/$releasever/os/$basearch/', 'http://mirrors.aliyuncs.com/centos/$releasever/os/$basearch/', 'http://mirrors.cloud.aliyuncs.com/centos/$releasever/os/$basearch/'], 'gpgcheck': False, 'failovermethod': 'priority'})
skipping: [10.10.10.10] => (item={'name': 'extras', 'description': 'CentOS-$releasever - Extras - Aliyun Mirror', 'baseurl': ['http://mirrors.aliyun.com/centos/$releasever/extras/$basearch/', 'http://mirrors.aliyuncs.com/centos/$releasever/extras/$basearch/', 'http://mirrors.cloud.aliyuncs.com/centos/$releasever/extras/$basearch/'], 'gpgcheck': False, 'failovermethod': 'priority'})
skipping: [10.10.10.11] => (item={'name': 'updates', 'description': 'CentOS-$releasever - Updates - Aliyun Mirror', 'baseurl': ['http://mirrors.aliyun.com/centos/$releasever/updates/$basearch/', 'http://mirrors.aliyuncs.com/centos/$releasever/updates/$basearch/', 'http://mirrors.cloud.aliyuncs.com/centos/$releasever/updates/$basearch/'], 'gpgcheck': False, 'failovermethod': 'priority'})
skipping: [10.10.10.10] => (item={'name': 'epel', 'description': 'CentOS $releasever - EPEL - Aliyun Mirror', 'baseurl': 'http://mirrors.aliyun.com/epel/$releasever/$basearch', 'gpgcheck': False, 'failovermethod': 'priority'})
skipping: [10.10.10.12] => (item={'name': 'base', 'description': 'CentOS-$releasever - Base - Aliyun Mirror', 'baseurl': ['http://mirrors.aliyun.com/centos/$releasever/os/$basearch/', 'http://mirrors.aliyuncs.com/centos/$releasever/os/$basearch/', 'http://mirrors.cloud.aliyuncs.com/centos/$releasever/os/$basearch/'], 'gpgcheck': False, 'failovermethod': 'priority'})
skipping: [10.10.10.11] => (item={'name': 'extras', 'description': 'CentOS-$releasever - Extras - Aliyun Mirror', 'baseurl': ['http://mirrors.aliyun.com/centos/$releasever/extras/$basearch/', 'http://mirrors.aliyuncs.com/centos/$releasever/extras/$basearch/', 'http://mirrors.cloud.aliyuncs.com/centos/$releasever/extras/$basearch/'], 'gpgcheck': False, 'failovermethod': 'priority'})
skipping: [10.10.10.10] => (item={'name': 'grafana', 'description': 'Grafana - TsingHua Mirror', 'gpgcheck': False, 'baseurl': 'https://mirrors.tuna.tsinghua.edu.cn/grafana/yum/rpm'})
skipping: [10.10.10.12] => (item={'name': 'updates', 'description': 'CentOS-$releasever - Updates - Aliyun Mirror', 'baseurl': ['http://mirrors.aliyun.com/centos/$releasever/updates/$basearch/', 'http://mirrors.aliyuncs.com/centos/$releasever/updates/$basearch/', 'http://mirrors.cloud.aliyuncs.com/centos/$releasever/updates/$basearch/'], 'gpgcheck': False, 'failovermethod': 'priority'})
skipping: [10.10.10.11] => (item={'name': 'epel', 'description': 'CentOS $releasever - EPEL - Aliyun Mirror', 'baseurl': 'http://mirrors.aliyun.com/epel/$releasever/$basearch', 'gpgcheck': False, 'failovermethod': 'priority'})
skipping: [10.10.10.13] => (item={'name': 'base', 'description': 'CentOS-$releasever - Base - Aliyun Mirror', 'baseurl': ['http://mirrors.aliyun.com/centos/$releasever/os/$basearch/', 'http://mirrors.aliyuncs.com/centos/$releasever/os/$basearch/', 'http://mirrors.cloud.aliyuncs.com/centos/$releasever/os/$basearch/'], 'gpgcheck': False, 'failovermethod': 'priority'})
skipping: [10.10.10.12] => (item={'name': 'extras', 'description': 'CentOS-$releasever - Extras - Aliyun Mirror', 'baseurl': ['http://mirrors.aliyun.com/centos/$releasever/extras/$basearch/', 'http://mirrors.aliyuncs.com/centos/$releasever/extras/$basearch/', 'http://mirrors.cloud.aliyuncs.com/centos/$releasever/extras/$basearch/'], 'gpgcheck': False, 'failovermethod': 'priority'})
skipping: [10.10.10.11] => (item={'name': 'grafana', 'description': 'Grafana - TsingHua Mirror', 'gpgcheck': False, 'baseurl': 'https://mirrors.tuna.tsinghua.edu.cn/grafana/yum/rpm'})
skipping: [10.10.10.13] => (item={'name': 'updates', 'description': 'CentOS-$releasever - Updates - Aliyun Mirror', 'baseurl': ['http://mirrors.aliyun.com/centos/$releasever/updates/$basearch/', 'http://mirrors.aliyuncs.com/centos/$releasever/updates/$basearch/', 'http://mirrors.cloud.aliyuncs.com/centos/$releasever/updates/$basearch/'], 'gpgcheck': False, 'failovermethod': 'priority'})
skipping: [10.10.10.12] => (item={'name': 'epel', 'description': 'CentOS $releasever - EPEL - Aliyun Mirror', 'baseurl': 'http://mirrors.aliyun.com/epel/$releasever/$basearch', 'gpgcheck': False, 'failovermethod': 'priority'})
skipping: [10.10.10.13] => (item={'name': 'extras', 'description': 'CentOS-$releasever - Extras - Aliyun Mirror', 'baseurl': ['http://mirrors.aliyun.com/centos/$releasever/extras/$basearch/', 'http://mirrors.aliyuncs.com/centos/$releasever/extras/$basearch/', 'http://mirrors.cloud.aliyuncs.com/centos/$releasever/extras/$basearch/'], 'gpgcheck': False, 'failovermethod': 'priority'})
skipping: [10.10.10.12] => (item={'name': 'grafana', 'description': 'Grafana - TsingHua Mirror', 'gpgcheck': False, 'baseurl': 'https://mirrors.tuna.tsinghua.edu.cn/grafana/yum/rpm'})
skipping: [10.10.10.13] => (item={'name': 'epel', 'description': 'CentOS $releasever - EPEL - Aliyun Mirror', 'baseurl': 'http://mirrors.aliyun.com/epel/$releasever/$basearch', 'gpgcheck': False, 'failovermethod': 'priority'})
skipping: [10.10.10.13] => (item={'name': 'grafana', 'description': 'Grafana - TsingHua Mirror', 'gpgcheck': False, 'baseurl': 'https://mirrors.tuna.tsinghua.edu.cn/grafana/yum/rpm'})
skipping: [10.10.10.10] => (item={'name': 'prometheus', 'description': 'Prometheus and exporters', 'gpgcheck': False, 'baseurl': 'https://packagecloud.io/prometheus-rpm/release/el/$releasever/$basearch'})
skipping: [10.10.10.10] => (item={'name': 'pgdg-common', 'description': 'PostgreSQL common RPMs for RHEL/CentOS $releasever - $basearch', 'gpgcheck': False, 'baseurl': 'http://mirrors.zju.edu.cn/postgresql/repos/yum/common/redhat/rhel-$releasever-$basearch'})
skipping: [10.10.10.11] => (item={'name': 'prometheus', 'description': 'Prometheus and exporters', 'gpgcheck': False, 'baseurl': 'https://packagecloud.io/prometheus-rpm/release/el/$releasever/$basearch'})
skipping: [10.10.10.10] => (item={'name': 'pgdg13', 'description': 'PostgreSQL 13 for RHEL/CentOS $releasever - $basearch', 'gpgcheck': False, 'baseurl': 'http://mirrors.zju.edu.cn/postgresql/repos/yum/13/redhat/rhel-$releasever-$basearch'})
skipping: [10.10.10.11] => (item={'name': 'pgdg-common', 'description': 'PostgreSQL common RPMs for RHEL/CentOS $releasever - $basearch', 'gpgcheck': False, 'baseurl': 'http://mirrors.zju.edu.cn/postgresql/repos/yum/common/redhat/rhel-$releasever-$basearch'})
skipping: [10.10.10.12] => (item={'name': 'prometheus', 'description': 'Prometheus and exporters', 'gpgcheck': False, 'baseurl': 'https://packagecloud.io/prometheus-rpm/release/el/$releasever/$basearch'})
skipping: [10.10.10.10] => (item={'name': 'centos-sclo', 'description': 'CentOS-$releasever - SCLo', 'gpgcheck': False, 'mirrorlist': 'http://mirrorlist.centos.org?arch=$basearch&release=7&repo=sclo-sclo'})
skipping: [10.10.10.11] => (item={'name': 'pgdg13', 'description': 'PostgreSQL 13 for RHEL/CentOS $releasever - $basearch', 'gpgcheck': False, 'baseurl': 'http://mirrors.zju.edu.cn/postgresql/repos/yum/13/redhat/rhel-$releasever-$basearch'})
skipping: [10.10.10.12] => (item={'name': 'pgdg-common', 'description': 'PostgreSQL common RPMs for RHEL/CentOS $releasever - $basearch', 'gpgcheck': False, 'baseurl': 'http://mirrors.zju.edu.cn/postgresql/repos/yum/common/redhat/rhel-$releasever-$basearch'})
skipping: [10.10.10.10] => (item={'name': 'centos-sclo-rh', 'description': 'CentOS-$releasever - SCLo rh', 'gpgcheck': False, 'mirrorlist': 'http://mirrorlist.centos.org?arch=$basearch&release=7&repo=sclo-rh'})
skipping: [10.10.10.11] => (item={'name': 'centos-sclo', 'description': 'CentOS-$releasever - SCLo', 'gpgcheck': False, 'mirrorlist': 'http://mirrorlist.centos.org?arch=$basearch&release=7&repo=sclo-sclo'})
skipping: [10.10.10.12] => (item={'name': 'pgdg13', 'description': 'PostgreSQL 13 for RHEL/CentOS $releasever - $basearch', 'gpgcheck': False, 'baseurl': 'http://mirrors.zju.edu.cn/postgresql/repos/yum/13/redhat/rhel-$releasever-$basearch'})
skipping: [10.10.10.10] => (item={'name': 'nginx', 'description': 'Nginx Official Yum Repo', 'skip_if_unavailable': True, 'gpgcheck': False, 'baseurl': 'http://nginx.org/packages/centos/$releasever/$basearch/'})
skipping: [10.10.10.13] => (item={'name': 'prometheus', 'description': 'Prometheus and exporters', 'gpgcheck': False, 'baseurl': 'https://packagecloud.io/prometheus-rpm/release/el/$releasever/$basearch'})
skipping: [10.10.10.11] => (item={'name': 'centos-sclo-rh', 'description': 'CentOS-$releasever - SCLo rh', 'gpgcheck': False, 'mirrorlist': 'http://mirrorlist.centos.org?arch=$basearch&release=7&repo=sclo-rh'})
skipping: [10.10.10.12] => (item={'name': 'centos-sclo', 'description': 'CentOS-$releasever - SCLo', 'gpgcheck': False, 'mirrorlist': 'http://mirrorlist.centos.org?arch=$basearch&release=7&repo=sclo-sclo'})
skipping: [10.10.10.13] => (item={'name': 'pgdg-common', 'description': 'PostgreSQL common RPMs for RHEL/CentOS $releasever - $basearch', 'gpgcheck': False, 'baseurl': 'http://mirrors.zju.edu.cn/postgresql/repos/yum/common/redhat/rhel-$releasever-$basearch'})
skipping: [10.10.10.10] => (item={'name': 'haproxy', 'description': 'Copr repo for haproxy', 'skip_if_unavailable': True, 'gpgcheck': False, 'baseurl': 'https://download.copr.fedorainfracloud.org/results/roidelapluie/haproxy/epel-$releasever-$basearch/'})
skipping: [10.10.10.11] => (item={'name': 'nginx', 'description': 'Nginx Official Yum Repo', 'skip_if_unavailable': True, 'gpgcheck': False, 'baseurl': 'http://nginx.org/packages/centos/$releasever/$basearch/'})
skipping: [10.10.10.12] => (item={'name': 'centos-sclo-rh', 'description': 'CentOS-$releasever - SCLo rh', 'gpgcheck': False, 'mirrorlist': 'http://mirrorlist.centos.org?arch=$basearch&release=7&repo=sclo-rh'})
skipping: [10.10.10.10] => (item={'name': 'harbottle', 'description': 'Copr repo for main owned by harbottle', 'skip_if_unavailable': True, 'gpgcheck': False, 'baseurl': 'https://download.copr.fedorainfracloud.org/results/harbottle/main/epel-$releasever-$basearch/'})
skipping: [10.10.10.13] => (item={'name': 'pgdg13', 'description': 'PostgreSQL 13 for RHEL/CentOS $releasever - $basearch', 'gpgcheck': False, 'baseurl': 'http://mirrors.zju.edu.cn/postgresql/repos/yum/13/redhat/rhel-$releasever-$basearch'})
skipping: [10.10.10.11] => (item={'name': 'haproxy', 'description': 'Copr repo for haproxy', 'skip_if_unavailable': True, 'gpgcheck': False, 'baseurl': 'https://download.copr.fedorainfracloud.org/results/roidelapluie/haproxy/epel-$releasever-$basearch/'})
skipping: [10.10.10.12] => (item={'name': 'nginx', 'description': 'Nginx Official Yum Repo', 'skip_if_unavailable': True, 'gpgcheck': False, 'baseurl': 'http://nginx.org/packages/centos/$releasever/$basearch/'})
skipping: [10.10.10.13] => (item={'name': 'centos-sclo', 'description': 'CentOS-$releasever - SCLo', 'gpgcheck': False, 'mirrorlist': 'http://mirrorlist.centos.org?arch=$basearch&release=7&repo=sclo-sclo'})
skipping: [10.10.10.11] => (item={'name': 'harbottle', 'description': 'Copr repo for main owned by harbottle', 'skip_if_unavailable': True, 'gpgcheck': False, 'baseurl': 'https://download.copr.fedorainfracloud.org/results/harbottle/main/epel-$releasever-$basearch/'})
skipping: [10.10.10.12] => (item={'name': 'haproxy', 'description': 'Copr repo for haproxy', 'skip_if_unavailable': True, 'gpgcheck': False, 'baseurl': 'https://download.copr.fedorainfracloud.org/results/roidelapluie/haproxy/epel-$releasever-$basearch/'})
skipping: [10.10.10.13] => (item={'name': 'centos-sclo-rh', 'description': 'CentOS-$releasever - SCLo rh', 'gpgcheck': False, 'mirrorlist': 'http://mirrorlist.centos.org?arch=$basearch&release=7&repo=sclo-rh'})
skipping: [10.10.10.12] => (item={'name': 'harbottle', 'description': 'Copr repo for main owned by harbottle', 'skip_if_unavailable': True, 'gpgcheck': False, 'baseurl': 'https://download.copr.fedorainfracloud.org/results/harbottle/main/epel-$releasever-$basearch/'})
skipping: [10.10.10.13] => (item={'name': 'nginx', 'description': 'Nginx Official Yum Repo', 'skip_if_unavailable': True, 'gpgcheck': False, 'baseurl': 'http://nginx.org/packages/centos/$releasever/$basearch/'})
skipping: [10.10.10.13] => (item={'name': 'haproxy', 'description': 'Copr repo for haproxy', 'skip_if_unavailable': True, 'gpgcheck': False, 'baseurl': 'https://download.copr.fedorainfracloud.org/results/roidelapluie/haproxy/epel-$releasever-$basearch/'})
skipping: [10.10.10.13] => (item={'name': 'harbottle', 'description': 'Copr repo for main owned by harbottle', 'skip_if_unavailable': True, 'gpgcheck': False, 'baseurl': 'https://download.copr.fedorainfracloud.org/results/harbottle/main/epel-$releasever-$basearch/'})
TASK [node : Install local repo] *************************************************************************************************************************************************************************
changed: [10.10.10.13] => (item=http://yum.pigsty/pigsty.repo)
changed: [10.10.10.12] => (item=http://yum.pigsty/pigsty.repo)
changed: [10.10.10.11] => (item=http://yum.pigsty/pigsty.repo)
changed: [10.10.10.10] => (item=http://yum.pigsty/pigsty.repo)
TASK [node : Install node basic packages] ****************************************************************************************************************************************************************
skipping: [10.10.10.10] => (item=[])
skipping: [10.10.10.11] => (item=[])
skipping: [10.10.10.12] => (item=[])
skipping: [10.10.10.13] => (item=[])
TASK [node : Install node extra packages] ****************************************************************************************************************************************************************
skipping: [10.10.10.10] => (item=[])
skipping: [10.10.10.11] => (item=[])
skipping: [10.10.10.12] => (item=[])
skipping: [10.10.10.13] => (item=[])
TASK [node : Install meta specific packages] *************************************************************************************************************************************************************
skipping: [10.10.10.10] => (item=[])
skipping: [10.10.10.11] => (item=[])
skipping: [10.10.10.12] => (item=[])
skipping: [10.10.10.13] => (item=[])
TASK [node : Install node basic packages] ****************************************************************************************************************************************************************
changed: [10.10.10.10] => (item=['wget,yum-utils,ntp,chrony,tuned,uuid,lz4,vim-minimal,make,patch,bash,lsof,wget,unzip,git,readline,zlib,openssl', 'numactl,grubby,sysstat,dstat,iotop,bind-utils,net-tools,tcpdump,socat,ipvsadm,telnet,tuned,pv,jq', 'python3,python3-psycopg2,python36-requests,python3-etcd,python3-consul', 'python36-urllib3,python36-idna,python36-pyOpenSSL,python36-cryptography', 'node_exporter,consul,consul-template,etcd,haproxy,keepalived,vip-manager'])
changed: [10.10.10.13] => (item=['wget,yum-utils,ntp,chrony,tuned,uuid,lz4,vim-minimal,make,patch,bash,lsof,wget,unzip,git,readline,zlib,openssl', 'numactl,grubby,sysstat,dstat,iotop,bind-utils,net-tools,tcpdump,socat,ipvsadm,telnet,tuned,pv,jq', 'python3,python3-psycopg2,python36-requests,python3-etcd,python3-consul', 'python36-urllib3,python36-idna,python36-pyOpenSSL,python36-cryptography', 'node_exporter,consul,consul-template,etcd,haproxy,keepalived,vip-manager'])
changed: [10.10.10.11] => (item=['wget,yum-utils,ntp,chrony,tuned,uuid,lz4,vim-minimal,make,patch,bash,lsof,wget,unzip,git,readline,zlib,openssl', 'numactl,grubby,sysstat,dstat,iotop,bind-utils,net-tools,tcpdump,socat,ipvsadm,telnet,tuned,pv,jq', 'python3,python3-psycopg2,python36-requests,python3-etcd,python3-consul', 'python36-urllib3,python36-idna,python36-pyOpenSSL,python36-cryptography', 'node_exporter,consul,consul-template,etcd,haproxy,keepalived,vip-manager'])
changed: [10.10.10.12] => (item=['wget,yum-utils,ntp,chrony,tuned,uuid,lz4,vim-minimal,make,patch,bash,lsof,wget,unzip,git,readline,zlib,openssl', 'numactl,grubby,sysstat,dstat,iotop,bind-utils,net-tools,tcpdump,socat,ipvsadm,telnet,tuned,pv,jq', 'python3,python3-psycopg2,python36-requests,python3-etcd,python3-consul', 'python36-urllib3,python36-idna,python36-pyOpenSSL,python36-cryptography', 'node_exporter,consul,consul-template,etcd,haproxy,keepalived,vip-manager'])
TASK [node : Install node extra packages] ****************************************************************************************************************************************************************
changed: [10.10.10.11] => (item=['patroni,patroni-consul,patroni-etcd,pgbouncer,pgbadger,pg_activity'])
changed: [10.10.10.12] => (item=['patroni,patroni-consul,patroni-etcd,pgbouncer,pgbadger,pg_activity'])
changed: [10.10.10.13] => (item=['patroni,patroni-consul,patroni-etcd,pgbouncer,pgbadger,pg_activity'])
changed: [10.10.10.10] => (item=['patroni,patroni-consul,patroni-etcd,pgbouncer,pgbadger,pg_activity'])
TASK [node : Install meta specific packages] *************************************************************************************************************************************************************
skipping: [10.10.10.11] => (item=[])
skipping: [10.10.10.12] => (item=[])
skipping: [10.10.10.13] => (item=[])
changed: [10.10.10.10] => (item=['grafana,prometheus2,alertmanager,nginx_exporter,blackbox_exporter,pushgateway', 'dnsmasq,nginx,ansible,pgbadger,polysh'])
TASK [node : Node configure disable numa] ****************************************************************************************************************************************************************
skipping: [10.10.10.10]
skipping: [10.10.10.11]
skipping: [10.10.10.12]
skipping: [10.10.10.13]
TASK [node : Node configure disable swap] ****************************************************************************************************************************************************************
skipping: [10.10.10.10]
skipping: [10.10.10.11]
skipping: [10.10.10.12]
skipping: [10.10.10.13]
TASK [node : Node configure unmount swap] ****************************************************************************************************************************************************************
skipping: [10.10.10.10] => (item=swap)
skipping: [10.10.10.10] => (item=none)
skipping: [10.10.10.11] => (item=swap)
skipping: [10.10.10.11] => (item=none)
skipping: [10.10.10.12] => (item=swap)
skipping: [10.10.10.12] => (item=none)
skipping: [10.10.10.13] => (item=swap)
skipping: [10.10.10.13] => (item=none)
TASK [node : Node setup static network] ******************************************************************************************************************************************************************
changed: [10.10.10.11]
changed: [10.10.10.12]
changed: [10.10.10.13]
changed: [10.10.10.10]
TASK [node : Node configure disable firewall] ************************************************************************************************************************************************************
changed: [10.10.10.11]
changed: [10.10.10.12]
changed: [10.10.10.10]
changed: [10.10.10.13]
TASK [node : Node configure disk prefetch] ***************************************************************************************************************************************************************
skipping: [10.10.10.10]
skipping: [10.10.10.11]
skipping: [10.10.10.12]
skipping: [10.10.10.13]
TASK [node : Enable linux kernel modules] ****************************************************************************************************************************************************************
changed: [10.10.10.13] => (item=softdog)
changed: [10.10.10.12] => (item=softdog)
changed: [10.10.10.11] => (item=softdog)
changed: [10.10.10.10] => (item=softdog)
changed: [10.10.10.13] => (item=br_netfilter)
changed: [10.10.10.12] => (item=br_netfilter)
changed: [10.10.10.11] => (item=br_netfilter)
changed: [10.10.10.10] => (item=br_netfilter)
changed: [10.10.10.12] => (item=ip_vs)
changed: [10.10.10.13] => (item=ip_vs)
changed: [10.10.10.11] => (item=ip_vs)
changed: [10.10.10.10] => (item=ip_vs)
changed: [10.10.10.13] => (item=ip_vs_rr)
changed: [10.10.10.12] => (item=ip_vs_rr)
changed: [10.10.10.11] => (item=ip_vs_rr)
changed: [10.10.10.10] => (item=ip_vs_rr)
ok: [10.10.10.13] => (item=ip_vs_rr)
ok: [10.10.10.12] => (item=ip_vs_rr)
ok: [10.10.10.11] => (item=ip_vs_rr)
ok: [10.10.10.10] => (item=ip_vs_rr)
changed: [10.10.10.13] => (item=ip_vs_wrr)
changed: [10.10.10.12] => (item=ip_vs_wrr)
changed: [10.10.10.11] => (item=ip_vs_wrr)
changed: [10.10.10.10] => (item=ip_vs_wrr)
changed: [10.10.10.13] => (item=ip_vs_sh)
changed: [10.10.10.12] => (item=ip_vs_sh)
changed: [10.10.10.11] => (item=ip_vs_sh)
changed: [10.10.10.10] => (item=ip_vs_sh)
changed: [10.10.10.13] => (item=nf_conntrack_ipv4)
changed: [10.10.10.12] => (item=nf_conntrack_ipv4)
changed: [10.10.10.11] => (item=nf_conntrack_ipv4)
changed: [10.10.10.10] => (item=nf_conntrack_ipv4)
TASK [node : Enable kernel module on reboot] *************************************************************************************************************************************************************
changed: [10.10.10.11]
changed: [10.10.10.13]
changed: [10.10.10.12]
changed: [10.10.10.10]
TASK [node : Get config parameter page count] ************************************************************************************************************************************************************
changed: [10.10.10.11]
changed: [10.10.10.10]
changed: [10.10.10.12]
changed: [10.10.10.13]
TASK [node : Get config parameter page size] *************************************************************************************************************************************************************
changed: [10.10.10.11]
changed: [10.10.10.10]
changed: [10.10.10.12]
changed: [10.10.10.13]
TASK [node : Tune shmmax and shmall via mem] *************************************************************************************************************************************************************
skipping: [10.10.10.10]
skipping: [10.10.10.11]
skipping: [10.10.10.12]
skipping: [10.10.10.13]
TASK [node : Create tuned profiles] **********************************************************************************************************************************************************************
changed: [10.10.10.11] => (item=oltp)
changed: [10.10.10.12] => (item=oltp)
changed: [10.10.10.10] => (item=oltp)
changed: [10.10.10.13] => (item=oltp)
changed: [10.10.10.11] => (item=olap)
changed: [10.10.10.12] => (item=olap)
changed: [10.10.10.13] => (item=olap)
changed: [10.10.10.10] => (item=olap)
changed: [10.10.10.11] => (item=crit)
changed: [10.10.10.12] => (item=crit)
changed: [10.10.10.13] => (item=crit)
changed: [10.10.10.10] => (item=crit)
changed: [10.10.10.11] => (item=tiny)
changed: [10.10.10.12] => (item=tiny)
changed: [10.10.10.13] => (item=tiny)
changed: [10.10.10.10] => (item=tiny)
TASK [node : Render tuned profiles] **********************************************************************************************************************************************************************
changed: [10.10.10.11] => (item=oltp)
changed: [10.10.10.12] => (item=oltp)
changed: [10.10.10.13] => (item=oltp)
changed: [10.10.10.10] => (item=oltp)
changed: [10.10.10.12] => (item=olap)
changed: [10.10.10.11] => (item=olap)
changed: [10.10.10.13] => (item=olap)
changed: [10.10.10.10] => (item=olap)
changed: [10.10.10.12] => (item=crit)
changed: [10.10.10.11] => (item=crit)
changed: [10.10.10.13] => (item=crit)
changed: [10.10.10.10] => (item=crit)
changed: [10.10.10.11] => (item=tiny)
changed: [10.10.10.12] => (item=tiny)
changed: [10.10.10.13] => (item=tiny)
changed: [10.10.10.10] => (item=tiny)
TASK [node : Active tuned profile] ***********************************************************************************************************************************************************************
changed: [10.10.10.13]
changed: [10.10.10.10]
changed: [10.10.10.12]
changed: [10.10.10.11]
TASK [node : Change additional sysctl params] ************************************************************************************************************************************************************
changed: [10.10.10.13] => (item={'key': 'net.bridge.bridge-nf-call-iptables', 'value': 1})
changed: [10.10.10.12] => (item={'key': 'net.bridge.bridge-nf-call-iptables', 'value': 1})
changed: [10.10.10.11] => (item={'key': 'net.bridge.bridge-nf-call-iptables', 'value': 1})
changed: [10.10.10.10] => (item={'key': 'net.bridge.bridge-nf-call-iptables', 'value': 1})
TASK [node : Copy default user bash profile] *************************************************************************************************************************************************************
changed: [10.10.10.11]
changed: [10.10.10.12]
changed: [10.10.10.13]
changed: [10.10.10.10]
TASK [node : Setup node default pam ulimits] *************************************************************************************************************************************************************
changed: [10.10.10.11]
changed: [10.10.10.12]
changed: [10.10.10.13]
changed: [10.10.10.10]
TASK [node : Create os user group admin] *****************************************************************************************************************************************************************
changed: [10.10.10.12]
changed: [10.10.10.11]
changed: [10.10.10.13]
changed: [10.10.10.10]
TASK [node : Create os user admin] ***********************************************************************************************************************************************************************
changed: [10.10.10.13]
changed: [10.10.10.11]
changed: [10.10.10.12]
changed: [10.10.10.10]
TASK [node : Grant admin group nopass sudo] **************************************************************************************************************************************************************
changed: [10.10.10.11]
changed: [10.10.10.12]
changed: [10.10.10.13]
changed: [10.10.10.10]
TASK [node : Add no host checking to ssh config] *********************************************************************************************************************************************************
changed: [10.10.10.11]
changed: [10.10.10.12]
changed: [10.10.10.13]
changed: [10.10.10.10]
TASK [node : Add admin ssh no host checking] *************************************************************************************************************************************************************
ok: [10.10.10.11]
ok: [10.10.10.10]
ok: [10.10.10.12]
ok: [10.10.10.13]
TASK [node : Fetch all admin public keys] ****************************************************************************************************************************************************************
changed: [10.10.10.11]
changed: [10.10.10.12]
changed: [10.10.10.10]
changed: [10.10.10.13]
TASK [node : Exchange all admin ssh keys] ****************************************************************************************************************************************************************
changed: [10.10.10.10 -> meta] => (item=['ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDfXbkp7ATV3rIzcpCwxcwpumIjnjldzDp9qfu65d4W5gSNumN/wvOORnG17rB2y/msyjstu1C42v2V60yho/XjPNIqqPWPtM/bc6MHNeNJJxvEEtDsY530z3n37QTcVI1kg3zRqnzm8HDKEE+BAll+iyXjzTFoGHc39syDRF8r5sZpG0qiNY2QaqEnByASsoHM4RQ3Jw2D2SbA78wFBz1zqsdz5VympAcc9wcfuUqhwk0ExL+AtrPNUeyEXwgRr1Br6JXVHjT6EHLsZburTD7uT94Jqzixd3LXRwsmuCrPIssASrYvfnWVQ29MxhiZqrmLcwp4ImjQetcZE2EgfzEp ansible-generated on meta', '10.10.10.10'])
changed: [10.10.10.13 -> meta] => (item=['ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDbkD6WQhs9KAv9HTYtZ+q2Nfxqhj72YbP16m0mTrEOS2evd4MWDBhVgAE6qK4gvAhVBdEdNaHc3f2W/wDpKvvbvCbwy+HZldUCTVUe1W3sycm1ZwP7m9Xr7Rg0Dd1Nom87CWsqmlmN6afPYyvJV3wCl4ZuqrAMQ5oCrR4D1B8yZBL7rj55JpzggnNJYv7+ueIeUYoPzE6mu32k9wPxEa2qXcdVelgL7dwjTAt1nsNukWAufuAI1nZcJahsNjj1B2XEEwgA1mHUzDPpemn5alCNeCb+Hdb0Y12No/Wo2Gcn3b5vh9pOamLCm3CGrrsAXZ2B8tQPGFObhGkSOB6pddkT ansible-generated on node-3', '10.10.10.10'])
changed: [10.10.10.12 -> meta] => (item=['ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQC3IAopnkVwQ779/Hk5MceAVZbhb/y3YaUu7ZROI87TaY/XK5WKJjplfNlLBC2vXGNkYMirbW+Qmmz/XIsyL7qvKmQfcMGP3ILD4FtMMlJMWLwBTIw5ORxvoZGxaWfw0bcZSIw5rv9rBA4UJR9JfZhpUkBMj7cq8jNDyIrLpoJ+hlnJa5G5zyiMWBqe7VKOoiBo7d2WBIauhRgHY3G79H9pVxJti6JJOeQ1tsUI5UtOMCRO+dbmsuRWruac4jWOj864RG/EjFveWEfCTagMFakqaxPTgF3RHAwPVBjbMm3+2lBiVNd2Zt2g/2gPdkEbIE+xXXP/f5kh21gXFea4ENsV ansible-generated on node-2', '10.10.10.10'])
changed: [10.10.10.11 -> meta] => (item=['ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQC2TJItJzBUEZ452k7ADL6mIQsGk7gb4AUqvN0pAHwR06pVv1XUmpCI5Wb0RUOoNFwmSBVTUXoXCnK7SB44ftpzD29cpxw3tlLEphYeY1wfrd2lblhpn2KxzBhyJZ27lK2qcZk7Ik20pZDhQZRuZuhb6HufYn7FGOutB8kgQChrcpqr9zRhjZOe4Y8tLR2lmEAVrp6ZsS04rjiBJ65TDCWCNSnin8DVbM1EerJ6Pvxy1cOY+B00EYMHlMni/3orzcrlnZqpkR/NRpgs9+lo+DZ4SCuEtIEOzpPzcm/O4oLhxSnTMJKTFwcc+bgmE0t1LMxvIKOQTwhIX+KoBE/syxh9 ansible-generated on node-1', '10.10.10.10'])
changed: [10.10.10.10 -> node-1] => (item=['ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDfXbkp7ATV3rIzcpCwxcwpumIjnjldzDp9qfu65d4W5gSNumN/wvOORnG17rB2y/msyjstu1C42v2V60yho/XjPNIqqPWPtM/bc6MHNeNJJxvEEtDsY530z3n37QTcVI1kg3zRqnzm8HDKEE+BAll+iyXjzTFoGHc39syDRF8r5sZpG0qiNY2QaqEnByASsoHM4RQ3Jw2D2SbA78wFBz1zqsdz5VympAcc9wcfuUqhwk0ExL+AtrPNUeyEXwgRr1Br6JXVHjT6EHLsZburTD7uT94Jqzixd3LXRwsmuCrPIssASrYvfnWVQ29MxhiZqrmLcwp4ImjQetcZE2EgfzEp ansible-generated on meta', '10.10.10.11'])
changed: [10.10.10.12 -> node-1] => (item=['ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQC3IAopnkVwQ779/Hk5MceAVZbhb/y3YaUu7ZROI87TaY/XK5WKJjplfNlLBC2vXGNkYMirbW+Qmmz/XIsyL7qvKmQfcMGP3ILD4FtMMlJMWLwBTIw5ORxvoZGxaWfw0bcZSIw5rv9rBA4UJR9JfZhpUkBMj7cq8jNDyIrLpoJ+hlnJa5G5zyiMWBqe7VKOoiBo7d2WBIauhRgHY3G79H9pVxJti6JJOeQ1tsUI5UtOMCRO+dbmsuRWruac4jWOj864RG/EjFveWEfCTagMFakqaxPTgF3RHAwPVBjbMm3+2lBiVNd2Zt2g/2gPdkEbIE+xXXP/f5kh21gXFea4ENsV ansible-generated on node-2', '10.10.10.11'])
changed: [10.10.10.13 -> node-1] => (item=['ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDbkD6WQhs9KAv9HTYtZ+q2Nfxqhj72YbP16m0mTrEOS2evd4MWDBhVgAE6qK4gvAhVBdEdNaHc3f2W/wDpKvvbvCbwy+HZldUCTVUe1W3sycm1ZwP7m9Xr7Rg0Dd1Nom87CWsqmlmN6afPYyvJV3wCl4ZuqrAMQ5oCrR4D1B8yZBL7rj55JpzggnNJYv7+ueIeUYoPzE6mu32k9wPxEa2qXcdVelgL7dwjTAt1nsNukWAufuAI1nZcJahsNjj1B2XEEwgA1mHUzDPpemn5alCNeCb+Hdb0Y12No/Wo2Gcn3b5vh9pOamLCm3CGrrsAXZ2B8tQPGFObhGkSOB6pddkT ansible-generated on node-3', '10.10.10.11'])
changed: [10.10.10.11 -> node-1] => (item=['ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQC2TJItJzBUEZ452k7ADL6mIQsGk7gb4AUqvN0pAHwR06pVv1XUmpCI5Wb0RUOoNFwmSBVTUXoXCnK7SB44ftpzD29cpxw3tlLEphYeY1wfrd2lblhpn2KxzBhyJZ27lK2qcZk7Ik20pZDhQZRuZuhb6HufYn7FGOutB8kgQChrcpqr9zRhjZOe4Y8tLR2lmEAVrp6ZsS04rjiBJ65TDCWCNSnin8DVbM1EerJ6Pvxy1cOY+B00EYMHlMni/3orzcrlnZqpkR/NRpgs9+lo+DZ4SCuEtIEOzpPzcm/O4oLhxSnTMJKTFwcc+bgmE0t1LMxvIKOQTwhIX+KoBE/syxh9 ansible-generated on node-1', '10.10.10.11'])
changed: [10.10.10.10 -> node-2] => (item=['ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDfXbkp7ATV3rIzcpCwxcwpumIjnjldzDp9qfu65d4W5gSNumN/wvOORnG17rB2y/msyjstu1C42v2V60yho/XjPNIqqPWPtM/bc6MHNeNJJxvEEtDsY530z3n37QTcVI1kg3zRqnzm8HDKEE+BAll+iyXjzTFoGHc39syDRF8r5sZpG0qiNY2QaqEnByASsoHM4RQ3Jw2D2SbA78wFBz1zqsdz5VympAcc9wcfuUqhwk0ExL+AtrPNUeyEXwgRr1Br6JXVHjT6EHLsZburTD7uT94Jqzixd3LXRwsmuCrPIssASrYvfnWVQ29MxhiZqrmLcwp4ImjQetcZE2EgfzEp ansible-generated on meta', '10.10.10.12'])
changed: [10.10.10.13 -> node-2] => (item=['ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDbkD6WQhs9KAv9HTYtZ+q2Nfxqhj72YbP16m0mTrEOS2evd4MWDBhVgAE6qK4gvAhVBdEdNaHc3f2W/wDpKvvbvCbwy+HZldUCTVUe1W3sycm1ZwP7m9Xr7Rg0Dd1Nom87CWsqmlmN6afPYyvJV3wCl4ZuqrAMQ5oCrR4D1B8yZBL7rj55JpzggnNJYv7+ueIeUYoPzE6mu32k9wPxEa2qXcdVelgL7dwjTAt1nsNukWAufuAI1nZcJahsNjj1B2XEEwgA1mHUzDPpemn5alCNeCb+Hdb0Y12No/Wo2Gcn3b5vh9pOamLCm3CGrrsAXZ2B8tQPGFObhGkSOB6pddkT ansible-generated on node-3', '10.10.10.12'])
changed: [10.10.10.12 -> node-2] => (item=['ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQC3IAopnkVwQ779/Hk5MceAVZbhb/y3YaUu7ZROI87TaY/XK5WKJjplfNlLBC2vXGNkYMirbW+Qmmz/XIsyL7qvKmQfcMGP3ILD4FtMMlJMWLwBTIw5ORxvoZGxaWfw0bcZSIw5rv9rBA4UJR9JfZhpUkBMj7cq8jNDyIrLpoJ+hlnJa5G5zyiMWBqe7VKOoiBo7d2WBIauhRgHY3G79H9pVxJti6JJOeQ1tsUI5UtOMCRO+dbmsuRWruac4jWOj864RG/EjFveWEfCTagMFakqaxPTgF3RHAwPVBjbMm3+2lBiVNd2Zt2g/2gPdkEbIE+xXXP/f5kh21gXFea4ENsV ansible-generated on node-2', '10.10.10.12'])
changed: [10.10.10.11 -> node-2] => (item=['ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQC2TJItJzBUEZ452k7ADL6mIQsGk7gb4AUqvN0pAHwR06pVv1XUmpCI5Wb0RUOoNFwmSBVTUXoXCnK7SB44ftpzD29cpxw3tlLEphYeY1wfrd2lblhpn2KxzBhyJZ27lK2qcZk7Ik20pZDhQZRuZuhb6HufYn7FGOutB8kgQChrcpqr9zRhjZOe4Y8tLR2lmEAVrp6ZsS04rjiBJ65TDCWCNSnin8DVbM1EerJ6Pvxy1cOY+B00EYMHlMni/3orzcrlnZqpkR/NRpgs9+lo+DZ4SCuEtIEOzpPzcm/O4oLhxSnTMJKTFwcc+bgmE0t1LMxvIKOQTwhIX+KoBE/syxh9 ansible-generated on node-1', '10.10.10.12'])
changed: [10.10.10.10 -> node-3] => (item=['ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDfXbkp7ATV3rIzcpCwxcwpumIjnjldzDp9qfu65d4W5gSNumN/wvOORnG17rB2y/msyjstu1C42v2V60yho/XjPNIqqPWPtM/bc6MHNeNJJxvEEtDsY530z3n37QTcVI1kg3zRqnzm8HDKEE+BAll+iyXjzTFoGHc39syDRF8r5sZpG0qiNY2QaqEnByASsoHM4RQ3Jw2D2SbA78wFBz1zqsdz5VympAcc9wcfuUqhwk0ExL+AtrPNUeyEXwgRr1Br6JXVHjT6EHLsZburTD7uT94Jqzixd3LXRwsmuCrPIssASrYvfnWVQ29MxhiZqrmLcwp4ImjQetcZE2EgfzEp ansible-generated on meta', '10.10.10.13'])
changed: [10.10.10.13 -> node-3] => (item=['ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDbkD6WQhs9KAv9HTYtZ+q2Nfxqhj72YbP16m0mTrEOS2evd4MWDBhVgAE6qK4gvAhVBdEdNaHc3f2W/wDpKvvbvCbwy+HZldUCTVUe1W3sycm1ZwP7m9Xr7Rg0Dd1Nom87CWsqmlmN6afPYyvJV3wCl4ZuqrAMQ5oCrR4D1B8yZBL7rj55JpzggnNJYv7+ueIeUYoPzE6mu32k9wPxEa2qXcdVelgL7dwjTAt1nsNukWAufuAI1nZcJahsNjj1B2XEEwgA1mHUzDPpemn5alCNeCb+Hdb0Y12No/Wo2Gcn3b5vh9pOamLCm3CGrrsAXZ2B8tQPGFObhGkSOB6pddkT ansible-generated on node-3', '10.10.10.13'])
changed: [10.10.10.11 -> node-3] => (item=['ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQC2TJItJzBUEZ452k7ADL6mIQsGk7gb4AUqvN0pAHwR06pVv1XUmpCI5Wb0RUOoNFwmSBVTUXoXCnK7SB44ftpzD29cpxw3tlLEphYeY1wfrd2lblhpn2KxzBhyJZ27lK2qcZk7Ik20pZDhQZRuZuhb6HufYn7FGOutB8kgQChrcpqr9zRhjZOe4Y8tLR2lmEAVrp6ZsS04rjiBJ65TDCWCNSnin8DVbM1EerJ6Pvxy1cOY+B00EYMHlMni/3orzcrlnZqpkR/NRpgs9+lo+DZ4SCuEtIEOzpPzcm/O4oLhxSnTMJKTFwcc+bgmE0t1LMxvIKOQTwhIX+KoBE/syxh9 ansible-generated on node-1', '10.10.10.13'])
changed: [10.10.10.12 -> node-3] => (item=['ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQC3IAopnkVwQ779/Hk5MceAVZbhb/y3YaUu7ZROI87TaY/XK5WKJjplfNlLBC2vXGNkYMirbW+Qmmz/XIsyL7qvKmQfcMGP3ILD4FtMMlJMWLwBTIw5ORxvoZGxaWfw0bcZSIw5rv9rBA4UJR9JfZhpUkBMj7cq8jNDyIrLpoJ+hlnJa5G5zyiMWBqe7VKOoiBo7d2WBIauhRgHY3G79H9pVxJti6JJOeQ1tsUI5UtOMCRO+dbmsuRWruac4jWOj864RG/EjFveWEfCTagMFakqaxPTgF3RHAwPVBjbMm3+2lBiVNd2Zt2g/2gPdkEbIE+xXXP/f5kh21gXFea4ENsV ansible-generated on node-2', '10.10.10.13'])
TASK [node : Install public keys] ************************************************************************************************************************************************************************
changed: [10.10.10.11] => (item=ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAAAgQC7IMAMNavYtWwzAJajKqwdn3ar5BhvcwCnBTxxEkXhGlCO2vfgosSAQMEflfgvkiI5nM1HIFQ8KINlx1XLO7SdL5KdInG5LIJjAFh0pujS4kNCT9a5IGvSq1BrzGqhbEcwWYdju1ZPYBcJm/MG+JD0dYCh8vfrYB/cYMD0SOmNkQ== vagrant@pigsty.com)
changed: [10.10.10.10] => (item=ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAAAgQC7IMAMNavYtWwzAJajKqwdn3ar5BhvcwCnBTxxEkXhGlCO2vfgosSAQMEflfgvkiI5nM1HIFQ8KINlx1XLO7SdL5KdInG5LIJjAFh0pujS4kNCT9a5IGvSq1BrzGqhbEcwWYdju1ZPYBcJm/MG+JD0dYCh8vfrYB/cYMD0SOmNkQ== vagrant@pigsty.com)
changed: [10.10.10.12] => (item=ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAAAgQC7IMAMNavYtWwzAJajKqwdn3ar5BhvcwCnBTxxEkXhGlCO2vfgosSAQMEflfgvkiI5nM1HIFQ8KINlx1XLO7SdL5KdInG5LIJjAFh0pujS4kNCT9a5IGvSq1BrzGqhbEcwWYdju1ZPYBcJm/MG+JD0dYCh8vfrYB/cYMD0SOmNkQ== vagrant@pigsty.com)
changed: [10.10.10.13] => (item=ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAAAgQC7IMAMNavYtWwzAJajKqwdn3ar5BhvcwCnBTxxEkXhGlCO2vfgosSAQMEflfgvkiI5nM1HIFQ8KINlx1XLO7SdL5KdInG5LIJjAFh0pujS4kNCT9a5IGvSq1BrzGqhbEcwWYdju1ZPYBcJm/MG+JD0dYCh8vfrYB/cYMD0SOmNkQ== vagrant@pigsty.com)
TASK [node : Install ntp package] ************************************************************************************************************************************************************************
ok: [10.10.10.11]
ok: [10.10.10.12]
ok: [10.10.10.10]
ok: [10.10.10.13]
TASK [node : Install chrony package] *********************************************************************************************************************************************************************
ok: [10.10.10.11]
ok: [10.10.10.12]
ok: [10.10.10.13]
ok: [10.10.10.10]
TASK [node : Setup default node timezone] ****************************************************************************************************************************************************************
changed: [10.10.10.13]
changed: [10.10.10.11]
changed: [10.10.10.12]
changed: [10.10.10.10]
TASK [node : Copy the ntp.conf file] *********************************************************************************************************************************************************************
changed: [10.10.10.11]
changed: [10.10.10.12]
changed: [10.10.10.13]
changed: [10.10.10.10]
TASK [node : Copy the chrony.conf template] **************************************************************************************************************************************************************
changed: [10.10.10.11]
changed: [10.10.10.12]
changed: [10.10.10.13]
changed: [10.10.10.10]
TASK [node : Launch ntpd service] ************************************************************************************************************************************************************************
changed: [10.10.10.10]
changed: [10.10.10.11]
changed: [10.10.10.12]
changed: [10.10.10.13]
TASK [node : Launch chronyd service] *********************************************************************************************************************************************************************
skipping: [10.10.10.10]
skipping: [10.10.10.11]
skipping: [10.10.10.12]
skipping: [10.10.10.13]
PLAY [Init meta service] *********************************************************************************************************************************************************************************
TASK [ca : Create local ca directory] ********************************************************************************************************************************************************************
changed: [10.10.10.10]
TASK [ca : Copy ca cert from local files] ****************************************************************************************************************************************************************
skipping: [10.10.10.10] => (item=ca.key)
skipping: [10.10.10.10] => (item=ca.crt)
TASK [ca : Check ca key cert exists] *********************************************************************************************************************************************************************
ok: [10.10.10.10]
TASK [ca : Create self-signed CA key-cert] ***************************************************************************************************************************************************************
changed: [10.10.10.10]
TASK [nameserver : Make sure dnsmasq package installed] **************************************************************************************************************************************************
ok: [10.10.10.10]
TASK [nameserver : Copy dnsmasq /etc/dnsmasq.d/config] ***************************************************************************************************************************************************
changed: [10.10.10.10]
TASK [nameserver : Add dynamic dns records to meta] ******************************************************************************************************************************************************
changed: [10.10.10.10] => (item=10.10.10.2 pg-meta)
changed: [10.10.10.10] => (item=10.10.10.3 pg-test)
changed: [10.10.10.10] => (item=10.10.10.10 meta-1)
changed: [10.10.10.10] => (item=10.10.10.11 node-1)
changed: [10.10.10.10] => (item=10.10.10.12 node-2)
changed: [10.10.10.10] => (item=10.10.10.13 node-3)
changed: [10.10.10.10] => (item=10.10.10.10 pigsty)
changed: [10.10.10.10] => (item=10.10.10.10 y.pigsty yum.pigsty)
changed: [10.10.10.10] => (item=10.10.10.10 c.pigsty consul.pigsty)
changed: [10.10.10.10] => (item=10.10.10.10 g.pigsty grafana.pigsty)
changed: [10.10.10.10] => (item=10.10.10.10 p.pigsty prometheus.pigsty)
changed: [10.10.10.10] => (item=10.10.10.10 a.pigsty alertmanager.pigsty)
changed: [10.10.10.10] => (item=10.10.10.10 n.pigsty ntp.pigsty)
changed: [10.10.10.10] => (item=10.10.10.10 h.pigsty haproxy.pigsty)
TASK [nameserver : Launch meta dnsmasq service] **********************************************************************************************************************************************************
changed: [10.10.10.10]
TASK [nameserver : Wait for meta dnsmasq online] *********************************************************************************************************************************************************
ok: [10.10.10.10]
TASK [nameserver : Register consul dnsmasq service] ******************************************************************************************************************************************************
changed: [10.10.10.10]
TASK [nameserver : Reload consul] ************************************************************************************************************************************************************************
changed: [10.10.10.10]
TASK [nginx : Make sure nginx installed] *****************************************************************************************************************************************************************
ok: [10.10.10.10]
TASK [nginx : Create local html directory] ***************************************************************************************************************************************************************
ok: [10.10.10.10]
TASK [nginx : Create nginx config directory] *************************************************************************************************************************************************************
changed: [10.10.10.10]
TASK [nginx : Update default nginx index page] ***********************************************************************************************************************************************************
changed: [10.10.10.10]
TASK [nginx : Copy nginx default config] *****************************************************************************************************************************************************************
ok: [10.10.10.10]
TASK [nginx : Copy nginx upstream conf] ******************************************************************************************************************************************************************
changed: [10.10.10.10] => (item={'name': 'home', 'host': 'pigsty', 'url': '127.0.0.1:3000'})
changed: [10.10.10.10] => (item={'name': 'consul', 'host': 'c.pigsty', 'url': '127.0.0.1:8500'})
changed: [10.10.10.10] => (item={'name': 'grafana', 'host': 'g.pigsty', 'url': '127.0.0.1:3000'})
changed: [10.10.10.10] => (item={'name': 'prometheus', 'host': 'p.pigsty', 'url': '127.0.0.1:9090'})
changed: [10.10.10.10] => (item={'name': 'alertmanager', 'host': 'a.pigsty', 'url': '127.0.0.1:9093'})
changed: [10.10.10.10] => (item={'name': 'haproxy', 'host': 'h.pigsty', 'url': '127.0.0.1:9091'})
TASK [nginx : Templating /etc/nginx/haproxy.conf] ********************************************************************************************************************************************************
changed: [10.10.10.10]
TASK [nginx : Render haproxy upstream in cluster mode] ***************************************************************************************************************************************************
changed: [10.10.10.10] => (item=pg-meta)
changed: [10.10.10.10] => (item=pg-test)
TASK [nginx : Render haproxy location in cluster mode] ***************************************************************************************************************************************************
changed: [10.10.10.10] => (item=pg-meta)
changed: [10.10.10.10] => (item=pg-test)
TASK [nginx : Templating haproxy cluster index] **********************************************************************************************************************************************************
changed: [10.10.10.10] => (item=pg-meta)
changed: [10.10.10.10] => (item=pg-test)
TASK [nginx : Templating haproxy cluster index] **********************************************************************************************************************************************************
changed: [10.10.10.10] => (item=pg-meta)
ok: [10.10.10.10] => (item=pg-test)
TASK [nginx : Restart meta nginx service] ****************************************************************************************************************************************************************
changed: [10.10.10.10]
TASK [nginx : Wait for nginx service online] *************************************************************************************************************************************************************
ok: [10.10.10.10]
TASK [nginx : Make sure nginx exporter installed] ********************************************************************************************************************************************************
ok: [10.10.10.10]
TASK [nginx : Config nginx_exporter options] *************************************************************************************************************************************************************
changed: [10.10.10.10]
TASK [nginx : Restart nginx_exporter service] ************************************************************************************************************************************************************
changed: [10.10.10.10]
TASK [nginx : Wait for nginx exporter online] ************************************************************************************************************************************************************
ok: [10.10.10.10]
TASK [nginx : Register cosnul nginx service] *************************************************************************************************************************************************************
changed: [10.10.10.10]
TASK [nginx : Register consul nginx-exporter service] ****************************************************************************************************************************************************
changed: [10.10.10.10]
TASK [nginx : Reload consul] *****************************************************************************************************************************************************************************
changed: [10.10.10.10]
TASK [prometheus : Install prometheus and alertmanager] **************************************************************************************************************************************************
ok: [10.10.10.10] => (item=prometheus2)
ok: [10.10.10.10] => (item=alertmanager)
TASK [prometheus : Wipe out prometheus config dir] *******************************************************************************************************************************************************
changed: [10.10.10.10]
TASK [prometheus : Wipe out existing prometheus data] ****************************************************************************************************************************************************
ok: [10.10.10.10]
TASK [prometheus : Create postgres directory structure] **************************************************************************************************************************************************
changed: [10.10.10.10] => (item=/etc/prometheus)
changed: [10.10.10.10] => (item=/etc/prometheus/bin)
changed: [10.10.10.10] => (item=/etc/prometheus/rules)
changed: [10.10.10.10] => (item=/etc/prometheus/targets)
changed: [10.10.10.10] => (item=/export/prometheus/data)
TASK [prometheus : Copy prometheus bin scripts] **********************************************************************************************************************************************************
changed: [10.10.10.10]
TASK [prometheus : Copy prometheus rules scripts] ********************************************************************************************************************************************************
changed: [10.10.10.10]
TASK [prometheus : Copy altermanager config] *************************************************************************************************************************************************************
changed: [10.10.10.10]
TASK [prometheus : Render prometheus config] *************************************************************************************************************************************************************
changed: [10.10.10.10]
TASK [prometheus : Config /etc/prometheus opts] **********************************************************************************************************************************************************
changed: [10.10.10.10]
TASK [prometheus : Launch prometheus service] ************************************************************************************************************************************************************
changed: [10.10.10.10]
TASK [prometheus : Launch alertmanager service] **********************************************************************************************************************************************************
changed: [10.10.10.10]
TASK [prometheus : Wait for prometheus online] ***********************************************************************************************************************************************************
ok: [10.10.10.10]
TASK [prometheus : Wait for alertmanager online] *********************************************************************************************************************************************************
ok: [10.10.10.10]
TASK [prometheus : Render prometheus targets in cluster mode] ********************************************************************************************************************************************
changed: [10.10.10.10] => (item=pg-meta)
changed: [10.10.10.10] => (item=pg-test)
TASK [prometheus : Reload prometheus service] ************************************************************************************************************************************************************
changed: [10.10.10.10]
TASK [prometheus : Copy prometheus service definition] ***************************************************************************************************************************************************
changed: [10.10.10.10]
TASK [prometheus : Copy alertmanager service definition] *************************************************************************************************************************************************
changed: [10.10.10.10]
TASK [prometheus : Reload consul to register prometheus] *************************************************************************************************************************************************
changed: [10.10.10.10]
TASK [grafana : Make sure grafana is installed] **********************************************************************************************************************************************************
ok: [10.10.10.10]
TASK [grafana : Check grafana plugin cache exists] *******************************************************************************************************************************************************
ok: [10.10.10.10]
TASK [grafana : Provision grafana plugins via cache] *****************************************************************************************************************************************************
[WARNING]: Consider using the file module with state=absent rather than running 'rm'. If you need to use command because file is insufficient you can add 'warn: false' to this command task or set
'command_warnings=False' in ansible.cfg to get rid of this message.
changed: [10.10.10.10]
TASK [grafana : Download grafana plugins from web] *******************************************************************************************************************************************************
skipping: [10.10.10.10] => (item=redis-datasource)
skipping: [10.10.10.10] => (item=simpod-json-datasource)
skipping: [10.10.10.10] => (item=fifemon-graphql-datasource)
skipping: [10.10.10.10] => (item=sbueringer-consul-datasource)
skipping: [10.10.10.10] => (item=camptocamp-prometheus-alertmanager-datasource)
skipping: [10.10.10.10] => (item=ryantxu-ajax-panel)
skipping: [10.10.10.10] => (item=marcusolsson-hourly-heatmap-panel)
skipping: [10.10.10.10] => (item=michaeldmoore-multistat-panel)
skipping: [10.10.10.10] => (item=marcusolsson-treemap-panel)
skipping: [10.10.10.10] => (item=pr0ps-trackmap-panel)
skipping: [10.10.10.10] => (item=dalvany-image-panel)
skipping: [10.10.10.10] => (item=magnesium-wordcloud-panel)
skipping: [10.10.10.10] => (item=cloudspout-button-panel)
skipping: [10.10.10.10] => (item=speakyourcode-button-panel)
skipping: [10.10.10.10] => (item=jdbranham-diagram-panel)
skipping: [10.10.10.10] => (item=grafana-piechart-panel)
skipping: [10.10.10.10] => (item=snuids-radar-panel)
skipping: [10.10.10.10] => (item=digrich-bubblechart-panel)
TASK [grafana : Download grafana plugins from web] *******************************************************************************************************************************************************
skipping: [10.10.10.10] => (item=https://github.com/Vonng/grafana-echarts)
TASK [grafana : Create grafana plugins cache] ************************************************************************************************************************************************************
skipping: [10.10.10.10]
TASK [grafana : Copy /etc/grafana/grafana.ini] ***********************************************************************************************************************************************************
changed: [10.10.10.10]
TASK [grafana : Remove grafana provision dir] ************************************************************************************************************************************************************
changed: [10.10.10.10]
TASK [grafana : Copy provisioning content] ***************************************************************************************************************************************************************
changed: [10.10.10.10]
TASK [grafana : Copy pigsty dashboards] ******************************************************************************************************************************************************************
changed: [10.10.10.10]
TASK [grafana : Copy pigsty icon image] ******************************************************************************************************************************************************************
changed: [10.10.10.10]
TASK [grafana : Replace grafana icon with pigsty] ********************************************************************************************************************************************************
changed: [10.10.10.10]
TASK [grafana : Launch grafana service] ******************************************************************************************************************************************************************
changed: [10.10.10.10]
TASK [grafana : Wait for grafana online] *****************************************************************************************************************************************************************
ok: [10.10.10.10]
TASK [grafana : Update grafana default preferences] ******************************************************************************************************************************************************
changed: [10.10.10.10]
TASK [grafana : Register consul grafana service] *********************************************************************************************************************************************************
changed: [10.10.10.10]
TASK [grafana : Reload consul] ***************************************************************************************************************************************************************************
changed: [10.10.10.10]
PLAY [Init dcs] ******************************************************************************************************************************************************************************************
TASK [consul : Check for existing consul] ****************************************************************************************************************************************************************
changed: [10.10.10.10]
changed: [10.10.10.11]
changed: [10.10.10.12]
changed: [10.10.10.13]
TASK [consul : Consul exists flag fact set] **************************************************************************************************************************************************************
ok: [10.10.10.10]
ok: [10.10.10.11]
ok: [10.10.10.12]
ok: [10.10.10.13]
TASK [consul : Abort due to consul exists] ***************************************************************************************************************************************************************
skipping: [10.10.10.10]
skipping: [10.10.10.11]
skipping: [10.10.10.12]
skipping: [10.10.10.13]
TASK [consul : Clean existing consul instance] ***********************************************************************************************************************************************************
skipping: [10.10.10.10]
skipping: [10.10.10.11]
skipping: [10.10.10.12]
skipping: [10.10.10.13]
TASK [consul : Stop any running consul instance] *********************************************************************************************************************************************************
changed: [10.10.10.10]
changed: [10.10.10.11]
changed: [10.10.10.12]
changed: [10.10.10.13]
TASK [consul : Remove existing consul dir] ***************************************************************************************************************************************************************
changed: [10.10.10.10] => (item=/etc/consul.d)
changed: [10.10.10.11] => (item=/etc/consul.d)
changed: [10.10.10.12] => (item=/etc/consul.d)
changed: [10.10.10.13] => (item=/etc/consul.d)
changed: [10.10.10.10] => (item=/var/lib/consul)
changed: [10.10.10.11] => (item=/var/lib/consul)
changed: [10.10.10.12] => (item=/var/lib/consul)
changed: [10.10.10.13] => (item=/var/lib/consul)
TASK [consul : Recreate consul dir] **********************************************************************************************************************************************************************
changed: [10.10.10.10] => (item=/etc/consul.d)
changed: [10.10.10.11] => (item=/etc/consul.d)
changed: [10.10.10.12] => (item=/etc/consul.d)
changed: [10.10.10.13] => (item=/etc/consul.d)
changed: [10.10.10.10] => (item=/var/lib/consul)
changed: [10.10.10.11] => (item=/var/lib/consul)
changed: [10.10.10.13] => (item=/var/lib/consul)
changed: [10.10.10.12] => (item=/var/lib/consul)
TASK [consul : Make sure consul is installed] ************************************************************************************************************************************************************
ok: [10.10.10.11]
ok: [10.10.10.10]
ok: [10.10.10.12]
ok: [10.10.10.13]
TASK [consul : Make sure consul dir exists] **************************************************************************************************************************************************************
changed: [10.10.10.10]
changed: [10.10.10.11]
changed: [10.10.10.12]
changed: [10.10.10.13]
TASK [consul : Get dcs server node names] ****************************************************************************************************************************************************************
ok: [10.10.10.10]
skipping: [10.10.10.11]
skipping: [10.10.10.12]
skipping: [10.10.10.13]
TASK [consul : Get dcs node name from var] ***************************************************************************************************************************************************************
skipping: [10.10.10.10]
skipping: [10.10.10.11]
skipping: [10.10.10.12]
skipping: [10.10.10.13]
TASK [consul : Get dcs node name from var] ***************************************************************************************************************************************************************
skipping: [10.10.10.10]
ok: [10.10.10.11]
ok: [10.10.10.12]
ok: [10.10.10.13]
TASK [consul : Fetch hostname as dcs node name] **********************************************************************************************************************************************************
skipping: [10.10.10.10]
skipping: [10.10.10.11]
skipping: [10.10.10.12]
skipping: [10.10.10.13]
TASK [consul : Get dcs name from hostname] ***************************************************************************************************************************************************************
skipping: [10.10.10.10]
skipping: [10.10.10.11]
skipping: [10.10.10.12]
skipping: [10.10.10.13]
TASK [consul : Copy /etc/consul.d/consul.json] ***********************************************************************************************************************************************************
changed: [10.10.10.10]
changed: [10.10.10.11]
changed: [10.10.10.12]
changed: [10.10.10.13]
TASK [consul : Copy consul agent service] ****************************************************************************************************************************************************************
changed: [10.10.10.11]
changed: [10.10.10.12]
changed: [10.10.10.13]
changed: [10.10.10.10]
TASK [consul : Get dcs bootstrap expect quroum] **********************************************************************************************************************************************************
ok: [10.10.10.10]
skipping: [10.10.10.11]
skipping: [10.10.10.12]
skipping: [10.10.10.13]
TASK [consul : Copy consul server service unit] **********************************************************************************************************************************************************
changed: [10.10.10.10]
skipping: [10.10.10.11]
skipping: [10.10.10.12]
skipping: [10.10.10.13]
TASK [consul : Launch consul server service] *************************************************************************************************************************************************************
changed: [10.10.10.10]
skipping: [10.10.10.11]
skipping: [10.10.10.12]
skipping: [10.10.10.13]
TASK [consul : Wait for consul server online] ************************************************************************************************************************************************************
ok: [10.10.10.10]
skipping: [10.10.10.11]
skipping: [10.10.10.12]
skipping: [10.10.10.13]
TASK [consul : Launch consul agent service] **************************************************************************************************************************************************************
skipping: [10.10.10.10]
changed: [10.10.10.11]
changed: [10.10.10.12]
changed: [10.10.10.13]
TASK [consul : Wait for consul agent online] *************************************************************************************************************************************************************
skipping: [10.10.10.10]
ok: [10.10.10.11]
ok: [10.10.10.12]
ok: [10.10.10.13]
PLAY [Init database cluster] *****************************************************************************************************************************************************************************
TASK [postgres : Create os group postgres] ***************************************************************************************************************************************************************
changed: [10.10.10.10]
changed: [10.10.10.11]
changed: [10.10.10.12]
changed: [10.10.10.13]
TASK [postgres : Make sure dcs group exists] *************************************************************************************************************************************************************
ok: [10.10.10.10] => (item=consul)
ok: [10.10.10.11] => (item=consul)
ok: [10.10.10.12] => (item=consul)
ok: [10.10.10.13] => (item=consul)
ok: [10.10.10.11] => (item=etcd)
ok: [10.10.10.10] => (item=etcd)
ok: [10.10.10.12] => (item=etcd)
ok: [10.10.10.13] => (item=etcd)
TASK [postgres : Create dbsu postgres] *******************************************************************************************************************************************************************
changed: [10.10.10.13]
changed: [10.10.10.12]
changed: [10.10.10.10]
changed: [10.10.10.11]
TASK [postgres : Grant dbsu nopass sudo] *****************************************************************************************************************************************************************
skipping: [10.10.10.10]
skipping: [10.10.10.11]
skipping: [10.10.10.12]
skipping: [10.10.10.13]
TASK [postgres : Grant dbsu all sudo] ********************************************************************************************************************************************************************
skipping: [10.10.10.10]
skipping: [10.10.10.11]
skipping: [10.10.10.12]
skipping: [10.10.10.13]
TASK [postgres : Grant dbsu limited sudo] ****************************************************************************************************************************************************************
changed: [10.10.10.10]
changed: [10.10.10.11]
changed: [10.10.10.12]
changed: [10.10.10.13]
TASK [postgres : Config patroni watchdog support] ********************************************************************************************************************************************************
changed: [10.10.10.10]
changed: [10.10.10.11]
changed: [10.10.10.12]
changed: [10.10.10.13]
TASK [postgres : Add dbsu ssh no host checking] **********************************************************************************************************************************************************
changed: [10.10.10.10]
changed: [10.10.10.11]
changed: [10.10.10.12]
changed: [10.10.10.13]
TASK [postgres : Fetch dbsu public keys] *****************************************************************************************************************************************************************
changed: [10.10.10.11]
changed: [10.10.10.10]
changed: [10.10.10.12]
changed: [10.10.10.13]
TASK [postgres : Exchange dbsu ssh keys] *****************************************************************************************************************************************************************
skipping: [10.10.10.10] => (item=['ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQC8ahlH3Yo0nTb1hhd7SGTF1sCwnjEVA/yGra2ktQcZ/i8S/2tfumVomxtnNTeOZqNeQygVUbRgIH77lABXrXwBOimw+J0EmoekPsW7q/NCT5EJgqfoDe5vWBpyhrCe1ixCxESlP2GfpaJYGqeMW2G8HiFU6ieDZcfGcFn1q9JBjtrrV851Htw+Ik/fed93ipGgWzzZnu4NOjz7tpmrsmE3/1J/RvPQdRT7Pjuy2pLn+oCjMkQHJezvUKruVTVwxjObaWO7WFlvQCy2dRez1GBxEK80LRbsZfmgkfIQPzmqHOaacqNBAHe+OeYlBh3fMMbpALzJHnhgJSW5GpdRwiUJ ansible-generated on meta', '10.10.10.10'])
skipping: [10.10.10.10] => (item=['ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQC8ahlH3Yo0nTb1hhd7SGTF1sCwnjEVA/yGra2ktQcZ/i8S/2tfumVomxtnNTeOZqNeQygVUbRgIH77lABXrXwBOimw+J0EmoekPsW7q/NCT5EJgqfoDe5vWBpyhrCe1ixCxESlP2GfpaJYGqeMW2G8HiFU6ieDZcfGcFn1q9JBjtrrV851Htw+Ik/fed93ipGgWzzZnu4NOjz7tpmrsmE3/1J/RvPQdRT7Pjuy2pLn+oCjMkQHJezvUKruVTVwxjObaWO7WFlvQCy2dRez1GBxEK80LRbsZfmgkfIQPzmqHOaacqNBAHe+OeYlBh3fMMbpALzJHnhgJSW5GpdRwiUJ ansible-generated on meta', '10.10.10.11'])
skipping: [10.10.10.10] => (item=['ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQC8ahlH3Yo0nTb1hhd7SGTF1sCwnjEVA/yGra2ktQcZ/i8S/2tfumVomxtnNTeOZqNeQygVUbRgIH77lABXrXwBOimw+J0EmoekPsW7q/NCT5EJgqfoDe5vWBpyhrCe1ixCxESlP2GfpaJYGqeMW2G8HiFU6ieDZcfGcFn1q9JBjtrrV851Htw+Ik/fed93ipGgWzzZnu4NOjz7tpmrsmE3/1J/RvPQdRT7Pjuy2pLn+oCjMkQHJezvUKruVTVwxjObaWO7WFlvQCy2dRez1GBxEK80LRbsZfmgkfIQPzmqHOaacqNBAHe+OeYlBh3fMMbpALzJHnhgJSW5GpdRwiUJ ansible-generated on meta', '10.10.10.12'])
skipping: [10.10.10.10] => (item=['ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQC8ahlH3Yo0nTb1hhd7SGTF1sCwnjEVA/yGra2ktQcZ/i8S/2tfumVomxtnNTeOZqNeQygVUbRgIH77lABXrXwBOimw+J0EmoekPsW7q/NCT5EJgqfoDe5vWBpyhrCe1ixCxESlP2GfpaJYGqeMW2G8HiFU6ieDZcfGcFn1q9JBjtrrV851Htw+Ik/fed93ipGgWzzZnu4NOjz7tpmrsmE3/1J/RvPQdRT7Pjuy2pLn+oCjMkQHJezvUKruVTVwxjObaWO7WFlvQCy2dRez1GBxEK80LRbsZfmgkfIQPzmqHOaacqNBAHe+OeYlBh3fMMbpALzJHnhgJSW5GpdRwiUJ ansible-generated on meta', '10.10.10.13'])
skipping: [10.10.10.11] => (item=['ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDCIr/IW4qyd4Ls8dztCJyYHt354iPFbhLAUUiEK9R3A5W8UOSiJK/WVwlxMazH8QUaMWHuQAlTtW66kW1DDU+fsJ4xGxrNjEnwUbmWfj3BBnoANJQHYOid8iLJwWZuykvz0EIdGMDVpUpIx/qqm3/ZlC+cD0iukXQyEyAw3Qgts/Twqr5IJGeQOFy9Z4rmqSXtz/8tS0YOHCHVC5GGsUpD5+GLqhwPd64xCbWnvpYY61IX45Hzf+zO80xGqPeQLqF9HULs5wi2i6plKrSRl76VWCq9T7QMQMKJJSLUabnrXrKm+sr21LImgpSxSbqbBVVNUVS+adQvvylWb6yaFWov ansible-generated on node-1', '10.10.10.10'])
skipping: [10.10.10.11] => (item=['ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDCIr/IW4qyd4Ls8dztCJyYHt354iPFbhLAUUiEK9R3A5W8UOSiJK/WVwlxMazH8QUaMWHuQAlTtW66kW1DDU+fsJ4xGxrNjEnwUbmWfj3BBnoANJQHYOid8iLJwWZuykvz0EIdGMDVpUpIx/qqm3/ZlC+cD0iukXQyEyAw3Qgts/Twqr5IJGeQOFy9Z4rmqSXtz/8tS0YOHCHVC5GGsUpD5+GLqhwPd64xCbWnvpYY61IX45Hzf+zO80xGqPeQLqF9HULs5wi2i6plKrSRl76VWCq9T7QMQMKJJSLUabnrXrKm+sr21LImgpSxSbqbBVVNUVS+adQvvylWb6yaFWov ansible-generated on node-1', '10.10.10.11'])
skipping: [10.10.10.11] => (item=['ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDCIr/IW4qyd4Ls8dztCJyYHt354iPFbhLAUUiEK9R3A5W8UOSiJK/WVwlxMazH8QUaMWHuQAlTtW66kW1DDU+fsJ4xGxrNjEnwUbmWfj3BBnoANJQHYOid8iLJwWZuykvz0EIdGMDVpUpIx/qqm3/ZlC+cD0iukXQyEyAw3Qgts/Twqr5IJGeQOFy9Z4rmqSXtz/8tS0YOHCHVC5GGsUpD5+GLqhwPd64xCbWnvpYY61IX45Hzf+zO80xGqPeQLqF9HULs5wi2i6plKrSRl76VWCq9T7QMQMKJJSLUabnrXrKm+sr21LImgpSxSbqbBVVNUVS+adQvvylWb6yaFWov ansible-generated on node-1', '10.10.10.12'])
skipping: [10.10.10.11] => (item=['ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDCIr/IW4qyd4Ls8dztCJyYHt354iPFbhLAUUiEK9R3A5W8UOSiJK/WVwlxMazH8QUaMWHuQAlTtW66kW1DDU+fsJ4xGxrNjEnwUbmWfj3BBnoANJQHYOid8iLJwWZuykvz0EIdGMDVpUpIx/qqm3/ZlC+cD0iukXQyEyAw3Qgts/Twqr5IJGeQOFy9Z4rmqSXtz/8tS0YOHCHVC5GGsUpD5+GLqhwPd64xCbWnvpYY61IX45Hzf+zO80xGqPeQLqF9HULs5wi2i6plKrSRl76VWCq9T7QMQMKJJSLUabnrXrKm+sr21LImgpSxSbqbBVVNUVS+adQvvylWb6yaFWov ansible-generated on node-1', '10.10.10.13'])
skipping: [10.10.10.12] => (item=['ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQChMymmlyxGn7PnUvAUvh968/gxTnwGZhhMhIc2+aiuA0QP/D8CSmKfzRYoMVP6/nm3cJsYXM28wzWZ1X/sLp33rYYxbwWpj5n8oBalzqKmSzK0HI5CePKAlWlEeLRDxvKpZYhZwXmro5Ov9lfp63kNHU84nAP7BPBOlufFyydn50bUwP1xKEsG1BC9Xqd4XqB5+eRLjkQDuC743bgxFc3FM8fij1/MuvxtG3HvL6DgEvCo3Lx4qkiVO3akR6Lo3bQEkf76Gq94cFbecAAnYZzdkPHR5LqJiIGS0DYj0yZQXrdN+DtjpyIBfZzi+TFdcVW1Agy1IUQ7Lrt29HJw+/sD ansible-generated on node-2', '10.10.10.10'])
skipping: [10.10.10.12] => (item=['ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQChMymmlyxGn7PnUvAUvh968/gxTnwGZhhMhIc2+aiuA0QP/D8CSmKfzRYoMVP6/nm3cJsYXM28wzWZ1X/sLp33rYYxbwWpj5n8oBalzqKmSzK0HI5CePKAlWlEeLRDxvKpZYhZwXmro5Ov9lfp63kNHU84nAP7BPBOlufFyydn50bUwP1xKEsG1BC9Xqd4XqB5+eRLjkQDuC743bgxFc3FM8fij1/MuvxtG3HvL6DgEvCo3Lx4qkiVO3akR6Lo3bQEkf76Gq94cFbecAAnYZzdkPHR5LqJiIGS0DYj0yZQXrdN+DtjpyIBfZzi+TFdcVW1Agy1IUQ7Lrt29HJw+/sD ansible-generated on node-2', '10.10.10.11'])
skipping: [10.10.10.12] => (item=['ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQChMymmlyxGn7PnUvAUvh968/gxTnwGZhhMhIc2+aiuA0QP/D8CSmKfzRYoMVP6/nm3cJsYXM28wzWZ1X/sLp33rYYxbwWpj5n8oBalzqKmSzK0HI5CePKAlWlEeLRDxvKpZYhZwXmro5Ov9lfp63kNHU84nAP7BPBOlufFyydn50bUwP1xKEsG1BC9Xqd4XqB5+eRLjkQDuC743bgxFc3FM8fij1/MuvxtG3HvL6DgEvCo3Lx4qkiVO3akR6Lo3bQEkf76Gq94cFbecAAnYZzdkPHR5LqJiIGS0DYj0yZQXrdN+DtjpyIBfZzi+TFdcVW1Agy1IUQ7Lrt29HJw+/sD ansible-generated on node-2', '10.10.10.12'])
skipping: [10.10.10.12] => (item=['ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQChMymmlyxGn7PnUvAUvh968/gxTnwGZhhMhIc2+aiuA0QP/D8CSmKfzRYoMVP6/nm3cJsYXM28wzWZ1X/sLp33rYYxbwWpj5n8oBalzqKmSzK0HI5CePKAlWlEeLRDxvKpZYhZwXmro5Ov9lfp63kNHU84nAP7BPBOlufFyydn50bUwP1xKEsG1BC9Xqd4XqB5+eRLjkQDuC743bgxFc3FM8fij1/MuvxtG3HvL6DgEvCo3Lx4qkiVO3akR6Lo3bQEkf76Gq94cFbecAAnYZzdkPHR5LqJiIGS0DYj0yZQXrdN+DtjpyIBfZzi+TFdcVW1Agy1IUQ7Lrt29HJw+/sD ansible-generated on node-2', '10.10.10.13'])
skipping: [10.10.10.13] => (item=['ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQCo9KBPH2DVYQrM/WZ4CO4Ipvr+5L6FhqWBr1A6C0Ms+qi77aKHwFEIbrxKqj7wZFbHWoTPt/cbWkXhZgnkfDBR81/wBImnFz0QfuL0tNDN0/YP/4cePo5bQERGcnBI6vkjmXMyGGpRQobNRj71fX/Wt5WMw6dM+d4XjfgUKHIJxEKnz8HYnkiwWm5Flc9EHKTWN+87vZ9B6cdi7gxLQu8LL3x+4e2ArRoz9u5yZIajUTvexqD2IIReqsFt+QObpinLaTc/g7Q+w/no1hAZERS3pImx9l0GF6Ktdp/HMHH1vk2cwnyogrk+OLw1WccI1YkBes/xdzBFTWOwUX3w/vBt ansible-generated on node-3', '10.10.10.10'])
skipping: [10.10.10.13] => (item=['ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQCo9KBPH2DVYQrM/WZ4CO4Ipvr+5L6FhqWBr1A6C0Ms+qi77aKHwFEIbrxKqj7wZFbHWoTPt/cbWkXhZgnkfDBR81/wBImnFz0QfuL0tNDN0/YP/4cePo5bQERGcnBI6vkjmXMyGGpRQobNRj71fX/Wt5WMw6dM+d4XjfgUKHIJxEKnz8HYnkiwWm5Flc9EHKTWN+87vZ9B6cdi7gxLQu8LL3x+4e2ArRoz9u5yZIajUTvexqD2IIReqsFt+QObpinLaTc/g7Q+w/no1hAZERS3pImx9l0GF6Ktdp/HMHH1vk2cwnyogrk+OLw1WccI1YkBes/xdzBFTWOwUX3w/vBt ansible-generated on node-3', '10.10.10.11'])
skipping: [10.10.10.13] => (item=['ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQCo9KBPH2DVYQrM/WZ4CO4Ipvr+5L6FhqWBr1A6C0Ms+qi77aKHwFEIbrxKqj7wZFbHWoTPt/cbWkXhZgnkfDBR81/wBImnFz0QfuL0tNDN0/YP/4cePo5bQERGcnBI6vkjmXMyGGpRQobNRj71fX/Wt5WMw6dM+d4XjfgUKHIJxEKnz8HYnkiwWm5Flc9EHKTWN+87vZ9B6cdi7gxLQu8LL3x+4e2ArRoz9u5yZIajUTvexqD2IIReqsFt+QObpinLaTc/g7Q+w/no1hAZERS3pImx9l0GF6Ktdp/HMHH1vk2cwnyogrk+OLw1WccI1YkBes/xdzBFTWOwUX3w/vBt ansible-generated on node-3', '10.10.10.12'])
skipping: [10.10.10.13] => (item=['ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQCo9KBPH2DVYQrM/WZ4CO4Ipvr+5L6FhqWBr1A6C0Ms+qi77aKHwFEIbrxKqj7wZFbHWoTPt/cbWkXhZgnkfDBR81/wBImnFz0QfuL0tNDN0/YP/4cePo5bQERGcnBI6vkjmXMyGGpRQobNRj71fX/Wt5WMw6dM+d4XjfgUKHIJxEKnz8HYnkiwWm5Flc9EHKTWN+87vZ9B6cdi7gxLQu8LL3x+4e2ArRoz9u5yZIajUTvexqD2IIReqsFt+QObpinLaTc/g7Q+w/no1hAZERS3pImx9l0GF6Ktdp/HMHH1vk2cwnyogrk+OLw1WccI1YkBes/xdzBFTWOwUX3w/vBt ansible-generated on node-3', '10.10.10.13'])
TASK [postgres : Install offical pgdg yum repo] **********************************************************************************************************************************************************
skipping: [10.10.10.10] => (item=postgresql${pg_version}*)
skipping: [10.10.10.10] => (item=postgis31_${pg_version}*)
skipping: [10.10.10.10] => (item=pgbouncer patroni pg_exporter pgbadger)
skipping: [10.10.10.11] => (item=postgresql${pg_version}*)
skipping: [10.10.10.10] => (item=patroni patroni-consul patroni-etcd pgbouncer pgbadger pg_activity)
skipping: [10.10.10.11] => (item=postgis31_${pg_version}*)
skipping: [10.10.10.10] => (item=python3 python3-psycopg2 python36-requests python3-etcd python3-consul)
skipping: [10.10.10.11] => (item=pgbouncer patroni pg_exporter pgbadger)
skipping: [10.10.10.12] => (item=postgresql${pg_version}*)
skipping: [10.10.10.10] => (item=python36-urllib3 python36-idna python36-pyOpenSSL python36-cryptography)
skipping: [10.10.10.11] => (item=patroni patroni-consul patroni-etcd pgbouncer pgbadger pg_activity)
skipping: [10.10.10.12] => (item=postgis31_${pg_version}*)
skipping: [10.10.10.11] => (item=python3 python3-psycopg2 python36-requests python3-etcd python3-consul)
skipping: [10.10.10.12] => (item=pgbouncer patroni pg_exporter pgbadger)
skipping: [10.10.10.13] => (item=postgresql${pg_version}*)
skipping: [10.10.10.11] => (item=python36-urllib3 python36-idna python36-pyOpenSSL python36-cryptography)
skipping: [10.10.10.12] => (item=patroni patroni-consul patroni-etcd pgbouncer pgbadger pg_activity)
skipping: [10.10.10.13] => (item=postgis31_${pg_version}*)
skipping: [10.10.10.12] => (item=python3 python3-psycopg2 python36-requests python3-etcd python3-consul)
skipping: [10.10.10.13] => (item=pgbouncer patroni pg_exporter pgbadger)
skipping: [10.10.10.12] => (item=python36-urllib3 python36-idna python36-pyOpenSSL python36-cryptography)
skipping: [10.10.10.13] => (item=patroni patroni-consul patroni-etcd pgbouncer pgbadger pg_activity)
skipping: [10.10.10.13] => (item=python3 python3-psycopg2 python36-requests python3-etcd python3-consul)
skipping: [10.10.10.13] => (item=python36-urllib3 python36-idna python36-pyOpenSSL python36-cryptography)
TASK [postgres : Install pg packages] ********************************************************************************************************************************************************************
changed: [10.10.10.10] => (item=['postgresql13*', 'postgis31_13*', 'pgbouncer,patroni,pg_exporter,pgbadger', 'patroni,patroni-consul,patroni-etcd,pgbouncer,pgbadger,pg_activity', 'python3,python3-psycopg2,python36-requests,python3-etcd,python3-consul', 'python36-urllib3,python36-idna,python36-pyOpenSSL,python36-cryptography'])
changed: [10.10.10.11] => (item=['postgresql13*', 'postgis31_13*', 'pgbouncer,patroni,pg_exporter,pgbadger', 'patroni,patroni-consul,patroni-etcd,pgbouncer,pgbadger,pg_activity', 'python3,python3-psycopg2,python36-requests,python3-etcd,python3-consul', 'python36-urllib3,python36-idna,python36-pyOpenSSL,python36-cryptography'])
changed: [10.10.10.13] => (item=['postgresql13*', 'postgis31_13*', 'pgbouncer,patroni,pg_exporter,pgbadger', 'patroni,patroni-consul,patroni-etcd,pgbouncer,pgbadger,pg_activity', 'python3,python3-psycopg2,python36-requests,python3-etcd,python3-consul', 'python36-urllib3,python36-idna,python36-pyOpenSSL,python36-cryptography'])
changed: [10.10.10.12] => (item=['postgresql13*', 'postgis31_13*', 'pgbouncer,patroni,pg_exporter,pgbadger', 'patroni,patroni-consul,patroni-etcd,pgbouncer,pgbadger,pg_activity', 'python3,python3-psycopg2,python36-requests,python3-etcd,python3-consul', 'python36-urllib3,python36-idna,python36-pyOpenSSL,python36-cryptography'])
TASK [postgres : Install pg extensions] ******************************************************************************************************************************************************************
changed: [10.10.10.11] => (item=['pg_repack13,pg_qualstats13,pg_stat_kcache13,wal2json13'])
changed: [10.10.10.10] => (item=['pg_repack13,pg_qualstats13,pg_stat_kcache13,wal2json13'])
changed: [10.10.10.13] => (item=['pg_repack13,pg_qualstats13,pg_stat_kcache13,wal2json13'])
changed: [10.10.10.12] => (item=['pg_repack13,pg_qualstats13,pg_stat_kcache13,wal2json13'])
TASK [postgres : Link /usr/pgsql to current version] *****************************************************************************************************************************************************
changed: [10.10.10.11]
changed: [10.10.10.10]
changed: [10.10.10.12]
changed: [10.10.10.13]
TASK [postgres : Add pg bin dir to profile path] *********************************************************************************************************************************************************
changed: [10.10.10.10]
changed: [10.10.10.11]
changed: [10.10.10.13]
changed: [10.10.10.12]
TASK [postgres : Fix directory ownership] ****************************************************************************************************************************************************************
ok: [10.10.10.10]
ok: [10.10.10.11]
ok: [10.10.10.12]
ok: [10.10.10.13]
TASK [postgres : Remove default postgres service] ********************************************************************************************************************************************************
changed: [10.10.10.10]
changed: [10.10.10.11]
changed: [10.10.10.12]
changed: [10.10.10.13]
TASK [postgres : Check necessary variables exists] *******************************************************************************************************************************************************
ok: [10.10.10.10] => {
"changed": false,
"msg": "All assertions passed"
}
ok: [10.10.10.11] => {
"changed": false,
"msg": "All assertions passed"
}
ok: [10.10.10.12] => {
"changed": false,
"msg": "All assertions passed"
}
ok: [10.10.10.13] => {
"changed": false,
"msg": "All assertions passed"
}
TASK [postgres : Fetch variables via pg_cluster] *********************************************************************************************************************************************************
ok: [10.10.10.10]
ok: [10.10.10.11]
ok: [10.10.10.12]
ok: [10.10.10.13]
TASK [postgres : Set cluster basic facts for hosts] ******************************************************************************************************************************************************
ok: [10.10.10.10]
ok: [10.10.10.11]
ok: [10.10.10.12]
ok: [10.10.10.13]
TASK [postgres : Assert cluster primary singleton] *******************************************************************************************************************************************************
ok: [10.10.10.10] => {
"changed": false,
"msg": "All assertions passed"
}
ok: [10.10.10.11] => {
"changed": false,
"msg": "All assertions passed"
}
ok: [10.10.10.12] => {
"changed": false,
"msg": "All assertions passed"
}
ok: [10.10.10.13] => {
"changed": false,
"msg": "All assertions passed"
}
TASK [postgres : Setup cluster primary ip address] *******************************************************************************************************************************************************
ok: [10.10.10.10]
ok: [10.10.10.11]
ok: [10.10.10.12]
ok: [10.10.10.13]
TASK [postgres : Setup repl upstream for primary] ********************************************************************************************************************************************************
skipping: [10.10.10.10]
skipping: [10.10.10.11]
skipping: [10.10.10.12]
skipping: [10.10.10.13]
TASK [postgres : Setup repl upstream for replicas] *******************************************************************************************************************************************************
skipping: [10.10.10.10]
skipping: [10.10.10.11]
ok: [10.10.10.12]
ok: [10.10.10.13]
TASK [postgres : Debug print instance summary] ***********************************************************************************************************************************************************
ok: [10.10.10.10] => {
"msg": "cluster=pg-meta service=pg-meta-primary instance=pg-meta-1 replication=[primary:itself]->10.10.10.10"
}
ok: [10.10.10.11] => {
"msg": "cluster=pg-test service=pg-test-primary instance=pg-test-1 replication=[primary:itself]->10.10.10.11"
}
ok: [10.10.10.12] => {
"msg": "cluster=pg-test service=pg-test-replica instance=pg-test-2 replication=[primary:itself]->10.10.10.12"
}
ok: [10.10.10.13] => {
"msg": "cluster=pg-test service=pg-test-offline instance=pg-test-3 replication=[primary:itself]->10.10.10.13"
}
TASK [postgres : Check for existing postgres instance] ***************************************************************************************************************************************************
changed: [10.10.10.11]
changed: [10.10.10.10]
changed: [10.10.10.12]
changed: [10.10.10.13]
TASK [postgres : Set fact whether pg port is open] *******************************************************************************************************************************************************
ok: [10.10.10.10]
ok: [10.10.10.11]
ok: [10.10.10.12]
ok: [10.10.10.13]
TASK [postgres : Abort due to existing postgres instance] ************************************************************************************************************************************************
skipping: [10.10.10.10]
skipping: [10.10.10.11]
skipping: [10.10.10.12]
skipping: [10.10.10.13]
TASK [postgres : Clean existing postgres instance] *******************************************************************************************************************************************************
skipping: [10.10.10.10]
skipping: [10.10.10.11]
skipping: [10.10.10.12]
skipping: [10.10.10.13]
TASK [postgres : Shutdown existing postgres service] *****************************************************************************************************************************************************
changed: [10.10.10.10]
changed: [10.10.10.11]
changed: [10.10.10.12]
changed: [10.10.10.13]
TASK [postgres : Remove registerd consul service] ********************************************************************************************************************************************************
changed: [10.10.10.12]
changed: [10.10.10.13]
changed: [10.10.10.11]
changed: [10.10.10.10]
TASK [postgres : Remove postgres metadata in consul] *****************************************************************************************************************************************************
skipping: [10.10.10.12]
skipping: [10.10.10.13]
changed: [10.10.10.11]
changed: [10.10.10.10]
TASK [postgres : Remove existing postgres data] **********************************************************************************************************************************************************
ok: [10.10.10.10] => (item=/pg)
ok: [10.10.10.11] => (item=/pg)
ok: [10.10.10.12] => (item=/pg)
ok: [10.10.10.13] => (item=/pg)
ok: [10.10.10.10] => (item=/export/postgres)
ok: [10.10.10.11] => (item=/export/postgres)
ok: [10.10.10.12] => (item=/export/postgres)
ok: [10.10.10.13] => (item=/export/postgres)
ok: [10.10.10.10] => (item=/var/backups/postgres)
ok: [10.10.10.11] => (item=/var/backups/postgres)
ok: [10.10.10.12] => (item=/var/backups/postgres)
ok: [10.10.10.13] => (item=/var/backups/postgres)
changed: [10.10.10.10] => (item=/etc/pgbouncer)
changed: [10.10.10.11] => (item=/etc/pgbouncer)
changed: [10.10.10.13] => (item=/etc/pgbouncer)
changed: [10.10.10.12] => (item=/etc/pgbouncer)
changed: [10.10.10.10] => (item=/var/log/pgbouncer)
changed: [10.10.10.11] => (item=/var/log/pgbouncer)
changed: [10.10.10.13] => (item=/var/log/pgbouncer)
changed: [10.10.10.12] => (item=/var/log/pgbouncer)
changed: [10.10.10.10] => (item=/var/run/pgbouncer)
changed: [10.10.10.11] => (item=/var/run/pgbouncer)
changed: [10.10.10.13] => (item=/var/run/pgbouncer)
changed: [10.10.10.12] => (item=/var/run/pgbouncer)
TASK [postgres : Make sure main and backup dir exists] ***************************************************************************************************************************************************
changed: [10.10.10.11] => (item=/export)
changed: [10.10.10.12] => (item=/export)
changed: [10.10.10.13] => (item=/export)
changed: [10.10.10.10] => (item=/export)
changed: [10.10.10.11] => (item=/var/backups)
changed: [10.10.10.12] => (item=/var/backups)
changed: [10.10.10.13] => (item=/var/backups)
changed: [10.10.10.10] => (item=/var/backups)
TASK [postgres : Create postgres directory structure] ****************************************************************************************************************************************************
changed: [10.10.10.10] => (item=/export/postgres)
changed: [10.10.10.11] => (item=/export/postgres)
changed: [10.10.10.12] => (item=/export/postgres)
changed: [10.10.10.13] => (item=/export/postgres)
changed: [10.10.10.10] => (item=/export/postgres/pg-meta-13)
changed: [10.10.10.11] => (item=/export/postgres/pg-test-13)
changed: [10.10.10.12] => (item=/export/postgres/pg-test-13)
changed: [10.10.10.13] => (item=/export/postgres/pg-test-13)
changed: [10.10.10.10] => (item=/export/postgres/pg-meta-13/bin)
changed: [10.10.10.12] => (item=/export/postgres/pg-test-13/bin)
changed: [10.10.10.11] => (item=/export/postgres/pg-test-13/bin)
changed: [10.10.10.13] => (item=/export/postgres/pg-test-13/bin)
changed: [10.10.10.10] => (item=/export/postgres/pg-meta-13/log)
changed: [10.10.10.12] => (item=/export/postgres/pg-test-13/log)
changed: [10.10.10.11] => (item=/export/postgres/pg-test-13/log)
changed: [10.10.10.13] => (item=/export/postgres/pg-test-13/log)
changed: [10.10.10.10] => (item=/export/postgres/pg-meta-13/tmp)
changed: [10.10.10.12] => (item=/export/postgres/pg-test-13/tmp)
changed: [10.10.10.11] => (item=/export/postgres/pg-test-13/tmp)
changed: [10.10.10.13] => (item=/export/postgres/pg-test-13/tmp)
changed: [10.10.10.10] => (item=/export/postgres/pg-meta-13/conf)
changed: [10.10.10.12] => (item=/export/postgres/pg-test-13/conf)
changed: [10.10.10.11] => (item=/export/postgres/pg-test-13/conf)
changed: [10.10.10.13] => (item=/export/postgres/pg-test-13/conf)
changed: [10.10.10.10] => (item=/export/postgres/pg-meta-13/data)
changed: [10.10.10.12] => (item=/export/postgres/pg-test-13/data)
changed: [10.10.10.11] => (item=/export/postgres/pg-test-13/data)
changed: [10.10.10.13] => (item=/export/postgres/pg-test-13/data)
changed: [10.10.10.10] => (item=/export/postgres/pg-meta-13/meta)
changed: [10.10.10.12] => (item=/export/postgres/pg-test-13/meta)
changed: [10.10.10.11] => (item=/export/postgres/pg-test-13/meta)
changed: [10.10.10.13] => (item=/export/postgres/pg-test-13/meta)
changed: [10.10.10.10] => (item=/export/postgres/pg-meta-13/stat)
changed: [10.10.10.12] => (item=/export/postgres/pg-test-13/stat)
changed: [10.10.10.11] => (item=/export/postgres/pg-test-13/stat)
changed: [10.10.10.13] => (item=/export/postgres/pg-test-13/stat)
changed: [10.10.10.10] => (item=/export/postgres/pg-meta-13/change)
changed: [10.10.10.12] => (item=/export/postgres/pg-test-13/change)
changed: [10.10.10.11] => (item=/export/postgres/pg-test-13/change)
changed: [10.10.10.13] => (item=/export/postgres/pg-test-13/change)
changed: [10.10.10.10] => (item=/var/backups/postgres/pg-meta-13/postgres)
changed: [10.10.10.12] => (item=/var/backups/postgres/pg-test-13/postgres)
changed: [10.10.10.11] => (item=/var/backups/postgres/pg-test-13/postgres)
changed: [10.10.10.13] => (item=/var/backups/postgres/pg-test-13/postgres)
changed: [10.10.10.10] => (item=/var/backups/postgres/pg-meta-13/arcwal)
changed: [10.10.10.12] => (item=/var/backups/postgres/pg-test-13/arcwal)
changed: [10.10.10.11] => (item=/var/backups/postgres/pg-test-13/arcwal)
changed: [10.10.10.13] => (item=/var/backups/postgres/pg-test-13/arcwal)
changed: [10.10.10.10] => (item=/var/backups/postgres/pg-meta-13/backup)
changed: [10.10.10.12] => (item=/var/backups/postgres/pg-test-13/backup)
changed: [10.10.10.11] => (item=/var/backups/postgres/pg-test-13/backup)
changed: [10.10.10.13] => (item=/var/backups/postgres/pg-test-13/backup)
changed: [10.10.10.10] => (item=/var/backups/postgres/pg-meta-13/remote)
changed: [10.10.10.12] => (item=/var/backups/postgres/pg-test-13/remote)
changed: [10.10.10.11] => (item=/var/backups/postgres/pg-test-13/remote)
changed: [10.10.10.13] => (item=/var/backups/postgres/pg-test-13/remote)
TASK [postgres : Create pgbouncer directory structure] ***************************************************************************************************************************************************
changed: [10.10.10.10] => (item=/etc/pgbouncer)
changed: [10.10.10.11] => (item=/etc/pgbouncer)
changed: [10.10.10.12] => (item=/etc/pgbouncer)
changed: [10.10.10.13] => (item=/etc/pgbouncer)
changed: [10.10.10.11] => (item=/var/log/pgbouncer)
changed: [10.10.10.10] => (item=/var/log/pgbouncer)
changed: [10.10.10.12] => (item=/var/log/pgbouncer)
changed: [10.10.10.13] => (item=/var/log/pgbouncer)
changed: [10.10.10.11] => (item=/var/run/pgbouncer)
changed: [10.10.10.10] => (item=/var/run/pgbouncer)
changed: [10.10.10.12] => (item=/var/run/pgbouncer)
changed: [10.10.10.13] => (item=/var/run/pgbouncer)
TASK [postgres : Create links from pgbkup to pgroot] *****************************************************************************************************************************************************
changed: [10.10.10.10] => (item=arcwal)
changed: [10.10.10.11] => (item=arcwal)
changed: [10.10.10.12] => (item=arcwal)
changed: [10.10.10.13] => (item=arcwal)
changed: [10.10.10.10] => (item=backup)
changed: [10.10.10.11] => (item=backup)
changed: [10.10.10.12] => (item=backup)
changed: [10.10.10.13] => (item=backup)
changed: [10.10.10.10] => (item=remote)
changed: [10.10.10.11] => (item=remote)
changed: [10.10.10.12] => (item=remote)
changed: [10.10.10.13] => (item=remote)
TASK [postgres : Create links from current cluster] ******************************************************************************************************************************************************
changed: [10.10.10.10]
changed: [10.10.10.12]
changed: [10.10.10.13]
changed: [10.10.10.11]
TASK [postgres : Copy pg_cluster to /pg/meta/cluster] ****************************************************************************************************************************************************
changed: [10.10.10.10]
changed: [10.10.10.11]
changed: [10.10.10.12]
changed: [10.10.10.13]
TASK [postgres : Copy pg_version to /pg/meta/version] ****************************************************************************************************************************************************
changed: [10.10.10.10]
changed: [10.10.10.11]
changed: [10.10.10.12]
changed: [10.10.10.13]
TASK [postgres : Copy pg_instance to /pg/meta/instance] **************************************************************************************************************************************************
changed: [10.10.10.10]
changed: [10.10.10.11]
changed: [10.10.10.12]
changed: [10.10.10.13]
TASK [postgres : Copy pg_seq to /pg/meta/sequence] *******************************************************************************************************************************************************
changed: [10.10.10.10]
changed: [10.10.10.11]
changed: [10.10.10.12]
changed: [10.10.10.13]
TASK [postgres : Copy pg_role to /pg/meta/role] **********************************************************************************************************************************************************
changed: [10.10.10.11]
changed: [10.10.10.10]
changed: [10.10.10.12]
changed: [10.10.10.13]
TASK [postgres : Copy postgres scripts to /pg/bin/] ******************************************************************************************************************************************************
changed: [10.10.10.10]
changed: [10.10.10.11]
changed: [10.10.10.13]
changed: [10.10.10.12]
TASK [postgres : Copy alias profile to /etc/profile.d] ***************************************************************************************************************************************************
changed: [10.10.10.10]
changed: [10.10.10.11]
changed: [10.10.10.13]
changed: [10.10.10.12]
TASK [postgres : Copy psqlrc to postgres home] ***********************************************************************************************************************************************************
changed: [10.10.10.10]
changed: [10.10.10.11]
changed: [10.10.10.12]
changed: [10.10.10.13]
TASK [postgres : Setup hostname to pg instance name] *****************************************************************************************************************************************************
skipping: [10.10.10.10]
skipping: [10.10.10.11]
skipping: [10.10.10.12]
skipping: [10.10.10.13]
TASK [postgres : Copy consul node-meta definition] *******************************************************************************************************************************************************
changed: [10.10.10.10]
changed: [10.10.10.11]
changed: [10.10.10.13]
changed: [10.10.10.12]
TASK [postgres : Restart consul to load new node-meta] ***************************************************************************************************************************************************
changed: [10.10.10.10]
changed: [10.10.10.12]
changed: [10.10.10.11]
changed: [10.10.10.13]
TASK [postgres : Config patroni watchdog support] ********************************************************************************************************************************************************
ok: [10.10.10.10]
ok: [10.10.10.11]
ok: [10.10.10.12]
ok: [10.10.10.13]
TASK [postgres : Get config parameter page count] ********************************************************************************************************************************************************
changed: [10.10.10.11]
changed: [10.10.10.10]
changed: [10.10.10.12]
changed: [10.10.10.13]
TASK [postgres : Get config parameter page size] *********************************************************************************************************************************************************
changed: [10.10.10.11]
changed: [10.10.10.12]
changed: [10.10.10.13]
changed: [10.10.10.10]
TASK [postgres : Tune shared buffer and work mem] ********************************************************************************************************************************************************
ok: [10.10.10.10]
ok: [10.10.10.11]
ok: [10.10.10.12]
ok: [10.10.10.13]
TASK [postgres : Hanlde small size mem occasion] *********************************************************************************************************************************************************
ok: [10.10.10.10]
ok: [10.10.10.11]
ok: [10.10.10.12]
ok: [10.10.10.13]
TASK [postgres : Calculate postgres mem params] **********************************************************************************************************************************************************
skipping: [10.10.10.10]
skipping: [10.10.10.11]
skipping: [10.10.10.12]
skipping: [10.10.10.13]
TASK [postgres : create patroni config dir] **************************************************************************************************************************************************************
changed: [10.10.10.10]
changed: [10.10.10.11]
changed: [10.10.10.12]
changed: [10.10.10.13]
TASK [postgres : use predefined patroni template] ********************************************************************************************************************************************************
skipping: [10.10.10.10]
skipping: [10.10.10.11]
skipping: [10.10.10.12]
skipping: [10.10.10.13]
TASK [postgres : Render default /pg/conf/patroni.yml] ****************************************************************************************************************************************************
changed: [10.10.10.10]
changed: [10.10.10.11]
changed: [10.10.10.12]
changed: [10.10.10.13]
TASK [postgres : Link /pg/conf/patroni to /pg/bin/] ******************************************************************************************************************************************************
changed: [10.10.10.10]
changed: [10.10.10.11]
changed: [10.10.10.12]
changed: [10.10.10.13]
TASK [postgres : Link /pg/bin/patroni.yml to /etc/patroni/] **********************************************************************************************************************************************
changed: [10.10.10.10]
changed: [10.10.10.11]
changed: [10.10.10.12]
changed: [10.10.10.13]
TASK [postgres : Config patroni watchdog support] ********************************************************************************************************************************************************
ok: [10.10.10.10]
ok: [10.10.10.11]
ok: [10.10.10.12]
ok: [10.10.10.13]
TASK [postgres : Copy patroni systemd service file] ******************************************************************************************************************************************************
changed: [10.10.10.11]
changed: [10.10.10.10]
changed: [10.10.10.12]
changed: [10.10.10.13]
TASK [postgres : create patroni systemd drop-in dir] *****************************************************************************************************************************************************
changed: [10.10.10.10]
changed: [10.10.10.11]
changed: [10.10.10.13]
changed: [10.10.10.12]
TASK [postgres : Copy postgres systemd service file] *****************************************************************************************************************************************************
changed: [10.10.10.10]
changed: [10.10.10.11]
changed: [10.10.10.12]
changed: [10.10.10.13]
TASK [postgres : Drop-In consul dependency for patroni] **************************************************************************************************************************************************
changed: [10.10.10.10]
changed: [10.10.10.11]
changed: [10.10.10.12]
changed: [10.10.10.13]
TASK [postgres : Render default initdb scripts] **********************************************************************************************************************************************************
changed: [10.10.10.10]
changed: [10.10.10.12]
changed: [10.10.10.11]
changed: [10.10.10.13]
TASK [postgres : Launch patroni on primary instance] *****************************************************************************************************************************************************
skipping: [10.10.10.12]
skipping: [10.10.10.13]
changed: [10.10.10.10]
changed: [10.10.10.11]
TASK [postgres : Wait for patroni primary online] ********************************************************************************************************************************************************
skipping: [10.10.10.12]
skipping: [10.10.10.13]
ok: [10.10.10.10]
ok: [10.10.10.11]
TASK [postgres : Wait for postgres primary online] *******************************************************************************************************************************************************
skipping: [10.10.10.12]
skipping: [10.10.10.13]
ok: [10.10.10.10]
ok: [10.10.10.11]
TASK [postgres : Check primary postgres service ready] ***************************************************************************************************************************************************
skipping: [10.10.10.12]
skipping: [10.10.10.13]
[WARNING]: Module remote_tmp /var/lib/pgsql/.ansible/tmp did not exist and was created with a mode of 0700, this may cause issues when running as another user. To avoid this, create the remote_tmp dir
with the correct permissions manually
changed: [10.10.10.10]
changed: [10.10.10.11]
TASK [postgres : Check replication connectivity to primary] **********************************************************************************************************************************************
skipping: [10.10.10.12]
skipping: [10.10.10.13]
changed: [10.10.10.10]
changed: [10.10.10.11]
TASK [postgres : Render init roles sql] ******************************************************************************************************************************************************************
skipping: [10.10.10.12]
skipping: [10.10.10.13]
changed: [10.10.10.10]
changed: [10.10.10.11]
TASK [postgres : Render init template sql] ***************************************************************************************************************************************************************
skipping: [10.10.10.12]
skipping: [10.10.10.13]
changed: [10.10.10.10]
changed: [10.10.10.11]
TASK [postgres : Render default pg-init scripts] *********************************************************************************************************************************************************
skipping: [10.10.10.12]
skipping: [10.10.10.13]
changed: [10.10.10.11]
changed: [10.10.10.10]
TASK [postgres : Execute initialization scripts] *********************************************************************************************************************************************************
skipping: [10.10.10.12]
skipping: [10.10.10.13]
changed: [10.10.10.10]
changed: [10.10.10.11]
TASK [postgres : Check primary instance ready] ***********************************************************************************************************************************************************
skipping: [10.10.10.12]
skipping: [10.10.10.13]
changed: [10.10.10.10]
changed: [10.10.10.11]
TASK [postgres : Add dbsu password to pgpass if exists] **************************************************************************************************************************************************
skipping: [10.10.10.10]
skipping: [10.10.10.11]
skipping: [10.10.10.12]
skipping: [10.10.10.13]
TASK [postgres : Add system user to pgpass] **************************************************************************************************************************************************************
changed: [10.10.10.10] => (item={'username': 'replicator', 'password': 'DBUser.Replicator'})
changed: [10.10.10.11] => (item={'username': 'replicator', 'password': 'DBUser.Replicator'})
changed: [10.10.10.12] => (item={'username': 'replicator', 'password': 'DBUser.Replicator'})
changed: [10.10.10.13] => (item={'username': 'replicator', 'password': 'DBUser.Replicator'})
changed: [10.10.10.11] => (item={'username': 'dbuser_monitor', 'password': 'DBUser.Monitor'})
changed: [10.10.10.10] => (item={'username': 'dbuser_monitor', 'password': 'DBUser.Monitor'})
changed: [10.10.10.13] => (item={'username': 'dbuser_monitor', 'password': 'DBUser.Monitor'})
changed: [10.10.10.12] => (item={'username': 'dbuser_monitor', 'password': 'DBUser.Monitor'})
changed: [10.10.10.13] => (item={'username': 'dbuser_admin', 'password': 'DBUser.Admin'})
changed: [10.10.10.12] => (item={'username': 'dbuser_admin', 'password': 'DBUser.Admin'})
changed: [10.10.10.10] => (item={'username': 'dbuser_admin', 'password': 'DBUser.Admin'})
changed: [10.10.10.11] => (item={'username': 'dbuser_admin', 'password': 'DBUser.Admin'})
TASK [postgres : Check replication connectivity to primary] **********************************************************************************************************************************************
skipping: [10.10.10.10]
skipping: [10.10.10.11]
changed: [10.10.10.12]
changed: [10.10.10.13]
TASK [postgres : Launch patroni on replica instances] ****************************************************************************************************************************************************
skipping: [10.10.10.10]
skipping: [10.10.10.11]
changed: [10.10.10.12]
changed: [10.10.10.13]
TASK [postgres : Wait for patroni replica online] ********************************************************************************************************************************************************
skipping: [10.10.10.10]
skipping: [10.10.10.11]
ok: [10.10.10.12]
ok: [10.10.10.13]
TASK [postgres : Wait for postgres replica online] *******************************************************************************************************************************************************
skipping: [10.10.10.10]
skipping: [10.10.10.11]
ok: [10.10.10.12]
ok: [10.10.10.13]
TASK [postgres : Check replica postgres service ready] ***************************************************************************************************************************************************
skipping: [10.10.10.10]
skipping: [10.10.10.11]
changed: [10.10.10.12]
changed: [10.10.10.13]
TASK [postgres : Render hba rules] ***********************************************************************************************************************************************************************
changed: [10.10.10.13]
changed: [10.10.10.10]
changed: [10.10.10.12]
changed: [10.10.10.11]
TASK [postgres : Reload hba rules] ***********************************************************************************************************************************************************************
changed: [10.10.10.10]
changed: [10.10.10.11]
changed: [10.10.10.13]
changed: [10.10.10.12]
TASK [postgres : Pause patroni] **************************************************************************************************************************************************************************
skipping: [10.10.10.11]
skipping: [10.10.10.12]
skipping: [10.10.10.13]
changed: [10.10.10.10]
TASK [postgres : Stop patroni on replica instance] *******************************************************************************************************************************************************
skipping: [10.10.10.10]
skipping: [10.10.10.11]
skipping: [10.10.10.12]
skipping: [10.10.10.13]
TASK [postgres : Stop patroni on primary instance] *******************************************************************************************************************************************************
skipping: [10.10.10.10]
skipping: [10.10.10.11]
skipping: [10.10.10.12]
skipping: [10.10.10.13]
TASK [postgres : Launch raw postgres on primary] *********************************************************************************************************************************************************
skipping: [10.10.10.10]
skipping: [10.10.10.11]
skipping: [10.10.10.12]
skipping: [10.10.10.13]
TASK [postgres : Launch raw postgres on primary] *********************************************************************************************************************************************************
skipping: [10.10.10.10]
skipping: [10.10.10.11]
skipping: [10.10.10.12]
skipping: [10.10.10.13]
TASK [postgres : Wait for postgres online] ***************************************************************************************************************************************************************
skipping: [10.10.10.10]
skipping: [10.10.10.11]
skipping: [10.10.10.12]
skipping: [10.10.10.13]
TASK [postgres : Check pgbouncer is installed] ***********************************************************************************************************************************************************
changed: [10.10.10.12]
changed: [10.10.10.11]
changed: [10.10.10.10]
changed: [10.10.10.13]
TASK [postgres : Stop existing pgbouncer service] ********************************************************************************************************************************************************
ok: [10.10.10.11]
ok: [10.10.10.10]
ok: [10.10.10.12]
ok: [10.10.10.13]
TASK [postgres : Remove existing pgbouncer dirs] *********************************************************************************************************************************************************
changed: [10.10.10.10] => (item=/etc/pgbouncer)
changed: [10.10.10.12] => (item=/etc/pgbouncer)
changed: [10.10.10.13] => (item=/etc/pgbouncer)
changed: [10.10.10.11] => (item=/etc/pgbouncer)
changed: [10.10.10.10] => (item=/var/log/pgbouncer)
changed: [10.10.10.12] => (item=/var/log/pgbouncer)
changed: [10.10.10.13] => (item=/var/log/pgbouncer)
changed: [10.10.10.11] => (item=/var/log/pgbouncer)
changed: [10.10.10.10] => (item=/var/run/pgbouncer)
changed: [10.10.10.12] => (item=/var/run/pgbouncer)
changed: [10.10.10.13] => (item=/var/run/pgbouncer)
changed: [10.10.10.11] => (item=/var/run/pgbouncer)
TASK [postgres : Recreate dirs with owner postgres] ******************************************************************************************************************************************************
changed: [10.10.10.10] => (item=/etc/pgbouncer)
changed: [10.10.10.11] => (item=/etc/pgbouncer)
changed: [10.10.10.12] => (item=/etc/pgbouncer)
changed: [10.10.10.13] => (item=/etc/pgbouncer)
changed: [10.10.10.10] => (item=/var/log/pgbouncer)
changed: [10.10.10.12] => (item=/var/log/pgbouncer)
changed: [10.10.10.11] => (item=/var/log/pgbouncer)
changed: [10.10.10.13] => (item=/var/log/pgbouncer)
changed: [10.10.10.10] => (item=/var/run/pgbouncer)
changed: [10.10.10.12] => (item=/var/run/pgbouncer)
changed: [10.10.10.11] => (item=/var/run/pgbouncer)
changed: [10.10.10.13] => (item=/var/run/pgbouncer)
TASK [postgres : Copy /etc/pgbouncer/pgbouncer.ini] ******************************************************************************************************************************************************
changed: [10.10.10.10]
changed: [10.10.10.12]
changed: [10.10.10.11]
changed: [10.10.10.13]
TASK [postgres : Copy /etc/pgbouncer/pgb_hba.conf] *******************************************************************************************************************************************************
changed: [10.10.10.10]
changed: [10.10.10.11]
changed: [10.10.10.13]
changed: [10.10.10.12]
TASK [postgres : Touch userlist and database list] *******************************************************************************************************************************************************
changed: [10.10.10.10] => (item=database.txt)
changed: [10.10.10.11] => (item=database.txt)
changed: [10.10.10.12] => (item=database.txt)
changed: [10.10.10.13] => (item=database.txt)
changed: [10.10.10.10] => (item=userlist.txt)
changed: [10.10.10.11] => (item=userlist.txt)
changed: [10.10.10.12] => (item=userlist.txt)
changed: [10.10.10.13] => (item=userlist.txt)
TASK [postgres : Add default users to pgbouncer] *********************************************************************************************************************************************************
changed: [10.10.10.11]
changed: [10.10.10.12]
changed: [10.10.10.13]
changed: [10.10.10.10]
TASK [postgres : Copy pgbouncer systemd service] *********************************************************************************************************************************************************
changed: [10.10.10.10]
changed: [10.10.10.12]
changed: [10.10.10.11]
changed: [10.10.10.13]
TASK [postgres : Launch pgbouncer pool service] **********************************************************************************************************************************************************
changed: [10.10.10.10]
changed: [10.10.10.12]
changed: [10.10.10.11]
changed: [10.10.10.13]
TASK [postgres : Wait for pgbouncer service online] ******************************************************************************************************************************************************
ok: [10.10.10.10]
ok: [10.10.10.11]
ok: [10.10.10.12]
ok: [10.10.10.13]
TASK [postgres : Check pgbouncer service is ready] *******************************************************************************************************************************************************
changed: [10.10.10.10]
changed: [10.10.10.11]
changed: [10.10.10.12]
changed: [10.10.10.13]
TASK [postgres : include_tasks] **************************************************************************************************************************************************************************
included: /private/tmp/pigsty/roles/postgres/tasks/createuser.yml for 10.10.10.10 => (item={'name': 'dbuser_meta', 'password': 'DBUser.Meta', 'login': True, 'superuser': False, 'createdb': False, 'createrole': False, 'inherit': True, 'replication': False, 'bypassrls': False, 'connlimit': -1, 'expire_at': '2030-12-31', 'expire_in': 365, 'roles': ['dbrole_readwrite'], 'pgbouncer': True, 'parameters': {'search_path': 'public'}, 'comment': 'test user'})
included: /private/tmp/pigsty/roles/postgres/tasks/createuser.yml for 10.10.10.10 => (item={'name': 'dbuser_vonng2', 'password': 'DBUser.Vonng', 'roles': ['dbrole_offline'], 'expire_in': 365, 'pgbouncer': False, 'comment': 'example personal user for interactive queries'})
included: /private/tmp/pigsty/roles/postgres/tasks/createuser.yml for 10.10.10.11, 10.10.10.12, 10.10.10.13 => (item={'name': 'test', 'password': 'test', 'roles': ['dbrole_readwrite'], 'pgbouncer': True, 'comment': 'default test user for production usage'})
TASK [postgres : Render user dbuser_meta creation sql] ***************************************************************************************************************************************************
changed: [10.10.10.10]
TASK [postgres : Execute user dbuser_meta creation sql on primary] ***************************************************************************************************************************************
changed: [10.10.10.10]
TASK [postgres : Add user to pgbouncer] ******************************************************************************************************************************************************************
changed: [10.10.10.10]
TASK [postgres : Render user dbuser_vonng2 creation sql] *************************************************************************************************************************************************
changed: [10.10.10.10]
TASK [postgres : Execute user dbuser_vonng2 creation sql on primary] *************************************************************************************************************************************
changed: [10.10.10.10]
TASK [postgres : Add user to pgbouncer] ******************************************************************************************************************************************************************
skipping: [10.10.10.10]
TASK [postgres : Render user test creation sql] **********************************************************************************************************************************************************
skipping: [10.10.10.12]
skipping: [10.10.10.13]
changed: [10.10.10.11]
TASK [postgres : Execute user test creation sql on primary] **********************************************************************************************************************************************
skipping: [10.10.10.12]
skipping: [10.10.10.13]
changed: [10.10.10.11]
TASK [postgres : Add user to pgbouncer] ******************************************************************************************************************************************************************
changed: [10.10.10.12]
changed: [10.10.10.11]
changed: [10.10.10.13]
TASK [postgres : include_tasks] **************************************************************************************************************************************************************************
included: /private/tmp/pigsty/roles/postgres/tasks/createdb.yml for 10.10.10.10 => (item={'name': 'meta', 'allowconn': True, 'revokeconn': False, 'connlimit': -1, 'extensions': [{'name': 'postgis', 'schema': 'public'}], 'parameters': {'enable_partitionwise_join': True}, 'pgbouncer': True, 'comment': 'pigsty meta database'})
included: /private/tmp/pigsty/roles/postgres/tasks/createdb.yml for 10.10.10.11, 10.10.10.12, 10.10.10.13 => (item={'name': 'test'})
TASK [postgres : debug] **********************************************************************************************************************************************************************************
ok: [10.10.10.10] => {
"msg": {
"allowconn": true,
"comment": "pigsty meta database",
"connlimit": -1,
"extensions": [
{
"name": "postgis",
"schema": "public"
}
],
"name": "meta",
"parameters": {
"enable_partitionwise_join": true
},
"pgbouncer": true,
"revokeconn": false
}
}
TASK [postgres : Render database meta creation sql] ******************************************************************************************************************************************************
changed: [10.10.10.10]
TASK [postgres : Render database meta baseline sql] ******************************************************************************************************************************************************
skipping: [10.10.10.10]
TASK [postgres : Execute database meta creation command] *************************************************************************************************************************************************
changed: [10.10.10.10]
TASK [postgres : Execute database meta creation sql] *****************************************************************************************************************************************************
changed: [10.10.10.10]
TASK [postgres : Execute database meta creation sql] *****************************************************************************************************************************************************
skipping: [10.10.10.10]
TASK [postgres : Add pgbouncer busniess database] ********************************************************************************************************************************************************
changed: [10.10.10.10]
TASK [postgres : debug] **********************************************************************************************************************************************************************************
ok: [10.10.10.11] => {
"msg": {
"name": "test"
}
}
skipping: [10.10.10.12]
skipping: [10.10.10.13]
TASK [postgres : Render database test creation sql] ******************************************************************************************************************************************************
skipping: [10.10.10.12]
skipping: [10.10.10.13]
changed: [10.10.10.11]
TASK [postgres : Render database test baseline sql] ******************************************************************************************************************************************************
skipping: [10.10.10.11]
skipping: [10.10.10.12]
skipping: [10.10.10.13]
TASK [postgres : Execute database test creation command] *************************************************************************************************************************************************
skipping: [10.10.10.12]
skipping: [10.10.10.13]
changed: [10.10.10.11]
TASK [postgres : Execute database test creation sql] *****************************************************************************************************************************************************
skipping: [10.10.10.12]
skipping: [10.10.10.13]
changed: [10.10.10.11]
TASK [postgres : Execute database test creation sql] *****************************************************************************************************************************************************
skipping: [10.10.10.11]
skipping: [10.10.10.12]
skipping: [10.10.10.13]
TASK [postgres : Add pgbouncer busniess database] ********************************************************************************************************************************************************
changed: [10.10.10.11]
changed: [10.10.10.13]
changed: [10.10.10.12]
TASK [postgres : Reload pgbouncer to add db and users] ***************************************************************************************************************************************************
changed: [10.10.10.13]
changed: [10.10.10.12]
changed: [10.10.10.10]
changed: [10.10.10.11]
TASK [postgres : Copy pg service definition to consul] ***************************************************************************************************************************************************
changed: [10.10.10.10] => (item=postgres)
changed: [10.10.10.11] => (item=postgres)
changed: [10.10.10.12] => (item=postgres)
changed: [10.10.10.13] => (item=postgres)
changed: [10.10.10.10] => (item=pgbouncer)
changed: [10.10.10.11] => (item=pgbouncer)
changed: [10.10.10.12] => (item=pgbouncer)
changed: [10.10.10.13] => (item=pgbouncer)
changed: [10.10.10.10] => (item=patroni)
changed: [10.10.10.11] => (item=patroni)
changed: [10.10.10.12] => (item=patroni)
changed: [10.10.10.13] => (item=patroni)
TASK [postgres : Reload postgres consul service] *********************************************************************************************************************************************************
changed: [10.10.10.11]
changed: [10.10.10.12]
changed: [10.10.10.13]
changed: [10.10.10.10]
TASK [postgres : Render grafana datasource definition] ***************************************************************************************************************************************************
changed: [10.10.10.10]
changed: [10.10.10.11]
changed: [10.10.10.12]
changed: [10.10.10.13]
TASK [postgres : Register datasource to grafana] *********************************************************************************************************************************************************
[WARNING]: Consider using the get_url or uri module rather than running 'curl'. If you need to use command because get_url or uri is insufficient you can add 'warn: false' to this command task or set
'command_warnings=False' in ansible.cfg to get rid of this message.
changed: [10.10.10.10]
changed: [10.10.10.11]
changed: [10.10.10.13]
changed: [10.10.10.12]
TASK [monitor : Install exporter yum repo] ***************************************************************************************************************************************************************
skipping: [10.10.10.10]
skipping: [10.10.10.11]
skipping: [10.10.10.12]
skipping: [10.10.10.13]
TASK [monitor : Install node_exporter and pg_exporter] ***************************************************************************************************************************************************
skipping: [10.10.10.10] => (item=node_exporter)
skipping: [10.10.10.10] => (item=pg_exporter)
skipping: [10.10.10.11] => (item=node_exporter)
skipping: [10.10.10.11] => (item=pg_exporter)
skipping: [10.10.10.12] => (item=node_exporter)
skipping: [10.10.10.12] => (item=pg_exporter)
skipping: [10.10.10.13] => (item=node_exporter)
skipping: [10.10.10.13] => (item=pg_exporter)
TASK [monitor : Copy node_exporter binary] ***************************************************************************************************************************************************************
skipping: [10.10.10.10]
skipping: [10.10.10.11]
skipping: [10.10.10.12]
skipping: [10.10.10.13]
TASK [monitor : Copy pg_exporter binary] *****************************************************************************************************************************************************************
skipping: [10.10.10.10]
skipping: [10.10.10.11]
skipping: [10.10.10.12]
skipping: [10.10.10.13]
TASK [monitor : Create /etc/pg_exporter conf dir] ********************************************************************************************************************************************************
changed: [10.10.10.10]
changed: [10.10.10.11]
changed: [10.10.10.12]
changed: [10.10.10.13]
TASK [monitor : Copy default pg_exporter.yaml] ***********************************************************************************************************************************************************
changed: [10.10.10.11]
changed: [10.10.10.12]
changed: [10.10.10.10]
changed: [10.10.10.13]
TASK [monitor : Config /etc/default/pg_exporter] *********************************************************************************************************************************************************
changed: [10.10.10.11]
changed: [10.10.10.10]
changed: [10.10.10.12]
changed: [10.10.10.13]
TASK [monitor : Config pg_exporter service unit] *********************************************************************************************************************************************************
changed: [10.10.10.11]
changed: [10.10.10.12]
changed: [10.10.10.10]
changed: [10.10.10.13]
TASK [monitor : Launch pg_exporter systemd service] ******************************************************************************************************************************************************
changed: [10.10.10.11]
changed: [10.10.10.12]
changed: [10.10.10.13]
changed: [10.10.10.10]
TASK [monitor : Wait for pg_exporter service online] *****************************************************************************************************************************************************
ok: [10.10.10.10]
ok: [10.10.10.12]
ok: [10.10.10.11]
ok: [10.10.10.13]
TASK [monitor : Register pg-exporter consul service] *****************************************************************************************************************************************************
changed: [10.10.10.10]
changed: [10.10.10.11]
changed: [10.10.10.12]
changed: [10.10.10.13]
TASK [monitor : Reload pg-exporter consul service] *******************************************************************************************************************************************************
changed: [10.10.10.12]
changed: [10.10.10.11]
changed: [10.10.10.13]
changed: [10.10.10.10]
TASK [monitor : Config pgbouncer_exporter opts] **********************************************************************************************************************************************************
changed: [10.10.10.10]
changed: [10.10.10.11]
changed: [10.10.10.12]
changed: [10.10.10.13]
TASK [monitor : Config pgbouncer_exporter service] *******************************************************************************************************************************************************
changed: [10.10.10.10]
changed: [10.10.10.11]
changed: [10.10.10.12]
changed: [10.10.10.13]
TASK [monitor : Launch pgbouncer_exporter service] *******************************************************************************************************************************************************
changed: [10.10.10.10]
changed: [10.10.10.13]
changed: [10.10.10.11]
changed: [10.10.10.12]
TASK [monitor : Wait for pgbouncer_exporter online] ******************************************************************************************************************************************************
ok: [10.10.10.10]
ok: [10.10.10.11]
ok: [10.10.10.12]
ok: [10.10.10.13]
TASK [monitor : Register pgb-exporter consul service] ****************************************************************************************************************************************************
changed: [10.10.10.10]
changed: [10.10.10.13]
changed: [10.10.10.11]
changed: [10.10.10.12]
TASK [monitor : Reload pgb-exporter consul service] ******************************************************************************************************************************************************
changed: [10.10.10.11]
changed: [10.10.10.12]
changed: [10.10.10.10]
changed: [10.10.10.13]
TASK [monitor : Copy node_exporter systemd service] ******************************************************************************************************************************************************
changed: [10.10.10.10]
changed: [10.10.10.12]
changed: [10.10.10.11]
changed: [10.10.10.13]
TASK [monitor : Config default node_exporter options] ****************************************************************************************************************************************************
changed: [10.10.10.12]
changed: [10.10.10.10]
changed: [10.10.10.11]
changed: [10.10.10.13]
TASK [monitor : Launch node_exporter service unit] *******************************************************************************************************************************************************
changed: [10.10.10.10]
changed: [10.10.10.12]
changed: [10.10.10.13]
changed: [10.10.10.11]
TASK [monitor : Wait for node_exporter online] ***********************************************************************************************************************************************************
ok: [10.10.10.10]
ok: [10.10.10.11]
ok: [10.10.10.12]
ok: [10.10.10.13]
TASK [monitor : Register node-exporter service to consul] ************************************************************************************************************************************************
changed: [10.10.10.10]
changed: [10.10.10.11]
changed: [10.10.10.13]
changed: [10.10.10.12]
TASK [monitor : Reload node-exporter consul service] *****************************************************************************************************************************************************
changed: [10.10.10.11]
changed: [10.10.10.12]
changed: [10.10.10.10]
changed: [10.10.10.13]
TASK [service : Make sure haproxy is installed] **********************************************************************************************************************************************************
ok: [10.10.10.10]
ok: [10.10.10.11]
ok: [10.10.10.12]
ok: [10.10.10.13]
TASK [service : Create haproxy directory] ****************************************************************************************************************************************************************
ok: [10.10.10.10]
ok: [10.10.10.12]
ok: [10.10.10.13]
ok: [10.10.10.11]
TASK [service : Copy haproxy systemd service file] *******************************************************************************************************************************************************
changed: [10.10.10.10]
changed: [10.10.10.12]
changed: [10.10.10.13]
changed: [10.10.10.11]
TASK [service : Fetch postgres cluster memberships] ******************************************************************************************************************************************************
ok: [10.10.10.10]
ok: [10.10.10.11]
ok: [10.10.10.12]
ok: [10.10.10.13]
TASK [service : Templating /etc/haproxy/haproxy.cfg] *****************************************************************************************************************************************************
changed: [10.10.10.10]
changed: [10.10.10.13]
changed: [10.10.10.11]
changed: [10.10.10.12]
TASK [service : Launch haproxy load balancer service] ****************************************************************************************************************************************************
changed: [10.10.10.10]
changed: [10.10.10.12]
changed: [10.10.10.13]
changed: [10.10.10.11]
TASK [service : Wait for haproxy load balancer online] ***************************************************************************************************************************************************
ok: [10.10.10.10]
ok: [10.10.10.12]
ok: [10.10.10.11]
ok: [10.10.10.13]
TASK [service : Reload haproxy load balancer service] ****************************************************************************************************************************************************
changed: [10.10.10.10]
changed: [10.10.10.13]
changed: [10.10.10.12]
changed: [10.10.10.11]
TASK [service : Copy haproxy exporter definition] ********************************************************************************************************************************************************
changed: [10.10.10.11]
changed: [10.10.10.10]
changed: [10.10.10.13]
changed: [10.10.10.12]
TASK [service : Copy haproxy service definition] *********************************************************************************************************************************************************
changed: [10.10.10.12] => (item={'name': 'primary', 'src_ip': '*', 'src_port': 5433, 'dst_port': 'pgbouncer', 'check_url': '/primary', 'selector': '[]'})
changed: [10.10.10.10] => (item={'name': 'primary', 'src_ip': '*', 'src_port': 5433, 'dst_port': 'pgbouncer', 'check_url': '/primary', 'selector': '[]'})
changed: [10.10.10.11] => (item={'name': 'primary', 'src_ip': '*', 'src_port': 5433, 'dst_port': 'pgbouncer', 'check_url': '/primary', 'selector': '[]'})
changed: [10.10.10.13] => (item={'name': 'primary', 'src_ip': '*', 'src_port': 5433, 'dst_port': 'pgbouncer', 'check_url': '/primary', 'selector': '[]'})
changed: [10.10.10.10] => (item={'name': 'replica', 'src_ip': '*', 'src_port': 5434, 'dst_port': 'pgbouncer', 'check_url': '/read-only', 'selector': '[]', 'selector_backup': '[? pg_role == `primary`]'})
changed: [10.10.10.12] => (item={'name': 'replica', 'src_ip': '*', 'src_port': 5434, 'dst_port': 'pgbouncer', 'check_url': '/read-only', 'selector': '[]', 'selector_backup': '[? pg_role == `primary`]'})
changed: [10.10.10.13] => (item={'name': 'replica', 'src_ip': '*', 'src_port': 5434, 'dst_port': 'pgbouncer', 'check_url': '/read-only', 'selector': '[]', 'selector_backup': '[? pg_role == `primary`]'})
changed: [10.10.10.11] => (item={'name': 'replica', 'src_ip': '*', 'src_port': 5434, 'dst_port': 'pgbouncer', 'check_url': '/read-only', 'selector': '[]', 'selector_backup': '[? pg_role == `primary`]'})
changed: [10.10.10.10] => (item={'name': 'default', 'src_ip': '*', 'src_port': 5436, 'dst_port': 'postgres', 'check_method': 'http', 'check_port': 'patroni', 'check_url': '/primary', 'check_code': 200, 'selector': '[]', 'haproxy': {'maxconn': 3000, 'balance': 'roundrobin', 'default_server_options': 'inter 3s fastinter 1s downinter 5s rise 3 fall 3 on-marked-down shutdown-sessions slowstart 30s maxconn 3000 maxqueue 128 weight 100'}})
changed: [10.10.10.12] => (item={'name': 'default', 'src_ip': '*', 'src_port': 5436, 'dst_port': 'postgres', 'check_method': 'http', 'check_port': 'patroni', 'check_url': '/primary', 'check_code': 200, 'selector': '[]', 'haproxy': {'maxconn': 3000, 'balance': 'roundrobin', 'default_server_options': 'inter 3s fastinter 1s downinter 5s rise 3 fall 3 on-marked-down shutdown-sessions slowstart 30s maxconn 3000 maxqueue 128 weight 100'}})
changed: [10.10.10.11] => (item={'name': 'default', 'src_ip': '*', 'src_port': 5436, 'dst_port': 'postgres', 'check_method': 'http', 'check_port': 'patroni', 'check_url': '/primary', 'check_code': 200, 'selector': '[]', 'haproxy': {'maxconn': 3000, 'balance': 'roundrobin', 'default_server_options': 'inter 3s fastinter 1s downinter 5s rise 3 fall 3 on-marked-down shutdown-sessions slowstart 30s maxconn 3000 maxqueue 128 weight 100'}})
changed: [10.10.10.13] => (item={'name': 'default', 'src_ip': '*', 'src_port': 5436, 'dst_port': 'postgres', 'check_method': 'http', 'check_port': 'patroni', 'check_url': '/primary', 'check_code': 200, 'selector': '[]', 'haproxy': {'maxconn': 3000, 'balance': 'roundrobin', 'default_server_options': 'inter 3s fastinter 1s downinter 5s rise 3 fall 3 on-marked-down shutdown-sessions slowstart 30s maxconn 3000 maxqueue 128 weight 100'}})
changed: [10.10.10.10] => (item={'name': 'offline', 'src_ip': '*', 'src_port': 5438, 'dst_port': 'postgres', 'check_url': '/replica', 'selector': '[? pg_role == `offline` || pg_offline_query ]', 'selector_backup': '[? pg_role == `replica` && !pg_offline_query]'})
changed: [10.10.10.12] => (item={'name': 'offline', 'src_ip': '*', 'src_port': 5438, 'dst_port': 'postgres', 'check_url': '/replica', 'selector': '[? pg_role == `offline` || pg_offline_query ]', 'selector_backup': '[? pg_role == `replica` && !pg_offline_query]'})
changed: [10.10.10.11] => (item={'name': 'offline', 'src_ip': '*', 'src_port': 5438, 'dst_port': 'postgres', 'check_url': '/replica', 'selector': '[? pg_role == `offline` || pg_offline_query ]', 'selector_backup': '[? pg_role == `replica` && !pg_offline_query]'})
changed: [10.10.10.13] => (item={'name': 'offline', 'src_ip': '*', 'src_port': 5438, 'dst_port': 'postgres', 'check_url': '/replica', 'selector': '[? pg_role == `offline` || pg_offline_query ]', 'selector_backup': '[? pg_role == `replica` && !pg_offline_query]'})
TASK [service : Reload haproxy consul service] ***********************************************************************************************************************************************************
changed: [10.10.10.10]
changed: [10.10.10.11]
changed: [10.10.10.13]
changed: [10.10.10.12]
TASK [service : Make sure vip-manager is installed] ******************************************************************************************************************************************************
ok: [10.10.10.10]
ok: [10.10.10.13]
ok: [10.10.10.11]
ok: [10.10.10.12]
TASK [service : Copy vip-manager systemd service file] ***************************************************************************************************************************************************
changed: [10.10.10.10]
changed: [10.10.10.12]
changed: [10.10.10.11]
changed: [10.10.10.13]
TASK [service : create vip-manager systemd drop-in dir] **************************************************************************************************************************************************
changed: [10.10.10.11]
changed: [10.10.10.12]
changed: [10.10.10.10]
changed: [10.10.10.13]
TASK [service : create vip-manager systemd drop-in file] *************************************************************************************************************************************************
changed: [10.10.10.10]
changed: [10.10.10.13]
changed: [10.10.10.12]
changed: [10.10.10.11]
TASK [service : Templating /etc/default/vip-manager.yml] *************************************************************************************************************************************************
changed: [10.10.10.10]
changed: [10.10.10.11]
changed: [10.10.10.12]
changed: [10.10.10.13]
TASK [service : Launch vip-manager] **********************************************************************************************************************************************************************
changed: [10.10.10.10]
changed: [10.10.10.11]
changed: [10.10.10.13]
changed: [10.10.10.12]
TASK [service : Fetch postgres cluster memberships] ******************************************************************************************************************************************************
skipping: [10.10.10.10]
skipping: [10.10.10.11]
skipping: [10.10.10.12]
skipping: [10.10.10.13]
TASK [service : Render L4 VIP configs] *******************************************************************************************************************************************************************
skipping: [10.10.10.10] => (item={'name': 'primary', 'src_ip': '*', 'src_port': 5433, 'dst_port': 'pgbouncer', 'check_url': '/primary', 'selector': '[]'})
skipping: [10.10.10.10] => (item={'name': 'replica', 'src_ip': '*', 'src_port': 5434, 'dst_port': 'pgbouncer', 'check_url': '/read-only', 'selector': '[]', 'selector_backup': '[? pg_role == `primary`]'})
skipping: [10.10.10.10] => (item={'name': 'default', 'src_ip': '*', 'src_port': 5436, 'dst_port': 'postgres', 'check_method': 'http', 'check_port': 'patroni', 'check_url': '/primary', 'check_code': 200, 'selector': '[]', 'haproxy': {'maxconn': 3000, 'balance': 'roundrobin', 'default_server_options': 'inter 3s fastinter 1s downinter 5s rise 3 fall 3 on-marked-down shutdown-sessions slowstart 30s maxconn 3000 maxqueue 128 weight 100'}})
skipping: [10.10.10.11] => (item={'name': 'primary', 'src_ip': '*', 'src_port': 5433, 'dst_port': 'pgbouncer', 'check_url': '/primary', 'selector': '[]'})
skipping: [10.10.10.10] => (item={'name': 'offline', 'src_ip': '*', 'src_port': 5438, 'dst_port': 'postgres', 'check_url': '/replica', 'selector': '[? pg_role == `offline` || pg_offline_query ]', 'selector_backup': '[? pg_role == `replica` && !pg_offline_query]'})
skipping: [10.10.10.11] => (item={'name': 'replica', 'src_ip': '*', 'src_port': 5434, 'dst_port': 'pgbouncer', 'check_url': '/read-only', 'selector': '[]', 'selector_backup': '[? pg_role == `primary`]'})
skipping: [10.10.10.11] => (item={'name': 'default', 'src_ip': '*', 'src_port': 5436, 'dst_port': 'postgres', 'check_method': 'http', 'check_port': 'patroni', 'check_url': '/primary', 'check_code': 200, 'selector': '[]', 'haproxy': {'maxconn': 3000, 'balance': 'roundrobin', 'default_server_options': 'inter 3s fastinter 1s downinter 5s rise 3 fall 3 on-marked-down shutdown-sessions slowstart 30s maxconn 3000 maxqueue 128 weight 100'}})
skipping: [10.10.10.12] => (item={'name': 'primary', 'src_ip': '*', 'src_port': 5433, 'dst_port': 'pgbouncer', 'check_url': '/primary', 'selector': '[]'})
skipping: [10.10.10.11] => (item={'name': 'offline', 'src_ip': '*', 'src_port': 5438, 'dst_port': 'postgres', 'check_url': '/replica', 'selector': '[? pg_role == `offline` || pg_offline_query ]', 'selector_backup': '[? pg_role == `replica` && !pg_offline_query]'})
skipping: [10.10.10.12] => (item={'name': 'replica', 'src_ip': '*', 'src_port': 5434, 'dst_port': 'pgbouncer', 'check_url': '/read-only', 'selector': '[]', 'selector_backup': '[? pg_role == `primary`]'})
skipping: [10.10.10.13] => (item={'name': 'primary', 'src_ip': '*', 'src_port': 5433, 'dst_port': 'pgbouncer', 'check_url': '/primary', 'selector': '[]'})
skipping: [10.10.10.12] => (item={'name': 'default', 'src_ip': '*', 'src_port': 5436, 'dst_port': 'postgres', 'check_method': 'http', 'check_port': 'patroni', 'check_url': '/primary', 'check_code': 200, 'selector': '[]', 'haproxy': {'maxconn': 3000, 'balance': 'roundrobin', 'default_server_options': 'inter 3s fastinter 1s downinter 5s rise 3 fall 3 on-marked-down shutdown-sessions slowstart 30s maxconn 3000 maxqueue 128 weight 100'}})
skipping: [10.10.10.13] => (item={'name': 'replica', 'src_ip': '*', 'src_port': 5434, 'dst_port': 'pgbouncer', 'check_url': '/read-only', 'selector': '[]', 'selector_backup': '[? pg_role == `primary`]'})
skipping: [10.10.10.12] => (item={'name': 'offline', 'src_ip': '*', 'src_port': 5438, 'dst_port': 'postgres', 'check_url': '/replica', 'selector': '[? pg_role == `offline` || pg_offline_query ]', 'selector_backup': '[? pg_role == `replica` && !pg_offline_query]'})
skipping: [10.10.10.13] => (item={'name': 'default', 'src_ip': '*', 'src_port': 5436, 'dst_port': 'postgres', 'check_method': 'http', 'check_port': 'patroni', 'check_url': '/primary', 'check_code': 200, 'selector': '[]', 'haproxy': {'maxconn': 3000, 'balance': 'roundrobin', 'default_server_options': 'inter 3s fastinter 1s downinter 5s rise 3 fall 3 on-marked-down shutdown-sessions slowstart 30s maxconn 3000 maxqueue 128 weight 100'}})
skipping: [10.10.10.13] => (item={'name': 'offline', 'src_ip': '*', 'src_port': 5438, 'dst_port': 'postgres', 'check_url': '/replica', 'selector': '[? pg_role == `offline` || pg_offline_query ]', 'selector_backup': '[? pg_role == `replica` && !pg_offline_query]'})
TASK [service : include_tasks] ***************************************************************************************************************************************************************************
skipping: [10.10.10.10]
skipping: [10.10.10.11]
skipping: [10.10.10.12]
skipping: [10.10.10.13]
PLAY RECAP ***********************************************************************************************************************************************************************************************
10.10.10.10 : ok=264 changed=205 unreachable=0 failed=0 skipped=62 rescued=0 ignored=0
10.10.10.11 : ok=182 changed=146 unreachable=0 failed=0 skipped=55 rescued=0 ignored=0
10.10.10.12 : ok=171 changed=135 unreachable=0 failed=0 skipped=66 rescued=0 ignored=0
10.10.10.13 : ok=171 changed=135 unreachable=0 failed=0 skipped=66 rescued=0 ignored=0
烈建议在第一次完成初始化后执行 make cache
命令,该命令会将下载好的软件打为离线缓存包,并放置于files/pkg.tgz
中。这样当下一次创建新的pigsty环境时,只要宿主机内操作系统一致,就可以直接复用该离线包,省去大量下载时间。
mon-view
初始化完毕后,您可以通过浏览器访问 http://pigsty 前往监控系统主页。默认的用户名与密码均为admin
如果没有配置DNS,或者没有使用默认的IP地址,也可以直接访问 http://meta_ip_address:3000
前往监控系统首页。
$ make mon-view
open -n 'http://g.pigsty/'
7 - PG Exporter
PG Exporter
Prometheus exporter for PostgreSQL metrics. Gives you complete insight on your favourate elephant!
Latest binaries & rpms can be found on release page. Supported pg version: PostgreSQL 9.4+ & Pgbouncer 1.8+. Default collectors definition is compatible with PostgreSQL 10,11,12,13.
Latest pg_exporter
version: 0.3.1
Features
- Support both Postgres & Pgbouncer
- Flexible: Almost all metrics are defined in customizable configuration files in SQL style.
- Fine-grained execution control (Tags Filter, Facts Filter, Version Filter, Timeout, Cache, etc…)
- Dynamic Planning: User could provide multiple branches of a metric queries. Queries matches server version & fact & tag will be actually installed.
- Configurable caching policy & query timeout
- Rich metrics about
pg_exporter
itself. - Auto discovery multi database in the same cluster (multiple database scrape TBD)
- Tested and verified in real world production environment for years (200+ Nodes)
- Metrics overhelming! Gives you complete insight on your favourate elephant!
- (Pgbouncer mode is enabled when target dbname is
pgbouncer
)
性能表现
对于极端场景(几十万张表与几万种查询),一次抓取最多可能耗费秒级的时长。
好在所有指标收集器都是可选关闭的,且pg_exporter 允许为收集器配置主动超时取消(默认100ms)
自监控
Exporter展示了监控系统组件本身的监控指标,包括:
- Exporter是否存活,Uptime,Exporter每分钟被抓取的次数
- 每个监控查询的耗时,产生的指标数量与错误数量。
Prometheus的配置
Prometheus的抓取频率建议采用10~15秒,并配置适当的超时。
演示或特殊情况也可以配置的更精细(例如2秒,5秒等)
单Prometheus节点可以支持几百个实例的监控,约几百万个时间序列 (Dell R740 64 Core / 400GB Mem/ 3TB PCI-E SSD)
更大规模的集群可以通过Prometheus级联、联邦或分片实现伸缩。例如为每一个数据库集群部署一个Prometheus,并使用上级Prometheus统筹抓取并计算衍生指标
8 - Prometheus Service Discovery
当使用 prometheus_sd_method
== ‘static’ 的静态文件服务发现模式时,Prometheus会使用静态文件进行服务发现,目标配置文件地址默认为 /etc/prometheus/targets/
目录中的所有yml
文件。
集中式配置
当 prometheus_sd_target
配置为batch
模式时,Pigsty会采用集中式配置管理Prometheus监控目标。
所有监控对象都定义于单一配置文件:/etc/prometheus/targets/all.yml
中。
#==============================================================#
# File : targets/all.yml
# Ctime : 2021-02-18
# Mtime : 2021-02-18
# Atime : 2021-03-01 16:46
# Note : Managed by Ansible
# Desc : Prometheus Static Monitoring Targets Definition
# Path : /etc/prometheus/targets/all.yml
# Copyright (C) 2018-2021 Ruohang Feng
#==============================================================#
# static monitor targets, batch version
#======> pg-meta-1 [primary]
- labels: {cls: pg-meta, ins: pg-meta-1, ip: 10.10.10.10, role: primary, svc: pg-meta-primary}
targets: [10.10.10.10:9630, 10.10.10.10:9100, 10.10.10.10:9631, 10.10.10.10:9101]
#======> pg-test-1 [primary]
- labels: {cls: pg-test, ins: pg-test-1, ip: 10.10.10.11, role: primary, svc: pg-test-primary}
targets: [10.10.10.11:9630, 10.10.10.11:9100, 10.10.10.11:9631, 10.10.10.11:9101]
#======> pg-test-2 [replica]
- labels: {cls: pg-test, ins: pg-test-2, ip: 10.10.10.12, role: replica, svc: pg-test-replica}
targets: [10.10.10.12:9630, 10.10.10.12:9100, 10.10.10.12:9631, 10.10.10.12:9101]
#======> pg-test-3 [replica]
- labels: {cls: pg-test, ins: pg-test-3, ip: 10.10.10.13, role: replica, svc: pg-test-replica}
targets: [10.10.10.13:9630, 10.10.10.13:9100, 10.10.10.13:9631, 10.10.10.13:9101]
分立式配置
当 prometheus_sd_target
配置为single
模式时,Pigsty会采用分立式配置管理Prometheus监控目标。
每个监控实例,都拥有自己独占的单一配置文件:/etc/prometheus/targets/{{ pg_instance }}.yml
中。
以 pg-meta-1
实例为例,其配置文件位置为:/etc/prometheus/targets/pg-meta-1.yml
,内容为:
# pg-meta-1 [primary]
- labels: {cls: pg-meta, ins: pg-meta-1, ip: 10.10.10.10, role: primary, svc: pg-meta-primary}
targets: [10.10.10.10:9630, 10.10.10.10:9100, 10.10.10.10:9631, 10.10.10.10:9101]
9 - Tuned Template
9.1 - OLTP
Tuned OLTP模板主要针对延迟进行优化,此模板针对的机型是Dell R740 64核/400GB内存,使用PCI-E SSD的节点。您可以根据自己的实际机型进行调整。
# tuned configuration
#==============================================================#
# File : tuned.conf
# Mtime : 2020-06-29
# Desc : Tune operatiing system to oltp mode
# Path : /etc/tuned/oltp/tuned.conf
# Author : Vonng(fengruohang@outlook.com)
# Copyright (C) 2018-2021 Ruohang Feng
#==============================================================#
[main]
summary=Optimize for PostgreSQL OLTP System
include=network-latency
[cpu]
force_latency=1
governor=performance
energy_perf_bias=performance
min_perf_pct=100
[vm]
# disable transparent hugepages
transparent_hugepages=never
[sysctl]
#-------------------------------------------------------------#
# KERNEL #
#-------------------------------------------------------------#
# disable numa balancing
kernel.numa_balancing=0
# total shmem size in bytes: $(expr $(getconf _PHYS_PAGES) / 2 \* $(getconf PAGE_SIZE))
{% if param_shmall is defined and param_shmall != '' %}
kernel.shmall = {{ param_shmall }}
{% endif %}
# total shmem size in pages: $(expr $(getconf _PHYS_PAGES) / 2)
{% if param_shmmax is defined and param_shmmax != '' %}
kernel.shmmax = {{ param_shmmax }}
{% endif %}
# total shmem segs 4096 -> 8192
kernel.shmmni=8192
# total msg queue number, set to mem size in MB
kernel.msgmni=32768
# max length of message queue
kernel.msgmnb=65536
# max size of message
kernel.msgmax=65536
kernel.pid_max=131072
# max(Sem in Set)=2048, max(Sem)=max(Sem in Set) x max(SemSet) , max(Sem per Ops)=2048, max(SemSet)=65536
kernel.sem=2048 134217728 2048 65536
# do not sched postgres process in group
kernel.sched_autogroup_enabled = 0
# total time the scheduler will consider a migrated process cache hot and, thus, less likely to be remigrated
# defaut = 0.5ms (500000ns), update to 5ms , depending on your typical query (e.g < 1ms)
kernel.sched_migration_cost_ns=5000000
#-------------------------------------------------------------#
# VM #
#-------------------------------------------------------------#
# try not using swap
vm.swappiness=0
# disable when most mem are for file cache
vm.zone_reclaim_mode=0
# overcommit threshhold = 80%
vm.overcommit_memory=2
vm.overcommit_ratio=80
# vm.dirty_background_bytes=67108864 # 64MB mem (2xRAID cache) wake the bgwriter
vm.dirty_background_ratio=3 # latency-performance default
vm.dirty_ratio=10 # latency-performance default
# deny access on 0x00000 - 0x10000
vm.mmap_min_addr=65536
#-------------------------------------------------------------#
# Filesystem #
#-------------------------------------------------------------#
# max open files: 382589 -> 167772160
fs.file-max=167772160
# max concurrent unfinished async io, should be larger than 1M. 65536->1M
fs.aio-max-nr=1048576
#-------------------------------------------------------------#
# Network #
#-------------------------------------------------------------#
# max connection in listen queue (triggers retrans if full)
net.core.somaxconn=65535
net.core.netdev_max_backlog=8192
# tcp receive/transmit buffer default = 256KiB
net.core.rmem_default=262144
net.core.wmem_default=262144
# receive/transmit buffer limit = 4MiB
net.core.rmem_max=4194304
net.core.wmem_max=4194304
# ip options
net.ipv4.ip_forward=1
net.ipv4.ip_nonlocal_bind=1
net.ipv4.ip_local_port_range=32768 65000
# tcp options
net.ipv4.tcp_timestamps=1
net.ipv4.tcp_tw_reuse=1
net.ipv4.tcp_tw_recycle=0
net.ipv4.tcp_syncookies=0
net.ipv4.tcp_synack_retries=1
net.ipv4.tcp_syn_retries=1
# tcp read/write buffer
net.ipv4.tcp_rmem="4096 87380 16777216"
net.ipv4.tcp_wmem="4096 16384 16777216"
net.ipv4.udp_mem="3145728 4194304 16777216"
# tcp probe fail interval: 75s -> 20s
net.ipv4.tcp_keepalive_intvl=20
# tcp break after 3 * 20s = 1m
net.ipv4.tcp_keepalive_probes=3
# probe peroid = 1 min
net.ipv4.tcp_keepalive_time=60
net.ipv4.tcp_fin_timeout=5
net.ipv4.tcp_max_tw_buckets=262144
net.ipv4.tcp_max_syn_backlog=8192
net.ipv4.neigh.default.gc_thresh1=80000
net.ipv4.neigh.default.gc_thresh2=90000
net.ipv4.neigh.default.gc_thresh3=100000
net.bridge.bridge-nf-call-iptables=1
net.bridge.bridge-nf-call-ip6tables=1
net.bridge.bridge-nf-call-arptables=1
# max connection tracking number
net.netfilter.nf_conntrack_max=1048576
9.2 - TINY
Tuned TINY模板主要针对极低配置的虚拟机进行优化,
此模板针对的典型机型是1核/1GB的虚拟机节点。您可以根据自己的实际机型进行调整。
# tuned configuration
#==============================================================#
# File : tuned.conf
# Mtime : 2020-06-29
# Desc : Tune operatiing system to tiny mode
# Path : /etc/tuned/tiny/tuned.conf
# Author : Vonng(fengruohang@outlook.com)
# Copyright (C) 2018-2021 Ruohang Feng
#==============================================================#
[main]
summary=Optimize for PostgreSQL TINY System
# include=virtual-guest
[vm]
# disable transparent hugepages
transparent_hugepages=never
[sysctl]
#-------------------------------------------------------------#
# KERNEL #
#-------------------------------------------------------------#
# disable numa balancing
kernel.numa_balancing=0
# If a workload mostly uses anonymous memory and it hits this limit, the entire
# working set is buffered for I/O, and any more write buffering would require
# swapping, so it's time to throttle writes until I/O can catch up. Workloads
# that mostly use file mappings may be able to use even higher values.
#
# The generator of dirty data starts writeback at this percentage (system default
# is 20%)
vm.dirty_ratio = 40
# Filesystem I/O is usually much more efficient than swapping, so try to keep
# swapping low. It's usually safe to go even lower than this on systems with
# server-grade storage.
vm.swappiness = 30
#-------------------------------------------------------------#
# Network #
#-------------------------------------------------------------#
# tcp options
net.ipv4.tcp_timestamps=1
net.ipv4.tcp_tw_reuse=1
net.ipv4.tcp_tw_recycle=0
net.ipv4.tcp_syncookies=0
net.ipv4.tcp_synack_retries=1
net.ipv4.tcp_syn_retries=1
# tcp probe fail interval: 75s -> 20s
net.ipv4.tcp_keepalive_intvl=20
# tcp break after 3 * 20s = 1m
net.ipv4.tcp_keepalive_probes=3
# probe peroid = 1 min
net.ipv4.tcp_keepalive_time=60
9.3 - OLAP
Tuned OLAP模板主要针对吞吐量与计算并行度进行优化
此模板针对的机型是Dell R740 64核/400GB内存,使用PCI-E SSD的节点。您可以根据自己的实际机型进行调整。
# tuned configuration
#==============================================================#
# File : tuned.conf
# Mtime : 2020-09-18
# Desc : Tune operatiing system to olap mode
# Path : /etc/tuned/olap/tuned.conf
# Author : Vonng(fengruohang@outlook.com)
# Copyright (C) 2018-2021 Ruohang Feng
#==============================================================#
[main]
summary=Optimize for PostgreSQL OLAP System
include=network-throughput
[cpu]
force_latency=1
governor=performance
energy_perf_bias=performance
min_perf_pct=100
[vm]
# disable transparent hugepages
transparent_hugepages=never
[sysctl]
#-------------------------------------------------------------#
# KERNEL #
#-------------------------------------------------------------#
# disable numa balancing
kernel.numa_balancing=0
# total shmem size in bytes: $(expr $(getconf _PHYS_PAGES) / 2 \* $(getconf PAGE_SIZE))
{% if param_shmall is defined and param_shmall != '' %}
kernel.shmall = {{ param_shmall }}
{% endif %}
# total shmem size in pages: $(expr $(getconf _PHYS_PAGES) / 2)
{% if param_shmmax is defined and param_shmmax != '' %}
kernel.shmmax = {{ param_shmmax }}
{% endif %}
# total shmem segs 4096 -> 8192
kernel.shmmni=8192
# total msg queue number, set to mem size in MB
kernel.msgmni=32768
# max length of message queue
kernel.msgmnb=65536
# max size of message
kernel.msgmax=65536
kernel.pid_max=131072
# max(Sem in Set)=2048, max(Sem)=max(Sem in Set) x max(SemSet) , max(Sem per Ops)=2048, max(SemSet)=65536
kernel.sem=2048 134217728 2048 65536
# do not sched postgres process in group
kernel.sched_autogroup_enabled = 0
# total time the scheduler will consider a migrated process cache hot and, thus, less likely to be remigrated
# defaut = 0.5ms (500000ns), update to 5ms , depending on your typical query (e.g < 1ms)
kernel.sched_migration_cost_ns=5000000
#-------------------------------------------------------------#
# VM #
#-------------------------------------------------------------#
# try not using swap
# vm.swappiness=10
# disable when most mem are for file cache
vm.zone_reclaim_mode=0
# overcommit threshhold = 80%
vm.overcommit_memory=2
vm.overcommit_ratio=80
vm.dirty_background_ratio = 10 # throughput-performance default
vm.dirty_ratio=80 # throughput-performance default 40 -> 80
# deny access on 0x00000 - 0x10000
vm.mmap_min_addr=65536
#-------------------------------------------------------------#
# Filesystem #
#-------------------------------------------------------------#
# max open files: 382589 -> 167772160
fs.file-max=167772160
# max concurrent unfinished async io, should be larger than 1M. 65536->1M
fs.aio-max-nr=1048576
#-------------------------------------------------------------#
# Network #
#-------------------------------------------------------------#
# max connection in listen queue (triggers retrans if full)
net.core.somaxconn=65535
net.core.netdev_max_backlog=8192
# tcp receive/transmit buffer default = 256KiB
net.core.rmem_default=262144
net.core.wmem_default=262144
# receive/transmit buffer limit = 4MiB
net.core.rmem_max=4194304
net.core.wmem_max=4194304
# ip options
net.ipv4.ip_forward=1
net.ipv4.ip_nonlocal_bind=1
net.ipv4.ip_local_port_range=32768 65000
# tcp options
net.ipv4.tcp_timestamps=1
net.ipv4.tcp_tw_reuse=1
net.ipv4.tcp_tw_recycle=0
net.ipv4.tcp_syncookies=0
net.ipv4.tcp_synack_retries=1
net.ipv4.tcp_syn_retries=1
# tcp read/write buffer
net.ipv4.tcp_rmem="4096 87380 16777216"
net.ipv4.tcp_wmem="4096 16384 16777216"
net.ipv4.udp_mem="3145728 4194304 16777216"
# tcp probe fail interval: 75s -> 20s
net.ipv4.tcp_keepalive_intvl=20
# tcp break after 3 * 20s = 1m
net.ipv4.tcp_keepalive_probes=3
# probe peroid = 1 min
net.ipv4.tcp_keepalive_time=60
net.ipv4.tcp_fin_timeout=5
net.ipv4.tcp_max_tw_buckets=262144
net.ipv4.tcp_max_syn_backlog=8192
net.ipv4.neigh.default.gc_thresh1=80000
net.ipv4.neigh.default.gc_thresh2=90000
net.ipv4.neigh.default.gc_thresh3=100000
net.bridge.bridge-nf-call-iptables=1
net.bridge.bridge-nf-call-ip6tables=1
net.bridge.bridge-nf-call-arptables=1
# max connection tracking number
net.netfilter.nf_conntrack_max=1048576
9.4 - CRIT
Tuned CRIT模板主要针对RPO进行优化,尽可能减少内存中脏数据的量。
此模板针对的机型是Dell R740 64核/400GB内存,使用PCI-E SSD的节点。您可以根据自己的实际机型进行调整。
# tuned configuration
#==============================================================#
# File : tuned.conf
# Mtime : 2020-06-29
# Desc : Tune operatiing system to crit mode
# Path : /etc/tuned/crit/tuned.conf
# Author : Vonng(fengruohang@outlook.com)
# Copyright (C) 2018-2021 Ruohang Feng
#==============================================================#
[main]
summary=Optimize for PostgreSQL CRIT System
include=network-latency
[cpu]
force_latency=1
governor=performance
energy_perf_bias=performance
min_perf_pct=100
[vm]
# disable transparent hugepages
transparent_hugepages=never
[sysctl]
#-------------------------------------------------------------#
# KERNEL #
#-------------------------------------------------------------#
# disable numa balancing
kernel.numa_balancing=0
# total shmem size in bytes: $(expr $(getconf _PHYS_PAGES) / 2 \* $(getconf PAGE_SIZE))
{% if param_shmall is defined and param_shmall != '' %}
kernel.shmall = {{ param_shmall }}
{% endif %}
# total shmem size in pages: $(expr $(getconf _PHYS_PAGES) / 2)
{% if param_shmmax is defined and param_shmmax != '' %}
kernel.shmmax = {{ param_shmmax }}
{% endif %}
# total shmem segs 4096 -> 8192
kernel.shmmni=8192
# total msg queue number, set to mem size in MB
kernel.msgmni=32768
# max length of message queue
kernel.msgmnb=65536
# max size of message
kernel.msgmax=65536
kernel.pid_max=131072
# max(Sem in Set)=2048, max(Sem)=max(Sem in Set) x max(SemSet) , max(Sem per Ops)=2048, max(SemSet)=65536
kernel.sem=2048 134217728 2048 65536
# do not sched postgres process in group
kernel.sched_autogroup_enabled = 0
# total time the scheduler will consider a migrated process cache hot and, thus, less likely to be remigrated
# defaut = 0.5ms (500000ns), update to 5ms , depending on your typical query (e.g < 1ms)
kernel.sched_migration_cost_ns=5000000
#-------------------------------------------------------------#
# VM #
#-------------------------------------------------------------#
# try not using swap
vm.swappiness=0
# disable when most mem are for file cache
vm.zone_reclaim_mode=0
# overcommit threshhold = 80%
vm.overcommit_memory=2
vm.overcommit_ratio=100
# 64MB mem (2xRAID cache) wake the bgwriter
vm.dirty_background_bytes=67108864
# vm.dirty_background_ratio=3 # latency-performance default
vm.dirty_ratio=6 # latency-performance default
# deny access on 0x00000 - 0x10000
vm.mmap_min_addr=65536
#-------------------------------------------------------------#
# Filesystem #
#-------------------------------------------------------------#
# max open files: 382589 -> 167772160
fs.file-max=167772160
# max concurrent unfinished async io, should be larger than 1M. 65536->1M
fs.aio-max-nr=1048576
#-------------------------------------------------------------#
# Network #
#-------------------------------------------------------------#
# max connection in listen queue (triggers retrans if full)
net.core.somaxconn=65535
net.core.netdev_max_backlog=8192
# tcp receive/transmit buffer default = 256KiB
net.core.rmem_default=262144
net.core.wmem_default=262144
# receive/transmit buffer limit = 4MiB
net.core.rmem_max=4194304
net.core.wmem_max=4194304
# ip options
net.ipv4.ip_forward=1
net.ipv4.ip_nonlocal_bind=1
net.ipv4.ip_local_port_range=32768 65000
# tcp options
net.ipv4.tcp_timestamps=1
net.ipv4.tcp_tw_reuse=1
net.ipv4.tcp_tw_recycle=0
net.ipv4.tcp_syncookies=0
net.ipv4.tcp_synack_retries=1
net.ipv4.tcp_syn_retries=1
# tcp read/write buffer
net.ipv4.tcp_rmem="4096 87380 16777216"
net.ipv4.tcp_wmem="4096 16384 16777216"
net.ipv4.udp_mem="3145728 4194304 16777216"
# tcp probe fail interval: 75s -> 20s
net.ipv4.tcp_keepalive_intvl=20
# tcp break after 3 * 20s = 1m
net.ipv4.tcp_keepalive_probes=3
# probe peroid = 1 min
net.ipv4.tcp_keepalive_time=60
net.ipv4.tcp_fin_timeout=5
net.ipv4.tcp_max_tw_buckets=262144
net.ipv4.tcp_max_syn_backlog=8192
net.ipv4.neigh.default.gc_thresh1=80000
net.ipv4.neigh.default.gc_thresh2=90000
net.ipv4.neigh.default.gc_thresh3=100000
net.bridge.bridge-nf-call-iptables=1
net.bridge.bridge-nf-call-ip6tables=1
net.bridge.bridge-nf-call-arptables=1
# max connection tracking number
net.netfilter.nf_conntrack_max=1048576
10 - Patroni Template
Pigsty使用Patroni管理与初始化Postgres数据库集群。
Pigsty使用Patroni完成供给的主体工作,即使用户选择了无Patroni模式,拉起数据库集群也会由Patroni负责,并在创建完成后移除Patroni组件。
用户可以通过Patroni配置文件,完成大部分的PostgreSQL集群定制工作,Patroni配置文件格式详情请参考 Patroni官方文档。
预定义模板
Pigsty提供了四种预定义的初始化模板,初始化模板是用于初始化数据库集群的定义文件,默认位于roles/postgres/templates/
。包括:
oltp.yml
OLTP模板,默认配置,针对生产机型优化延迟与性能。- `olap.yml OLAP模板,提高并行度,针对吞吐量,长查询进行优化。
crit.yml
) 核心业务模板,基于OLTP模板针对RPO、安全性、数据完整性进行优化,启用同步复制与数据校验和。tiny.yml
微型数据库模板,针对低资源场景进行优化,例如运行于虚拟机中的演示数据库集群。
通过 pg_conf
参数指定所需使用的模板路径,如果使用预制模板,则只需填入模板文件名称即可。
如果使用定制的 Patroni配置模板,通常也应当针对机器节点使用配套的 节点优化模板。
更详细的配置信息,请参考 PG供给
10.1 - OLTP
Patroni OLTP模板主要针对延迟进行优化,此模板针对的机型是Dell R740 64核/400GB内存,使用PCI-E SSD的节点。您可以根据自己的实际机型进行调整。
#!/usr/bin/env patroni
#==============================================================#
# File : patroni.yml
# Ctime : 2020-04-08
# Mtime : 2020-12-22
# Desc : patroni cluster definition for {{ pg_cluster }} (oltp)
# Path : /pg/bin/patroni.yml
# Real Path : /pg/conf/{{ pg_instance }}.yml
# Link : /pg/bin/patroni.yml -> /pg/conf/{{ pg_instance}}.yml
# Note : Transactional Database Cluster Template
# Doc : https://patroni.readthedocs.io/en/latest/SETTINGS.html
# Copyright (C) 2018-2021 Ruohang Feng
#==============================================================#
# OLTP database are optimized for performance, rt latency
# typical spec: 64 Core | 400 GB RAM | PCI-E SSD xTB
---
#------------------------------------------------------------------------------
# identity
#------------------------------------------------------------------------------
namespace: {{ pg_namespace }}/ # namespace
scope: {{ pg_cluster }} # cluster name
name: {{ pg_instance }} # instance name
#------------------------------------------------------------------------------
# log
#------------------------------------------------------------------------------
log:
level: INFO # NOTEST|DEBUG|INFO|WARNING|ERROR|CRITICAL
dir: /pg/log/ # default log file: /pg/log/patroni.log
file_size: 100000000 # 100MB log triggers a log rotate
# format: '%(asctime)s %(levelname)s: %(message)s'
#------------------------------------------------------------------------------
# dcs
#------------------------------------------------------------------------------
consul:
host: 127.0.0.1:8500
consistency: default # default|consistent|stale
register_service: true
service_check_interval: 15s
service_tags:
- {{ pg_cluster }}
#------------------------------------------------------------------------------
# api
#------------------------------------------------------------------------------
# how to expose patroni service
# listen on all ipv4, connect via public ip, use same credential as dbuser_monitor
restapi:
listen: 0.0.0.0:{{ patroni_port }}
connect_address: {{ inventory_hostname }}:{{ patroni_port }}
authentication:
verify_client: none # none|optional|required
username: {{ pg_monitor_username }}
password: '{{ pg_monitor_password }}'
#------------------------------------------------------------------------------
# ctl
#------------------------------------------------------------------------------
ctl:
optional:
insecure: true
# cacert: '/path/to/ca/cert'
# certfile: '/path/to/cert/file'
# keyfile: '/path/to/key/file'
#------------------------------------------------------------------------------
# tags
#------------------------------------------------------------------------------
tags:
nofailover: false
clonefrom: true
noloadbalance: false
nosync: false
{% if pg_upstream is defined %}
replicatefrom: {{ pg_upstream }} # clone from another replica rather than primary
{% endif %}
#------------------------------------------------------------------------------
# watchdog
#------------------------------------------------------------------------------
# available mode: off|automatic|required
watchdog:
mode: {{ patroni_watchdog_mode }}
device: /dev/watchdog
# safety_margin: 10s
#------------------------------------------------------------------------------
# bootstrap
#------------------------------------------------------------------------------
bootstrap:
#----------------------------------------------------------------------------
# bootstrap method
#----------------------------------------------------------------------------
method: initdb
# add custom bootstrap method here
# default bootstrap method: initdb
initdb:
- locale: C
- encoding: UTF8
# - data-checksums # enable data-checksum
#----------------------------------------------------------------------------
# bootstrap users
#---------------------------------------------------------------------------
# additional users which need to be created after initializing new cluster
# replication user and monitor user are required
users:
{{ pg_replication_username }}:
password: '{{ pg_replication_password }}'
{{ pg_monitor_username }}:
password: '{{ pg_monitor_password }}'
{{ pg_admin_username }}:
password: '{{ pg_admin_password }}'
# bootstrap hba, allow local and intranet password access & replication
# will be overwritten later
pg_hba:
- local all postgres ident
- local all all md5
- host all all 0.0.0.0/0 md5
- local replication postgres ident
- local replication all md5
- host replication all 0.0.0.0/0 md5
#----------------------------------------------------------------------------
# template
#---------------------------------------------------------------------------
# post_init: /pg/bin/pg-init
#----------------------------------------------------------------------------
# bootstrap config
#---------------------------------------------------------------------------
# this section will be written to /{{ pg_namespace }}/{{ pg_cluster }}/config
# if will NOT take any effect after cluster bootstrap
dcs:
{% if pg_role == 'primary' and pg_upstream is defined %}
#----------------------------------------------------------------------------
# standby cluster definition
#---------------------------------------------------------------------------
standby_cluster:
host: {{ pg_upstream }}
port: {{ pg_port }}
# primary_slot_name: patroni # must be create manually on upstream server, if specified
create_replica_methods:
- basebackup
{% endif %}
#----------------------------------------------------------------------------
# important parameters
#---------------------------------------------------------------------------
# constraint: ttl >: loop_wait + retry_timeout * 2
# the number of seconds the loop will sleep. Default value: 10
# this is patroni check loop interval
loop_wait: 10
# the TTL to acquire the leader lock (in seconds). Think of it as the length of time before initiation of the automatic failover process. Default value: 30
# config this according to your network condition to avoid false-positive failover
ttl: 30
# timeout for DCS and PostgreSQL operation retries (in seconds). DCS or network issues shorter than this will not cause Patroni to demote the leader. Default value: 10
retry_timeout: 10
# the amount of time a master is allowed to recover from failures before failover is triggered (in seconds)
# Max RTO: 2 loop wait + master_start_timeout
master_start_timeout: 10
# import: candidate will not be promoted if replication lag is higher than this
# maximum RPO: 1MB
maximum_lag_on_failover: 1048576
# The number of seconds Patroni is allowed to wait when stopping Postgres and effective only when synchronous_mode is enabled
master_stop_timeout: 30
# turns on synchronous replication mode. In this mode a replica will be chosen as synchronous and only the latest leader and synchronous replica are able to participate in leader election
# set to true for RPO mode
synchronous_mode: false
# prevents disabling synchronous replication if no synchronous replicas are available, blocking all client writes to the master
synchronous_mode_strict: false
#----------------------------------------------------------------------------
# postgres parameters
#---------------------------------------------------------------------------
postgresql:
use_slots: true
use_pg_rewind: true
remove_data_directory_on_rewind_failure: true
parameters:
#----------------------------------------------------------------------
# IMPORTANT PARAMETERS
#----------------------------------------------------------------------
max_connections: 400 # 100 -> 400
superuser_reserved_connections: 10 # reserve 10 connection for su
max_locks_per_transaction: 128 # 64 -> 128
max_prepared_transactions: 0 # 0 disable 2PC
track_commit_timestamp: on # enabled xact timestamp
max_worker_processes: 8 # default 8, set to cpu core
wal_level: logical # logical
wal_log_hints: on # wal log hints to support rewind
max_wal_senders: 16 # 10 -> 16
max_replication_slots: 16 # 10 -> 16
wal_keep_size: 100GB # keep at least 100GB WAL
password_encryption: md5 # use traditional md5 auth
#----------------------------------------------------------------------
# RESOURCE USAGE (except WAL)
#----------------------------------------------------------------------
# memory: shared_buffers and maintenance_work_mem will be dynamically set
shared_buffers: {{ pg_shared_buffers }}
maintenance_work_mem: {{ pg_maintenance_work_mem }}
work_mem: 32MB # 4MB -> 32MB
huge_pages: try # try huge pages
temp_file_limit: 100GB # 0 -> 100GB
vacuum_cost_delay: 2ms # wait 2ms per 10000 cost
vacuum_cost_limit: 10000 # 10000 cost each round
bgwriter_delay: 10ms # check dirty page every 10ms
bgwriter_lru_maxpages: 800 # 100 -> 800
bgwriter_lru_multiplier: 5.0 # 2.0 -> 5.0 more cushion buffer
#----------------------------------------------------------------------
# WAL
#----------------------------------------------------------------------
wal_buffers: 16MB # max to 16MB
wal_writer_delay: 20ms # wait period
wal_writer_flush_after: 1MB # max allowed data loss
min_wal_size: 100GB # at least 100GB WAL
max_wal_size: 400GB # at most 400GB WAL
commit_delay: 20 # 200ms -> 20ms, increase speed
commit_siblings: 10 # 5 -> 10
checkpoint_timeout: 60min # checkpoint 5min -> 1h
checkpoint_completion_target: 0.95 # 0.5 -> 0.95
archive_mode: on
archive_command: 'wal_dir=/pg/arcwal; [[ $(date +%H%M) == 1200 ]] && rm -rf ${wal_dir}/$(date -d"yesterday" +%Y%m%d); /bin/mkdir -p ${wal_dir}/$(date +%Y%m%d) && /usr/bin/lz4 -q -z %p > ${wal_dir}/$(date +%Y%m%d)/%f.lz4'
#----------------------------------------------------------------------
# REPLICATION
#----------------------------------------------------------------------
# synchronous_standby_names: ''
vacuum_defer_cleanup_age: 50000 # 0->50000 last 50000 xact changes will not be vacuumed
promote_trigger_file: promote.signal # default promote trigger file path
max_standby_archive_delay: 10min # max delay before canceling queries when reading WAL from archive;
max_standby_streaming_delay: 3min # max delay before canceling queries when reading streaming WAL;
wal_receiver_status_interval: 1s # send replies at least this often
hot_standby_feedback: on # send info from standby to prevent query conflicts
wal_receiver_timeout: 60s # time that receiver waits for
max_logical_replication_workers: 8 # 4 -> 8
max_sync_workers_per_subscription: 8 # 4 -> 8
#----------------------------------------------------------------------
# QUERY TUNING
#----------------------------------------------------------------------
# planner
# enable_partitionwise_join: on
random_page_cost: 1.1 # 4 for HDD, 1.1 for SSD
effective_cache_size: 320GB # max mem - shared buffer
default_statistics_target: 1000 # stat bucket 100 -> 1000
#----------------------------------------------------------------------
# REPORTING AND LOGGING
#----------------------------------------------------------------------
log_destination: csvlog # use standard csv log
logging_collector: on # enable csvlog
log_directory: log # default log dir: /pg/data/log
# log_filename: 'postgresql-%a.log' # weekly auto-recycle
log_filename: 'postgresql-%Y-%m-%d.log' # YYYY-MM-DD full log retention
log_checkpoints: on # log checkpoint info
log_lock_waits: on # log lock wait info
log_replication_commands: on # log replication info
log_statement: ddl # log ddl change
log_min_duration_statement: 100 # log slow query (>100ms)
#----------------------------------------------------------------------
# STATISTICS
#----------------------------------------------------------------------
track_io_timing: on # collect io statistics
track_functions: all # track all functions (none|pl|all)
track_activity_query_size: 8192 # max query length in pg_stat_activity
#----------------------------------------------------------------------
# AUTOVACUUM
#----------------------------------------------------------------------
log_autovacuum_min_duration: 1s # log autovacuum activity take more than 1s
autovacuum_max_workers: 3 # default autovacuum worker 3
autovacuum_naptime: 1min # default autovacuum naptime 1min
autovacuum_vacuum_scale_factor: 0.08 # fraction of table size before vacuum 20% -> 8%
autovacuum_analyze_scale_factor: 0.04 # fraction of table size before analyze 10% -> 4%
autovacuum_vacuum_cost_delay: -1 # default vacuum cost delay: same as vacuum_cost_delay
autovacuum_vacuum_cost_limit: -1 # default vacuum cost limit: same as vacuum_cost_limit
autovacuum_freeze_max_age: 100000000 # age > 1 billion triggers force vacuum
#----------------------------------------------------------------------
# CLIENT
#----------------------------------------------------------------------
deadlock_timeout: 50ms # 50ms for deadlock
idle_in_transaction_session_timeout: 10min # 10min timeout for idle in transaction
#----------------------------------------------------------------------
# CUSTOMIZED OPTIONS
#----------------------------------------------------------------------
# extensions
shared_preload_libraries: '{{ pg_shared_libraries | default("pg_stat_statements, auto_explain") }}'
# auto_explain
auto_explain.log_min_duration: 1s # auto explain query slower than 1s
auto_explain.log_analyze: true # explain analyze
auto_explain.log_verbose: true # explain verbose
auto_explain.log_timing: true # explain timing
auto_explain.log_nested_statements: true
# pg_stat_statements
pg_stat_statements.max: 10000 # 5000 -> 10000 queries
pg_stat_statements.track: all # track all statements (all|top|none)
pg_stat_statements.track_utility: off # do not track query other than CRUD
pg_stat_statements.track_planning: off # do not track planning metrics
#------------------------------------------------------------------------------
# postgres
#------------------------------------------------------------------------------
postgresql:
#----------------------------------------------------------------------------
# how to connect to postgres
#----------------------------------------------------------------------------
bin_dir: {{ pg_bin_dir }}
data_dir: {{ pg_data }}
config_dir: {{ pg_data }}
pgpass: {{ pg_dbsu_home }}/.pgpass
listen: {{ pg_listen }}:{{ pg_port }}
connect_address: {{ inventory_hostname }}:{{ pg_port }}
use_unix_socket: true # default: /var/run/postgresql, /tmp
#----------------------------------------------------------------------------
# who to connect to postgres
#----------------------------------------------------------------------------
authentication:
superuser:
username: {{ pg_dbsu }}
replication:
username: {{ pg_replication_username }}
password: '{{ pg_replication_password }}'
rewind:
username: {{ pg_replication_username }}
password: '{{ pg_replication_password }}'
#----------------------------------------------------------------------------
# how to react to database operations
#----------------------------------------------------------------------------
# event callback script log: /pg/log/callback.log
callbacks:
on_start: /pg/bin/pg-failover-callback
on_stop: /pg/bin/pg-failover-callback
on_reload: /pg/bin/pg-failover-callback
on_restart: /pg/bin/pg-failover-callback
on_role_change: /pg/bin/pg-failover-callback
# rewind policy: data checksum should be enabled before using rewind
use_pg_rewind: true
remove_data_directory_on_rewind_failure: true
remove_data_directory_on_diverged_timelines: false
#----------------------------------------------------------------------------
# how to create replica
#----------------------------------------------------------------------------
# create replica method: default pg_basebackup
create_replica_methods:
- basebackup
basebackup:
- max-rate: '1000M'
- checkpoint: fast
- status-interva: 1s
- verbose
- progress
#----------------------------------------------------------------------------
# ad hoc parameters (overwrite with default)
#----------------------------------------------------------------------------
# parameters:
#----------------------------------------------------------------------------
# host based authentication, overwrite default pg_hba.conf
#----------------------------------------------------------------------------
# pg_hba:
# - local all postgres ident
# - local all all md5
# - host all all 0.0.0.0/0 md5
# - local replication postgres ident
# - local replication all md5
# - host replication all 0.0.0.0/0 md5
...
10.2 - TINY
Patroni TINY模板主要针对极低配置的虚拟机进行优化,
此模板针对的典型机型是1核/1GB的虚拟机节点。您可以根据自己的实际机型进行调整。
#!/usr/bin/env patroni
#==============================================================#
# File : patroni.yml
# Ctime : 2020-04-08
# Mtime : 2020-12-22
# Desc : patroni cluster definition for {{ pg_cluster }} (tiny)
# Path : /pg/bin/patroni.yml
# Real Path : /pg/conf/{{ pg_instance }}.yml
# Link : /pg/bin/patroni.yml -> /pg/conf/{{ pg_instance}}.yml
# Note : Tiny Database Cluster Template
# Doc : https://patroni.readthedocs.io/en/latest/SETTINGS.html
# Copyright (C) 2018-2021 Ruohang Feng
#==============================================================#
# TINY database are optimized for low-resource situation (e.g 1 Core 1G)
# typical spec: 1 Core | 1-4 GB RAM | Normal SSD 10x GB
---
#------------------------------------------------------------------------------
# identity
#------------------------------------------------------------------------------
namespace: {{ pg_namespace }}/ # namespace
scope: {{ pg_cluster }} # cluster name
name: {{ pg_instance }} # instance name
#------------------------------------------------------------------------------
# log
#------------------------------------------------------------------------------
log:
level: INFO # NOTEST|DEBUG|INFO|WARNING|ERROR|CRITICAL
dir: /pg/log/ # default log file: /pg/log/patroni.log
file_size: 100000000 # 100MB log triggers a log rotate
# format: '%(asctime)s %(levelname)s: %(message)s'
#------------------------------------------------------------------------------
# dcs
#------------------------------------------------------------------------------
consul:
host: 127.0.0.1:8500
consistency: default # default|consistent|stale
register_service: true
service_check_interval: 15s
service_tags:
- {{ pg_cluster }}
#------------------------------------------------------------------------------
# api
#------------------------------------------------------------------------------
# how to expose patroni service
# listen on all ipv4, connect via public ip, use same credential as dbuser_monitor
restapi:
listen: 0.0.0.0:{{ patroni_port }}
connect_address: {{ inventory_hostname }}:{{ patroni_port }}
authentication:
verify_client: none # none|optional|required
username: {{ pg_monitor_username }}
password: '{{ pg_monitor_password }}'
#------------------------------------------------------------------------------
# ctl
#------------------------------------------------------------------------------
ctl:
optional:
insecure: true
# cacert: '/path/to/ca/cert'
# certfile: '/path/to/cert/file'
# keyfile: '/path/to/key/file'
#------------------------------------------------------------------------------
# tags
#------------------------------------------------------------------------------
tags:
nofailover: false
clonefrom: true
noloadbalance: false
nosync: false
{% if pg_upstream is defined %}
replicatefrom: {{ pg_upstream }} # clone from another replica rather than primary
{% endif %}
#------------------------------------------------------------------------------
# watchdog
#------------------------------------------------------------------------------
# available mode: off|automatic|required
watchdog:
mode: {{ patroni_watchdog_mode }}
device: /dev/watchdog
# safety_margin: 10s
#------------------------------------------------------------------------------
# bootstrap
#------------------------------------------------------------------------------
bootstrap:
#----------------------------------------------------------------------------
# bootstrap method
#----------------------------------------------------------------------------
method: initdb
# add custom bootstrap method here
# default bootstrap method: initdb
initdb:
- locale: C
- encoding: UTF8
- data-checksums # enable data-checksum
#----------------------------------------------------------------------------
# bootstrap users
#---------------------------------------------------------------------------
# additional users which need to be created after initializing new cluster
# replication user and monitor user are required
users:
{{ pg_replication_username }}:
password: '{{ pg_replication_password }}'
{{ pg_monitor_username }}:
password: '{{ pg_monitor_password }}'
# bootstrap hba, allow local and intranet password access & replication
# will be overwritten later
pg_hba:
- local all postgres ident
- local all all md5
- host all all 0.0.0.0/0 md5
- local replication postgres ident
- local replication all md5
- host replication all 0.0.0.0/0 md5
#----------------------------------------------------------------------------
# customization
#---------------------------------------------------------------------------
# post_init: /pg/bin/pg-init
#----------------------------------------------------------------------------
# bootstrap config
#---------------------------------------------------------------------------
# this section will be written to /{{ pg_namespace }}/{{ pg_cluster }}/config
# if will NOT take any effect after cluster bootstrap
dcs:
{% if pg_role == 'primary' and pg_upstream is defined %}
#----------------------------------------------------------------------------
# standby cluster definition
#---------------------------------------------------------------------------
standby_cluster:
host: {{ pg_upstream }}
port: {{ pg_port }}
# primary_slot_name: patroni # must be create manually on upstream server, if specified
create_replica_methods:
- basebackup
{% endif %}
#----------------------------------------------------------------------------
# important parameters
#---------------------------------------------------------------------------
# constraint: ttl >: loop_wait + retry_timeout * 2
# the number of seconds the loop will sleep. Default value: 10
# this is patroni check loop interval
loop_wait: 10
# the TTL to acquire the leader lock (in seconds). Think of it as the length of time before initiation of the automatic failover process. Default value: 30
# config this according to your network condition to avoid false-positive failover
ttl: 30
# timeout for DCS and PostgreSQL operation retries (in seconds). DCS or network issues shorter than this will not cause Patroni to demote the leader. Default value: 10
retry_timeout: 10
# the amount of time a master is allowed to recover from failures before failover is triggered (in seconds)
# Max RTO: 2 loop wait + master_start_timeout
master_start_timeout: 10
# import: candidate will not be promoted if replication lag is higher than this
# maximum RPO: 1MB
maximum_lag_on_failover: 1048576
# The number of seconds Patroni is allowed to wait when stopping Postgres and effective only when synchronous_mode is enabled
master_stop_timeout: 30
# turns on synchronous replication mode. In this mode a replica will be chosen as synchronous and only the latest leader and synchronous replica are able to participate in leader election
# set to true for RPO mode
synchronous_mode: false
# prevents disabling synchronous replication if no synchronous replicas are available, blocking all client writes to the master
synchronous_mode_strict: false
#----------------------------------------------------------------------------
# postgres parameters
#---------------------------------------------------------------------------
postgresql:
use_slots: true
use_pg_rewind: true
remove_data_directory_on_rewind_failure: true
parameters:
#----------------------------------------------------------------------
# IMPORTANT PARAMETERS
#----------------------------------------------------------------------
max_connections: 50 # default 100 -> 50
superuser_reserved_connections: 10 # reserve 10 connection for su
max_locks_per_transaction: 64 # default 64
max_prepared_transactions: 0 # 0 disable 2PC
track_commit_timestamp: on # enabled xact timestamp
max_worker_processes: 1 # default 8 -> 1 (set to cpu core)
wal_level: logical # logical
wal_log_hints: on # wal log hints to support rewind
max_wal_senders: 10 # default 10
max_replication_slots: 10 # default 10
wal_keep_size: 1GB # keep at least 1GB WAL
password_encryption: md5 # use traditional md5 auth
#----------------------------------------------------------------------
# RESOURCE USAGE (except WAL)
#----------------------------------------------------------------------
# memory: shared_buffers and maintenance_work_mem will be dynamically set
shared_buffers: {{ pg_shared_buffers }}
maintenance_work_mem: {{ pg_maintenance_work_mem }}
work_mem: 4MB # default 4MB
huge_pages: try # try huge pages
temp_file_limit: 40GB # 0 -> 40GB (according to your disk)
vacuum_cost_delay: 5ms # wait 5ms per 10000 cost
vacuum_cost_limit: 10000 # 10000 cost each round
bgwriter_delay: 10ms # check dirty page every 10ms
bgwriter_lru_maxpages: 800 # 100 -> 800
bgwriter_lru_multiplier: 5.0 # 2.0 -> 5.0 more cushion buffer
#----------------------------------------------------------------------
# WAL
#----------------------------------------------------------------------
wal_buffers: 16MB # max to 16MB
wal_writer_delay: 20ms # wait period
wal_writer_flush_after: 1MB # max allowed data loss
min_wal_size: 100GB # at least 100GB WAL
max_wal_size: 400GB # at most 400GB WAL
commit_delay: 20 # 200ms -> 20ms, increase speed
commit_siblings: 10 # 5 -> 10
checkpoint_timeout: 15min # checkpoint 5min -> 15min
checkpoint_completion_target: 0.80 # 0.5 -> 0.8
archive_mode: on
archive_command: 'wal_dir=/pg/arcwal; [[ $(date +%H%M) == 1200 ]] && rm -rf ${wal_dir}/$(date -d"yesterday" +%Y%m%d); /bin/mkdir -p ${wal_dir}/$(date +%Y%m%d) && /usr/bin/lz4 -q -z %p > ${wal_dir}/$(date +%Y%m%d)/%f.lz4'
#----------------------------------------------------------------------
# REPLICATION
#----------------------------------------------------------------------
# synchronous_standby_names: ''
vacuum_defer_cleanup_age: 50000 # 0->50000 last 50000 xact changes will not be vacuumed
promote_trigger_file: promote.signal # default promote trigger file path
max_standby_archive_delay: 10min # max delay before canceling queries when reading WAL from archive;
max_standby_streaming_delay: 3min # max delay before canceling queries when reading streaming WAL;
wal_receiver_status_interval: 1s # send replies at least this often
hot_standby_feedback: on # send info from standby to prevent query conflicts
wal_receiver_timeout: 60s # time that receiver waits for
max_logical_replication_workers: 8 # 4 -> 2 (set according to your cpu core)
max_sync_workers_per_subscription: 8 # 4 -> 2
#----------------------------------------------------------------------
# QUERY TUNING
#----------------------------------------------------------------------
# planner
# enable_partitionwise_join: on
random_page_cost: 1.1 # 4 for HDD, 1.1 for SSD
effective_cache_size: 2GB # max mem - shared buffer
default_statistics_target: 200 # stat bucket 100 -> 200
#----------------------------------------------------------------------
# REPORTING AND LOGGING
#----------------------------------------------------------------------
log_destination: csvlog # use standard csv log
logging_collector: on # enable csvlog
log_directory: log # default log dir: /pg/data/log
# log_filename: 'postgresql-%a.log' # weekly auto-recycle
log_filename: 'postgresql-%Y-%m-%d.log' # YYYY-MM-DD full log retention
log_checkpoints: on # log checkpoint info
log_lock_waits: on # log lock wait info
log_replication_commands: on # log replication info
log_statement: ddl # log ddl change
log_min_duration_statement: 100 # log slow query (>100ms)
#----------------------------------------------------------------------
# STATISTICS
#----------------------------------------------------------------------
track_io_timing: on # collect io statistics
track_functions: all # track all functions (none|pl|all)
track_activity_query_size: 8192 # max query length in pg_stat_activity
#----------------------------------------------------------------------
# AUTOVACUUM
#----------------------------------------------------------------------
log_autovacuum_min_duration: 1s # log autovacuum activity take more than 1s
autovacuum_max_workers: 1 # default autovacuum worker 3 -> 1
autovacuum_naptime: 1min # default autovacuum naptime 1min
autovacuum_vacuum_scale_factor: 0.08 # fraction of table size before vacuum 20% -> 8%
autovacuum_analyze_scale_factor: 0.04 # fraction of table size before analyze 10% -> 4%
autovacuum_vacuum_cost_delay: -1 # default vacuum cost delay: same as vacuum_cost_delay
autovacuum_vacuum_cost_limit: -1 # default vacuum cost limit: same as vacuum_cost_limit
autovacuum_freeze_max_age: 100000000 # age > 1 billion triggers force vacuum
#----------------------------------------------------------------------
# CLIENT
#----------------------------------------------------------------------
deadlock_timeout: 50ms # 50ms for deadlock
idle_in_transaction_session_timeout: 10min # 10min timeout for idle in transaction
#----------------------------------------------------------------------
# CUSTOMIZED OPTIONS
#----------------------------------------------------------------------
# extensions
shared_preload_libraries: '{{ pg_shared_libraries | default("pg_stat_statements, auto_explain") }}'
# auto_explain
auto_explain.log_min_duration: 1s # auto explain query slower than 1s
auto_explain.log_analyze: true # explain analyze
auto_explain.log_verbose: true # explain verbose
auto_explain.log_timing: true # explain timing
auto_explain.log_nested_statements: true
# pg_stat_statements
pg_stat_statements.max: 3000 # 5000 -> 3000 queries
pg_stat_statements.track: all # track all statements (all|top|none)
pg_stat_statements.track_utility: off # do not track query other than CRUD
pg_stat_statements.track_planning: off # do not track planning metrics
#------------------------------------------------------------------------------
# postgres
#------------------------------------------------------------------------------
postgresql:
#----------------------------------------------------------------------------
# how to connect to postgres
#----------------------------------------------------------------------------
bin_dir: {{ pg_bin_dir }}
data_dir: {{ pg_data }}
config_dir: {{ pg_data }}
pgpass: {{ pg_dbsu_home }}/.pgpass
listen: {{ pg_listen }}:{{ pg_port }}
connect_address: {{ inventory_hostname }}:{{ pg_port }}
use_unix_socket: true # default: /var/run/postgresql, /tmp
#----------------------------------------------------------------------------
# who to connect to postgres
#----------------------------------------------------------------------------
authentication:
superuser:
username: {{ pg_dbsu }}
replication:
username: {{ pg_replication_username }}
password: '{{ pg_replication_password }}'
rewind:
username: {{ pg_replication_username }}
password: '{{ pg_replication_password }}'
#----------------------------------------------------------------------------
# how to react to database operations
#----------------------------------------------------------------------------
# event callback script log: /pg/log/callback.log
callbacks:
on_start: /pg/bin/pg-failover-callback
on_stop: /pg/bin/pg-failover-callback
on_reload: /pg/bin/pg-failover-callback
on_restart: /pg/bin/pg-failover-callback
on_role_change: /pg/bin/pg-failover-callback
# rewind policy: data checksum should be enabled before using rewind
use_pg_rewind: true
remove_data_directory_on_rewind_failure: true
remove_data_directory_on_diverged_timelines: false
#----------------------------------------------------------------------------
# how to create replica
#----------------------------------------------------------------------------
# create replica method: default pg_basebackup
create_replica_methods:
- basebackup
basebackup:
- max-rate: '1000M'
- checkpoint: fast
- status-interva: 1s
- verbose
- progress
#----------------------------------------------------------------------------
# ad hoc parameters (overwrite with default)
#----------------------------------------------------------------------------
# parameters:
#----------------------------------------------------------------------------
# host based authentication, overwrite default pg_hba.conf
#----------------------------------------------------------------------------
# pg_hba:
# - local all postgres ident
# - local all all md5
# - host all all 0.0.0.0/0 md5
# - local replication postgres ident
# - local replication all md5
# - host replication all 0.0.0.0/0 md5
...
10.3 - OLAP
Patroni OLAP模板主要针对吞吐量与计算并行度进行优化
此模板针对的机型是Dell R740 64核/400GB内存,使用PCI-E SSD的节点。您可以根据自己的实际机型进行调整。
#!/usr/bin/env patroni
#==============================================================#
# File : patroni.yml
# Ctime : 2020-04-08
# Mtime : 2020-12-22
# Desc : patroni cluster definition for {{ pg_cluster }} (olap)
# Path : /pg/bin/patroni.yml
# Real Path : /pg/conf/{{ pg_instance }}.yml
# Link : /pg/bin/patroni.yml -> /pg/conf/{{ pg_instance}}.yml
# Note : Analysis Database Cluster Template
# Doc : https://patroni.readthedocs.io/en/latest/SETTINGS.html
# Copyright (C) 2018-2021 Ruohang Feng
#==============================================================#
# OLTP database are optimized for throughput
# typical spec: 64 Core | 400 GB RAM | PCI-E SSD xTB
---
#------------------------------------------------------------------------------
# identity
#------------------------------------------------------------------------------
namespace: {{ pg_namespace }}/ # namespace
scope: {{ pg_cluster }} # cluster name
name: {{ pg_instance }} # instance name
#------------------------------------------------------------------------------
# log
#------------------------------------------------------------------------------
log:
level: INFO # NOTEST|DEBUG|INFO|WARNING|ERROR|CRITICAL
dir: /pg/log/ # default log file: /pg/log/patroni.log
file_size: 100000000 # 100MB log triggers a log rotate
# format: '%(asctime)s %(levelname)s: %(message)s'
#------------------------------------------------------------------------------
# dcs
#------------------------------------------------------------------------------
consul:
host: 127.0.0.1:8500
consistency: default # default|consistent|stale
register_service: true
service_check_interval: 15s
service_tags:
- {{ pg_cluster }}
#------------------------------------------------------------------------------
# api
#------------------------------------------------------------------------------
# how to expose patroni service
# listen on all ipv4, connect via public ip, use same credential as dbuser_monitor
restapi:
listen: 0.0.0.0:{{ patroni_port }}
connect_address: {{ inventory_hostname }}:{{ patroni_port }}
authentication:
verify_client: none # none|optional|required
username: {{ pg_monitor_username }}
password: '{{ pg_monitor_password }}'
#------------------------------------------------------------------------------
# ctl
#------------------------------------------------------------------------------
ctl:
optional:
insecure: true
# cacert: '/path/to/ca/cert'
# certfile: '/path/to/cert/file'
# keyfile: '/path/to/key/file'
#------------------------------------------------------------------------------
# tags
#------------------------------------------------------------------------------
tags:
nofailover: false
clonefrom: true
noloadbalance: false
nosync: false
{% if pg_upstream is defined %}
replicatefrom: {{ pg_upstream }} # clone from another replica rather than primary
{% endif %}
#------------------------------------------------------------------------------
# watchdog
#------------------------------------------------------------------------------
# available mode: off|automatic|required
watchdog:
mode: {{ patroni_watchdog_mode }}
device: /dev/watchdog
# safety_margin: 10s
#------------------------------------------------------------------------------
# bootstrap
#------------------------------------------------------------------------------
bootstrap:
#----------------------------------------------------------------------------
# bootstrap method
#----------------------------------------------------------------------------
method: initdb
# add custom bootstrap method here
# default bootstrap method: initdb
initdb:
- locale: C
- encoding: UTF8
# - data-checksums # enable data-checksum
#----------------------------------------------------------------------------
# bootstrap users
#---------------------------------------------------------------------------
# additional users which need to be created after initializing new cluster
# replication user and monitor user are required
users:
{{ pg_replication_username }}:
password: '{{ pg_replication_password }}'
{{ pg_monitor_username }}:
password: '{{ pg_monitor_password }}'
{{ pg_admin_username }}:
password: '{{ pg_admin_password }}'
# bootstrap hba, allow local and intranet password access & replication
# will be overwritten later
pg_hba:
- local all postgres ident
- local all all md5
- host all all 0.0.0.0/0 md5
- local replication postgres ident
- local replication all md5
- host replication all 0.0.0.0/0 md5
#----------------------------------------------------------------------------
# template
#---------------------------------------------------------------------------
# post_init: /pg/bin/pg-init
#----------------------------------------------------------------------------
# bootstrap config
#---------------------------------------------------------------------------
# this section will be written to /{{ pg_namespace }}/{{ pg_cluster }}/config
# if will NOT take any effect after cluster bootstrap
dcs:
{% if pg_role == 'primary' and pg_upstream is defined %}
#----------------------------------------------------------------------------
# standby cluster definition
#---------------------------------------------------------------------------
standby_cluster:
host: {{ pg_upstream }}
port: {{ pg_port }}
# primary_slot_name: patroni # must be create manually on upstream server, if specified
create_replica_methods:
- basebackup
{% endif %}
#----------------------------------------------------------------------------
# important parameters
#---------------------------------------------------------------------------
# constraint: ttl >: loop_wait + retry_timeout * 2
# the number of seconds the loop will sleep. Default value: 10
# this is patroni check loop interval
loop_wait: 10
# the TTL to acquire the leader lock (in seconds). Think of it as the length of time before initiation of the automatic failover process. Default value: 30
# config this according to your network condition to avoid false-positive failover
ttl: 30
# timeout for DCS and PostgreSQL operation retries (in seconds). DCS or network issues shorter than this will not cause Patroni to demote the leader. Default value: 10
retry_timeout: 10
# the amount of time a master is allowed to recover from failures before failover is triggered (in seconds)
# Max RTO: 2 loop wait + master_start_timeout
master_start_timeout: 10
# import: candidate will not be promoted if replication lag is higher than this
# maximum RPO: 16MB (analysis tolerate more data loss)
maximum_lag_on_failover: 16777216
# The number of seconds Patroni is allowed to wait when stopping Postgres and effective only when synchronous_mode is enabled
master_stop_timeout: 30
# turns on synchronous replication mode. In this mode a replica will be chosen as synchronous and only the latest leader and synchronous replica are able to participate in leader election
# set to true for RPO mode
synchronous_mode: false
# prevents disabling synchronous replication if no synchronous replicas are available, blocking all client writes to the master
synchronous_mode_strict: false
#----------------------------------------------------------------------------
# postgres parameters
#---------------------------------------------------------------------------
postgresql:
use_slots: true
use_pg_rewind: true
remove_data_directory_on_rewind_failure: true
parameters:
#----------------------------------------------------------------------
# IMPORTANT PARAMETERS
#----------------------------------------------------------------------
max_connections: 400 # 100 -> 400
superuser_reserved_connections: 10 # reserve 10 connection for su
max_locks_per_transaction: 256 # 64 -> 256 (analysis)
max_prepared_transactions: 0 # 0 disable 2PC
track_commit_timestamp: on # enabled xact timestamp
max_worker_processes: 64 # default 8 -> 64, SET THIS ACCORDING TO YOUR CPU CORES
wal_level: logical # logical
wal_log_hints: on # wal log hints to support rewind
max_wal_senders: 16 # 10 -> 16
max_replication_slots: 16 # 10 -> 16
wal_keep_size: 100GB # keep at least 100GB WAL
password_encryption: md5 # use traditional md5 auth
#----------------------------------------------------------------------
# RESOURCE USAGE (except WAL)
#----------------------------------------------------------------------
# memory: shared_buffers and maintenance_work_mem will be dynamically set
shared_buffers: {{ pg_shared_buffers }}
maintenance_work_mem: {{ pg_maintenance_work_mem }}
work_mem: 128MB # 4MB -> 128MB (analysis)
huge_pages: try # try huge pages
temp_file_limit: 500GB # 0 -> 500GB (analysis)
vacuum_cost_delay: 2ms # wait 2ms per 10000 cost
vacuum_cost_limit: 10000 # 10000 cost each round
bgwriter_delay: 10ms # check dirty page every 10ms
bgwriter_lru_maxpages: 1600 # 100 -> 1600 (analysis)
bgwriter_lru_multiplier: 5.0 # 2.0 -> 5.0 more cushion buffer
max_parallel_workers: 64 # SET THIS ACCORDING TO YOUR CPU CORES
max_parallel_workers_per_gather: 64 # SET THIS ACCORDING TO YOUR CPU CORES
max_parallel_maintenance_workers: 4 # 2 -> 4
#----------------------------------------------------------------------
# WAL
#----------------------------------------------------------------------
wal_buffers: 16MB # max to 16MB
wal_writer_delay: 20ms # wait period
wal_writer_flush_after: 16MB # max allowed data loss (analysis)
min_wal_size: 100GB # at least 100GB WAL
max_wal_size: 400GB # at most 400GB WAL
commit_delay: 20 # 200ms -> 20ms, increase speed
commit_siblings: 10 # 5 -> 10
checkpoint_timeout: 60min # checkpoint 5min -> 1h
checkpoint_completion_target: 0.95 # 0.5 -> 0.95
archive_mode: on
archive_command: 'wal_dir=/pg/arcwal; [[ $(date +%H%M) == 1200 ]] && rm -rf ${wal_dir}/$(date -d"yesterday" +%Y%m%d); /bin/mkdir -p ${wal_dir}/$(date +%Y%m%d) && /usr/bin/lz4 -q -z %p > ${wal_dir}/$(date +%Y%m%d)/%f.lz4'
#----------------------------------------------------------------------
# REPLICATION
#----------------------------------------------------------------------
# synchronous_standby_names: ''
vacuum_defer_cleanup_age: 0 # 0 (default)
promote_trigger_file: promote.signal # default promote trigger file path
max_standby_archive_delay: 10min # max delay before canceling queries when reading WAL from archive;
max_standby_streaming_delay: 3min # max delay before canceling queries when reading streaming WAL;
wal_receiver_status_interval: 1s # send replies at least this often
hot_standby_feedback: on # send info from standby to prevent query conflicts
wal_receiver_timeout: 60s # time that receiver waits for
max_logical_replication_workers: 8 # 4 -> 8
max_sync_workers_per_subscription: 8 # 4 -> 8
#----------------------------------------------------------------------
# QUERY TUNING
#----------------------------------------------------------------------
# planner
enable_partitionwise_join: on # enable on analysis
random_page_cost: 1.1 # 4 for HDD, 1.1 for SSD
effective_cache_size: 320GB # max mem - shared buffer
default_statistics_target: 1000 # stat bucket 100 -> 1000
jit: on # default on
jit_above_cost: 100000 # default jit threshold
#----------------------------------------------------------------------
# REPORTING AND LOGGING
#----------------------------------------------------------------------
log_destination: csvlog # use standard csv log
logging_collector: on # enable csvlog
log_directory: log # default log dir: /pg/data/log
# log_filename: 'postgresql-%a.log' # weekly auto-recycle
log_filename: 'postgresql-%Y-%m-%d.log' # YYYY-MM-DD full log retention
log_checkpoints: on # log checkpoint info
log_lock_waits: on # log lock wait info
log_replication_commands: on # log replication info
log_statement: ddl # log ddl change
log_min_duration_statement: 1000 # log slow query (>1s)
#----------------------------------------------------------------------
# STATISTICS
#----------------------------------------------------------------------
track_io_timing: on # collect io statistics
track_functions: all # track all functions (none|pl|all)
track_activity_query_size: 8192 # max query length in pg_stat_activity
#----------------------------------------------------------------------
# AUTOVACUUM
#----------------------------------------------------------------------
log_autovacuum_min_duration: 1s # log autovacuum activity take more than 1s
autovacuum_max_workers: 3 # default autovacuum worker 3
autovacuum_naptime: 1min # default autovacuum naptime 1min
autovacuum_vacuum_scale_factor: 0.08 # fraction of table size before vacuum 20% -> 8%
autovacuum_analyze_scale_factor: 0.04 # fraction of table size before analyze 10% -> 4%
autovacuum_vacuum_cost_delay: -1 # default vacuum cost delay: same as vacuum_cost_delay
autovacuum_vacuum_cost_limit: -1 # default vacuum cost limit: same as vacuum_cost_limit
autovacuum_freeze_max_age: 100000000 # age > 1 billion triggers force vacuum
#----------------------------------------------------------------------
# CLIENT
#----------------------------------------------------------------------
deadlock_timeout: 50ms # 50ms for deadlock
idle_in_transaction_session_timeout: 0 # Disable idle in xact timeout in analysis database
#----------------------------------------------------------------------
# CUSTOMIZED OPTIONS
#----------------------------------------------------------------------
# extensions
shared_preload_libraries: '{{ pg_shared_libraries | default("pg_stat_statements, auto_explain") }}'
# auto_explain
auto_explain.log_min_duration: 1s # auto explain query slower than 1s
auto_explain.log_analyze: true # explain analyze
auto_explain.log_verbose: true # explain verbose
auto_explain.log_timing: true # explain timing
auto_explain.log_nested_statements: true
# pg_stat_statements
pg_stat_statements.max: 10000 # 5000 -> 10000 queries
pg_stat_statements.track: all # track all statements (all|top|none)
pg_stat_statements.track_utility: off # do not track query other than CRUD
pg_stat_statements.track_planning: off # do not track planning metrics
#------------------------------------------------------------------------------
# postgres
#------------------------------------------------------------------------------
postgresql:
#----------------------------------------------------------------------------
# how to connect to postgres
#----------------------------------------------------------------------------
bin_dir: {{ pg_bin_dir }}
data_dir: {{ pg_data }}
config_dir: {{ pg_data }}
pgpass: {{ pg_dbsu_home }}/.pgpass
listen: {{ pg_listen }}:{{ pg_port }}
connect_address: {{ inventory_hostname }}:{{ pg_port }}
use_unix_socket: true # default: /var/run/postgresql, /tmp
#----------------------------------------------------------------------------
# who to connect to postgres
#----------------------------------------------------------------------------
authentication:
superuser:
username: {{ pg_dbsu }}
replication:
username: {{ pg_replication_username }}
password: '{{ pg_replication_password }}'
rewind:
username: {{ pg_replication_username }}
password: '{{ pg_replication_password }}'
#----------------------------------------------------------------------------
# how to react to database operations
#----------------------------------------------------------------------------
# event callback script log: /pg/log/callback.log
callbacks:
on_start: /pg/bin/pg-failover-callback
on_stop: /pg/bin/pg-failover-callback
on_reload: /pg/bin/pg-failover-callback
on_restart: /pg/bin/pg-failover-callback
on_role_change: /pg/bin/pg-failover-callback
# rewind policy: data checksum should be enabled before using rewind
use_pg_rewind: true
remove_data_directory_on_rewind_failure: true
remove_data_directory_on_diverged_timelines: false
#----------------------------------------------------------------------------
# how to create replica
#----------------------------------------------------------------------------
# create replica method: default pg_basebackup
create_replica_methods:
- basebackup
basebackup:
- max-rate: '1000M'
- checkpoint: fast
- status-interva: 1s
- verbose
- progress
#----------------------------------------------------------------------------
# ad hoc parameters (overwrite with default)
#----------------------------------------------------------------------------
# parameters:
#----------------------------------------------------------------------------
# host based authentication, overwrite default pg_hba.conf
#----------------------------------------------------------------------------
# pg_hba:
# - local all postgres ident
# - local all all md5
# - host all all 0.0.0.0/0 md5
# - local replication postgres ident
# - local replication all md5
# - host replication all 0.0.0.0/0 md5
...
10.4 - CRIT
Patroni CRIT模板主要针对RPO进行优化,采用同步复制,发生故障时确保不会有数据丢失。
此模板针对的机型是Dell R740 64核/400GB内存,使用PCI-E SSD的节点。用户可以根据自己的实际机型进行调整。
#!/usr/bin/env patroni
#==============================================================#
# File : patroni.yml
# Ctime : 2020-04-08
# Mtime : 2020-12-22
# Desc : patroni cluster definition for {{ pg_cluster }} (crit)
# Path : /pg/bin/patroni.yml
# Real Path : /pg/conf/{{ pg_instance }}.yml
# Link : /pg/bin/patroni.yml -> /pg/conf/{{ pg_instance}}.yml
# Note : Critical Database Cluster Template
# Doc : https://patroni.readthedocs.io/en/latest/SETTINGS.html
# Copyright (C) 2018-2021 Ruohang Feng
#==============================================================#
# CRIT database are optimized for security, integrity, RPO
# typical spec: 64 Core | 400 GB RAM | PCI-E SSD xTB
---
#------------------------------------------------------------------------------
# identity
#------------------------------------------------------------------------------
namespace: {{ pg_namespace }}/ # namespace
scope: {{ pg_cluster }} # cluster name
name: {{ pg_instance }} # instance name
#------------------------------------------------------------------------------
# log
#------------------------------------------------------------------------------
log:
level: INFO # NOTEST|DEBUG|INFO|WARNING|ERROR|CRITICAL
dir: /pg/log/ # default log file: /pg/log/patroni.log
file_size: 100000000 # 100MB log triggers a log rotate
# format: '%(asctime)s %(levelname)s: %(message)s'
#------------------------------------------------------------------------------
# dcs
#------------------------------------------------------------------------------
consul:
host: 127.0.0.1:8500
consistency: default # default|consistent|stale
register_service: true
service_check_interval: 15s
service_tags:
- {{ pg_cluster }}
#------------------------------------------------------------------------------
# api
#------------------------------------------------------------------------------
# how to expose patroni service
# listen on all ipv4, connect via public ip, use same credential as dbuser_monitor
restapi:
listen: 0.0.0.0:{{ patroni_port }}
connect_address: {{ inventory_hostname }}:{{ patroni_port }}
authentication:
verify_client: none # none|optional|required
username: {{ pg_monitor_username }}
password: '{{ pg_monitor_password }}'
#------------------------------------------------------------------------------
# ctl
#------------------------------------------------------------------------------
ctl:
optional:
insecure: true
# cacert: '/path/to/ca/cert'
# certfile: '/path/to/cert/file'
# keyfile: '/path/to/key/file'
#------------------------------------------------------------------------------
# tags
#------------------------------------------------------------------------------
tags:
nofailover: false
clonefrom: true
noloadbalance: false
nosync: false
{% if pg_upstream is defined %}
replicatefrom: {{ pg_upstream }} # clone from another replica rather than primary
{% endif %}
#------------------------------------------------------------------------------
# watchdog
#------------------------------------------------------------------------------
# available mode: off|automatic|required
watchdog:
mode: {{ patroni_watchdog_mode }}
device: /dev/watchdog
# safety_margin: 10s
#------------------------------------------------------------------------------
# bootstrap
#------------------------------------------------------------------------------
bootstrap:
#----------------------------------------------------------------------------
# bootstrap method
#----------------------------------------------------------------------------
method: initdb
# add custom bootstrap method here
# default bootstrap method: initdb
initdb:
- locale: C
- encoding: UTF8
# - data-checksums # enable data-checksum
#----------------------------------------------------------------------------
# bootstrap users
#---------------------------------------------------------------------------
# additional users which need to be created after initializing new cluster
# replication user and monitor user are required
users:
{{ pg_replication_username }}:
password: '{{ pg_replication_password }}'
{{ pg_monitor_username }}:
password: '{{ pg_monitor_password }}'
{{ pg_admin_username }}:
password: '{{ pg_admin_password }}'
# bootstrap hba, allow local and intranet password access & replication
# will be overwritten later
pg_hba:
- local all postgres ident
- local all all md5
- host all all 0.0.0.0/0 md5
- local replication postgres ident
- local replication all md5
- host replication all 0.0.0.0/0 md5
#----------------------------------------------------------------------------
# template
#---------------------------------------------------------------------------
# post_init: /pg/bin/pg-init
#----------------------------------------------------------------------------
# bootstrap config
#---------------------------------------------------------------------------
# this section will be written to /{{ pg_namespace }}/{{ pg_cluster }}/config
# if will NOT take any effect after cluster bootstrap
dcs:
{% if pg_role == 'primary' and pg_upstream is defined %}
#----------------------------------------------------------------------------
# standby cluster definition
#---------------------------------------------------------------------------
standby_cluster:
host: {{ pg_upstream }}
port: {{ pg_port }}
# primary_slot_name: patroni # must be create manually on upstream server, if specified
create_replica_methods:
- basebackup
{% endif %}
#----------------------------------------------------------------------------
# important parameters
#---------------------------------------------------------------------------
# constraint: ttl >: loop_wait + retry_timeout * 2
# the number of seconds the loop will sleep. Default value: 10
# this is patroni check loop interval
loop_wait: 10
# the TTL to acquire the leader lock (in seconds). Think of it as the length of time before initiation of the automatic failover process. Default value: 30
# config this according to your network condition to avoid false-positive failover
ttl: 30
# timeout for DCS and PostgreSQL operation retries (in seconds). DCS or network issues shorter than this will not cause Patroni to demote the leader. Default value: 10
retry_timeout: 10
# the amount of time a master is allowed to recover from failures before failover is triggered (in seconds)
# Max RTO: 2 loop wait + master_start_timeout
master_start_timeout: 120 # more patient on critical database
# import: candidate will not be promoted if replication lag is higher than this
# maximum RPO: 0 for critical database
maximum_lag_on_failover: 1
# The number of seconds Patroni is allowed to wait when stopping Postgres and effective only when synchronous_mode is enabled
master_stop_timeout: 10 # more patient on critical database
# turns on synchronous replication mode. In this mode a replica will be chosen as synchronous and only the latest leader and synchronous replica are able to participate in leader election
# set to true for RPO mode
synchronous_mode: true # use sync replication on critical database
# prevents disabling synchronous replication if no synchronous replicas are available, blocking all client writes to the master
synchronous_mode_strict: false
#----------------------------------------------------------------------------
# postgres parameters
#---------------------------------------------------------------------------
postgresql:
use_slots: true
use_pg_rewind: true
remove_data_directory_on_rewind_failure: true
parameters:
#----------------------------------------------------------------------
# IMPORTANT PARAMETERS
#----------------------------------------------------------------------
max_connections: 400 # 100 -> 400
superuser_reserved_connections: 10 # reserve 10 connection for su
max_locks_per_transaction: 128 # 64 -> 128
max_prepared_transactions: 0 # 0 disable 2PC
track_commit_timestamp: on # enabled xact timestamp
max_worker_processes: 8 # default 8, set to cpu core
wal_level: logical # logical
wal_log_hints: on # wal log hints to support rewind
max_wal_senders: 16 # 10 -> 16
max_replication_slots: 16 # 10 -> 16
wal_keep_size: 100GB # keep at least 100GB WAL
password_encryption: md5 # use traditional md5 auth
#----------------------------------------------------------------------
# RESOURCE USAGE (except WAL)
#----------------------------------------------------------------------
# memory: shared_buffers and maintenance_work_mem will be dynamically set
shared_buffers: {{ pg_shared_buffers }}
maintenance_work_mem: {{ pg_maintenance_work_mem }}
work_mem: 32MB # 4MB -> 32MB
huge_pages: try # try huge pages
temp_file_limit: 100GB # 0 -> 100GB
vacuum_cost_delay: 2ms # wait 2ms per 10000 cost
vacuum_cost_limit: 10000 # 10000 cost each round
bgwriter_delay: 10ms # check dirty page every 10ms
bgwriter_lru_maxpages: 800 # 100 -> 800
bgwriter_lru_multiplier: 5.0 # 2.0 -> 5.0 more cushion buffer
#----------------------------------------------------------------------
# WAL
#----------------------------------------------------------------------
wal_buffers: 16MB # max to 16MB
wal_writer_delay: 20ms # wait period
wal_writer_flush_after: 1MB # max allowed data loss
min_wal_size: 100GB # at least 100GB WAL
max_wal_size: 400GB # at most 400GB WAL
commit_delay: 20 # 200ms -> 20ms, increase speed
commit_siblings: 10 # 5 -> 10
checkpoint_timeout: 60min # checkpoint 5min -> 1h
checkpoint_completion_target: 0.95 # 0.5 -> 0.95
archive_mode: on
archive_command: 'wal_dir=/pg/arcwal; [[ $(date +%H%M) == 1200 ]] && rm -rf ${wal_dir}/$(date -d"yesterday" +%Y%m%d); /bin/mkdir -p ${wal_dir}/$(date +%Y%m%d) && /usr/bin/lz4 -q -z %p > ${wal_dir}/$(date +%Y%m%d)/%f.lz4'
#----------------------------------------------------------------------
# REPLICATION
#----------------------------------------------------------------------
# synchronous_standby_names: ''
vacuum_defer_cleanup_age: 50000 # 0->50000 last 50000 xact changes will not be vacuumed
promote_trigger_file: promote.signal # default promote trigger file path
max_standby_archive_delay: 10min # max delay before canceling queries when reading WAL from archive;
max_standby_streaming_delay: 3min # max delay before canceling queries when reading streaming WAL;
wal_receiver_status_interval: 1s # send replies at least this often
hot_standby_feedback: on # send info from standby to prevent query conflicts
wal_receiver_timeout: 60s # time that receiver waits for
max_logical_replication_workers: 8 # 4 -> 8
max_sync_workers_per_subscription: 8 # 4 -> 8
#----------------------------------------------------------------------
# QUERY TUNING
#----------------------------------------------------------------------
# planner
# enable_partitionwise_join: on
random_page_cost: 1.1 # 4 for HDD, 1.1 for SSD
effective_cache_size: 320GB # max mem - shared buffer
default_statistics_target: 1000 # stat bucket 100 -> 1000
#----------------------------------------------------------------------
# REPORTING AND LOGGING
#----------------------------------------------------------------------
log_destination: csvlog # use standard csv log
logging_collector: on # enable csvlog
log_directory: log # default log dir: /pg/data/log
# log_filename: 'postgresql-%a.log' # weekly auto-recycle
log_filename: 'postgresql-%Y-%m-%d.log' # YYYY-MM-DD full log retention
log_checkpoints: on # log checkpoint info
log_lock_waits: on # log lock wait info
log_replication_commands: on # log replication info
log_statement: ddl # log ddl change
log_min_duration_statement: 100 # log slow query (>100ms)
#----------------------------------------------------------------------
# STATISTICS
#----------------------------------------------------------------------
track_io_timing: on # collect io statistics
track_functions: all # track all functions (none|pl|all)
track_activity_query_size: 32768 # show full query on critical database
#----------------------------------------------------------------------
# AUTOVACUUM
#----------------------------------------------------------------------
log_autovacuum_min_duration: 1s # log autovacuum activity take more than 1s
autovacuum_max_workers: 3 # default autovacuum worker 3
autovacuum_naptime: 1min # default autovacuum naptime 1min
autovacuum_vacuum_scale_factor: 0.08 # fraction of table size before vacuum 20% -> 8%
autovacuum_analyze_scale_factor: 0.04 # fraction of table size before analyze 10% -> 4%
autovacuum_vacuum_cost_delay: -1 # default vacuum cost delay: same as vacuum_cost_delay
autovacuum_vacuum_cost_limit: -1 # default vacuum cost limit: same as vacuum_cost_limit
autovacuum_freeze_max_age: 100000000 # age > 1 billion triggers force vacuum
#----------------------------------------------------------------------
# CLIENT
#----------------------------------------------------------------------
deadlock_timeout: 50ms # 50ms for deadlock
idle_in_transaction_session_timeout: 1min # 1min timeout for idle in transaction in critical database
#----------------------------------------------------------------------
# CUSTOMIZED OPTIONS
#----------------------------------------------------------------------
# extensions
shared_preload_libraries: '{{ pg_shared_libraries | default("pg_stat_statements, auto_explain") }}'
# auto_explain
auto_explain.log_min_duration: 1s # auto explain query slower than 1s
auto_explain.log_analyze: true # explain analyze
auto_explain.log_verbose: true # explain verbose
auto_explain.log_timing: true # explain timing
auto_explain.log_nested_statements: true
# pg_stat_statements
pg_stat_statements.max: 10000 # 5000 -> 10000 queries
pg_stat_statements.track: all # track all statements (all|top|none)
pg_stat_statements.track_utility: on # TRACK all queries on critical database
pg_stat_statements.track_planning: off # do not track planning metrics
#------------------------------------------------------------------------------
# postgres
#------------------------------------------------------------------------------
postgresql:
#----------------------------------------------------------------------------
# how to connect to postgres
#----------------------------------------------------------------------------
bin_dir: {{ pg_bin_dir }}
data_dir: {{ pg_data }}
config_dir: {{ pg_data }}
pgpass: {{ pg_dbsu_home }}/.pgpass
listen: {{ pg_listen }}:{{ pg_port }}
connect_address: {{ inventory_hostname }}:{{ pg_port }}
use_unix_socket: true # default: /var/run/postgresql, /tmp
#----------------------------------------------------------------------------
# who to connect to postgres
#----------------------------------------------------------------------------
authentication:
superuser:
username: {{ pg_dbsu }}
replication:
username: {{ pg_replication_username }}
password: '{{ pg_replication_password }}'
rewind:
username: {{ pg_replication_username }}
password: '{{ pg_replication_password }}'
#----------------------------------------------------------------------------
# how to react to database operations
#----------------------------------------------------------------------------
# event callback script log: /pg/log/callback.log
callbacks:
on_start: /pg/bin/pg-failover-callback
on_stop: /pg/bin/pg-failover-callback
on_reload: /pg/bin/pg-failover-callback
on_restart: /pg/bin/pg-failover-callback
on_role_change: /pg/bin/pg-failover-callback
# rewind policy: data checksum should be enabled before using rewind
use_pg_rewind: true
remove_data_directory_on_rewind_failure: true
remove_data_directory_on_diverged_timelines: false
#----------------------------------------------------------------------------
# how to create replica
#----------------------------------------------------------------------------
# create replica method: default pg_basebackup
create_replica_methods:
- basebackup
basebackup:
- max-rate: '1000M'
- checkpoint: fast
- status-interva: 1s
- verbose
- progress
#----------------------------------------------------------------------------
# ad hoc parameters (overwrite with default)
#----------------------------------------------------------------------------
# parameters:
#----------------------------------------------------------------------------
# host based authentication, overwrite default pg_hba.conf
#----------------------------------------------------------------------------
# pg_hba:
# - local all postgres ident
# - local all all md5
# - host all all 0.0.0.0/0 md5
# - local replication postgres ident
# - local replication all md5
# - host replication all 0.0.0.0/0 md5
...