Kernel Optimize

Pigsty parameter tuning for the OS kernel

Pigsty使用tuned调整操作系统配置，tuned是CentOS7自带的调参工具。

Pigsty Tuned配置

Pigsty默认会为操作系统安装四种tuned profile：

OLTP：针对常规业务库，优化延迟
OLAP：针对分析库，优化吞吐量
CRIT：针对核心业务库，优化RPO
TINY：针对微型实例与虚拟机

tuned-adm profile oltp    # 启用OLTP模式
tuned-adm profile olap    # 启用OLAP模式
tuned-adm profile crit    # 启用CRIT模式
tuned-adm profile tiny    # 启用TINY模式

Tuned基本操作

# 如需启动 tuned，请以 root 身份运行下列指令：
systemctl start tuned

# 若要在每次计算机启动时激活 tuned，请输入以下指令：
systemctl enable tuned

# 其它的 tuned 控制，例如配置文件选择等，请使用：
tuned-adm

# 若要查看可用的已安装配置文件，此命令需要 tuned 服务正在运行。
tuned-adm list

# 若要查看目前已激活的配置文件，请运行：
tuned-adm active

# 若要选择或激活某一配置文件，请运行：
tuned-adm profile profile
# 例如
tuned-adm profile powersave

# 若要让 tuned 推荐最适合您的系统的配置文件，同时不改变任何现有的配置文件，也不使用安装期间使用过的逻辑，请运行以下指令：
tuned-adm recommend

# 要禁用所有微调：
tuned-adm off

要列出所有可用配置文件并识别目前激活的配置文件，请运行：
tuned-adm list
要只显示当前激活的配置文件请运行：
tuned-adm active
要切换到某个可用的配置文件请运行：
tuned-adm profile profile_name
例如：
tuned-adm profile server-powersave

OLTP配置

# tuned configuration
#==============================================================#
# File      :   tuned.conf
# Mtime     :   2020-06-29
# Desc      :   Tune operatiing system to oltp mode
# Path      :   /etc/tuned/oltp/tuned.conf
# Author    :   Vonng(fengruohang@outlook.com)
# Copyright (C) 2019-2020 Ruohang Feng
#==============================================================#

[main]
summary=Optimize for PostgreSQL OLTP System
include=network-latency

[cpu]
force_latency=1
governor=performance
energy_perf_bias=performance
min_perf_pct=100

[vm]
# disable transparent hugepages
transparent_hugepages=never

[sysctl]
#-------------------------------------------------------------#
#                           KERNEL                            #
#-------------------------------------------------------------#
# disable numa balancing
kernel.numa_balancing=0

# total shmem size in bytes： $(expr $(getconf _PHYS_PAGES) / 2 \* $(getconf PAGE_SIZE))
{% if param_shmall is defined and param_shmall != '' %}
kernel.shmall = {{ param_shmall }}
{% endif %}

# total shmem size in pages:  $(expr $(getconf _PHYS_PAGES) / 2)
{% if param_shmmax is defined and param_shmmax != '' %}
kernel.shmmax = {{ param_shmmax }}
{% endif %}

# total shmem segs 4096 -> 8192
kernel.shmmni=8192

# total msg queue number, set to mem size in MB
kernel.msgmni=32768

# max length of message queue
kernel.msgmnb=65536

# max size of message
kernel.msgmax=65536

kernel.pid_max=131072

# max(Sem in Set)=2048, max(Sem)=max(Sem in Set) x max(SemSet) , max(Sem per Ops)=2048, max(SemSet)=65536
kernel.sem=2048 134217728 2048 65536

# do not sched postgres process in group
kernel.sched_autogroup_enabled = 0

# total time the scheduler will consider a migrated process cache hot and, thus, less likely to be remigrated
# defaut = 0.5ms (500000ns), update to 5ms , depending on your typical query (e.g < 1ms)
kernel.sched_migration_cost_ns=5000000

#-------------------------------------------------------------#
#                             VM                              #
#-------------------------------------------------------------#
# try not using swap
vm.swappiness=0

# disable when most mem are for file cache
vm.zone_reclaim_mode=0

# overcommit threshhold = 80%
vm.overcommit_memory=2
vm.overcommit_ratio=80

# vm.dirty_background_bytes=67108864 # 64MB mem (2xRAID cache) wake the bgwriter
vm.dirty_background_ratio=3       # latency-performance default
vm.dirty_ratio=10                 # latency-performance default

# deny access on 0x00000 - 0x10000
vm.mmap_min_addr=65536

#-------------------------------------------------------------#
#                        Filesystem                           #
#-------------------------------------------------------------#
# max open files: 382589 -> 167772160
fs.file-max=167772160

# max concurrent unfinished async io, should be larger than 1M.  65536->1M
fs.aio-max-nr=1048576


#-------------------------------------------------------------#
#                          Network                            #
#-------------------------------------------------------------#
# max connection in listen queue (triggers retrans if full)
net.core.somaxconn=65535
net.core.netdev_max_backlog=8192
# tcp receive/transmit buffer default = 256KiB
net.core.rmem_default=262144
net.core.wmem_default=262144
# receive/transmit buffer limit = 4MiB
net.core.rmem_max=4194304
net.core.wmem_max=4194304

# ip options
net.ipv4.ip_forward=1
net.ipv4.ip_nonlocal_bind=1
net.ipv4.ip_local_port_range=32768 65000

# tcp options
net.ipv4.tcp_timestamps=1
net.ipv4.tcp_tw_reuse=1
net.ipv4.tcp_tw_recycle=0
net.ipv4.tcp_syncookies=0
net.ipv4.tcp_synack_retries=1
net.ipv4.tcp_syn_retries=1

# tcp read/write buffer
net.ipv4.tcp_rmem="4096 87380 16777216"
net.ipv4.tcp_wmem="4096 16384 16777216"
net.ipv4.udp_mem="3145728 4194304 16777216"

# tcp probe fail interval: 75s -> 20s
net.ipv4.tcp_keepalive_intvl=20
# tcp break after 3 * 20s = 1m
net.ipv4.tcp_keepalive_probes=3
# probe peroid = 1 min
net.ipv4.tcp_keepalive_time=60

net.ipv4.tcp_fin_timeout=5
net.ipv4.tcp_max_tw_buckets=262144
net.ipv4.tcp_max_syn_backlog=8192
net.ipv4.neigh.default.gc_thresh1=80000
net.ipv4.neigh.default.gc_thresh2=90000
net.ipv4.neigh.default.gc_thresh3=100000

net.bridge.bridge-nf-call-iptables=1
net.bridge.bridge-nf-call-ip6tables=1
net.bridge.bridge-nf-call-arptables=1

# max connection tracking number
net.netfilter.nf_conntrack_max=1048576

OLAP配置

# tuned configuration
#==============================================================#
# File      :   tuned.conf
# Mtime     :   2020-09-18
# Desc      :   Tune operatiing system to olap mode
# Path      :   /etc/tuned/olap/tuned.conf
# Author    :   Vonng(fengruohang@outlook.com)
# Copyright (C) 2019-2020 Ruohang Feng
#==============================================================#

[main]
summary=Optimize for PostgreSQL OLAP System
include=network-throughput

[cpu]
force_latency=1
governor=performance
energy_perf_bias=performance
min_perf_pct=100

[vm]
# disable transparent hugepages
transparent_hugepages=never

[sysctl]
#-------------------------------------------------------------#
#                           KERNEL                            #
#-------------------------------------------------------------#
# disable numa balancing
kernel.numa_balancing=0

# total shmem size in bytes： $(expr $(getconf _PHYS_PAGES) / 2 \* $(getconf PAGE_SIZE))
{% if param_shmall is defined and param_shmall != '' %}
kernel.shmall = {{ param_shmall }}
{% endif %}

# total shmem size in pages:  $(expr $(getconf _PHYS_PAGES) / 2)
{% if param_shmmax is defined and param_shmmax != '' %}
kernel.shmmax = {{ param_shmmax }}
{% endif %}

# total shmem segs 4096 -> 8192
kernel.shmmni=8192

# total msg queue number, set to mem size in MB
kernel.msgmni=32768

# max length of message queue
kernel.msgmnb=65536

# max size of message
kernel.msgmax=65536

kernel.pid_max=131072

# max(Sem in Set)=2048, max(Sem)=max(Sem in Set) x max(SemSet) , max(Sem per Ops)=2048, max(SemSet)=65536
kernel.sem=2048 134217728 2048 65536

# do not sched postgres process in group
kernel.sched_autogroup_enabled = 0

# total time the scheduler will consider a migrated process cache hot and, thus, less likely to be remigrated
# defaut = 0.5ms (500000ns), update to 5ms , depending on your typical query (e.g < 1ms)
kernel.sched_migration_cost_ns=5000000

#-------------------------------------------------------------#
#                             VM                              #
#-------------------------------------------------------------#
# try not using swap
# vm.swappiness=10

# disable when most mem are for file cache
vm.zone_reclaim_mode=0

# overcommit threshhold = 80%
vm.overcommit_memory=2
vm.overcommit_ratio=80

vm.dirty_background_ratio = 10    # throughput-performance default
vm.dirty_ratio=80                 # throughput-performance default 40 -> 80

# deny access on 0x00000 - 0x10000
vm.mmap_min_addr=65536

#-------------------------------------------------------------#
#                        Filesystem                           #
#-------------------------------------------------------------#
# max open files: 382589 -> 167772160
fs.file-max=167772160

# max concurrent unfinished async io, should be larger than 1M.  65536->1M
fs.aio-max-nr=1048576


#-------------------------------------------------------------#
#                          Network                            #
#-------------------------------------------------------------#
# max connection in listen queue (triggers retrans if full)
net.core.somaxconn=65535
net.core.netdev_max_backlog=8192
# tcp receive/transmit buffer default = 256KiB
net.core.rmem_default=262144
net.core.wmem_default=262144
# receive/transmit buffer limit = 4MiB
net.core.rmem_max=4194304
net.core.wmem_max=4194304

# ip options
net.ipv4.ip_forward=1
net.ipv4.ip_nonlocal_bind=1
net.ipv4.ip_local_port_range=32768 65000

# tcp options
net.ipv4.tcp_timestamps=1
net.ipv4.tcp_tw_reuse=1
net.ipv4.tcp_tw_recycle=0
net.ipv4.tcp_syncookies=0
net.ipv4.tcp_synack_retries=1
net.ipv4.tcp_syn_retries=1

# tcp read/write buffer
net.ipv4.tcp_rmem="4096 87380 16777216"
net.ipv4.tcp_wmem="4096 16384 16777216"
net.ipv4.udp_mem="3145728 4194304 16777216"

# tcp probe fail interval: 75s -> 20s
net.ipv4.tcp_keepalive_intvl=20
# tcp break after 3 * 20s = 1m
net.ipv4.tcp_keepalive_probes=3
# probe peroid = 1 min
net.ipv4.tcp_keepalive_time=60

net.ipv4.tcp_fin_timeout=5
net.ipv4.tcp_max_tw_buckets=262144
net.ipv4.tcp_max_syn_backlog=8192
net.ipv4.neigh.default.gc_thresh1=80000
net.ipv4.neigh.default.gc_thresh2=90000
net.ipv4.neigh.default.gc_thresh3=100000

net.bridge.bridge-nf-call-iptables=1
net.bridge.bridge-nf-call-ip6tables=1
net.bridge.bridge-nf-call-arptables=1

# max connection tracking number
net.netfilter.nf_conntrack_max=1048576

CRIT配置

# tuned configuration
#==============================================================#
# File      :   tuned.conf
# Mtime     :   2020-06-29
# Desc      :   Tune operatiing system to crit mode
# Path      :   /etc/tuned/crit/tuned.conf
# Author    :   Vonng(fengruohang@outlook.com)
# Copyright (C) 2019-2020 Ruohang Feng
#==============================================================#

[main]
summary=Optimize for PostgreSQL CRIT System
include=network-latency

[cpu]
force_latency=1
governor=performance
energy_perf_bias=performance
min_perf_pct=100

[vm]
# disable transparent hugepages
transparent_hugepages=never

[sysctl]
#-------------------------------------------------------------#
#                           KERNEL                            #
#-------------------------------------------------------------#
# disable numa balancing
kernel.numa_balancing=0

# total shmem size in bytes： $(expr $(getconf _PHYS_PAGES) / 2 \* $(getconf PAGE_SIZE))
{% if param_shmall is defined and param_shmall != '' %}
kernel.shmall = {{ param_shmall }}
{% endif %}

# total shmem size in pages:  $(expr $(getconf _PHYS_PAGES) / 2)
{% if param_shmmax is defined and param_shmmax != '' %}
kernel.shmmax = {{ param_shmmax }}
{% endif %}

# total shmem segs 4096 -> 8192
kernel.shmmni=8192

# total msg queue number, set to mem size in MB
kernel.msgmni=32768

# max length of message queue
kernel.msgmnb=65536

# max size of message
kernel.msgmax=65536

kernel.pid_max=131072

# max(Sem in Set)=2048, max(Sem)=max(Sem in Set) x max(SemSet) , max(Sem per Ops)=2048, max(SemSet)=65536
kernel.sem=2048 134217728 2048 65536

# do not sched postgres process in group
kernel.sched_autogroup_enabled = 0

# total time the scheduler will consider a migrated process cache hot and, thus, less likely to be remigrated
# defaut = 0.5ms (500000ns), update to 5ms , depending on your typical query (e.g < 1ms)
kernel.sched_migration_cost_ns=5000000

#-------------------------------------------------------------#
#                             VM                              #
#-------------------------------------------------------------#
# try not using swap
vm.swappiness=0

# disable when most mem are for file cache
vm.zone_reclaim_mode=0

# overcommit threshhold = 80%
vm.overcommit_memory=2
vm.overcommit_ratio=100

# 64MB mem (2xRAID cache) wake the bgwriter
vm.dirty_background_bytes=67108864
# vm.dirty_background_ratio=3       # latency-performance default
vm.dirty_ratio=6                    # latency-performance default

# deny access on 0x00000 - 0x10000
vm.mmap_min_addr=65536

#-------------------------------------------------------------#
#                        Filesystem                           #
#-------------------------------------------------------------#
# max open files: 382589 -> 167772160
fs.file-max=167772160

# max concurrent unfinished async io, should be larger than 1M.  65536->1M
fs.aio-max-nr=1048576


#-------------------------------------------------------------#
#                          Network                            #
#-------------------------------------------------------------#
# max connection in listen queue (triggers retrans if full)
net.core.somaxconn=65535
net.core.netdev_max_backlog=8192
# tcp receive/transmit buffer default = 256KiB
net.core.rmem_default=262144
net.core.wmem_default=262144
# receive/transmit buffer limit = 4MiB
net.core.rmem_max=4194304
net.core.wmem_max=4194304

# ip options
net.ipv4.ip_forward=1
net.ipv4.ip_nonlocal_bind=1
net.ipv4.ip_local_port_range=32768 65000

# tcp options
net.ipv4.tcp_timestamps=1
net.ipv4.tcp_tw_reuse=1
net.ipv4.tcp_tw_recycle=0
net.ipv4.tcp_syncookies=0
net.ipv4.tcp_synack_retries=1
net.ipv4.tcp_syn_retries=1

# tcp read/write buffer
net.ipv4.tcp_rmem="4096 87380 16777216"
net.ipv4.tcp_wmem="4096 16384 16777216"
net.ipv4.udp_mem="3145728 4194304 16777216"

# tcp probe fail interval: 75s -> 20s
net.ipv4.tcp_keepalive_intvl=20
# tcp break after 3 * 20s = 1m
net.ipv4.tcp_keepalive_probes=3
# probe peroid = 1 min
net.ipv4.tcp_keepalive_time=60

net.ipv4.tcp_fin_timeout=5
net.ipv4.tcp_max_tw_buckets=262144
net.ipv4.tcp_max_syn_backlog=8192
net.ipv4.neigh.default.gc_thresh1=80000
net.ipv4.neigh.default.gc_thresh2=90000
net.ipv4.neigh.default.gc_thresh3=100000

net.bridge.bridge-nf-call-iptables=1
net.bridge.bridge-nf-call-ip6tables=1
net.bridge.bridge-nf-call-arptables=1

# max connection tracking number
net.netfilter.nf_conntrack_max=1048576

TINY配置

# tuned configuration
#==============================================================#
# File      :   tuned.conf
# Mtime     :   2020-06-29
# Desc      :   Tune operatiing system to tiny mode
# Path      :   /etc/tuned/tiny/tuned.conf
# Author    :   Vonng(fengruohang@outlook.com)
# Copyright (C) 2019-2020 Ruohang Feng
#==============================================================#

[main]
summary=Optimize for PostgreSQL TINY System
# include=virtual-guest

[vm]
# disable transparent hugepages
transparent_hugepages=never

[sysctl]
#-------------------------------------------------------------#
#                           KERNEL                            #
#-------------------------------------------------------------#
# disable numa balancing
kernel.numa_balancing=0

# If a workload mostly uses anonymous memory and it hits this limit, the entire
# working set is buffered for I/O, and any more write buffering would require
# swapping, so it's time to throttle writes until I/O can catch up.  Workloads
# that mostly use file mappings may be able to use even higher values.
#
# The generator of dirty data starts writeback at this percentage (system default
# is 20%)
vm.dirty_ratio = 40

# Filesystem I/O is usually much more efficient than swapping, so try to keep
# swapping low.  It's usually safe to go even lower than this on systems with
# server-grade storage.
vm.swappiness = 30

#-------------------------------------------------------------#
#                          Network                            #
#-------------------------------------------------------------#
# tcp options
net.ipv4.tcp_timestamps=1
net.ipv4.tcp_tw_reuse=1
net.ipv4.tcp_tw_recycle=0
net.ipv4.tcp_syncookies=0
net.ipv4.tcp_synack_retries=1
net.ipv4.tcp_syn_retries=1

# tcp probe fail interval: 75s -> 20s
net.ipv4.tcp_keepalive_intvl=20
# tcp break after 3 * 20s = 1m
net.ipv4.tcp_keepalive_probes=3
# probe peroid = 1 min
net.ipv4.tcp_keepalive_time=60

数据库内核调优参考

# Database kernel optimisation
fs.aio-max-nr = 1048576 # 限制并发未完成的异步请求数目，，不应小于1M
fs.file-max = 16777216  # 最大打开16M个文件

# kernel
kernel.shmmax = 485058		# 共享内存最大页面数量: $(expr $(getconf _PHYS_PAGES) / 2)
kernel.shmall = 1986797568 	# 共享内存总大小： $(expr $(getconf _PHYS_PAGES) / 2 \* $(getconf PAGE_SIZE))
kernel.shmmni = 16384 		# 系统范围内共享内存段的最大数量 4096 -> 16384
kernel.msgmni = 32768		# 系统的消息队列数目,影响可以启动的代理程序数 设为内存MB数
kernel.msgmnb = 65536		# 影响队列的大小
kernel.msgmax = 65536		# 影响队列中可以发送的消息的大小
kernel.numa_balancing = 0   # Numa禁用
kernel.sched_migration_cost_ns = 5000000 # 5ms内，调度认为进程还是Hot的。
kernel.sem = 2048 134217728 2048 65536   # 每个信号集最大信号量2048，系统总共可用信号量134217728，单次最大操作2048，信号集总数65536

# vm
vm.dirty_ratio = 80                       # 绝对限制，超过80%阻塞写请求刷盘
vm.dirty_background_bytes = 268435456     # 256MB脏数据唤醒刷盘进程
vm.dirty_expire_centisecs = 6000          # 1分钟前的数据被认为需要刷盘
vm.dirty_writeback_centisecs= 500         # 刷新进程运行间隔5秒
vm.mmap_min_addr = 65536                  # 禁止访问0x10000下的内存
vm.zone_reclaim_mode = 0                  # Numa禁用

# vm swap
vm.swappiness = 0                         # 禁用SWAP，但高水位仍会有
vm.overcommit_memory = 2                  # 允许一定程度的Overcommit
vm.overcommit_ratio = 50                  # 允许的Overcommit:$((($mem - $swap) * 100 / $mem))

# tcp memory
net.ipv4.tcp_rmem = 8192 65536 16777216		# tcp读buffer: 32M/256M/16G
net.ipv4.tcp_wmem = 8192 65536 16777216		# tcp写buffer: 32M/256M/16G
net.ipv4.tcp_mem = 131072 262144 16777216	# tcp 内存使用 512M/1G/16G
net.core.rmem_default = 262144      		# 接受缓冲区默认大小: 256K
net.core.rmem_max = 4194304         		# 接受缓冲区最大大小: 4M
net.core.wmem_default = 262144      		# 发送缓冲区默认大小: 256K
net.core.wmem_max = 4194304         		# 发送缓冲区最大大小: 4M
# tcp keepalive
net.ipv4.tcp_keepalive_intvl = 20	# 探测没有确认时，重新发送探测的频度。默认75s -> 20s
net.ipv4.tcp_keepalive_probes = 3	# 3 * 20 = 1分钟超时断开
net.ipv4.tcp_keepalive_time = 60	# 探活周期1分钟
# tcp port resure
net.ipv4.tcp_tw_reuse = 1           # 允许将TIME_WAIT socket用于新的TCP连接。默认为0
net.ipv4.tcp_tw_recycle = 0			# 快速回收，已弃用
net.ipv4.tcp_fin_timeout = 5        # 保持在FIN-WAIT-2状态的秒时间
net.ipv4.tcp_timestamps = 1
# tcp anti-flood
net.ipv4.tcp_syncookies = 1			# SYN_RECV队列满后发cookie，防止恶意攻击
net.ipv4.tcp_synack_retries = 1		# 收到不完整sync后的重试次数 5->2
net.ipv4.tcp_syn_retries = 1         #表示在内核放弃建立连接之前发送SYN包的数量。
# tcp load-balancer
net.ipv4.ip_forward = 1						# IP转发
net.ipv4.ip_nonlocal_bind = 1				# 绑定非本机地址
net.netfilter.nf_conntrack_max = 1048576	# 最大跟踪连接数
net.ipv4.ip_local_port_range = 10000 65535	# 端口范围
net.ipv4.tcp_max_tw_buckets = 262144		# 256k  TIME_WAIT
net.core.somaxconn = 65535          		# 限制LISTEN队列最大数据包量，触发重传机制。
net.ipv4.tcp_max_syn_backlog = 8192 		# SYN队列大小：1024->8192
net.core.netdev_max_backlog = 8192			# 网卡收包快于内核时，允许队列长度

Last modified 2021-03-28: update en docs (f994b54)