mirror of
https://github.com/zulip/zulip.git
synced 2026-06-24 21:08:25 +08:00
puppet: Move monitoring of pg replication to the pg hosts.
Instead of SSH'ing around to them, run directly on the database hosts. This means that the replicas do not know how many bytes behind they are in _receiving_ the wall logs; thus, the monitoring also extends to the primary database, which knows that information for each replica. This also allows for detecting when there are too few active replicas.
This commit is contained in:
parent
05455f432e
commit
7d4a370a57
@ -1,13 +1,13 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
"""Nagios plugin to check the difference between the primary and
|
||||
replica Postgres servers' xlog location. Requires that the user this
|
||||
connects to postgres as has been granted the `pg_monitor` role.
|
||||
|
||||
"""
|
||||
Nagios plugin to check the difference between the primary and
|
||||
secondary Postgres servers' xlog location.
|
||||
"""
|
||||
import configparser
|
||||
import re
|
||||
import subprocess
|
||||
from typing import NoReturn
|
||||
from typing import Dict, List
|
||||
|
||||
states = {
|
||||
"OK": 0,
|
||||
@ -16,18 +16,38 @@ states = {
|
||||
"UNKNOWN": 3,
|
||||
}
|
||||
|
||||
def report(state: str, msg: str) -> "NoReturn":
|
||||
print(f"{state}: {msg}")
|
||||
exit(states[state])
|
||||
MAXSTATE = 0
|
||||
|
||||
def get_loc_over_ssh(host: str, func: str) -> str:
|
||||
|
||||
def report(state: str, msg: str) -> None:
|
||||
global MAXSTATE
|
||||
print(f"{state}: {msg}")
|
||||
MAXSTATE = max(MAXSTATE, states[state])
|
||||
|
||||
|
||||
def run_sql_query(query: str) -> List[List[str]]:
|
||||
command = [
|
||||
'psql',
|
||||
'-t', # Omit header line
|
||||
'-A', # Don't pad with spaces
|
||||
'-z', # Separate columns with nulls
|
||||
'-v', 'ON_ERROR_STOP=1',
|
||||
'zulip',
|
||||
'-c', f'SELECT {query}',
|
||||
]
|
||||
try:
|
||||
return subprocess.check_output(['ssh', host,
|
||||
f'psql -v ON_ERROR_STOP=1 zulip -t -c "SELECT {func}()"'],
|
||||
stderr=subprocess.STDOUT,
|
||||
universal_newlines=True)
|
||||
output = subprocess.check_output(
|
||||
command,
|
||||
stderr=subprocess.STDOUT,
|
||||
universal_newlines=True
|
||||
).strip()
|
||||
if not output:
|
||||
return []
|
||||
return [x.split('\0') for x in output.split('\n')]
|
||||
except subprocess.CalledProcessError as e:
|
||||
report('CRITICAL', f'ssh failed: {str(e)}: {e.output}')
|
||||
report('CRITICAL', f'psql failed: {str(e)}: {e.output}')
|
||||
exit(MAXSTATE)
|
||||
|
||||
|
||||
def loc_to_abs_offset(loc_str: str) -> int:
|
||||
m = re.match(r'^\s*([0-9a-fA-F]+)/([0-9a-fA-F]+)\s*$', loc_str)
|
||||
@ -48,43 +68,59 @@ def loc_to_abs_offset(loc_str: str) -> int:
|
||||
# Since XLOG_SEG_SIZE is normally 16MB, XLogFileSize comes out to 0xFF000000
|
||||
return 0xFF000000 * int(xlog_file, 16) + int(file_offset, 16)
|
||||
|
||||
config_file = configparser.RawConfigParser()
|
||||
config_file.read("/etc/zulip/zulip.conf")
|
||||
hosts_domain = config_file.get('nagios', 'hosts_domain')
|
||||
primary_server = config_file.get('nagios', 'hosts_postgres_primary') + "." + hosts_domain
|
||||
secondary_servers = config_file.get('nagios', 'hosts_postgres_secondary').split(',')
|
||||
|
||||
# TODO: Make this a loop
|
||||
secondary_server = secondary_servers[0] + "." + hosts_domain
|
||||
replication_info = run_sql_query(
|
||||
'sender_host, status, pg_last_wal_replay_lsn(), pg_last_wal_receive_lsn()'
|
||||
' from pg_stat_wal_receiver'
|
||||
)
|
||||
|
||||
secondary_replay_loc = get_loc_over_ssh(secondary_server, 'pg_last_wal_replay_lsn')
|
||||
secondary_recv_loc = get_loc_over_ssh(secondary_server, 'pg_last_wal_receive_lsn')
|
||||
primary_loc = get_loc_over_ssh(primary_server, 'pg_current_wal_lsn')
|
||||
if replication_info:
|
||||
(primary_server, state, replay_loc, recv_loc) = replication_info[0]
|
||||
|
||||
primary_offset = loc_to_abs_offset(primary_loc)
|
||||
secondary_recv_offset = loc_to_abs_offset(secondary_recv_loc)
|
||||
secondary_replay_offset = loc_to_abs_offset(secondary_replay_loc)
|
||||
recv_offset = loc_to_abs_offset(recv_loc)
|
||||
replay_lag = recv_offset - loc_to_abs_offset(replay_loc)
|
||||
|
||||
recv_diff = primary_offset - secondary_recv_offset
|
||||
replay_diff = secondary_recv_offset - secondary_replay_offset
|
||||
if state != 'streaming':
|
||||
report('CRITICAL', f'replica is in state {state}, not streaming')
|
||||
|
||||
# xlog segments are normally 16MB each. These thresholds are pretty arbitrary.
|
||||
if recv_diff > 5 * 16 * 1024**2:
|
||||
report('CRITICAL', f'secondary is {recv_diff} bytes behind on receiving xlog')
|
||||
msg = f'replica is {replay_lag} bytes behind in replay of WAL logs from {primary_server}'
|
||||
if replay_lag > 5 * 16 * 1024**2:
|
||||
report('CRITICAL', msg)
|
||||
elif replay_lag > 16 * 1024**2:
|
||||
report('WARNING', msg)
|
||||
else:
|
||||
report('OK', msg)
|
||||
|
||||
if replay_diff > 5 * 16 * 1024**2:
|
||||
report('CRITICAL', f'secondary is {replay_diff} bytes behind on applying received xlog')
|
||||
else:
|
||||
replication_info = run_sql_query(
|
||||
'client_addr, state, sent_lsn, write_lsn, flush_lsn, replay_lsn'
|
||||
' from pg_stat_replication'
|
||||
)
|
||||
if replication_info == 0:
|
||||
report('CRITICAL', 'No replicas!')
|
||||
elif len(replication_info) == 1:
|
||||
report('WARNING', 'Only one replica!')
|
||||
else:
|
||||
report('OK', f'Found {len(replication_info)} replicas')
|
||||
|
||||
if recv_diff < 0:
|
||||
report('CRITICAL', f'secondary is {recv_diff} bytes ahead on receiving xlog')
|
||||
for replica in replication_info:
|
||||
(client_addr, state, sent_lsn, write_lsn, flush_lsn, replay_lsn) = replica
|
||||
if state != 'streaming':
|
||||
report('CRITICAL', f'replica {client_addr} is in state {state}, not streaming')
|
||||
|
||||
if replay_diff < 0:
|
||||
report('CRITICAL', f'secondary is {replay_diff} bytes ahead on applying received xlog')
|
||||
sent_offset = loc_to_abs_offset(sent_lsn)
|
||||
lag: Dict[str, int] = {}
|
||||
lag['write'] = sent_offset - loc_to_abs_offset(write_lsn)
|
||||
lag['flush'] = sent_offset - loc_to_abs_offset(flush_lsn)
|
||||
lag['replay'] = sent_offset - loc_to_abs_offset(replay_lsn)
|
||||
for lag_type in ('write', 'flush', 'replay'):
|
||||
lag_bytes = lag[lag_type]
|
||||
msg = f'replica {client_addr} is {lag_bytes} bytes behind in {lag_type} of WAL logs'
|
||||
if lag_bytes > 5 * 16 * 1024**2:
|
||||
report('CRITICAL', msg)
|
||||
elif lag_bytes > 16 * 1024**2:
|
||||
report('WARNING', msg)
|
||||
else:
|
||||
report('OK', msg)
|
||||
|
||||
if recv_diff > 16 * 1024**2:
|
||||
report('WARNING', f'secondary is {recv_diff} bytes behind on receiving xlog')
|
||||
|
||||
if replay_diff > 16 * 1024**2:
|
||||
report('WARNING', f'secondary is {replay_diff} bytes behind on applying received xlog')
|
||||
|
||||
report('OK', f'secondary is {recv_diff} bytes behind on receiving and {replay_diff} bytes behind on applying xlog')
|
||||
exit(MAXSTATE)
|
||||
|
||||
@ -250,6 +250,14 @@ define service {
|
||||
contact_groups admins
|
||||
}
|
||||
|
||||
define service{
|
||||
use generic-service
|
||||
service_description Check postgres replication lag
|
||||
check_command check_postgres_replication_lag
|
||||
hostgroup postgres_appdb
|
||||
contact_groups admins
|
||||
}
|
||||
|
||||
define service {
|
||||
use generic-service
|
||||
service_description Check last Postgres backup time
|
||||
|
||||
@ -34,13 +34,6 @@ define service{
|
||||
check_command check_load!7.0!6.0!5.0!10.0!8.0!6.0
|
||||
}
|
||||
|
||||
define service{
|
||||
use generic-service
|
||||
host_name nagios
|
||||
service_description Check postgres replication lag
|
||||
check_command check_postgres_replication_lag
|
||||
}
|
||||
|
||||
define service{
|
||||
use generic-service
|
||||
host_name nagios
|
||||
|
||||
Loading…
Reference in New Issue
Block a user