puppet: Move monitoring of pg replication to the pg hosts.

Instead of SSH'ing around to them, run directly on the database hosts.
This means that the replicas do not know how many bytes behind they
are in _receiving_ the wall logs; thus, the monitoring also extends to
the primary database, which knows that information for each replica.
This also allows for detecting when there are too few active replicas.
This commit is contained in:
Alex Vandiver 2020-06-15 14:56:55 -07:00 committed by Tim Abbott
parent 05455f432e
commit 7d4a370a57
3 changed files with 89 additions and 52 deletions

View File

@ -1,13 +1,13 @@
#!/usr/bin/env python3
"""Nagios plugin to check the difference between the primary and
replica Postgres servers' xlog location. Requires that the user this
connects to postgres as has been granted the `pg_monitor` role.
"""
Nagios plugin to check the difference between the primary and
secondary Postgres servers' xlog location.
"""
import configparser
import re
import subprocess
from typing import NoReturn
from typing import Dict, List
states = {
"OK": 0,
@ -16,18 +16,38 @@ states = {
"UNKNOWN": 3,
}
def report(state: str, msg: str) -> "NoReturn":
print(f"{state}: {msg}")
exit(states[state])
MAXSTATE = 0
def get_loc_over_ssh(host: str, func: str) -> str:
def report(state: str, msg: str) -> None:
global MAXSTATE
print(f"{state}: {msg}")
MAXSTATE = max(MAXSTATE, states[state])
def run_sql_query(query: str) -> List[List[str]]:
command = [
'psql',
'-t', # Omit header line
'-A', # Don't pad with spaces
'-z', # Separate columns with nulls
'-v', 'ON_ERROR_STOP=1',
'zulip',
'-c', f'SELECT {query}',
]
try:
return subprocess.check_output(['ssh', host,
f'psql -v ON_ERROR_STOP=1 zulip -t -c "SELECT {func}()"'],
stderr=subprocess.STDOUT,
universal_newlines=True)
output = subprocess.check_output(
command,
stderr=subprocess.STDOUT,
universal_newlines=True
).strip()
if not output:
return []
return [x.split('\0') for x in output.split('\n')]
except subprocess.CalledProcessError as e:
report('CRITICAL', f'ssh failed: {str(e)}: {e.output}')
report('CRITICAL', f'psql failed: {str(e)}: {e.output}')
exit(MAXSTATE)
def loc_to_abs_offset(loc_str: str) -> int:
m = re.match(r'^\s*([0-9a-fA-F]+)/([0-9a-fA-F]+)\s*$', loc_str)
@ -48,43 +68,59 @@ def loc_to_abs_offset(loc_str: str) -> int:
# Since XLOG_SEG_SIZE is normally 16MB, XLogFileSize comes out to 0xFF000000
return 0xFF000000 * int(xlog_file, 16) + int(file_offset, 16)
config_file = configparser.RawConfigParser()
config_file.read("/etc/zulip/zulip.conf")
hosts_domain = config_file.get('nagios', 'hosts_domain')
primary_server = config_file.get('nagios', 'hosts_postgres_primary') + "." + hosts_domain
secondary_servers = config_file.get('nagios', 'hosts_postgres_secondary').split(',')
# TODO: Make this a loop
secondary_server = secondary_servers[0] + "." + hosts_domain
replication_info = run_sql_query(
'sender_host, status, pg_last_wal_replay_lsn(), pg_last_wal_receive_lsn()'
' from pg_stat_wal_receiver'
)
secondary_replay_loc = get_loc_over_ssh(secondary_server, 'pg_last_wal_replay_lsn')
secondary_recv_loc = get_loc_over_ssh(secondary_server, 'pg_last_wal_receive_lsn')
primary_loc = get_loc_over_ssh(primary_server, 'pg_current_wal_lsn')
if replication_info:
(primary_server, state, replay_loc, recv_loc) = replication_info[0]
primary_offset = loc_to_abs_offset(primary_loc)
secondary_recv_offset = loc_to_abs_offset(secondary_recv_loc)
secondary_replay_offset = loc_to_abs_offset(secondary_replay_loc)
recv_offset = loc_to_abs_offset(recv_loc)
replay_lag = recv_offset - loc_to_abs_offset(replay_loc)
recv_diff = primary_offset - secondary_recv_offset
replay_diff = secondary_recv_offset - secondary_replay_offset
if state != 'streaming':
report('CRITICAL', f'replica is in state {state}, not streaming')
# xlog segments are normally 16MB each. These thresholds are pretty arbitrary.
if recv_diff > 5 * 16 * 1024**2:
report('CRITICAL', f'secondary is {recv_diff} bytes behind on receiving xlog')
msg = f'replica is {replay_lag} bytes behind in replay of WAL logs from {primary_server}'
if replay_lag > 5 * 16 * 1024**2:
report('CRITICAL', msg)
elif replay_lag > 16 * 1024**2:
report('WARNING', msg)
else:
report('OK', msg)
if replay_diff > 5 * 16 * 1024**2:
report('CRITICAL', f'secondary is {replay_diff} bytes behind on applying received xlog')
else:
replication_info = run_sql_query(
'client_addr, state, sent_lsn, write_lsn, flush_lsn, replay_lsn'
' from pg_stat_replication'
)
if replication_info == 0:
report('CRITICAL', 'No replicas!')
elif len(replication_info) == 1:
report('WARNING', 'Only one replica!')
else:
report('OK', f'Found {len(replication_info)} replicas')
if recv_diff < 0:
report('CRITICAL', f'secondary is {recv_diff} bytes ahead on receiving xlog')
for replica in replication_info:
(client_addr, state, sent_lsn, write_lsn, flush_lsn, replay_lsn) = replica
if state != 'streaming':
report('CRITICAL', f'replica {client_addr} is in state {state}, not streaming')
if replay_diff < 0:
report('CRITICAL', f'secondary is {replay_diff} bytes ahead on applying received xlog')
sent_offset = loc_to_abs_offset(sent_lsn)
lag: Dict[str, int] = {}
lag['write'] = sent_offset - loc_to_abs_offset(write_lsn)
lag['flush'] = sent_offset - loc_to_abs_offset(flush_lsn)
lag['replay'] = sent_offset - loc_to_abs_offset(replay_lsn)
for lag_type in ('write', 'flush', 'replay'):
lag_bytes = lag[lag_type]
msg = f'replica {client_addr} is {lag_bytes} bytes behind in {lag_type} of WAL logs'
if lag_bytes > 5 * 16 * 1024**2:
report('CRITICAL', msg)
elif lag_bytes > 16 * 1024**2:
report('WARNING', msg)
else:
report('OK', msg)
if recv_diff > 16 * 1024**2:
report('WARNING', f'secondary is {recv_diff} bytes behind on receiving xlog')
if replay_diff > 16 * 1024**2:
report('WARNING', f'secondary is {replay_diff} bytes behind on applying received xlog')
report('OK', f'secondary is {recv_diff} bytes behind on receiving and {replay_diff} bytes behind on applying xlog')
exit(MAXSTATE)

View File

@ -250,6 +250,14 @@ define service {
contact_groups admins
}
define service{
use generic-service
service_description Check postgres replication lag
check_command check_postgres_replication_lag
hostgroup postgres_appdb
contact_groups admins
}
define service {
use generic-service
service_description Check last Postgres backup time

View File

@ -34,13 +34,6 @@ define service{
check_command check_load!7.0!6.0!5.0!10.0!8.0!6.0
}
define service{
use generic-service
host_name nagios
service_description Check postgres replication lag
check_command check_postgres_replication_lag
}
define service{
use generic-service
host_name nagios