puppet: Move monitoring of pg replication to the pg hosts.

Instead of SSH'ing around to them, run directly on the database hosts. This means that the replicas do not know how many bytes behind they are in _receiving_ the wall logs; thus, the monitoring also extends to the primary database, which knows that information for each replica. This also allows for detecting when there are too few active replicas.
2026-06-24 21:08:25 +08:00 · 2020-06-15 14:56:55 -07:00 · 2020-06-15 14:56:55 -07:00 · 7d4a370a57
commit 7d4a370a57
parent 05455f432e
3 changed files with 89 additions and 52 deletions
--- a/puppet/zulip/files/nagios_plugins/zulip_nagios_server/check_postgres_replication_lag
+++ b/puppet/zulip/files/nagios_plugins/zulip_nagios_server/check_postgres_replication_lag
@ -1,13 +1,13 @@
 #!/usr/bin/env python3

+"""Nagios plugin to check the difference between the primary and
+replica Postgres servers' xlog location.  Requires that the user this
+connects to postgres as has been granted the `pg_monitor` role.
+
 """
-Nagios plugin to check the difference between the primary and
-secondary Postgres servers' xlog location.
-"""
-import configparser
 import re
 import subprocess
-from typing import NoReturn
+from typing import Dict, List

 states = {
    "OK": 0,
@ -16,18 +16,38 @@ states = {
    "UNKNOWN": 3,
 }

-def report(state: str, msg: str) -> "NoReturn":
-    print(f"{state}: {msg}")
-    exit(states[state])
+MAXSTATE = 0

-def get_loc_over_ssh(host: str, func: str) -> str:
+
+def report(state: str, msg: str) -> None:
+    global MAXSTATE
+    print(f"{state}: {msg}")
+    MAXSTATE = max(MAXSTATE, states[state])
+
+
+def run_sql_query(query: str) -> List[List[str]]:
+    command = [
+        'psql',
+        '-t',  # Omit header line
+        '-A',  # Don't pad with spaces
+        '-z',  # Separate columns with nulls
+        '-v', 'ON_ERROR_STOP=1',
+        'zulip',
+        '-c', f'SELECT {query}',
+    ]
    try:
-        return subprocess.check_output(['ssh', host,
-                                        f'psql -v ON_ERROR_STOP=1 zulip -t -c "SELECT {func}()"'],
-                                       stderr=subprocess.STDOUT,
-                                       universal_newlines=True)
+        output = subprocess.check_output(
+            command,
+            stderr=subprocess.STDOUT,
+            universal_newlines=True
+        ).strip()
+        if not output:
+            return []
+        return [x.split('\0') for x in output.split('\n')]
    except subprocess.CalledProcessError as e:
-        report('CRITICAL', f'ssh failed: {str(e)}: {e.output}')
+        report('CRITICAL', f'psql failed: {str(e)}: {e.output}')
+        exit(MAXSTATE)
+

 def loc_to_abs_offset(loc_str: str) -> int:
    m = re.match(r'^\s*([0-9a-fA-F]+)/([0-9a-fA-F]+)\s*$', loc_str)
@ -48,43 +68,59 @@ def loc_to_abs_offset(loc_str: str) -> int:
    # Since XLOG_SEG_SIZE is normally 16MB, XLogFileSize comes out to 0xFF000000
    return 0xFF000000 * int(xlog_file, 16) + int(file_offset, 16)

-config_file = configparser.RawConfigParser()
-config_file.read("/etc/zulip/zulip.conf")
-hosts_domain = config_file.get('nagios', 'hosts_domain')
-primary_server = config_file.get('nagios', 'hosts_postgres_primary') + "." + hosts_domain
-secondary_servers = config_file.get('nagios', 'hosts_postgres_secondary').split(',')

-# TODO: Make this a loop
-secondary_server = secondary_servers[0] + "." + hosts_domain
+replication_info = run_sql_query(
+    'sender_host, status, pg_last_wal_replay_lsn(), pg_last_wal_receive_lsn()'
+    ' from pg_stat_wal_receiver'
+)

-secondary_replay_loc = get_loc_over_ssh(secondary_server, 'pg_last_wal_replay_lsn')
-secondary_recv_loc   = get_loc_over_ssh(secondary_server, 'pg_last_wal_receive_lsn')
-primary_loc          = get_loc_over_ssh(primary_server, 'pg_current_wal_lsn')
+if replication_info:
+    (primary_server, state, replay_loc, recv_loc) = replication_info[0]

-primary_offset = loc_to_abs_offset(primary_loc)
-secondary_recv_offset = loc_to_abs_offset(secondary_recv_loc)
-secondary_replay_offset = loc_to_abs_offset(secondary_replay_loc)
+    recv_offset = loc_to_abs_offset(recv_loc)
+    replay_lag = recv_offset - loc_to_abs_offset(replay_loc)

-recv_diff = primary_offset - secondary_recv_offset
-replay_diff = secondary_recv_offset - secondary_replay_offset
+    if state != 'streaming':
+        report('CRITICAL', f'replica is in state {state}, not streaming')

-# xlog segments are normally 16MB each.  These thresholds are pretty arbitrary.
-if recv_diff > 5 * 16 * 1024**2:
-    report('CRITICAL', f'secondary is {recv_diff} bytes behind on receiving xlog')
+    msg = f'replica is {replay_lag} bytes behind in replay of WAL logs from {primary_server}'
+    if replay_lag > 5 * 16 * 1024**2:
+        report('CRITICAL', msg)
+    elif replay_lag > 16 * 1024**2:
+        report('WARNING', msg)
+    else:
+        report('OK', msg)

-if replay_diff > 5 * 16 * 1024**2:
-    report('CRITICAL', f'secondary is {replay_diff} bytes behind on applying received xlog')
+else:
+    replication_info = run_sql_query(
+        'client_addr, state, sent_lsn, write_lsn, flush_lsn, replay_lsn'
+        ' from pg_stat_replication'
+    )
+    if replication_info == 0:
+        report('CRITICAL', 'No replicas!')
+    elif len(replication_info) == 1:
+        report('WARNING', 'Only one replica!')
+    else:
+        report('OK', f'Found {len(replication_info)} replicas')

-if recv_diff < 0:
-    report('CRITICAL', f'secondary is {recv_diff} bytes ahead on receiving xlog')
+    for replica in replication_info:
+        (client_addr, state, sent_lsn, write_lsn, flush_lsn, replay_lsn) = replica
+        if state != 'streaming':
+            report('CRITICAL', f'replica {client_addr} is in state {state}, not streaming')

-if replay_diff < 0:
-    report('CRITICAL', f'secondary is {replay_diff} bytes ahead on applying received xlog')
+        sent_offset = loc_to_abs_offset(sent_lsn)
+        lag: Dict[str, int] = {}
+        lag['write'] = sent_offset - loc_to_abs_offset(write_lsn)
+        lag['flush'] = sent_offset - loc_to_abs_offset(flush_lsn)
+        lag['replay'] = sent_offset - loc_to_abs_offset(replay_lsn)
+        for lag_type in ('write', 'flush', 'replay'):
+            lag_bytes = lag[lag_type]
+            msg = f'replica {client_addr} is {lag_bytes} bytes behind in {lag_type} of WAL logs'
+            if lag_bytes > 5 * 16 * 1024**2:
+                report('CRITICAL', msg)
+            elif lag_bytes > 16 * 1024**2:
+                report('WARNING', msg)
+            else:
+                report('OK', msg)

-if recv_diff > 16 * 1024**2:
-    report('WARNING', f'secondary is {recv_diff} bytes behind on receiving xlog')
-
-if replay_diff > 16 * 1024**2:
-    report('WARNING', f'secondary is {replay_diff} bytes behind on applying received xlog')
-
-report('OK', f'secondary is {recv_diff} bytes behind on receiving and {replay_diff} bytes behind on applying xlog')
+exit(MAXSTATE)
--- a/puppet/zulip_ops/files/nagios3/conf.d/services.cfg
+++ b/puppet/zulip_ops/files/nagios3/conf.d/services.cfg
@ -250,6 +250,14 @@ define service {
        contact_groups                  admins
 }

+define service{
+        use                             generic-service
+        service_description             Check postgres replication lag
+        check_command                   check_postgres_replication_lag
+        hostgroup                       postgres_appdb
+        contact_groups                  admins
+}
+
 define service {
        use                             generic-service
        service_description             Check last Postgres backup time
--- a/puppet/zulip_ops/templates/nagios3/localhost.cfg.template.erb
+++ b/puppet/zulip_ops/templates/nagios3/localhost.cfg.template.erb
@ -34,13 +34,6 @@ define service{
        check_command                   check_load!7.0!6.0!5.0!10.0!8.0!6.0
        }

-define service{
-        use                             generic-service
-        host_name                       nagios
-        service_description             Check postgres replication lag
-        check_command                   check_postgres_replication_lag
-        }
-
 define service{
        use                             generic-service
        host_name                       nagios