emulator: bounded dep wait with per-service diagnostics

wait-for-deps used to loop forever on each service, so any single
dep that failed to start (e.g. a service crash-looping under TCG)
hung the build until the outer 6000s provision timeout.

Rewrite as a wait_for helper with:
- Hard 1500s budget across the full dep wait (overridable via
  STACK_DEPS_TIMEOUT). On timeout, dump docker ps -a, last 300 lines
  of the deps container, and per-service reachability, then exit 1
  so provision-build's cleanup trap fires and the VM shuts down fast.
- "<service> ready (Ns)" log lines on each service so successful
  runs show which service was the bottleneck.
- 30s heartbeat per service so long-running waits don't look frozen.

amd64 is unaffected — services come up in ~1s each under KVM, which
is well inside any threshold here.
This commit is contained in:
Bilal Godil 2026-04-10 12:16:45 -07:00
parent 6c5615b931
commit c76c8da7a6

View File

@ -155,13 +155,62 @@ write_files:
permissions: '0755'
content: |
#!/bin/bash
set -euo pipefail
set -uo pipefail
until nc -z 127.0.0.1 5432 >/dev/null 2>&1; do sleep 1; done
until curl -sf http://127.0.0.1:8123/ping >/dev/null 2>&1; do sleep 1; done
until curl -sf http://127.0.0.1:8071/api/v1/health/ >/dev/null 2>&1; do sleep 1; done
until curl -sf http://127.0.0.1:9090/minio/health/live >/dev/null 2>&1; do sleep 1; done
until [ "$(curl -s -o /dev/null -w '%{http_code}' http://127.0.0.1:8080/ 2>/dev/null || true)" = "401" ]; do sleep 1; done
# Hard upper bound across the whole dep wait. Under TCG every service
# init is 5-20x slower than native, so we allow a generous budget, but
# if we cross it something is genuinely stuck and we need to surface it.
DEPS_TIMEOUT="${STACK_DEPS_TIMEOUT:-1500}"
DEPS_CONTAINER="${STACK_DEPS_CONTAINER:-stack-build-init}"
start=$SECONDS
log() { /usr/local/bin/log-provision "wait-for-deps: $*"; }
dump_diagnostics() {
log "dumping diagnostics for stuck dep wait..."
log "--- docker ps -a ---"
docker ps -a 2>&1 | while IFS= read -r line; do log "ps: $line"; done || true
log "--- docker logs ${DEPS_CONTAINER} (last 300 lines) ---"
docker logs --tail 300 "$DEPS_CONTAINER" 2>&1 | while IFS= read -r line; do log "deps: $line"; done || true
log "--- per-service probes ---"
nc -z 127.0.0.1 5432 >/dev/null 2>&1 && log "postgres:5432 reachable" || log "postgres:5432 NOT reachable"
curl -sf --max-time 3 http://127.0.0.1:8123/ping >/dev/null 2>&1 && log "clickhouse:8123 reachable" || log "clickhouse:8123 NOT reachable"
curl -sf --max-time 3 http://127.0.0.1:8071/api/v1/health/ >/dev/null 2>&1 && log "svix:8071 reachable" || log "svix:8071 NOT reachable"
curl -sf --max-time 3 http://127.0.0.1:9090/minio/health/live >/dev/null 2>&1 && log "minio:9090 reachable" || log "minio:9090 NOT reachable"
code=$(curl -s -o /dev/null -w '%{http_code}' --max-time 3 http://127.0.0.1:8080/ 2>/dev/null || true)
[ "$code" = "401" ] && log "qstash:8080 reachable (401)" || log "qstash:8080 NOT reachable (code=${code:-none})"
}
wait_for() {
local name="$1" probe="$2" elapsed
local svc_start=$SECONDS
local next_heartbeat=$((svc_start + 30))
while true; do
if eval "$probe" >/dev/null 2>&1; then
elapsed=$((SECONDS - svc_start))
log "${name} ready (${elapsed}s)"
return 0
fi
if [ "$SECONDS" -ge "$next_heartbeat" ]; then
log "still waiting for ${name} ($((SECONDS - svc_start))s elapsed)"
next_heartbeat=$((SECONDS + 30))
fi
if [ "$((SECONDS - start))" -ge "$DEPS_TIMEOUT" ]; then
elapsed=$((SECONDS - start))
log "TIMEOUT waiting for ${name} after ${elapsed}s (hard cap ${DEPS_TIMEOUT}s)"
dump_diagnostics
exit 1
fi
sleep 2
done
}
log "starting dep wait (timeout=${DEPS_TIMEOUT}s)"
wait_for "postgres" 'nc -z 127.0.0.1 5432'
wait_for "clickhouse" 'curl -sf http://127.0.0.1:8123/ping'
wait_for "svix" 'curl -sf http://127.0.0.1:8071/api/v1/health/'
wait_for "minio" 'curl -sf http://127.0.0.1:9090/minio/health/live'
wait_for "qstash" '[ "$(curl -s -o /dev/null -w "%{http_code}" http://127.0.0.1:8080/ 2>/dev/null || true)" = "401" ]'
log "all deps ready ($((SECONDS - start))s total)"
- path: /etc/stack-build-computed.env
content: |