mirror of
https://github.com/stack-auth/stack.git
synced 2026-06-04 21:04:37 +08:00
emulator: fail-fast on provision errors, diagnose smoke test failures
Provisioning used to silently wait out the full 6000s timeout on any guest-side failure because the cleanup trap only logged the error. Now it writes STACK_CLOUD_INIT_FAILED and shuts the VM down, and the host waiter breaks on that marker and reports it distinctly. Also bump smoke test timeout 120s->300s, dump docker ps / container logs / free -m / verbose curl when it fails, log the qemu accel path, and enable /dev/kvm on the CI runner so the VM isn't stuck in TCG.
This commit is contained in:
parent
cd087c516d
commit
784f17cc2a
15
.github/workflows/qemu-emulator-build.yaml
vendored
15
.github/workflows/qemu-emulator-build.yaml
vendored
@ -47,7 +47,20 @@ jobs:
|
||||
- name: Install QEMU dependencies
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y qemu-system-x86 qemu-system-arm qemu-utils genisoimage socat qemu-efi-aarch64
|
||||
sudo apt-get install -y qemu-system-x86 qemu-system-arm qemu-kvm qemu-utils genisoimage socat qemu-efi-aarch64
|
||||
|
||||
- name: Enable KVM access
|
||||
run: |
|
||||
echo 'KERNEL=="kvm", GROUP="kvm", MODE="0666", OPTIONS+="static_node=kvm"' \
|
||||
| sudo tee /etc/udev/rules.d/99-kvm4all.rules
|
||||
sudo udevadm control --reload-rules
|
||||
sudo udevadm trigger --name-match=kvm || true
|
||||
ls -la /dev/kvm || echo "no /dev/kvm present"
|
||||
if [ -w /dev/kvm ]; then
|
||||
echo "KVM is writable — hardware acceleration will be used"
|
||||
else
|
||||
echo "WARNING: /dev/kvm is not writable — will fall back to TCG (very slow)"
|
||||
fi
|
||||
|
||||
- name: Build QEMU image
|
||||
run: |
|
||||
|
||||
@ -237,6 +237,7 @@ build_one() {
|
||||
local qemu_base pid elapsed total_build_lines
|
||||
local last_build_lines=0
|
||||
local guest_exited=false
|
||||
local guest_failed=false
|
||||
local start_time=$SECONDS
|
||||
|
||||
cp "$base_img" "$tmp_img"
|
||||
@ -258,6 +259,7 @@ build_one() {
|
||||
: > "$serial_log"
|
||||
: > "$provision_log"
|
||||
qemu_base="$(qemu_cmd_prefix_for_arch "$arch")"
|
||||
log "QEMU command prefix (${arch}): $qemu_base"
|
||||
|
||||
# shellcheck disable=SC2086
|
||||
$qemu_base \
|
||||
@ -282,6 +284,11 @@ build_one() {
|
||||
break
|
||||
fi
|
||||
|
||||
if contains_provision_marker "$provision_log" "$serial_log" "STACK_CLOUD_INIT_FAILED"; then
|
||||
guest_failed=true
|
||||
break
|
||||
fi
|
||||
|
||||
if [ -f "$provision_log" ]; then
|
||||
total_build_lines="$(line_count "$provision_log")"
|
||||
if [ "$total_build_lines" -gt "$last_build_lines" ]; then
|
||||
@ -308,7 +315,9 @@ build_one() {
|
||||
echo ""
|
||||
|
||||
if ! contains_provision_marker "$provision_log" "$serial_log" "STACK_CLOUD_INIT_DONE"; then
|
||||
if [ "$guest_exited" = true ]; then
|
||||
if [ "$guest_failed" = true ]; then
|
||||
err "Guest provisioning reported failure for emulator (${arch})"
|
||||
elif [ "$guest_exited" = true ]; then
|
||||
err "Provisioning exited before completion for emulator (${arch})"
|
||||
else
|
||||
err "Provisioning timed out for emulator (${arch})"
|
||||
|
||||
@ -273,7 +273,7 @@ write_files:
|
||||
-v stack-inbucket-data:/data/inbucket \
|
||||
-d stack-local-emulator-slim
|
||||
|
||||
smoke_timeout=120
|
||||
smoke_timeout=300
|
||||
smoke_elapsed=0
|
||||
smoke_passed=false
|
||||
while [ "$smoke_elapsed" -lt "$smoke_timeout" ]; do
|
||||
@ -286,13 +286,22 @@ write_files:
|
||||
smoke_elapsed=$((smoke_elapsed + 2))
|
||||
done
|
||||
|
||||
docker stop smoke-test 2>/dev/null || true
|
||||
sleep 2
|
||||
|
||||
if [ "$smoke_passed" = "false" ]; then
|
||||
log "SMOKE TEST FAILED: backend /health?db=1 did not return 200 within ${smoke_timeout}s"
|
||||
log "--- docker ps -a ---"
|
||||
docker ps -a 2>&1 | while IFS= read -r line; do log "ps: $line"; done || true
|
||||
log "--- smoke-test container logs (last 200 lines) ---"
|
||||
docker logs --tail 200 smoke-test 2>&1 | while IFS= read -r line; do log "smoke-test: $line"; done || true
|
||||
log "--- free -m ---"
|
||||
free -m 2>&1 | while IFS= read -r line; do log "mem: $line"; done || true
|
||||
log "--- curl -v /health?db=1 ---"
|
||||
curl -v --max-time 5 http://127.0.0.1:8102/health?db=1 2>&1 | while IFS= read -r line; do log "curl: $line"; done || true
|
||||
docker stop smoke-test 2>/dev/null || true
|
||||
exit 1
|
||||
fi
|
||||
|
||||
docker stop smoke-test 2>/dev/null || true
|
||||
sleep 2
|
||||
log "Smoke test passed (${smoke_elapsed}s)."
|
||||
|
||||
log "Flattening image (docker export/import)..."
|
||||
@ -363,8 +372,17 @@ write_files:
|
||||
|
||||
cleanup() {
|
||||
local status=$?
|
||||
if [ "$status" -ne 0 ] && [ -n "${STACK_PROVISION_LOG_FILE:-}" ]; then
|
||||
printf 'ERROR: provision-build exited with code %s\n' "$status" >> "$STACK_PROVISION_LOG_FILE"
|
||||
if [ "$status" -ne 0 ]; then
|
||||
if [ -n "${STACK_PROVISION_LOG_FILE:-}" ]; then
|
||||
printf 'ERROR: provision-build exited with code %s\n' "$status" >> "$STACK_PROVISION_LOG_FILE"
|
||||
printf '%s\n' "STACK_CLOUD_INIT_FAILED" >> "$STACK_PROVISION_LOG_FILE"
|
||||
fi
|
||||
for dev in /dev/console /dev/ttyAMA0 /dev/ttyS0; do
|
||||
echo "STACK_CLOUD_INIT_FAILED" > "$dev" 2>/dev/null || true
|
||||
done
|
||||
sync || true
|
||||
(sleep 2 && shutdown -P now) &
|
||||
(sleep 15 && poweroff -f) &
|
||||
fi
|
||||
}
|
||||
trap cleanup EXIT
|
||||
|
||||
Loading…
Reference in New Issue
Block a user