emulator: fail-fast on provision errors, diagnose smoke test failures

Provisioning used to silently wait out the full 6000s timeout on any
guest-side failure because the cleanup trap only logged the error. Now
it writes STACK_CLOUD_INIT_FAILED and shuts the VM down, and the host
waiter breaks on that marker and reports it distinctly.

Also bump smoke test timeout 120s->300s, dump docker ps / container
logs / free -m / verbose curl when it fails, log the qemu accel path,
and enable /dev/kvm on the CI runner so the VM isn't stuck in TCG.
This commit is contained in:
Bilal Godil 2026-04-10 09:35:27 -07:00
parent cd087c516d
commit 784f17cc2a
3 changed files with 48 additions and 8 deletions

View File

@ -47,7 +47,20 @@ jobs:
- name: Install QEMU dependencies
run: |
sudo apt-get update
sudo apt-get install -y qemu-system-x86 qemu-system-arm qemu-utils genisoimage socat qemu-efi-aarch64
sudo apt-get install -y qemu-system-x86 qemu-system-arm qemu-kvm qemu-utils genisoimage socat qemu-efi-aarch64
- name: Enable KVM access
run: |
echo 'KERNEL=="kvm", GROUP="kvm", MODE="0666", OPTIONS+="static_node=kvm"' \
| sudo tee /etc/udev/rules.d/99-kvm4all.rules
sudo udevadm control --reload-rules
sudo udevadm trigger --name-match=kvm || true
ls -la /dev/kvm || echo "no /dev/kvm present"
if [ -w /dev/kvm ]; then
echo "KVM is writable — hardware acceleration will be used"
else
echo "WARNING: /dev/kvm is not writable — will fall back to TCG (very slow)"
fi
- name: Build QEMU image
run: |

View File

@ -237,6 +237,7 @@ build_one() {
local qemu_base pid elapsed total_build_lines
local last_build_lines=0
local guest_exited=false
local guest_failed=false
local start_time=$SECONDS
cp "$base_img" "$tmp_img"
@ -258,6 +259,7 @@ build_one() {
: > "$serial_log"
: > "$provision_log"
qemu_base="$(qemu_cmd_prefix_for_arch "$arch")"
log "QEMU command prefix (${arch}): $qemu_base"
# shellcheck disable=SC2086
$qemu_base \
@ -282,6 +284,11 @@ build_one() {
break
fi
if contains_provision_marker "$provision_log" "$serial_log" "STACK_CLOUD_INIT_FAILED"; then
guest_failed=true
break
fi
if [ -f "$provision_log" ]; then
total_build_lines="$(line_count "$provision_log")"
if [ "$total_build_lines" -gt "$last_build_lines" ]; then
@ -308,7 +315,9 @@ build_one() {
echo ""
if ! contains_provision_marker "$provision_log" "$serial_log" "STACK_CLOUD_INIT_DONE"; then
if [ "$guest_exited" = true ]; then
if [ "$guest_failed" = true ]; then
err "Guest provisioning reported failure for emulator (${arch})"
elif [ "$guest_exited" = true ]; then
err "Provisioning exited before completion for emulator (${arch})"
else
err "Provisioning timed out for emulator (${arch})"

View File

@ -273,7 +273,7 @@ write_files:
-v stack-inbucket-data:/data/inbucket \
-d stack-local-emulator-slim
smoke_timeout=120
smoke_timeout=300
smoke_elapsed=0
smoke_passed=false
while [ "$smoke_elapsed" -lt "$smoke_timeout" ]; do
@ -286,13 +286,22 @@ write_files:
smoke_elapsed=$((smoke_elapsed + 2))
done
docker stop smoke-test 2>/dev/null || true
sleep 2
if [ "$smoke_passed" = "false" ]; then
log "SMOKE TEST FAILED: backend /health?db=1 did not return 200 within ${smoke_timeout}s"
log "--- docker ps -a ---"
docker ps -a 2>&1 | while IFS= read -r line; do log "ps: $line"; done || true
log "--- smoke-test container logs (last 200 lines) ---"
docker logs --tail 200 smoke-test 2>&1 | while IFS= read -r line; do log "smoke-test: $line"; done || true
log "--- free -m ---"
free -m 2>&1 | while IFS= read -r line; do log "mem: $line"; done || true
log "--- curl -v /health?db=1 ---"
curl -v --max-time 5 http://127.0.0.1:8102/health?db=1 2>&1 | while IFS= read -r line; do log "curl: $line"; done || true
docker stop smoke-test 2>/dev/null || true
exit 1
fi
docker stop smoke-test 2>/dev/null || true
sleep 2
log "Smoke test passed (${smoke_elapsed}s)."
log "Flattening image (docker export/import)..."
@ -363,8 +372,17 @@ write_files:
cleanup() {
local status=$?
if [ "$status" -ne 0 ] && [ -n "${STACK_PROVISION_LOG_FILE:-}" ]; then
printf 'ERROR: provision-build exited with code %s\n' "$status" >> "$STACK_PROVISION_LOG_FILE"
if [ "$status" -ne 0 ]; then
if [ -n "${STACK_PROVISION_LOG_FILE:-}" ]; then
printf 'ERROR: provision-build exited with code %s\n' "$status" >> "$STACK_PROVISION_LOG_FILE"
printf '%s\n' "STACK_CLOUD_INIT_FAILED" >> "$STACK_PROVISION_LOG_FILE"
fi
for dev in /dev/console /dev/ttyAMA0 /dev/ttyS0; do
echo "STACK_CLOUD_INIT_FAILED" > "$dev" 2>/dev/null || true
done
sync || true
(sleep 2 && shutdown -P now) &
(sleep 15 && poweroff -f) &
fi
}
trap cleanup EXIT