fix PCI slot mismatch in snapshot capture + stale runtime ISO on direct start

- build-image.sh: move runtime.iso drive before netdev so its virtio-blk
  slot precedes virtio-net-pci, matching run-emulator.sh's resume argv.
  Previously migrate-incoming against CI's savevm hit a device-tree
  mismatch and only looked green because snapshot_fallback_to_cold_boot
  silently retried as cold boot.
- run-emulator.sh: drop early-return in ensure_runtime_config_iso so
  PORT_PREFIX/EMULATOR_*_PORT changes take effect on every start; the
  preserved ISO from a prior run would otherwise silently override the
  host-forward ports picked up by QEMU's netdev.
- common.sh: fix backslash-escaped JSON in capture_vm_state's migrate-
  timeout diagnostic; single-quoted printf was emitting literal
  backslashes, so QMP replied with a parse error instead of the real
  query-migrate status.
This commit is contained in:
Bilal Godil 2026-04-16 12:31:53 -07:00
parent 7db9fe405e
commit 510ef38015
3 changed files with 22 additions and 20 deletions

View File

@ -333,19 +333,25 @@ build_one() {
local monitor_sock="$tmp_dir/monitor.sock" local monitor_sock="$tmp_dir/monitor.sock"
local qga_sock="$tmp_dir/qga.sock" local qga_sock="$tmp_dir/qga.sock"
local snapshot_args=() local snapshot_args=()
local runtime_disk_args=()
local virtfs_args=(-virtfs "local,path=$tmp_dir,mount_tag=hostfs,security_model=none") local virtfs_args=(-virtfs "local,path=$tmp_dir,mount_tag=hostfs,security_model=none")
if [ "$EMULATOR_BUILD_SNAPSHOT" = "1" ]; then if [ "$EMULATOR_BUILD_SNAPSHOT" = "1" ]; then
# STACKCFG runtime ISO lets stack.service start during the build — same
# disk shape render-stack-env expects at runtime. Placed before netdev
# so its virtio-blk PCI slot precedes virtio-net-pci, matching the
# resume argv order in run-emulator.sh (slots must line up or
# migrate-incoming fails the device-tree check).
runtime_disk_args=(
-drive "file=$runtime_iso,format=raw,if=virtio,readonly=on"
)
# QMP for stop/migrate/quit; virtio-serial + QGA channel so we can exec # QMP for stop/migrate/quit; virtio-serial + QGA channel so we can exec
# inside the guest post-resume (only needed at runtime but harmless here). # inside the guest post-resume (only needed at runtime but harmless here).
# STACKCFG runtime ISO lets stack.service start during the build — same
# disk shape render-stack-env expects at runtime.
snapshot_args=( snapshot_args=(
-chardev "socket,id=monitor,path=$monitor_sock,server=on,wait=off" -chardev "socket,id=monitor,path=$monitor_sock,server=on,wait=off"
-mon "chardev=monitor,mode=control" -mon "chardev=monitor,mode=control"
-chardev "socket,path=$qga_sock,server=on,wait=off,id=qga0" -chardev "socket,path=$qga_sock,server=on,wait=off,id=qga0"
-device virtio-serial -device virtio-serial
-device "virtserialport,chardev=qga0,name=org.qemu.guest_agent.0" -device "virtserialport,chardev=qga0,name=org.qemu.guest_agent.0"
-drive "file=$runtime_iso,format=raw,if=virtio,readonly=on"
# Empty PCIe root port reserved for runtime hot-plug of virtio-9p. # Empty PCIe root port reserved for runtime hot-plug of virtio-9p.
# The integrated pcie.0 bus on q35 / arm64-virt is static — hotplug # The integrated pcie.0 bus on q35 / arm64-virt is static — hotplug
# only works through a root port. Must be present at snapshot capture # only works through a root port. Must be present at snapshot capture
@ -367,6 +373,7 @@ build_one() {
-drive "file=$tmp_img,format=qcow2,if=virtio,discard=on,detect-zeroes=unmap" \ -drive "file=$tmp_img,format=qcow2,if=virtio,discard=on,detect-zeroes=unmap" \
-drive "file=$seed_iso,format=raw,if=virtio,readonly=on" \ -drive "file=$seed_iso,format=raw,if=virtio,readonly=on" \
-drive "file=$bundle_iso,format=raw,if=virtio,readonly=on" \ -drive "file=$bundle_iso,format=raw,if=virtio,readonly=on" \
${runtime_disk_args[@]+"${runtime_disk_args[@]}"} \
-netdev user,id=net0 \ -netdev user,id=net0 \
-device virtio-net-pci,netdev=net0 \ -device virtio-net-pci,netdev=net0 \
${virtfs_args[@]+"${virtfs_args[@]}"} \ ${virtfs_args[@]+"${virtfs_args[@]}"} \

View File

@ -193,8 +193,8 @@ capture_vm_state() {
if [ "$waited" -ge "$migrate_timeout" ]; then if [ "$waited" -ge "$migrate_timeout" ]; then
err "QMP migrate timed out after ${migrate_timeout}s" err "QMP migrate timed out after ${migrate_timeout}s"
err "Last query-migrate response: $({ err "Last query-migrate response: $({
printf '%s\n' '{\"execute\":\"qmp_capabilities\"}' printf '%s\n' '{"execute":"qmp_capabilities"}'
printf '%s\n' '{\"execute\":\"query-migrate\"}' printf '%s\n' '{"execute":"query-migrate"}'
} | qmp_session "$sock" 2>/dev/null || true)" } | qmp_session "$sock" 2>/dev/null || true)"
return 1 return 1
fi fi

View File

@ -168,16 +168,12 @@ runtime_fingerprint() {
} }
ensure_runtime_config_iso() { ensure_runtime_config_iso() {
local cfg_iso # Regenerate unconditionally: port env vars (PORT_PREFIX, EMULATOR_*_PORT)
cfg_iso="$(runtime_iso_path)" # may have changed since the last run, and an ISO cached from a prior
if [ -s "$cfg_iso" ]; then # invocation would silently override them. The stack-cli path writes the
return 0 # ISO first via packages/stack-cli/src/lib/iso.ts; this re-write produces
fi # the same content for that flow (same field set + volume label) and is
# cheap enough (~ms) to run on every start.
# Fallback used when this script is invoked directly (e.g. `pnpm
# emulator:start`) rather than through the stack-cli, which generates the
# ISO via packages/stack-cli/src/lib/iso.ts. Mirrors the field set + volume
# label so the guest's render-stack-env mounts it the same way.
write_runtime_config_iso "$VM_DIR" write_runtime_config_iso "$VM_DIR"
} }
@ -740,10 +736,9 @@ stop_vm() {
fi fi
fi fi
rm -f "$VM_DIR/qemu.pid" "$VM_DIR/monitor.sock" "$VM_DIR/qga.sock" "$VM_DIR/serial.log" rm -f "$VM_DIR/qemu.pid" "$VM_DIR/monitor.sock" "$VM_DIR/qga.sock" "$VM_DIR/serial.log"
# Do NOT remove runtime-config.iso: the CLI owns its lifecycle and run-emulator.sh # runtime-config.iso is left in place; ensure_runtime_config_iso regenerates
# cannot regenerate it. Removing here breaks the snapshot → cold-boot fallback # it on the next start. `cmd_reset` wipes $RUN_DIR entirely when a full reset
# (which calls stop_vm before recursing into cmd_start → ensure_runtime_config_iso). # is wanted.
# `cmd_reset` wipes $RUN_DIR entirely when a full reset is wanted.
} }
cmd_start() { cmd_start() {
@ -854,7 +849,7 @@ snapshot_fallback_to_cold_boot() {
warn "Retrying with cold boot (EMULATOR_NO_SNAPSHOT=1)..." warn "Retrying with cold boot (EMULATOR_NO_SNAPSHOT=1)..."
stop_vm stop_vm
# Wipe the overlay + fingerprint so build_qemu_cmd re-creates a fresh one. # Wipe the overlay + fingerprint so build_qemu_cmd re-creates a fresh one.
# runtime-config.iso is preserved by stop_vm (the CLI owns it). # runtime-config.iso is regenerated by ensure_runtime_config_iso on recursion.
rm -f "$VM_DIR/disk.qcow2" "$VM_DIR/base-image.fingerprint" \ rm -f "$VM_DIR/disk.qcow2" "$VM_DIR/base-image.fingerprint" \
"$VM_DIR/seed.phantom" "$VM_DIR/bundle.phantom" "$VM_DIR/seed.phantom" "$VM_DIR/bundle.phantom"
EMULATOR_NO_SNAPSHOT=1 EMULATOR_NO_SNAPSHOT=1