address unresolved PR review comments on snapshot resume path

- stop_vm no longer deletes runtime-config.iso; the CLI owns its
  lifecycle and the snapshot → cold-boot fallback needs it preserved
  (cmd_reset still wipes RUN_DIR for a full reset). Also sweeps qga.sock.
- Write internal-pck to \$VM_DIR on the host in snapshot mode. Cold boot
  publishes this via virtfs/9p; snapshot mode drops virtfs, so
  --config-file flows would otherwise hang. Handles both the rotation
  path (fresh PCK) and EMULATOR_NO_ROTATION (placeholder PCK).
- Pin RAM in snapshot mode to the build-time 4096 (overridable via
  EMULATOR_SNAPSHOT_RAM). Migration replay requires an identical -m
  value, same constraint as CPU count.
- Fail amd64 build when .savevm.zst is missing rather than shipping a
  cold-boot-only release silently. arm64 stays best-effort for now
  because it runs under TCG and can't be verified end-to-end.
- Install Node/pnpm on both arches. arm64 also runs
  generate-env-development.mjs, which otherwise relied on the runner
  image's preinstalled Node.
This commit is contained in:
Bilal Godil 2026-04-15 14:30:21 -07:00
parent cfdc88299a
commit 2c8ad4c77a
2 changed files with 57 additions and 20 deletions

View File

@ -55,13 +55,14 @@ jobs:
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
# Node/pnpm are needed on both arches: arm64 also runs
# generate-env-development.mjs inside build-image.sh. amd64 additionally
# builds and runs the CLI for the verification steps below.
- uses: pnpm/action-setup@v4
if: matrix.arch == 'amd64'
with:
version: 10.23.0
- uses: actions/setup-node@v4
if: matrix.arch == 'amd64'
with:
node-version: 22
cache: pnpm
@ -177,8 +178,14 @@ jobs:
if [ -f "$SAVEVM" ]; then
cp "$SAVEVM" "stack-emulator-${{ matrix.arch }}.savevm.zst"
ls -lh "stack-emulator-${{ matrix.arch }}.savevm.zst"
elif [ "${{ matrix.arch }}" = "amd64" ]; then
# amd64 is the fast-resume contract: if the build didn't produce a
# snapshot, fail loudly rather than silently shipping a
# cold-boot-only release.
echo "ERROR: snapshot build expected to produce $SAVEVM for amd64." >&2
exit 1
else
echo "NOTE: no savevm snapshot was produced; fast-start will be unavailable for this arch."
echo "NOTE: no savevm snapshot was produced for ${{ matrix.arch }}; fast-start will be unavailable for this arch."
fi
- name: Upload image artifact

View File

@ -308,7 +308,7 @@ build_qemu_cmd() {
# build and are not needed at runtime, but their virtio-blk slots must
# exist so the migration replay matches device IDs. Runtime-only devices
# (virtfs, balloon) live at higher slots — extra at destination is fine.
local snapshot_args=() runtime_only_args=() snapshot_smp="$VM_CPUS"
local snapshot_args=() runtime_only_args=() snapshot_smp="$VM_CPUS" snapshot_ram="$VM_RAM"
if snapshot_available; then
log "Snapshot found at $savevm_file — fast-resume enabled."
# -incoming defer: QEMU starts, waits for a QMP migrate-incoming command.
@ -316,9 +316,17 @@ build_qemu_cmd() {
# which enables parallel RAM restore (~2-3x faster than streamed decode).
snapshot_args+=(-incoming defer)
snapshot_smp="${EMULATOR_SNAPSHOT_CPUS:-4}"
# RAM size is baked into the snapshot; migration replay requires an
# identical -m value. Pin to the build-time RAM (4096) and ignore
# EMULATOR_RAM — override via EMULATOR_SNAPSHOT_RAM if a different
# snapshot was produced.
snapshot_ram="${EMULATOR_SNAPSHOT_RAM:-4096}"
if [ "$snapshot_smp" != "$VM_CPUS" ]; then
log "Pinning SMP to ${snapshot_smp} for snapshot resume (build-time value)."
fi
if [ "$snapshot_ram" != "$VM_RAM" ]; then
log "Pinning RAM to ${snapshot_ram}MB for snapshot resume (ignoring EMULATOR_RAM=${VM_RAM})."
fi
# Tiny placeholder ISOs to match the seed.iso / bundle.iso slots present
# at snapshot time. Their content doesn't matter (cloud-init has already
@ -351,7 +359,7 @@ build_qemu_cmd() {
-cpu "$cpu"
"${firmware_args[@]}"
-boot order=c
-m "$VM_RAM"
-m "$snapshot_ram"
-smp "$snapshot_smp"
-drive "file=$VM_DIR/disk.qcow2,format=qcow2,if=virtio"
"${runtime_only_args[@]}"
@ -502,14 +510,17 @@ qmp_incoming_and_cont() {
return 1
}
# Generate fresh per-install secrets on the host. We pass them to the guest
# through QGA's guest-exec input-data field (base64-encoded), so no host file
# or virtfs mount is needed in the snapshot path.
generate_fresh_secrets_payload() {
printf 'STACK_SEED_INTERNAL_PROJECT_PUBLISHABLE_CLIENT_KEY=%s\n' "$(openssl rand -hex 32)"
printf 'STACK_SEED_INTERNAL_PROJECT_SECRET_SERVER_KEY=%s\n' "$(openssl rand -hex 32)"
printf 'STACK_SEED_INTERNAL_PROJECT_SUPER_SECRET_ADMIN_KEY=%s\n' "$(openssl rand -hex 32)"
printf 'CRON_SECRET=%s\n' "$(openssl rand -hex 32)"
# Placeholder PCK baked into the snapshot. Kept in sync with the value in
# docker/local-emulator/qemu/cloud-init/emulator/user-data.
SNAPSHOT_PLACEHOLDER_PCK="00000000000000000000000000000000ffffffffffffffffffffffffffffffff"
# Write the internal PCK to the host path the CLI reads (see
# readInternalPck() in packages/stack-cli/src/commands/emulator.ts). In
# cold-boot mode the guest publishes this via virtfs/9p, but snapshot mode
# drops virtfs, so the host has to write it itself.
write_internal_pck_for_cli() {
local pck="$1"
(umask 077 && printf '%s' "$pck" > "$VM_DIR/internal-pck")
}
# Drive qemu-guest-agent via its virtserialport socket. QGA speaks the same
@ -547,8 +558,22 @@ qga_trigger_fast_rotate() {
# message is available in serial.log. We pipe the fresh-secrets env file
# (as base64) to the script via input-data — keeps secrets off the
# filesystem and avoids needing virtfs.
local secrets_b64 resp pid
secrets_b64=$(generate_fresh_secrets_payload | base64 | tr -d '\n')
local fresh_pck fresh_ssk fresh_sak fresh_cron payload secrets_b64 resp pid
fresh_pck="$(openssl rand -hex 32)"
fresh_ssk="$(openssl rand -hex 32)"
fresh_sak="$(openssl rand -hex 32)"
fresh_cron="$(openssl rand -hex 32)"
payload=$(
printf 'STACK_SEED_INTERNAL_PROJECT_PUBLISHABLE_CLIENT_KEY=%s\n' "$fresh_pck"
printf 'STACK_SEED_INTERNAL_PROJECT_SECRET_SERVER_KEY=%s\n' "$fresh_ssk"
printf 'STACK_SEED_INTERNAL_PROJECT_SUPER_SECRET_ADMIN_KEY=%s\n' "$fresh_sak"
printf 'CRON_SECRET=%s\n' "$fresh_cron"
)
# Publish the fresh PCK to the host path the CLI reads. Writing before the
# guest-exec so a --config-file flow that polls from another process can
# pick it up the moment rotation completes.
write_internal_pck_for_cli "$fresh_pck"
secrets_b64=$(printf '%s' "$payload" | base64 | tr -d '\n')
local cmd
cmd=$(printf '{"execute":"guest-exec","arguments":{"path":"/usr/local/bin/trigger-fast-rotate","capture-output":true,"input-data":"%s"}}' "$secrets_b64")
resp=$(printf '%s\n' "$cmd" | qga_send || true)
@ -599,8 +624,11 @@ stop_vm() {
kill -9 "$pid" 2>/dev/null || true
fi
fi
rm -f "$VM_DIR/qemu.pid" "$VM_DIR/monitor.sock" "$VM_DIR/serial.log"
rm -f "$VM_DIR/runtime-config.iso"
rm -f "$VM_DIR/qemu.pid" "$VM_DIR/monitor.sock" "$VM_DIR/qga.sock" "$VM_DIR/serial.log"
# Do NOT remove runtime-config.iso: the CLI owns its lifecycle and run-emulator.sh
# cannot regenerate it. Removing here breaks the snapshot → cold-boot fallback
# (which calls stop_vm before recursing into cmd_start → ensure_runtime_config_iso).
# `cmd_reset` wipes $RUN_DIR entirely when a full reset is wanted.
}
cmd_start() {
@ -642,6 +670,9 @@ cmd_start() {
if [ "$EMULATOR_NO_ROTATION" = "1" ]; then
warn "EMULATOR_NO_ROTATION=1: snapshot's placeholder secrets are in effect — do not expose this instance."
# The placeholder PCK is live in the running image; publish it to the
# host path so --config-file flows still work.
write_internal_pck_for_cli "$SNAPSHOT_PLACEHOLDER_PCK"
if ! wait_for_condition "services" "$SNAPSHOT_READY_TIMEOUT" all_ready; then
warn "Services did not respond after resume — falling back to cold boot."
tail_vm_logs
@ -691,9 +722,8 @@ cmd_start() {
snapshot_fallback_to_cold_boot() {
warn "Retrying with cold boot (EMULATOR_NO_SNAPSHOT=1)..."
stop_vm
# Wipe the overlay + fingerprint so build_qemu_cmd re-creates a fresh one,
# but keep the CLI-generated runtime-config.iso (we can't regenerate it
# from shell — the CLI owns that).
# Wipe the overlay + fingerprint so build_qemu_cmd re-creates a fresh one.
# runtime-config.iso is preserved by stop_vm (the CLI owns it).
rm -f "$VM_DIR/disk.qcow2" "$VM_DIR/base-image.fingerprint" \
"$VM_DIR/seed.phantom" "$VM_DIR/bundle.phantom"
EMULATOR_NO_SNAPSHOT=1