capture emulator snapshot locally during pull instead of shipping from CI

QEMU migration state isn't portable across accelerators — a KVM-captured
snapshot won't resume under HVF, and `-cpu max` feature sets differ across
hosts. Instead of trying to match every (KVM/HVF/TCG, amd64/arm64)
combination in CI, capture the snapshot on the user's own machine during
`stack emulator pull`: download the qcow2, cold-boot once, wait for all
services, QMP migrate via mapped-ram + multifd, compress. Subsequent
`stack emulator start`s fast-resume in ~3-8s as before.

- Factor qmp_session + capture_vm_state out of build-image.sh into
  common.sh so run-emulator.sh can call them.
- Add cmd_capture to run-emulator.sh. build_qemu_cmd emits the
  resume-compatible device layout (phantom ISOs, no virtfs, fsdev +
  pcie-root-port, pinned 4096MB/4CPU) with -incoming defer gated on an
  actual snapshot being present, so capture mode reuses the same path.
- Capture regenerates runtime-config.iso with STACK_EMULATOR_VM_DIR_HOST
  empty — virtfs is detached for migration compat so /host isn't mounted;
  the `install internal-pck → /host/$VM_DIR_HOST` path would otherwise
  fail and restart-loop stack.service. Mirrors build-image.sh's CI
  runtime.env shape.
- stack-cli `pull` downloads only the qcow2 then invokes run-emulator.sh
  capture. Add --skip-snapshot for CI/debug. startEmulator auto-captures
  on the auto-pull fallback.
- Revert the arm64 CI split: delete qemu-emulator-build-arm64.yaml,
  restore arm64 to the unified matrix on ubicloud-standard-8 under
  cross-arch TCG (macOS HVF runner existed only to produce a portable
  snapshot; no longer needed). Drop savevm.zst from package/upload/publish
  steps; update release notes.

Verified end-to-end on an arm64 Mac under HVF: capture 50s, fast-resume
6.5s, all services green.
This commit is contained in:
Bilal Godil 2026-04-16 11:12:42 -07:00
parent 288b80ee0d
commit d94aa661d8
6 changed files with 337 additions and 390 deletions

View File

@ -1,183 +0,0 @@
name: Build QEMU Emulator Image (arm64 / macOS)
# arm64 emulator images are built in two stages:
# 1. docker-build (Linux): builds the Docker container image for arm64 and
# exports a tarball — Docker is painful to run on macOS CI runners.
# 2. qemu-snapshot (macOS): boots the image under HVF on Apple Silicon,
# provisions it, and captures a snapshot. HVF snapshots are portable to
# developer Macs; KVM snapshots are NOT (differing -cpu max features).
on:
push:
branches:
- main
- dev
pull_request:
paths:
- 'docker/local-emulator/**'
- '.github/workflows/qemu-emulator-build-arm64.yaml'
workflow_dispatch:
concurrency:
group: qemu-arm64-${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' && github.ref != 'refs/heads/dev' }}
env:
EMULATOR_IMAGE_NAME: stack-local-emulator
jobs:
# ---------- Stage 1: build Docker image on Linux ----------
docker-build:
name: Build Docker Image (arm64)
runs-on: ubicloud-standard-8
timeout-minutes: 60
steps:
- uses: actions/checkout@v6
- name: Set up QEMU user-mode emulation
uses: docker/setup-qemu-action@v3
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- uses: pnpm/action-setup@v4
with:
version: 10.23.0
- uses: actions/setup-node@v4
with:
node-version: 22
cache: pnpm
- name: Generate emulator env
run: node docker/local-emulator/generate-env-development.mjs
- name: Build arm64 Docker image
run: |
docker buildx build \
--platform linux/arm64 \
--tag "$EMULATOR_IMAGE_NAME" \
--load \
-f docker/local-emulator/Dockerfile \
.
- name: Export Docker image bundle
run: |
mkdir -p /tmp/bundle
docker save "$EMULATOR_IMAGE_NAME" | gzip -c > /tmp/bundle/emulator-arm64-docker-images.tar.gz
docker image inspect --format '{{.ID}}' "$EMULATOR_IMAGE_NAME" > /tmp/bundle/emulator-arm64-docker-images.tar.gz.image-ids
ls -lh /tmp/bundle/
- name: Upload Docker bundle
uses: actions/upload-artifact@v4
with:
name: arm64-docker-bundle
path: /tmp/bundle/
retention-days: 1
compression-level: 0
# ---------- Stage 2: QEMU provision + snapshot on macOS (HVF) ----------
qemu-snapshot:
name: QEMU Snapshot (arm64 / HVF)
needs: docker-build
runs-on: macos-15
timeout-minutes: 120
env:
EMULATOR_IMAGE_DIR: ${{ github.workspace }}/docker/local-emulator/qemu/images
EMULATOR_RUN_DIR: ${{ github.workspace }}/docker/local-emulator/qemu/run
steps:
- uses: actions/checkout@v6
- uses: pnpm/action-setup@v4
with:
version: 10.23.0
- uses: actions/setup-node@v4
with:
node-version: 22
cache: pnpm
- name: Install system dependencies
run: brew install qemu socat zstd
- name: Verify QEMU + HVF
run: |
qemu-system-aarch64 --version
if qemu-system-aarch64 -accel help 2>&1 | grep -q hvf; then
echo "HVF available — snapshot will be portable to developer Macs"
else
echo "::error::HVF not available on this runner"
exit 1
fi
- name: Download Docker bundle
uses: actions/download-artifact@v4
with:
name: arm64-docker-bundle
path: ${{ env.EMULATOR_IMAGE_DIR }}/
- name: Generate emulator env
run: node docker/local-emulator/generate-env-development.mjs
- name: Build QEMU image (provision + snapshot)
run: |
chmod +x docker/local-emulator/qemu/build-image.sh
# SKIP_DOCKER_BUILD=1 tells build-image.sh to skip the Docker
# build + export steps — we already have the bundle from stage 1.
EMULATOR_PROVISION_TIMEOUT=6000 \
SKIP_DOCKER_BUILD=1 \
docker/local-emulator/qemu/build-image.sh arm64
# HVF gives us native-speed arm64 — verify the image boots and
# services come up (previously impossible under cross-arch TCG).
- name: Build stack-cli
run: |
pnpm install --frozen-lockfile --filter '@stackframe/stack-cli...'
pnpm exec turbo run build --filter='@stackframe/stack-cli...'
- name: Start emulator and verify
env:
EMULATOR_ARCH: arm64
EMULATOR_READY_TIMEOUT: 3200
run: node packages/stack-cli/dist/index.js emulator start
- name: Verify services are healthy
env:
EMULATOR_ARCH: arm64
run: node packages/stack-cli/dist/index.js emulator status
- name: Stop emulator
if: always()
env:
EMULATOR_ARCH: arm64
run: node packages/stack-cli/dist/index.js emulator stop
- name: Print serial log on failure
if: failure()
run: tail -100 "$EMULATOR_RUN_DIR/vm/serial.log" 2>/dev/null || true
- name: Package image
run: |
BASE_IMG="$EMULATOR_IMAGE_DIR/stack-emulator-arm64.qcow2"
SAVEVM="$EMULATOR_IMAGE_DIR/stack-emulator-arm64.savevm.zst"
cp "$BASE_IMG" "stack-emulator-arm64.qcow2"
if [ -f "$SAVEVM" ]; then
cp "$SAVEVM" "stack-emulator-arm64.savevm.zst"
ls -lh "stack-emulator-arm64.savevm.zst"
else
echo "::error::Snapshot was not produced — fast-start will be unavailable"
exit 1
fi
- name: Upload image artifact
uses: actions/upload-artifact@v4
with:
name: qemu-emulator-arm64
path: |
stack-emulator-arm64.qcow2
stack-emulator-arm64.savevm.zst
if-no-files-found: error
retention-days: 30
compression-level: 0

View File

@ -34,12 +34,18 @@ jobs:
fail-fast: false
matrix:
include:
# amd64 runs natively under KVM on ubicloud's amd64 runner.
# arm64 is built in a separate workflow on a macOS runner (HVF)
# so that the snapshot is portable to developer Macs.
# See qemu-emulator-build-arm64.yaml.
# Both arches build on ubicloud's amd64 runner. amd64 uses KVM;
# arm64 runs under cross-arch TCG (slow, but only cloud-init
# provisioning has to complete — the boot/verify smoke test below
# is gated to amd64 because TCG can't boot Next.js in any
# reasonable time). Snapshots are NOT published — `stack emulator
# pull` captures one locally on first run, which is the only way
# to guarantee KVM/HVF/TCG + `-cpu max` compatibility on the
# user's machine.
- arch: amd64
runner: ubicloud-standard-8
- arch: arm64
runner: ubicloud-standard-8
steps:
- uses: actions/checkout@v6
@ -168,29 +174,15 @@ jobs:
- name: Package image
run: |
BASE_IMG="docker/local-emulator/qemu/images/stack-emulator-${{ matrix.arch }}.qcow2"
SAVEVM="docker/local-emulator/qemu/images/stack-emulator-${{ matrix.arch }}.savevm.zst"
cp "$BASE_IMG" "stack-emulator-${{ matrix.arch }}.qcow2"
if [ -f "$SAVEVM" ]; then
cp "$SAVEVM" "stack-emulator-${{ matrix.arch }}.savevm.zst"
ls -lh "stack-emulator-${{ matrix.arch }}.savevm.zst"
elif [ "${{ matrix.arch }}" = "amd64" ]; then
# amd64 is the fast-resume contract: if the build didn't produce a
# snapshot, fail loudly rather than silently shipping a
# cold-boot-only release.
echo "ERROR: snapshot build expected to produce $SAVEVM for amd64." >&2
exit 1
else
echo "NOTE: no savevm snapshot was produced for ${{ matrix.arch }}; fast-start will be unavailable for this arch."
fi
ls -lh "stack-emulator-${{ matrix.arch }}.qcow2"
- name: Upload image artifact
uses: actions/upload-artifact@v4
with:
name: qemu-emulator-${{ matrix.arch }}
path: |
stack-emulator-${{ matrix.arch }}.qcow2
stack-emulator-${{ matrix.arch }}.savevm.zst
if-no-files-found: warn
path: stack-emulator-${{ matrix.arch }}.qcow2
if-no-files-found: error
retention-days: 30
compression-level: 0
@ -266,18 +258,14 @@ jobs:
name: qemu-emulator-${{ matrix.arch }}
path: ${{ github.workspace }}/.stack-emulator-images/
- name: Place images into STACK_EMULATOR_HOME layout
- name: Place qcow2 into STACK_EMULATOR_HOME layout
run: |
mkdir -p "$HOME/.stack/emulator/images"
cp "${{ github.workspace }}/.stack-emulator-images/stack-emulator-${{ matrix.arch }}.qcow2" "$HOME/.stack/emulator/images/"
if [ -f "${{ github.workspace }}/.stack-emulator-images/stack-emulator-${{ matrix.arch }}.savevm.zst" ]; then
cp "${{ github.workspace }}/.stack-emulator-images/stack-emulator-${{ matrix.arch }}.savevm.zst" "$HOME/.stack/emulator/images/"
echo "Snapshot present — will test snapshot-resume path."
else
echo "No snapshot — will test cold-boot path."
fi
ls -lh "$HOME/.stack/emulator/images/"
# No savevm.zst artifact (users capture locally via `emulator pull`),
# so `emulator start` cold-boots the qcow2. Budget accordingly.
- name: Start emulator via CLI
run: |
EMULATOR_ARCH=${{ matrix.arch }} \
@ -336,11 +324,6 @@ jobs:
for f in artifacts/qemu-emulator-*/*.qcow2; do
cp "$f" release/
done
# savevm.zst is optional — older branches may not produce it. Skip
# missing files rather than failing the publish.
for f in artifacts/qemu-emulator-*/*.savevm.zst; do
[ -f "$f" ] && cp "$f" release/
done
cat > release-notes.md <<EOF
## QEMU Emulator Images
@ -352,11 +335,12 @@ jobs:
|------|-------------|
| \`stack-emulator-arm64.qcow2\` | ARM64 disk image |
| \`stack-emulator-amd64.qcow2\` | AMD64 disk image |
| \`stack-emulator-arm64.savevm.zst\` | ARM64 warm VM snapshot (fast-start) |
| \`stack-emulator-amd64.savevm.zst\` | AMD64 warm VM snapshot (fast-start) |
\`emulator pull\` downloads both; \`emulator start\` uses the snapshot
when present and falls back to cold-boot otherwise.
\`emulator pull\` downloads the qcow2 and captures a local fast-start
snapshot (~1-3 min). Subsequent \`emulator start\`s resume in ~3-8 s.
Snapshots are captured locally because QEMU migration state isn't
portable across accelerators (KVM / HVF / TCG) or \`-cpu max\`
feature sets.
### Usage
\`\`\`bash

View File

@ -253,142 +253,8 @@ persist_provision_logs() {
cp "$provision_log" "$IMAGE_DIR/provision-emulator-${arch}.progress.log" 2>/dev/null || true
}
# Open a persistent QMP session on the monitor socket, negotiate capabilities,
# run a series of commands, and close. Commands are read from stdin (one JSON
# object per line); responses are written to stdout. Uses socat's bidirectional
# pipe so we can interleave request/response in one connection — QMP requires
# qmp_capabilities to come first and keeps state across commands.
# Keeps stdin open briefly after caller's input ends so QEMU has time to
# process the last command before socat closes.
qmp_session() {
local sock="$1"
local payload
payload="$(cat)"
( printf '%s\n' "$payload"; sleep 0.5 ) | socat -t30 - "UNIX-CONNECT:${sock}"
}
# Drive the snapshot capture over QMP:
# 1. qmp_capabilities — exit negotiation mode.
# 2. stop — pause the VM so no more disk writes happen.
# 3. migrate to exec:zstd > <file via hostfs> — streams RAM/device state out.
# 4. Poll query-migrate until status=completed (or failed).
# 5. quit — terminate QEMU cleanly.
capture_vm_state() {
local sock="$1"
local guest_path="$2"
if [ ! -S "$sock" ]; then
err "QMP monitor socket missing: $sock"
return 1
fi
log " QMP: stopping VM..."
{
printf '%s\n' '{"execute":"qmp_capabilities"}'
printf '%s\n' '{"execute":"stop"}'
} | qmp_session "$sock" >/dev/null || {
err "QMP stop failed"
return 1
}
log " QMP: enabling mapped-ram + multifd for fast resume..."
# mapped-ram: writes each RAM page to a fixed offset in the output file
# (vs the legacy streamed format). This lets the target QEMU mmap the file
# and fault pages lazily — and combined with multifd, load RAM in parallel.
# multifd-channels=4 matches our pinned SMP so the channels don't starve
# each other on the target's 4 vCPUs.
local caps_cmd params_cmd
caps_cmd='{"execute":"migrate-set-capabilities","arguments":{"capabilities":[{"capability":"mapped-ram","state":true},{"capability":"multifd","state":true}]}}'
params_cmd='{"execute":"migrate-set-parameters","arguments":{"multifd-channels":4}}'
local setup_resp
setup_resp=$({
printf '%s\n' '{"execute":"qmp_capabilities"}'
printf '%s\n' "$caps_cmd"
printf '%s\n' "$params_cmd"
} | qmp_session "$sock") || {
err "QMP capabilities setup failed"
return 1
}
if printf '%s' "$setup_resp" | grep -q '"error"[[:space:]]*:'; then
err "QMP capabilities returned error: $setup_resp"
return 1
fi
log " QMP: migrating RAM state to ${guest_path}..."
# Use file: migration (native QEMU) instead of exec: to avoid relying on a
# spawned shell finding zstd in PATH. Compressed as a separate host step
# after migrate completes.
local migrate_cmd
migrate_cmd=$(printf '{"execute":"migrate","arguments":{"uri":"file:%s"}}' "$guest_path")
local migrate_resp
migrate_resp=$({
printf '%s\n' '{"execute":"qmp_capabilities"}'
printf '%s\n' "$migrate_cmd"
} | qmp_session "$sock") || {
err "QMP migrate failed"
return 1
}
if printf '%s' "$migrate_resp" | grep -q '"error"[[:space:]]*:'; then
err "QMP migrate returned error: $migrate_resp"
return 1
fi
# Poll migration status. Migration runs in the background after the
# migrate command returns; we watch for "completed" or "failed".
local migrate_timeout=600
local waited=0
local last_heartbeat=0
while [ "$waited" -lt "$migrate_timeout" ]; do
local status_line status
status_line=$({
printf '%s\n' '{"execute":"qmp_capabilities"}'
printf '%s\n' '{"execute":"query-migrate"}'
} | qmp_session "$sock" 2>/dev/null || true)
status="$(printf '%s\n' "$status_line" | grep -o '"status"[[:space:]]*:[[:space:]]*"[a-z-]*"' | head -1 | sed -E 's/.*"([a-z-]+)".*/\1/')"
case "$status" in
completed)
log " QMP: migrate completed (${waited}s)"
break
;;
failed|cancelled)
err " QMP: migrate ended with status=$status"
err " QMP response: $status_line"
return 1
;;
active|setup|device|"")
# still running
if [ "$((waited - last_heartbeat))" -ge 30 ]; then
local transferred
transferred=$(printf '%s' "$status_line" | grep -o '"transferred"[[:space:]]*:[[:space:]]*[0-9]*' | head -1 | sed -E 's/.*:[[:space:]]*([0-9]+).*/\1/')
log " QMP: migrate in progress (${waited}s, status=${status:-init}, transferred=${transferred:-0})"
last_heartbeat=$waited
fi
;;
*)
log " QMP: migrate status=$status (${waited}s)"
;;
esac
sleep 2
waited=$((waited + 2))
done
if [ "$waited" -ge "$migrate_timeout" ]; then
err "QMP migrate timed out after ${migrate_timeout}s"
err "Last query-migrate response: $({
printf '%s\n' '{\"execute\":\"qmp_capabilities\"}'
printf '%s\n' '{\"execute\":\"query-migrate\"}'
} | qmp_session "$sock" 2>/dev/null || true)"
return 1
fi
log " QMP: quitting VM..."
{
printf '%s\n' '{"execute":"qmp_capabilities"}'
printf '%s\n' '{"execute":"quit"}'
} | qmp_session "$sock" >/dev/null || true
return 0
}
# qmp_session() and capture_vm_state() live in common.sh; both build-image.sh
# (CI) and run-emulator.sh (stack emulator pull local capture) call them.
build_one() {
local arch="$1"

View File

@ -68,3 +68,142 @@ make_iso_from_dir() {
exit 1
fi
}
# Send one or more QMP commands over the monitor socket. Stdin is a stream of
# JSON objects; qmp_capabilities is always sent first to exit negotiation mode.
# Keep stdin open briefly after writing so socat doesn't close before QEMU
# responds — QMP replies in milliseconds so 0.5s is plenty.
#
# Callers: build-image.sh capture flow, run-emulator.sh cmd_capture.
qmp_session() {
local sock="$1"
local payload
payload="$(cat)"
( printf '%s\n' "$payload"; sleep 0.5 ) | socat -t30 - "UNIX-CONNECT:${sock}"
}
# Drive the snapshot capture over QMP:
# 1. qmp_capabilities — exit negotiation mode.
# 2. stop — pause the VM so no more disk writes happen.
# 3. migrate-set-capabilities — enable mapped-ram + multifd for fast resume.
# 4. migrate to file:<path> — streams RAM/device state out.
# 5. Poll query-migrate until status=completed (or failed).
# 6. quit — terminate QEMU cleanly.
#
# Depends on log/err/warn being defined by the sourcing script.
capture_vm_state() {
local sock="$1"
local guest_path="$2"
if [ ! -S "$sock" ]; then
err "QMP monitor socket missing: $sock"
return 1
fi
log " QMP: stopping VM..."
{
printf '%s\n' '{"execute":"qmp_capabilities"}'
printf '%s\n' '{"execute":"stop"}'
} | qmp_session "$sock" >/dev/null || {
err "QMP stop failed"
return 1
}
log " QMP: enabling mapped-ram + multifd for fast resume..."
# mapped-ram: writes each RAM page to a fixed offset in the output file
# (vs the legacy streamed format). This lets the target QEMU mmap the file
# and fault pages lazily — and combined with multifd, load RAM in parallel.
# multifd-channels=4 matches our pinned SMP so the channels don't starve
# each other on the target's 4 vCPUs.
local caps_cmd params_cmd
caps_cmd='{"execute":"migrate-set-capabilities","arguments":{"capabilities":[{"capability":"mapped-ram","state":true},{"capability":"multifd","state":true}]}}'
params_cmd='{"execute":"migrate-set-parameters","arguments":{"multifd-channels":4}}'
local setup_resp
setup_resp=$({
printf '%s\n' '{"execute":"qmp_capabilities"}'
printf '%s\n' "$caps_cmd"
printf '%s\n' "$params_cmd"
} | qmp_session "$sock") || {
err "QMP capabilities setup failed"
return 1
}
if printf '%s' "$setup_resp" | grep -q '"error"[[:space:]]*:'; then
err "QMP capabilities returned error: $setup_resp"
return 1
fi
log " QMP: migrating RAM state to ${guest_path}..."
# Use file: migration (native QEMU) instead of exec: to avoid relying on a
# spawned shell finding zstd in PATH. Compressed as a separate host step
# after migrate completes.
local migrate_cmd
migrate_cmd=$(printf '{"execute":"migrate","arguments":{"uri":"file:%s"}}' "$guest_path")
local migrate_resp
migrate_resp=$({
printf '%s\n' '{"execute":"qmp_capabilities"}'
printf '%s\n' "$migrate_cmd"
} | qmp_session "$sock") || {
err "QMP migrate failed"
return 1
}
if printf '%s' "$migrate_resp" | grep -q '"error"[[:space:]]*:'; then
err "QMP migrate returned error: $migrate_resp"
return 1
fi
# Poll migration status. Migration runs in the background after the
# migrate command returns; we watch for "completed" or "failed".
local migrate_timeout=600
local waited=0
local last_heartbeat=0
while [ "$waited" -lt "$migrate_timeout" ]; do
local status_line status
status_line=$({
printf '%s\n' '{"execute":"qmp_capabilities"}'
printf '%s\n' '{"execute":"query-migrate"}'
} | qmp_session "$sock" 2>/dev/null || true)
status="$(printf '%s\n' "$status_line" | grep -o '"status"[[:space:]]*:[[:space:]]*"[a-z-]*"' | head -1 | sed -E 's/.*"([a-z-]+)".*/\1/')"
case "$status" in
completed)
log " QMP: migrate completed (${waited}s)"
break
;;
failed|cancelled)
err " QMP: migrate ended with status=$status"
err " QMP response: $status_line"
return 1
;;
active|setup|device|"")
# still running
if [ "$((waited - last_heartbeat))" -ge 30 ]; then
local transferred
transferred=$(printf '%s' "$status_line" | grep -o '"transferred"[[:space:]]*:[[:space:]]*[0-9]*' | head -1 | sed -E 's/.*:[[:space:]]*([0-9]+).*/\1/')
log " QMP: migrate in progress (${waited}s, status=${status:-init}, transferred=${transferred:-0})"
last_heartbeat=$waited
fi
;;
*)
log " QMP: migrate status=$status (${waited}s)"
;;
esac
sleep 2
waited=$((waited + 2))
done
if [ "$waited" -ge "$migrate_timeout" ]; then
err "QMP migrate timed out after ${migrate_timeout}s"
err "Last query-migrate response: $({
printf '%s\n' '{\"execute\":\"qmp_capabilities\"}'
printf '%s\n' '{\"execute\":\"query-migrate\"}'
} | qmp_session "$sock" 2>/dev/null || true)"
return 1
fi
log " QMP: quitting VM..."
{
printf '%s\n' '{"execute":"qmp_capabilities"}'
printf '%s\n' '{"execute":"quit"}'
} | qmp_session "$sock" >/dev/null || true
return 0
}

View File

@ -21,6 +21,13 @@ EMULATOR_NO_SNAPSHOT="${EMULATOR_NO_SNAPSHOT:-0}"
# in place — acceptable for tests and CI that don't reach the emulator over
# a shared network. Shaves ~2-3s off `emulator start`.
EMULATOR_NO_ROTATION="${EMULATOR_NO_ROTATION:-0}"
# Internal: set to 1 by cmd_capture to build QEMU with the snapshot-compatible
# device layout (phantom ISOs, no virtfs, pcie-root-port, pinned 4096MB/4CPU)
# without the `-incoming defer` that resume mode adds. The captured snapshot
# must be byte-compatible with what the resume path will later feed to QEMU.
EMULATOR_CAPTURING_SNAPSHOT="${EMULATOR_CAPTURING_SNAPSHOT:-0}"
# Force re-capture even if a .savevm.zst is already present.
EMULATOR_FORCE_CAPTURE="${EMULATOR_FORCE_CAPTURE:-0}"
# Fixed host-side ports for the QEMU emulator (267xx range).
# Only user-facing services are exposed; internal deps stay inside the VM.
@ -87,7 +94,15 @@ runtime_iso_path() {
}
snapshot_available() {
[ "$EMULATOR_NO_SNAPSHOT" != "1" ] && [ -s "$(savevm_path)" ]
[ "$EMULATOR_NO_SNAPSHOT" != "1" ] && [ "$EMULATOR_CAPTURING_SNAPSHOT" != "1" ] && [ -s "$(savevm_path)" ]
}
# True when QEMU must use the snapshot-compatible device layout — either to
# resume from an existing snapshot or to capture a new one. Resume adds
# `-incoming defer`; capture does not. Everything else (phantom ISOs, no
# virtfs, pcie-root-port, pinned RAM/SMP) matches.
snapshot_layout() {
snapshot_available || [ "$EMULATOR_CAPTURING_SNAPSHOT" = "1" ]
}
# Ensure the decompressed mapped-ram cache is up-to-date with the shipped
@ -163,6 +178,16 @@ ensure_runtime_config_iso() {
# emulator:start`) rather than through the stack-cli, which generates the
# ISO via packages/stack-cli/src/lib/iso.ts. Mirrors the field set + volume
# label so the guest's render-stack-env mounts it the same way.
write_runtime_config_iso "$VM_DIR"
}
# Write a STACKCFG runtime-config.iso containing runtime.env + base.env.
# The VM_DIR_HOST arg is the path to publish internal-pck / stack.log to on
# /host; pass empty string to suppress publication (used by capture mode
# where /host isn't mounted — virtfs is detached for snapshot compatibility,
# so any host-side write would fail and restart-loop stack.service).
write_runtime_config_iso() {
local vm_dir_host="$1"
local base_env="$SCRIPT_DIR/../.env.development"
if [ ! -f "$base_env" ]; then
err "Cannot generate runtime config ISO: $base_env is missing."
@ -179,10 +204,10 @@ ensure_runtime_config_iso() {
printf "STACK_EMULATOR_BACKEND_HOST_PORT=%s\n" "$EMULATOR_BACKEND_PORT"
printf "STACK_EMULATOR_MINIO_HOST_PORT=%s\n" "$EMULATOR_MINIO_PORT"
printf "STACK_EMULATOR_INBUCKET_HOST_PORT=%s\n" "$EMULATOR_INBUCKET_PORT"
printf "STACK_EMULATOR_VM_DIR_HOST=%s\n" "$VM_DIR"
printf "STACK_EMULATOR_VM_DIR_HOST=%s\n" "$vm_dir_host"
} > "$cfg_dir/runtime.env"
cp "$base_env" "$cfg_dir/base.env"
make_iso_from_dir "$cfg_iso" "STACKCFG" "$cfg_dir"
make_iso_from_dir "$(runtime_iso_path)" "STACKCFG" "$cfg_dir"
}
service_is_up() {
@ -259,13 +284,14 @@ build_qemu_cmd() {
local current_fp
current_fp="$(runtime_fingerprint "$base_img" "$savevm_file")"
if snapshot_available; then
if snapshot_layout; then
# The savevm RAM state was captured against the base image's exact disk
# state. An overlay with writes from a previous session diverges from
# that point, so -incoming would resume RAM against inconsistent disk.
# Always start from a fresh overlay in the snapshot path; per-session
# state is not preserved. Users who want persistence can opt out with
# EMULATOR_NO_SNAPSHOT=1.
# EMULATOR_NO_SNAPSHOT=1. Capture mode also needs a clean overlay so the
# snapshot we write is taken against the base's known disk state.
if [ -f "$VM_DIR/disk.qcow2" ]; then
rm -f "$VM_DIR/disk.qcow2" "$fingerprint_file"
fi
@ -332,12 +358,16 @@ build_qemu_cmd() {
# exist so the migration replay matches device IDs. Runtime-only devices
# (virtfs, balloon) live at higher slots — extra at destination is fine.
local snapshot_args=() runtime_only_args=() snapshot_smp="$VM_CPUS" snapshot_ram="$VM_RAM"
if snapshot_available; then
log "Snapshot found at $savevm_file — fast-resume enabled."
# -incoming defer: QEMU starts, waits for a QMP migrate-incoming command.
# We use that to set mapped-ram + multifd capabilities before loading,
# which enables parallel RAM restore (~2-3x faster than streamed decode).
snapshot_args+=(-incoming defer)
if snapshot_layout; then
if snapshot_available; then
log "Snapshot found at $savevm_file — fast-resume enabled."
# -incoming defer: QEMU starts, waits for a QMP migrate-incoming command.
# We use that to set mapped-ram + multifd capabilities before loading,
# which enables parallel RAM restore (~2-3x faster than streamed decode).
snapshot_args+=(-incoming defer)
else
log "Capture mode: booting with snapshot-compatible layout (no -incoming)."
fi
snapshot_smp="${EMULATOR_SNAPSHOT_CPUS:-4}"
# RAM size is baked into the snapshot; migration replay requires an
# identical -m value. Pin to the build-time RAM (4096) and ignore
@ -374,7 +404,7 @@ build_qemu_cmd() {
)
fi
if snapshot_available; then
if snapshot_layout; then
QEMU_CMD=(
"$qemu_bin"
-machine "$machine"
@ -406,7 +436,7 @@ build_qemu_cmd() {
# via human-monitor-command (errors come back as a return string,
# not a QMP error).
-fsdev "local,id=hostfs,path=/,security_model=none"
"${snapshot_args[@]}"
${snapshot_args[@]+"${snapshot_args[@]}"}
-serial "file:$VM_DIR/serial.log"
-display none
-daemonize
@ -842,6 +872,100 @@ cmd_reset() {
log "Emulator state reset. Next start will be a fresh boot."
}
# Cold-boot the VM with the snapshot-compatible device layout, wait for all
# services to be healthy, then capture a snapshot via QMP migrate and compress
# it to .savevm.zst. Called by `stack emulator pull` so first-run users get a
# fast-resume snapshot that's guaranteed compatible with their host's QEMU
# version + accelerator (which CI-built snapshots can't guarantee across
# KVM/HVF/TCG).
cmd_capture() {
if [ ! -f "$(image_path)" ]; then
err "Missing qcow2: $(image_path). Run 'stack emulator pull' first."
exit 1
fi
if [ -s "$(savevm_path)" ] && [ "$EMULATOR_FORCE_CAPTURE" != "1" ]; then
log "Snapshot already present at $(savevm_path); skipping capture."
log "Pass EMULATOR_FORCE_CAPTURE=1 to rebuild it."
return 0
fi
if is_running; then
err "Emulator is already running; stop it first (stack emulator stop)."
exit 1
fi
# Start with a clean slate if we're force-recapturing; stale raw/zst would
# otherwise make snapshot_available() return true and flip QEMU into
# -incoming defer mode.
rm -f "$(savevm_path)" "$(savevm_raw_path)"
ensure_ports_free
mkdir -p "$RUN_DIR" "$VM_DIR"
# Regenerate runtime-config.iso with STACK_EMULATOR_VM_DIR_HOST empty —
# virtfs is detached in capture mode, so run-stack-container's
# `install internal-pck → /host/$VM_DIR_HOST/...` would fail and restart-loop
# stack.service. Mirrors build-image.sh's CI runtime.env shape.
rm -f "$(runtime_iso_path)"
write_runtime_config_iso ""
info "Cold-booting VM to capture local snapshot (one-time, ~1-3 min)..."
EMULATOR_CAPTURING_SNAPSHOT=1
start_vm
info "VM: 4096MB / 4 CPUs (pinned for snapshot compatibility)"
# Cold boot with snapshot-compatible layout drops virtfs, so stack.service
# starts without /host mounted — fine for capture; hostfs is hot-plugged on
# resume via qmp_hotplug_9p.
if ! wait_for_condition "all services" "$READY_TIMEOUT" all_ready; then
tail_vm_logs
stop_vm
err "Services did not come up; capture aborted."
exit 1
fi
local raw tmp_raw zst tmp_zst
raw="$(savevm_raw_path)"
tmp_raw="${raw}.capture.tmp"
zst="$(savevm_path)"
tmp_zst="${zst}.capture.tmp"
rm -f "$tmp_raw" "$tmp_zst"
log "Capturing VM state via QMP (mapped-ram + multifd)..."
if ! capture_vm_state "$VM_DIR/monitor.sock" "$tmp_raw"; then
err "QMP capture failed."
stop_vm
exit 1
fi
# capture_vm_state sent QMP quit; wait for QEMU to exit, then clean sockets.
local waited=0
while [ "$waited" -lt 30 ] && is_running; do
sleep 1
waited=$((waited + 1))
done
if is_running; then
warn "QEMU did not exit after QMP quit; forcing."
stop_vm
fi
rm -f "$VM_DIR/qemu.pid" "$VM_DIR/monitor.sock" "$VM_DIR/qga.sock"
if [ ! -s "$tmp_raw" ]; then
err "Captured raw file is empty: $tmp_raw"
exit 1
fi
log "Compressing snapshot with zstd..."
zstd -1 -T0 -f -o "$tmp_zst" "$tmp_raw"
mv "$tmp_zst" "$zst"
# Keep the uncompressed file too — resume reads it directly via mapped-ram,
# and ensure_savevm_raw skips re-decompression when the raw's mtime >= zst's.
mv "$tmp_raw" "$raw"
touch -r "$zst" "$raw"
local size
size="$(du -h "$zst" | cut -f1)"
log "Snapshot captured: $zst (${size})"
}
STATUS_FAILED=0
print_service_status() {
@ -889,12 +1013,12 @@ ACTION="start"
while [[ $# -gt 0 ]]; do
case "$1" in
start|stop|reset|status|bench)
start|stop|reset|status|bench|capture)
ACTION="$1"
shift
;;
*)
echo "Usage: $0 [start|stop|reset|status|bench]"
echo "Usage: $0 [start|stop|reset|status|bench|capture]"
exit 1
;;
esac
@ -906,4 +1030,5 @@ case "$ACTION" in
reset) cmd_reset ;;
status) cmd_status ;;
bench) cmd_bench ;;
capture) cmd_capture ;;
esac

View File

@ -239,6 +239,9 @@ async function startEmulator(arch: "arm64" | "amd64"): Promise<void> {
if (!existsSync(img)) {
console.log("No emulator image found. Pulling latest...");
await pullRelease(arch);
// Capture now so this and all subsequent starts resume fast. Skipping it
// would cold-boot today plus every future start (we never auto-capture).
await captureLocalSnapshot(arch);
}
prepareRuntimeConfigIso();
await runEmulator("start", { EMULATOR_ARCH: arch });
@ -261,25 +264,26 @@ async function pullRelease(arch: "arm64" | "amd64", opts: { repo?: string, branc
mkdirSync(imageDir, { recursive: true });
const diskAsset = `stack-emulator-${arch}.qcow2`;
// The savevm file enables the fast-resume path in run-emulator.sh. It's
// optional — older releases may not have it and the runtime cleanly falls
// back to a cold boot.
const snapshotAsset = `stack-emulator-${arch}.savevm.zst`;
const release = await ghApi<ReleaseResponse>(`/repos/${repo}/releases/tags/${tag}`);
const diskMatch = release.assets.find((a) => a.name === diskAsset);
if (!diskMatch) {
throw new CliError(`Asset ${diskAsset} not found in release ${tag}. Run 'stack emulator list-releases' to see available releases.`);
}
const snapshotMatch = release.assets.find((a) => a.name === snapshotAsset);
const token = githubToken();
await downloadReleaseAsset(diskMatch, imageDir, diskAsset, token, tag);
if (snapshotMatch) {
await downloadReleaseAsset(snapshotMatch, imageDir, snapshotAsset, token, tag);
} else {
console.log(`Snapshot asset ${snapshotAsset} not available in release ${tag}; fast-start disabled for this image.`);
}
}
// Cold-boot the VM, wait for services, capture a snapshot via QMP, compress,
// stop. Runs once per qcow2 download so subsequent `stack emulator start`s
// resume in ~3-8s. Snapshots are always captured on the user's own machine
// because QEMU migration state isn't portable across accelerators
// (KVM/HVF/TCG) or `-cpu max` feature sets.
async function captureLocalSnapshot(arch: "arm64" | "amd64"): Promise<void> {
preflightForVmStart("pull", arch);
prepareRuntimeConfigIso();
console.log("Capturing local snapshot (first-time, ~1-3 min cold boot + capture)...");
await runEmulator("capture", { EMULATOR_ARCH: arch });
}
async function downloadReleaseAsset(
@ -491,19 +495,20 @@ export function registerEmulatorCommand(program: Command) {
emulator
.command("pull")
.description("Download an emulator image from GitHub Releases or a PR build")
.description("Download an emulator image from GitHub Releases or a PR build, then capture a local fast-start snapshot")
.option("--arch <arch>", "Target architecture (default: current system arch)")
.option("--branch <branch>", "Release branch (default: dev)")
.option("--tag <tag>", "Specific release tag (default: latest)")
.option("--repo <repo>", "GitHub repository (default: stack-auth/stack-auth)")
.option("--pr <number>", "Pull from a PR's CI artifacts")
.option("--run <id>", "Pull from a specific workflow run's artifacts")
.action(async (opts) => {
.option("--skip-snapshot", "Download only the qcow2; skip the one-time local snapshot capture")
.action(async (opts: { arch?: string, repo?: string, branch?: string, tag?: string, pr?: string, run?: string, skipSnapshot?: boolean }) => {
const arch = resolveArch(opts.arch);
const repo = opts.repo ?? DEFAULT_REPO;
if (opts.run || opts.pr) {
let runId = opts.run as string | undefined;
let runId = opts.run;
if (!runId) {
console.log(`Finding latest successful build for PR #${opts.pr}...`);
const pr = await ghApi<PullResponse>(`/repos/${repo}/pulls/${opts.pr}`);
@ -521,22 +526,33 @@ export function registerEmulatorCommand(program: Command) {
mkdirSync(imageDir, { recursive: true });
const dest = join(imageDir, `stack-emulator-${arch}.qcow2`);
const snapshotDest = join(imageDir, `stack-emulator-${arch}.savevm.zst`);
const snapshotRawDest = join(imageDir, `stack-emulator-${arch}.savevm.raw`);
if (existsSync(dest)) unlinkSync(dest);
// Stale snapshots from a previous pull would resume against the new
// qcow2 and crash; wipe them so capture rebuilds cleanly.
if (existsSync(snapshotDest)) unlinkSync(snapshotDest);
if (existsSync(snapshotRawDest)) unlinkSync(snapshotRawDest);
const downloaded = await downloadArtifactByName(repo, runId, `qemu-emulator-${arch}`, imageDir);
if (!downloaded) {
throw new CliError(`Artifact qemu-emulator-${arch} not found in workflow run ${runId}.`);
}
if (!existsSync(dest)) throw new CliError(`Expected image not found at ${dest} after download.`);
console.log(`Downloaded: ${dest}`);
if (existsSync(snapshotDest)) {
console.log(`Downloaded: ${snapshotDest}`);
} else {
console.log(`Snapshot not present in artifact for run ${runId}; fast-start disabled.`);
}
} else {
// Same stale-snapshot concern as the PR branch above.
const imageDir = emulatorImageDir();
const snapshotDest = join(imageDir, `stack-emulator-${arch}.savevm.zst`);
const snapshotRawDest = join(imageDir, `stack-emulator-${arch}.savevm.raw`);
if (existsSync(snapshotDest)) unlinkSync(snapshotDest);
if (existsSync(snapshotRawDest)) unlinkSync(snapshotRawDest);
await pullRelease(arch, { repo, branch: opts.branch, tag: opts.tag });
}
if (opts.skipSnapshot) {
console.log("--skip-snapshot: not capturing a local snapshot. First `stack emulator start` will cold-boot.");
} else {
await captureLocalSnapshot(arch);
}
});
emulator