local emulator build improvements (#1330)

… V8 --jitless

2.6 GB to 1.3 GB final image

Flip arm64 matrix back to ubicloud-standard-8 so both arches share one
runner fleet. Cross-arch TCG on an amd64 host previously SIGTRAP'd in
migrations because V8's JIT emitted arm64 instructions that QEMU's
cross-arch translator mis-handled; pair the existing -cpu cortex-a72
fallback with NODE_OPTIONS=--jitless on the migration docker exec to
force V8 to stay on the interpreter. Does not affect amd64 migrations
(KVM, no TCG).

<!--

Make sure you've read the CONTRIBUTING.md guidelines:
https://github.com/stack-auth/stack-auth/blob/dev/CONTRIBUTING.md

-->


<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

* **Chores**
* Optimized emulator images with binary stripping, compression, and
preservation of standalone runtime dependencies.
* Improved multi-architecture build matrix, added optional KVM
detection/fallback, and gated certain emulator runtime steps for arm64.
* Enhanced build scripts to generate and include env files and persist
richer logs and artifacts.

* **New Features**
* Centralized provision entrypoint to streamline install → migrations →
slimming sequence.

* **Tests**
  * Added a fast QEMU serial boot test for architecture validation.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->
This commit is contained in:
BilalG1 2026-04-13 19:21:02 -07:00 committed by GitHub
parent 7f8e3df852
commit 5399142db9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 705 additions and 56 deletions

View File

@ -26,14 +26,23 @@ env:
jobs:
build:
name: Build QEMU Image (${{ matrix.arch }})
runs-on: ubicloud-standard-8
runs-on: ${{ matrix.runner }}
timeout-minutes: 120
strategy:
fail-fast: false
matrix:
include:
# amd64 runs natively under KVM on ubicloud's amd64 runner.
- arch: amd64
runner: ubicloud-standard-8
# arm64 runs under cross-arch TCG on ubicloud's amd64 runner.
# No KVM for arm64 guests on an amd64 host; cortex-a72 + V8
# --jitless together sidestep the SIGTRAPs that cross-arch TCG
# hits on aggressive arm64 JIT code. Smoke test is still skipped
# because the backend can't come up reliably under cross-arch
# TCG within any sane window.
- arch: arm64
runner: ubicloud-standard-8
steps:
- uses: actions/checkout@v6
@ -47,7 +56,20 @@ jobs:
- name: Install QEMU dependencies
run: |
sudo apt-get update
sudo apt-get install -y qemu-system-x86 qemu-system-arm qemu-utils genisoimage socat qemu-efi-aarch64
sudo apt-get install -y qemu-system-x86 qemu-system-arm qemu-kvm qemu-utils genisoimage socat qemu-efi-aarch64
- name: Enable KVM access
run: |
echo 'KERNEL=="kvm", GROUP="kvm", MODE="0666", OPTIONS+="static_node=kvm"' \
| sudo tee /etc/udev/rules.d/99-kvm4all.rules
sudo udevadm control --reload-rules
sudo udevadm trigger --name-match=kvm || true
ls -la /dev/kvm || echo "no /dev/kvm present"
if [ -w /dev/kvm ]; then
echo "KVM is writable — hardware acceleration will be used"
else
echo "WARNING: /dev/kvm is not writable — will fall back to TCG (very slow)"
fi
- name: Build QEMU image
run: |
@ -58,7 +80,13 @@ jobs:
- name: Generate emulator env
run: node docker/local-emulator/generate-env-development.mjs
# arm64 runs under cross-arch TCG on an amd64 runner; the backend's
# V8 TurboFan JIT re-triggers the SIGTRAPs we dodge in migrations
# with --no-opt, and even if it didn't, boot is too slow under TCG
# to verify in any sane window. amd64 KVM already exercises the
# service stack; real arm64 hosts have KVM for end-users.
- name: Start emulator and verify
if: matrix.arch == 'amd64'
run: |
chmod +x docker/local-emulator/qemu/run-emulator.sh
EMULATOR_ARCH=${{ matrix.arch }} \
@ -66,12 +94,13 @@ jobs:
docker/local-emulator/qemu/run-emulator.sh start
- name: Verify services are healthy
if: matrix.arch == 'amd64'
run: |
EMULATOR_ARCH=${{ matrix.arch }} \
docker/local-emulator/qemu/run-emulator.sh status
- name: Stop emulator
if: always()
if: always() && matrix.arch == 'amd64'
run: |
EMULATOR_ARCH=${{ matrix.arch }} \
docker/local-emulator/qemu/run-emulator.sh stop

View File

@ -103,6 +103,28 @@ RUN cp $(which qstash) /qstash-binary 2>/dev/null || \
{ echo "ERROR: qstash binary not found" >&2; exit 1; }
# ── Strip / compress service binaries (parallel stages) ──────────────────────
FROM debian:trixie-slim AS upx-compress
RUN apt-get update && apt-get install -y --no-install-recommends upx-ucl binutils && \
rm -rf /var/lib/apt/lists/*
COPY --from=clickhouse-bin /usr/bin/clickhouse /out/clickhouse
COPY --from=svix-bin /usr/local/bin/svix-server /out/svix-server
COPY --from=minio-bin /usr/bin/minio /out/minio
COPY --from=mc-bin /usr/bin/mc /out/mc
COPY --from=qstash-bin /qstash-binary /out/qstash
RUN chmod u+w /out/* && \
# Intentionally NOT stripping /out/clickhouse. The clickhouse binary is a
# self-extracting compressed executable (a small loader with a ZSTD
# payload appended after the section table); strip rewrites the ELF and
# can invalidate the loader's "find my payload" lookup, causing the
# decompressor to spin on garbage with zero log output — the exact
# symptom seen on cross-arch TCG runs. Savings from stripping would be
# only the tiny bootstrap anyway since the payload isn't in any section.
strip --strip-all /out/minio /out/svix-server /out/mc /out/qstash && \
upx -9 /out/minio /out/svix-server /out/mc /out/qstash
# ── Final image ───────────────────────────────────────────────────────────────
FROM debian:trixie-slim
@ -139,20 +161,20 @@ COPY --from=node-base /usr/local/bin/node /usr/local/bin/node
# Inbucket
COPY --from=inbucket-bin /opt/inbucket /opt/inbucket
# Svix
COPY --from=svix-bin /usr/local/bin/svix-server /usr/local/bin/svix-server
# Svix (UPX-compressed)
COPY --from=upx-compress /out/svix-server /usr/local/bin/svix-server
# ClickHouse
COPY --from=clickhouse-bin /usr/bin/clickhouse /usr/bin/clickhouse
# ClickHouse (stripped only)
COPY --from=upx-compress /out/clickhouse /usr/bin/clickhouse
RUN ln -sf /usr/bin/clickhouse /usr/bin/clickhouse-server && \
ln -sf /usr/bin/clickhouse /usr/bin/clickhouse-client
# MinIO
COPY --from=minio-bin /usr/bin/minio /usr/local/bin/minio
COPY --from=mc-bin /usr/bin/mc /usr/local/bin/mc
# MinIO (UPX-compressed)
COPY --from=upx-compress /out/minio /usr/local/bin/minio
COPY --from=upx-compress /out/mc /usr/local/bin/mc
# QStash
COPY --from=qstash-bin --chmod=755 /qstash-binary /usr/local/bin/qstash
# QStash (UPX-compressed)
COPY --from=upx-compress --chmod=755 /out/qstash /usr/local/bin/qstash
# App
WORKDIR /app
@ -164,6 +186,10 @@ COPY --from=builder /app/apps/backend/node_modules ./apps/backend/node_modules
COPY --from=builder /app/apps/dashboard/.next/standalone ./
COPY --from=builder /app/apps/dashboard/.next/static ./apps/dashboard/.next/static
COPY --from=builder /app/apps/dashboard/public ./apps/dashboard/public
# Save the standalone-traced node_modules (runtime deps only) before the full
# migration-pruner copy overwrites it. The slim-docker-image step in the QEMU
# build restores this after migrations are baked in.
RUN cp -a /app/node_modules /app/node_modules.standalone 2>/dev/null || mkdir -p /app/node_modules.standalone
COPY --from=migration-pruner /pruned-node_modules ./node_modules
COPY --from=builder /app/packages ./packages

View File

@ -112,15 +112,36 @@ qemu_cmd_prefix_for_arch() {
case "$arch" in
arm64)
local accel="tcg"
local cpu="max"
if [ "$HOST_ARCH" = "arm64" ]; then
# Same-arch: prefer hardware acceleration, keep -cpu max. If no
# accelerator is available (e.g. Azure arm64 runners with no
# nested virt) we fall through to TCG, but same-arch TCG handles
# -cpu max correctly and more named CPU models have TCG bugs
# than -cpu max does.
case "$HOST_OS" in
darwin) accel="hvf" ;;
linux) [ -w /dev/kvm ] && accel="kvm" ;;
esac
else
# Cross-arch TCG (amd64 host emulating arm64 guest) needs a CPU
# model that threads a narrow needle:
# * -cpu max advertises armv8.5+ features (PAC, BTI, SVE, LSE…)
# that V8's TurboFan then emits JIT code for; cross-arch TCG
# mistranslates some of those and node SIGTRAPs in migrations.
# * -cpu cortex-a72 (armv8.0-a) keeps V8 safe but makes
# ClickHouse SIGILL on startup because its statically-linked
# LSE atomics (armv8.1+) aren't recognized.
# cortex-a76 is armv8.2-a: it exposes LSE (ClickHouse happy)
# while predating PAC (v8.3) and BTI (v8.5), so V8's aggressive
# JIT tiers don't emit the instructions that tripped TCG. Pair
# this with `node --no-opt` on the migration exec, which keeps
# V8 in Ignition+Sparkplug only (no TurboFan/Maglev).
cpu="cortex-a76"
fi
local firmware
firmware="$(find_aarch64_firmware)"
echo "qemu-system-aarch64 -machine virt -accel $accel -cpu max -bios $firmware"
echo "qemu-system-aarch64 -machine virt -accel $accel -cpu $cpu -bios $firmware"
;;
amd64)
local accel="tcg"
@ -176,6 +197,40 @@ prepare_bundle_artifacts() {
printf "%s" "$current_ids" > "$bundle_meta"
}
contains_provision_marker() {
local provision_log="$1"
local serial_log="$2"
local marker="$3"
if [ -f "$provision_log" ] && grep -Fqx "$marker" "$provision_log" 2>/dev/null; then
return 0
fi
if [ -f "$serial_log" ] && LC_ALL=C strings -a "$serial_log" 2>/dev/null | grep -Fqx "$marker" 2>/dev/null; then
return 0
fi
return 1
}
line_count() {
local file="$1"
local count=0
if [ -f "$file" ]; then
count="$(wc -l < "$file" | tr -d '[:space:]')" || count=0
fi
printf '%s\n' "$count"
}
persist_provision_logs() {
local arch="$1"
local serial_log="$2"
local provision_log="$3"
cp "$serial_log" "$IMAGE_DIR/provision-emulator-${arch}.log" 2>/dev/null || true
cp "$provision_log" "$IMAGE_DIR/provision-emulator-${arch}.progress.log" 2>/dev/null || true
}
build_one() {
local arch="$1"
local base_img="$IMAGE_DIR/debian-${DEBIAN_VERSION}-base-${arch}.qcow2"
@ -192,8 +247,12 @@ build_one() {
local bundle_iso="$tmp_dir/bundle.iso"
local bundle_dir="$tmp_dir/bundle"
local serial_log="$tmp_dir/serial.log"
local provision_log="$tmp_dir/provision.log"
local pidfile="$tmp_dir/qemu.pid"
local qemu_base pid elapsed
local qemu_base pid elapsed total_build_lines
local last_build_lines=0
local guest_exited=false
local guest_failed=false
local start_time=$SECONDS
cp "$base_img" "$tmp_img"
@ -209,21 +268,28 @@ build_one() {
mkdir -p "$bundle_dir"
cp "$bundle_tgz" "$bundle_dir/img.tgz"
cp "$BUILD_ENV_FILE" "$bundle_dir/build.env"
# Tell the guest which arch it's being built for so cross-arch (TCG) builds
# can skip the smoke test, which isn't reliable under software emulation.
printf 'STACK_EMULATOR_BUILD_ARCH=%s\n' "$arch" > "$bundle_dir/build-arch.env"
make_iso_from_dir "$bundle_iso" "STACKBUNDLE" "$bundle_dir"
: > "$serial_log"
: > "$provision_log"
qemu_base="$(qemu_cmd_prefix_for_arch "$arch")"
log "QEMU command prefix (${arch}): $qemu_base"
# shellcheck disable=SC2086
$qemu_base \
-boot order=c \
-m "$RAM" \
-smp "$CPUS" \
-drive "file=$tmp_img,format=qcow2,if=virtio" \
-drive "file=$tmp_img,format=qcow2,if=virtio,discard=on,detect-zeroes=unmap" \
-drive "file=$seed_iso,format=raw,if=virtio,readonly=on" \
-drive "file=$bundle_iso,format=raw,if=virtio,readonly=on" \
-netdev user,id=net0 \
-device virtio-net-pci,netdev=net0 \
-virtfs "local,path=$tmp_dir,mount_tag=hostfs,security_model=none" \
-serial "file:$serial_log" \
-display none \
-daemonize \
@ -232,23 +298,62 @@ build_one() {
pid="$(cat "$pidfile")"
elapsed=0
while [ "$elapsed" -lt "$PROVISION_TIMEOUT" ]; do
if grep -q "STACK_CLOUD_INIT_DONE" "$serial_log" 2>/dev/null; then
if contains_provision_marker "$provision_log" "$serial_log" "STACK_CLOUD_INIT_DONE"; then
break
fi
if contains_provision_marker "$provision_log" "$serial_log" "STACK_CLOUD_INIT_FAILED"; then
guest_failed=true
break
fi
if [ -f "$provision_log" ]; then
total_build_lines="$(line_count "$provision_log")"
if [ "$total_build_lines" -gt "$last_build_lines" ]; then
echo ""
sed -n "$((last_build_lines + 1)),${total_build_lines}p" "$provision_log" 2>/dev/null | while IFS= read -r msg; do
if [ "$msg" = "STACK_CLOUD_INIT_DONE" ]; then
continue
fi
printf " [%3ds] %s\n" "$elapsed" "$msg"
done
last_build_lines="$total_build_lines"
fi
fi
if ! kill -0 "$pid" 2>/dev/null; then
guest_exited=true
break
fi
sleep 5
elapsed=$((SECONDS - start_time))
printf "\r [%3ds / %ds] provisioning emulator..." "$elapsed" "$PROVISION_TIMEOUT"
done
echo ""
if ! grep -q "STACK_CLOUD_INIT_DONE" "$serial_log" 2>/dev/null; then
err "Provisioning timed out for emulator (${arch})"
tail -50 "$serial_log" >&2 || true
if ! contains_provision_marker "$provision_log" "$serial_log" "STACK_CLOUD_INIT_DONE"; then
if [ "$guest_failed" = true ]; then
err "Guest provisioning reported failure for emulator (${arch})"
elif [ "$guest_exited" = true ]; then
err "Provisioning exited before completion for emulator (${arch})"
else
err "Provisioning timed out for emulator (${arch})"
fi
if [ -s "$provision_log" ]; then
tail -50 "$provision_log" >&2 || true
else
LC_ALL=C strings -a "$serial_log" 2>/dev/null | tail -50 >&2 || tail -50 "$serial_log" >&2 || true
fi
if kill -0 "$pid" 2>/dev/null; then
kill "$pid" 2>/dev/null || true
sleep 1
kill -9 "$pid" 2>/dev/null || true
fi
persist_provision_logs "$arch" "$serial_log" "$provision_log"
rm -rf "$tmp_dir"
exit 1
fi
@ -266,19 +371,21 @@ build_one() {
kill -9 "$pid" 2>/dev/null || true
fi
cp "$tmp_img" "$final_img"
cp "$serial_log" "$IMAGE_DIR/provision-emulator-${arch}.log"
rm -rf "$tmp_dir"
persist_provision_logs "$arch" "$serial_log" "$provision_log"
log "Compressing final image (this may take several minutes)..."
qemu-img convert -p -O qcow2 -c "$final_img" "$final_img.tmp"
mv "$final_img.tmp" "$final_img"
qemu-img convert -p -O qcow2 -c "$tmp_img" "$final_img"
rm -rf "$tmp_dir"
local size
size="$(du -h "$final_img" | cut -f1)"
log "━━━ Emulator image ready: $final_img (${size}) ━━━"
}
log "Generating emulator build env file..."
node "$REPO_ROOT/docker/local-emulator/generate-env-development.mjs"
BUILD_ENV_FILE="$REPO_ROOT/docker/local-emulator/.env.development"
for arch in "${TARGET_ARCHS[@]}"; do
local_base="$IMAGE_DIR/debian-${DEBIAN_VERSION}-base-${arch}.qcow2"
download_cloud_image "$arch" "$local_base"

View File

@ -43,6 +43,15 @@ write_files:
gzip -dc /mnt/stack-bundle/img.tgz | docker load
if [ -f /mnt/stack-bundle/build.env ]; then
cp /mnt/stack-bundle/build.env /etc/stack-build.env
fi
# build-arch.env lets the guest skip the smoke test on cross-arch TCG.
if [ -f /mnt/stack-bundle/build-arch.env ]; then
cp /mnt/stack-bundle/build-arch.env /etc/stack-build-arch.env
fi
- path: /usr/local/bin/render-stack-env
permissions: '0755'
content: |
@ -71,25 +80,33 @@ write_files:
cat /mnt/stack-runtime/runtime.env
# Computed vars — depend on port prefix or deps host
# Host-side ports (for browser URLs — browser runs on host, not in VM)
HP_BACKEND="$STACK_EMULATOR_BACKEND_HOST_PORT"
HP_DASHBOARD="$STACK_EMULATOR_DASHBOARD_HOST_PORT"
HP_MINIO="$STACK_EMULATOR_MINIO_HOST_PORT"
HP_INBUCKET="$STACK_EMULATOR_INBUCKET_HOST_PORT"
cat <<COMPUTED
STACK_SKIP_MIGRATIONS=true
STACK_SKIP_SEED_SCRIPT=true
NEXT_PUBLIC_STACK_PORT_PREFIX=${P}
STACK_RUNTIME_WORK_DIR=/app
STACK_LOCAL_EMULATOR_HOST_MOUNT_ROOT=/host
NEXT_PUBLIC_STACK_API_URL=http://localhost:${P}02
NEXT_PUBLIC_STACK_DASHBOARD_URL=http://localhost:${P}01
NEXT_PUBLIC_BROWSER_STACK_API_URL=http://localhost:${P}02
NEXT_PUBLIC_BROWSER_STACK_DASHBOARD_URL=http://localhost:${P}01
NEXT_PUBLIC_STACK_API_URL=http://localhost:${HP_BACKEND}
NEXT_PUBLIC_STACK_DASHBOARD_URL=http://localhost:${HP_DASHBOARD}
NEXT_PUBLIC_BROWSER_STACK_API_URL=http://localhost:${HP_BACKEND}
NEXT_PUBLIC_BROWSER_STACK_DASHBOARD_URL=http://localhost:${HP_DASHBOARD}
NEXT_PUBLIC_SERVER_STACK_API_URL=http://127.0.0.1:${P}02
NEXT_PUBLIC_SERVER_STACK_DASHBOARD_URL=http://127.0.0.1:${P}01
NEXT_PUBLIC_STACK_SVIX_SERVER_URL=http://localhost:${P}13
NEXT_PUBLIC_STACK_SVIX_SERVER_URL=http://localhost:${HP_BACKEND}
STACK_DATABASE_CONNECTION_STRING=postgres://postgres:PASSWORD-PLACEHOLDER--uqfEC1hmmv@${DEPS_HOST}:5432/stackframe
STACK_EMAIL_HOST=${DEPS_HOST}
STACK_SVIX_SERVER_URL=http://${DEPS_HOST}:8071
STACK_S3_ENDPOINT=http://${DEPS_HOST}:9090
STACK_S3_PUBLIC_ENDPOINT=http://localhost:${P}21/stack-storage
STACK_S3_PUBLIC_ENDPOINT=http://localhost:${HP_MINIO}/stack-storage
STACK_QSTASH_URL=http://${DEPS_HOST}:8080
STACK_CLICKHOUSE_URL=http://${DEPS_HOST}:8123
STACK_EMAIL_MONITOR_VERIFICATION_CALLBACK_URL=http://localhost:${P}01/handler/email-verification
STACK_EMAIL_MONITOR_VERIFICATION_CALLBACK_URL=http://localhost:${HP_DASHBOARD}/handler/email-verification
STACK_EMAIL_MONITOR_INBUCKET_API_URL=http://${DEPS_HOST}:9001
STACK_OAUTH_MOCK_URL=http://${HOST_SERVICES_HOST}:${P}14
BACKEND_PORT=${P}02
@ -134,16 +151,305 @@ write_files:
stack-local-emulator
- path: /usr/local/bin/wait-for-deps
permissions: '0755'
content: |
#!/bin/bash
set -uo pipefail
# Hard upper bound across the whole dep wait. Under TCG every service
# init is 5-20x slower than native, so we allow a generous budget, but
# if we cross it something is genuinely stuck and we need to surface it.
DEPS_TIMEOUT="${STACK_DEPS_TIMEOUT:-1500}"
DEPS_CONTAINER="${STACK_DEPS_CONTAINER:-stack-build-init}"
start=$SECONDS
log() { /usr/local/bin/log-provision "wait-for-deps: $*"; }
# name|probe pairs — probe runs through `eval` and must exit 0 when ready.
# No --max-time on these: under slow TCG a service may take >3s to
# respond; let curl wait, outer DEPS_TIMEOUT bounds the whole dep wait.
SERVICES=(
'postgres|nc -z 127.0.0.1 5432'
'clickhouse|curl -sf http://127.0.0.1:8123/ping'
'svix|curl -sf http://127.0.0.1:8071/api/v1/health/'
'minio|curl -sf http://127.0.0.1:9090/minio/health/live'
'qstash|[ "$(curl -s -o /dev/null -w "%{http_code}" http://127.0.0.1:8080/ 2>/dev/null || true)" = "401" ]'
)
dump_diagnostics() {
log "dumping diagnostics for stuck dep wait..."
log "--- docker ps -a ---"
docker ps -a 2>&1 | /usr/local/bin/log-provision-stream "wait-for-deps: ps" || true
log "--- docker logs ${DEPS_CONTAINER} (last 300 lines) ---"
docker logs --tail 300 "$DEPS_CONTAINER" 2>&1 | /usr/local/bin/log-provision-stream "wait-for-deps: deps" || true
log "--- per-service probes (3s timeout) ---"
nc -z -w 3 127.0.0.1 5432 >/dev/null 2>&1 && log "postgres:5432 reachable" || log "postgres:5432 NOT reachable"
curl -sf --max-time 3 http://127.0.0.1:8123/ping >/dev/null 2>&1 && log "clickhouse:8123 reachable" || log "clickhouse:8123 NOT reachable"
curl -sf --max-time 3 http://127.0.0.1:8071/api/v1/health/ >/dev/null 2>&1 && log "svix:8071 reachable" || log "svix:8071 NOT reachable"
curl -sf --max-time 3 http://127.0.0.1:9090/minio/health/live >/dev/null 2>&1 && log "minio:9090 reachable" || log "minio:9090 NOT reachable"
code=$(curl -s -o /dev/null -w '%{http_code}' --max-time 3 http://127.0.0.1:8080/ 2>/dev/null || true)
[ "$code" = "401" ] && log "qstash:8080 reachable (401)" || log "qstash:8080 NOT reachable (code=${code:-none})"
}
wait_for() {
local name="$1" probe="$2" elapsed
local svc_start=$SECONDS
local next_heartbeat=$((svc_start + 30))
while true; do
if eval "$probe" >/dev/null 2>&1; then
elapsed=$((SECONDS - svc_start))
log "${name} ready (${elapsed}s)"
return 0
fi
if [ "$SECONDS" -ge "$next_heartbeat" ]; then
log "still waiting for ${name} ($((SECONDS - svc_start))s elapsed)"
next_heartbeat=$((SECONDS + 30))
fi
if [ "$((SECONDS - start))" -ge "$DEPS_TIMEOUT" ]; then
elapsed=$((SECONDS - start))
log "TIMEOUT waiting for ${name} after ${elapsed}s (hard cap ${DEPS_TIMEOUT}s)"
dump_diagnostics
exit 1
fi
sleep 2
done
}
log "starting dep wait (timeout=${DEPS_TIMEOUT}s)"
for entry in "${SERVICES[@]}"; do
wait_for "${entry%%|*}" "${entry#*|}"
done
log "all deps ready ($((SECONDS - start))s total)"
- path: /etc/stack-build-computed.env
content: |
USE_INLINE_ENV_VARS=true
NEXT_PUBLIC_STACK_API_URL=http://localhost:8102
NEXT_PUBLIC_STACK_DASHBOARD_URL=http://localhost:8101
NEXT_PUBLIC_BROWSER_STACK_API_URL=http://localhost:8102
NEXT_PUBLIC_BROWSER_STACK_DASHBOARD_URL=http://localhost:8101
NEXT_PUBLIC_SERVER_STACK_API_URL=http://127.0.0.1:8102
NEXT_PUBLIC_SERVER_STACK_DASHBOARD_URL=http://127.0.0.1:8101
NEXT_PUBLIC_STACK_SVIX_SERVER_URL=http://localhost:8071
NEXT_PUBLIC_STACK_PORT_PREFIX=81
STACK_CLICKHOUSE_DATABASE=analytics
BACKEND_PORT=8102
DASHBOARD_PORT=8101
- path: /usr/local/bin/log-provision
permissions: '0755'
content: |
#!/bin/bash
set -euo pipefail
until nc -z 127.0.0.1 5432 >/dev/null 2>&1; do sleep 1; done
until curl -sf http://127.0.0.1:8123/ping >/dev/null 2>&1; do sleep 1; done
until curl -sf http://127.0.0.1:8071/api/v1/health/ >/dev/null 2>&1; do sleep 1; done
until curl -sf http://127.0.0.1:9090/minio/health/live >/dev/null 2>&1; do sleep 1; done
until [ "$(curl -s -o /dev/null -w '%{http_code}' http://127.0.0.1:8080/ 2>/dev/null || true)" = "401" ]; do sleep 1; done
msg="$*"
echo "STACK_PROVISION: $msg"
if [ -n "${STACK_PROVISION_LOG_FILE:-}" ]; then
printf '%s\n' "$msg" >> "$STACK_PROVISION_LOG_FILE"
fi
- path: /usr/local/bin/log-provision-stream
permissions: '0755'
content: |
#!/bin/bash
set -uo pipefail
prefix="${1:-}"
while IFS= read -r line; do
/usr/local/bin/log-provision "${prefix}: ${line}"
done
- path: /usr/local/bin/run-build-migrations
permissions: '0755'
content: |
#!/bin/bash
set -euo pipefail
log() { /usr/local/bin/log-provision "$*"; }
log "Starting deps container..."
docker run --rm --name stack-build-init \
--network host \
-e STACK_DEPS_ONLY=true \
-v stack-postgres-data:/data/postgres \
-v stack-redis-data:/data/redis \
-v stack-clickhouse-data:/data/clickhouse \
-v stack-minio-data:/data/minio \
-v stack-inbucket-data:/data/inbucket \
-d stack-local-emulator
log "Waiting for deps (postgres, redis, clickhouse, minio, qstash)..."
/usr/local/bin/wait-for-deps
log "Deps ready."
# Wait for init-services.sh (MinIO buckets, ClickHouse DB creation)
log "Waiting for init-services.sh..."
timeout=120
elapsed=0
while [ "$elapsed" -lt "$timeout" ]; do
if docker exec stack-build-init test -f /var/run/stack-local-init-services.done 2>/dev/null; then
break
fi
sleep 1
elapsed=$((elapsed + 1))
done
if [ "$elapsed" -ge "$timeout" ]; then
log "ERROR: init-services.sh did not finish within ${timeout}s"
exit 1
fi
log "init-services done (${elapsed}s)."
log "Running migrations..."
# Cross-arch TCG mistranslates V8's JIT-emitted arm64, and V8's wasm
# tier-up path trips an InnerPointerToCodeCache check deep in the heap
# (Runtime_WasmTriggerTierUp → StackFrameIterator::Advance crashes
# when Wasm code has been freed while a frame still references it).
# --no-opt keeps JS off TurboFan/Maglev
# --no-wasm-tier-up keeps Wasm on Liftoff (no TurboFan)
# --no-wasm-dynamic-tiering suppresses the tier-up decision runtime call
# --no-wasm-code-gc keeps Wasm code alive across stack walks
# All four are no-ops under KVM, and must be passed on node's CLI
# (NODE_OPTIONS rejects them).
migrate_log="$(mktemp)"
set +e
docker exec \
--env-file /etc/stack-build.env \
--env-file /etc/stack-build-computed.env \
stack-build-init \
sh -c 'cd /app/apps/backend && node --no-opt --no-wasm-tier-up --no-wasm-dynamic-tiering --no-wasm-code-gc dist/db-migrations.mjs migrate && node --no-opt --no-wasm-tier-up --no-wasm-dynamic-tiering --no-wasm-code-gc dist/db-migrations.mjs seed' \
> "$migrate_log" 2>&1
migrate_status=$?
set -e
if [ "$migrate_status" -ne 0 ]; then
log "MIGRATIONS FAILED (exit ${migrate_status}) — last 200 lines of migration output:"
tail -200 "$migrate_log" | /usr/local/bin/log-provision-stream "migrate" || true
rm -f "$migrate_log"
exit "$migrate_status"
fi
rm -f "$migrate_log"
log "Migrations + seed complete."
log "Stopping deps container..."
docker stop stack-build-init || true
log "run-build-migrations done."
- path: /usr/local/bin/slim-docker-image
permissions: '0755'
content: |
#!/bin/bash
set -euo pipefail
log() { /usr/local/bin/log-provision "$*"; }
log "Building slim Docker image..."
docker build -t stack-local-emulator-slim - <<'DOCKERFILE'
FROM stack-local-emulator
RUN rm -rf /app/node_modules /app/apps/backend/dist && \
mv /app/node_modules.standalone /app/node_modules && \
for entry in /app/node_modules/.pnpm/node_modules/*; do \
name="$(basename "$entry")"; \
[ "$name" = ".bin" ] && continue; \
ln -sf ".pnpm/node_modules/$name" "/app/node_modules/$name" 2>/dev/null || true; \
done
DOCKERFILE
log "Slim image built."
# Determine build arch to decide whether to run the smoke test. Cross-arch
# (TCG) builds can't reliably run the Next.js backend inside the smoke
# test container: V8 JIT ↔ QEMU TCG mistranslations crash the process,
# and even with --jitless the backend is too slow to respond within any
# sane timeout. amd64 builds run under KVM and are unaffected.
BUILD_ARCH=""
if [ -f /etc/stack-build-arch.env ]; then
# shellcheck disable=SC1091
. /etc/stack-build-arch.env
BUILD_ARCH="${STACK_EMULATOR_BUILD_ARCH:-}"
fi
if [ "$BUILD_ARCH" = "arm64" ]; then
log "Skipping smoke test: build arch is arm64 and cross-arch TCG can't reliably run the backend."
else
log "Running smoke test on slim image..."
docker run --rm --name smoke-test \
--network host \
--env-file /etc/stack-build.env \
--env-file /etc/stack-build-computed.env \
-e STACK_SKIP_MIGRATIONS=true \
-e STACK_SKIP_SEED_SCRIPT=true \
-e STACK_RUNTIME_WORK_DIR=/app \
-v stack-postgres-data:/data/postgres \
-v stack-redis-data:/data/redis \
-v stack-clickhouse-data:/data/clickhouse \
-v stack-minio-data:/data/minio \
-v stack-inbucket-data:/data/inbucket \
-d stack-local-emulator-slim
smoke_timeout=300
smoke_elapsed=0
smoke_passed=false
while [ "$smoke_elapsed" -lt "$smoke_timeout" ]; do
code=$(curl -s -o /dev/null -w "%{http_code}" --max-time 3 http://127.0.0.1:8102/health?db=1 2>/dev/null || true)
if [ "$code" = "200" ]; then
smoke_passed=true
break
fi
sleep 2
smoke_elapsed=$((smoke_elapsed + 2))
done
if [ "$smoke_passed" = "false" ]; then
log "SMOKE TEST FAILED: backend /health?db=1 did not return 200 within ${smoke_timeout}s"
log "--- docker ps -a ---"
docker ps -a 2>&1 | /usr/local/bin/log-provision-stream "ps" || true
log "--- smoke-test container logs (last 200 lines) ---"
docker logs --tail 200 smoke-test 2>&1 | /usr/local/bin/log-provision-stream "smoke-test" || true
log "--- free -m ---"
free -m 2>&1 | /usr/local/bin/log-provision-stream "mem" || true
log "--- curl -v /health?db=1 ---"
curl -v --max-time 5 http://127.0.0.1:8102/health?db=1 2>&1 | /usr/local/bin/log-provision-stream "curl" || true
docker stop smoke-test 2>/dev/null || true
exit 1
fi
docker stop smoke-test 2>/dev/null || true
sleep 2
log "Smoke test passed (${smoke_elapsed}s)."
fi
log "Flattening image (docker export/import)..."
docker create --name flatten stack-local-emulator-slim /bin/true
docker export flatten | docker import \
--change 'WORKDIR /app' \
--change 'ENTRYPOINT ["/entrypoint.sh"]' \
--change 'EXPOSE 5432 6379 2500 9001 1100 8071 8123 9009 9090 8080 8101 8102' \
--change 'ENV DEBIAN_FRONTEND=noninteractive' \
- stack-local-emulator:final
log "Flatten done."
log "Saving final image to /var/tmp..."
docker rm flatten
docker save stack-local-emulator:final -o /var/tmp/final-image.tar
mv /var/lib/docker/volumes /var/tmp/volumes-backup
log "Nuking Docker storage and reloading..."
systemctl stop docker containerd
rm -rf /var/lib/docker /var/lib/containerd
systemctl start docker containerd
until docker info >/dev/null 2>&1; do sleep 1; done
docker load -i /var/tmp/final-image.tar
docker tag stack-local-emulator:final stack-local-emulator
docker rmi stack-local-emulator:final || true
rm -f /var/tmp/final-image.tar
systemctl stop docker
rm -rf /var/lib/docker/volumes
mv /var/tmp/volumes-backup /var/lib/docker/volumes
systemctl start docker
log "Docker storage rebuilt."
log "Zeroing free space for qcow2 compression..."
dd if=/dev/zero of=/zero.fill bs=1M 2>/dev/null || true
rm -f /zero.fill
sync
fstrim -av 2>/dev/null || true
log "slim-docker-image done."
- path: /etc/systemd/system/stack.service
content: |
@ -162,24 +468,77 @@ write_files:
[Install]
WantedBy=multi-user.target
- path: /usr/local/bin/provision-build
permissions: '0755'
content: |
#!/bin/bash
set -euo pipefail
if bash /usr/local/bin/mount-host-fs 2>/dev/null; then
export STACK_PROVISION_LOG_FILE=/host/provision.log
: > "$STACK_PROVISION_LOG_FILE"
else
export STACK_PROVISION_LOG_FILE=""
fi
write_marker_to_consoles() {
local marker="$1"
for dev in /dev/console /dev/ttyAMA0 /dev/ttyS0; do
echo "$marker" > "$dev" 2>/dev/null || true
done
}
cleanup() {
local status=$?
if [ "$status" -ne 0 ]; then
if [ -n "${STACK_PROVISION_LOG_FILE:-}" ]; then
printf 'ERROR: provision-build exited with code %s\n' "$status" >> "$STACK_PROVISION_LOG_FILE"
printf '%s\n' "STACK_CLOUD_INIT_FAILED" >> "$STACK_PROVISION_LOG_FILE"
fi
write_marker_to_consoles "STACK_CLOUD_INIT_FAILED"
sync || true
(sleep 2 && shutdown -P now) &
(sleep 15 && poweroff -f) &
fi
}
trap cleanup EXIT
SERIAL=""
for d in /dev/ttyAMA0 /dev/ttyS0; do
[ -c "$d" ] && SERIAL="$d" && break
done
if [ -n "$SERIAL" ]; then
exec > >(tee -a "$SERIAL") 2>&1
fi
log_provision() {
/usr/local/bin/log-provision "$*"
}
log_provision "runcmd starting"
systemctl disable --now ssh || true
systemctl mask ssh || true
log_provision "installing emulator containers"
bash /usr/local/bin/install-emulator-containers
systemctl daemon-reload
systemctl enable stack.service
log_provision "starting build migrations"
bash /usr/local/bin/run-build-migrations
log_provision "starting slim-docker-image"
bash /usr/local/bin/slim-docker-image
log_provision "build pipeline complete"
if [ -n "${STACK_PROVISION_LOG_FILE:-}" ]; then
printf '%s\n' "STACK_CLOUD_INIT_DONE" >> "$STACK_PROVISION_LOG_FILE"
fi
write_marker_to_consoles "STACK_CLOUD_INIT_DONE"
shutdown -P now
runcmd:
- systemctl disable --now ssh || true
- systemctl mask ssh || true
- bash /usr/local/bin/install-emulator-containers
- systemctl daemon-reload
- systemctl enable stack.service
- docker run --rm --name stack-build-init
--network host
-e STACK_DEPS_ONLY=true
-v stack-postgres-data:/data/postgres
-v stack-redis-data:/data/redis
-v stack-clickhouse-data:/data/clickhouse
-v stack-minio-data:/data/minio
-v stack-inbucket-data:/data/inbucket
-d stack-local-emulator
- bash /usr/local/bin/wait-for-deps
- docker stop stack-build-init || true
- echo "STACK_CLOUD_INIT_DONE" > /dev/console 2>/dev/null || true
- echo "STACK_CLOUD_INIT_DONE" > /dev/ttyAMA0 2>/dev/null || true
- echo "STACK_CLOUD_INIT_DONE" > /dev/ttyS0 2>/dev/null || true
- shutdown -P now
- [bash, /usr/local/bin/provision-build]

View File

@ -85,6 +85,10 @@ prepare_runtime_config_iso() {
mkdir -p "$cfg_dir"
{
printf "STACK_EMULATOR_PORT_PREFIX=%s\n" "$PORT_PREFIX"
printf "STACK_EMULATOR_DASHBOARD_HOST_PORT=%s\n" "$EMULATOR_DASHBOARD_PORT"
printf "STACK_EMULATOR_BACKEND_HOST_PORT=%s\n" "$EMULATOR_BACKEND_PORT"
printf "STACK_EMULATOR_MINIO_HOST_PORT=%s\n" "$EMULATOR_MINIO_PORT"
printf "STACK_EMULATOR_INBUCKET_HOST_PORT=%s\n" "$EMULATOR_INBUCKET_PORT"
} > "$cfg_dir/runtime.env"
cp "$SCRIPT_DIR/../.env.development" "$cfg_dir/base.env"
make_iso_from_dir "$cfg_iso" "STACKCFG" "$cfg_dir"

View File

@ -0,0 +1,124 @@
#!/usr/bin/env bash
# Quick test: boot the base QEMU image with a minimal cloud-init that writes to
# serial via runcmd. Verifies that our logging approach works without running
# the full emulator build (~10s instead of ~10min).
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
source "$SCRIPT_DIR/common.sh"
detect_host
ARCH="${1:-$HOST_ARCH}"
BASE_IMG="$SCRIPT_DIR/images/debian-13-base-${ARCH}.qcow2"
if [ ! -f "$BASE_IMG" ]; then
echo "Base image not found: $BASE_IMG" >&2
exit 1
fi
TMP_DIR="$(mktemp -d /tmp/stack-serial-test-XXXXXX)"
trap 'kill "$(cat "$TMP_DIR/qemu.pid" 2>/dev/null)" 2>/dev/null; rm -rf "$TMP_DIR"' EXIT
# Create a temporary disk
cp "$BASE_IMG" "$TMP_DIR/disk.qcow2"
# Minimal cloud-init user-data that tests serial output from runcmd
cat > "$TMP_DIR/user-data" << 'EOF'
#cloud-config
write_files:
- path: /usr/local/bin/provision-build
permissions: '0755'
content: |
#!/bin/bash
set -euo pipefail
SERIAL=""
for d in /dev/ttyAMA0 /dev/ttyS0; do
[ -c "$d" ] && SERIAL="$d" && break
done
if [ -n "$SERIAL" ]; then
exec > >(tee -a "$SERIAL") 2>&1
fi
echo "STACK_PROVISION: script started"
sleep 1
echo "STACK_PROVISION: step 2"
for dev in /dev/console /dev/ttyAMA0 /dev/ttyS0; do
echo "STACK_CLOUD_INIT_DONE" > "$dev" 2>/dev/null || true
done
shutdown -P now
runcmd:
- [bash, /usr/local/bin/provision-build]
EOF
cat > "$TMP_DIR/meta-data" << 'EOF'
instance-id: serial-test
local-hostname: serial-test
EOF
# Build seed ISO
make_iso_from_dir "$TMP_DIR/seed.iso" "cidata" "$TMP_DIR"
: > "$TMP_DIR/serial.log"
case "$ARCH" in
arm64)
accel="hvf"
firmware="$(find_aarch64_firmware)"
qemu_base="qemu-system-aarch64 -machine virt -accel $accel -cpu max -bios $firmware"
;;
amd64)
qemu_base="qemu-system-x86_64 -machine q35 -accel hvf -cpu max"
;;
esac
$qemu_base \
-boot order=c \
-m 1024 \
-smp 2 \
-drive "file=$TMP_DIR/disk.qcow2,format=qcow2,if=virtio" \
-drive "file=$TMP_DIR/seed.iso,format=raw,if=virtio,readonly=on" \
-netdev user,id=net0 \
-device virtio-net-pci,netdev=net0 \
-serial "file:$TMP_DIR/serial.log" \
-display none \
-daemonize \
-pidfile "$TMP_DIR/qemu.pid"
echo "QEMU started, waiting for serial output..."
echo "Serial log: $TMP_DIR/serial.log"
elapsed=0
timeout=120
while [ "$elapsed" -lt "$timeout" ]; do
if grep -q "STACK_CLOUD_INIT_DONE" "$TMP_DIR/serial.log" 2>/dev/null; then
echo ""
echo "=== SUCCESS: STACK_CLOUD_INIT_DONE received ==="
echo ""
echo "=== All STACK_PROVISION lines ==="
grep "STACK_PROVISION" "$TMP_DIR/serial.log" || echo "(none found)"
exit 0
fi
# Show any STACK_PROVISION lines as they appear
if grep -q "STACK_PROVISION" "$TMP_DIR/serial.log" 2>/dev/null; then
grep "STACK_PROVISION" "$TMP_DIR/serial.log" | while IFS= read -r line; do
echo " [${elapsed}s] $line"
done
fi
sleep 2
elapsed=$((elapsed + 2))
printf "\r [%ds / %ds] waiting..." "$elapsed" "$timeout"
done
echo ""
echo "=== TIMEOUT: no STACK_CLOUD_INIT_DONE after ${timeout}s ==="
echo ""
echo "=== Last 30 lines of serial log ==="
tail -30 "$TMP_DIR/serial.log"
echo ""
echo "=== STACK_PROVISION lines ==="
grep "STACK_PROVISION" "$TMP_DIR/serial.log" || echo "(none found)"
exit 1