From 0e1d98b5d0ccec2a1676d267bd148ba60129c2b7 Mon Sep 17 00:00:00 2001 From: BilalG1 Date: Fri, 19 Jun 2026 13:17:07 -0700 Subject: [PATCH] fix(cli): fall back to installed CLI when npx auto-update fails (#1612) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Problem `hexclave dev` re-execs itself through `npx @latest` (`maybeReexecToLatest`) so users always get the latest dashboard without reinstalling. But when that npx run **fails**, it exits nonzero and we did `process.exit(result.code)` — killing `hexclave dev` even though a perfectly good CLI was already installed locally. This surfaced on **Replit**, where a user running `hexclave dev` got: ``` npm notice Access denied: Your download has been blocked by the Socket Security Policy. Reason: AI-detected potential malware. npm error code ECOMPROMISED npm error Lock compromised ``` Diagnosis: the user's `pnpm` is aliased to `sfw pnpm` (**Socket Firewall**), and Replit enforces an org-level Socket Security Policy. Socket **blocks the `@hexclave/cli@latest` download** (false-positive "AI-detected malware", almost certainly the bundled minified dashboard). The interrupted download breaks npx's reify while it holds its cache lock, which npm reports as `Lock compromised`. The lock message is a *downstream symptom*; the real failure is the blocked download. The same class of failure (blocked download, npm error, lock contention, offline) all share one shape: **npx exits nonzero before our CLI ever runs**, and today that takes down `hexclave dev`. ## Fix Use a **startup-marker handshake** to distinguish the two cases: - The parent passes a temp marker path to the npx child via `STACK_CLI_REEXEC_MARKER`. - The re-exec'd child touches the marker the instant it starts (`signalReexecStartedIfChild`). - After the child exits (`decidePostReexec`): - exited 0, or nonzero **with** the marker present → our CLI ran; **propagate** the exit code (real command result). - nonzero **without** the marker, or npx not spawnable → our CLI never ran; **fall back** to the installed CLI instead of failing `hexclave dev`. The marker only needs file create/exists (robust on sandboxed/networked filesystems). If the marker can't be created, we preserve the old always-propagate behavior, so there's no spurious-fallback risk. `decidePostReexec` and `signalReexecStartedIfChild` are pure and unit-tested. ## Verification - `pnpm test run src/lib/self-update.test.ts` → 23 passing (added cases for the fallback decision and the marker handshake). - `pnpm typecheck` / `pnpm lint` → clean. - End-to-end: forced a real npx failure (unreachable registry → npx exits nonzero before our CLI runs) and confirmed `hexclave dev` now logs `Auto-update skipped: …; continuing with the installed CLI.` and proceeds into the installed dev path instead of dying. ## Notes / follow-ups (not in this PR) - Workaround for affected users today: `STACK_CLI_NO_AUTO_UPDATE=1` (or `--no-auto-update`) skips the npx download entirely. - Worth reporting the Socket false-positive to socket.dev to allowlist `@hexclave/cli`, and reconsidering shipping a ~165 MB minified dashboard inside the npm tarball (it's what trips AI-malware heuristics and slows cold installs). --- ## Summary by cubic Prevent `hexclave dev` from exiting when `npx @latest` auto-update fails by falling back to the installed CLI. Signal-based aborts now propagate as a nonzero exit (never NaN), so Ctrl-C doesn’t relaunch dev. - **Bug Fixes** - Fall back to the installed CLI when `npx @latest` fails before the CLI starts (firewalls, offline, npm lock errors). Do not fall back when the child is killed by a signal; propagate 128+signum with a safe nonzero fallback if the signum is unknown. - Startup-marker handshake: parent sets `REEXEC_MARKER_ENV` (`HEXCLAVE_CLI_REEXEC_MARKER`); child touches it on start; `decidePostReexec` chooses propagate vs fallback. If the marker can’t be created, keep the old always-propagate behavior. - Scrub the marker env from user commands using the exported constant, covering both Windows and POSIX spawn paths. Written for commit cb2635f2aefeba658afd27e50c3f09b9124703fb. Summary will update on new commits. Review in cubic ## Summary by CodeRabbit ## Release Notes * **Bug Fixes** * Updated the `dev` command to prevent internal re-exec environment details from being inherited by user commands. * Improved self-update re-exec fallback logic to better detect whether the CLI actually started, with more reliable exit-code and signal handling. * **Tests** * Added deterministic test coverage for the post-re-exec decision path and signal/exit propagation scenarios, including early `npx` failures. --- packages/cli/src/commands/dev.ts | 10 +- packages/cli/src/lib/self-update.test.ts | 173 +++++++++++++++++++++++ packages/cli/src/lib/self-update.ts | 128 ++++++++++++++--- 3 files changed, 292 insertions(+), 19 deletions(-) diff --git a/packages/cli/src/commands/dev.ts b/packages/cli/src/commands/dev.ts index 50630b72b..26ac5ee08 100644 --- a/packages/cli/src/commands/dev.ts +++ b/packages/cli/src/commands/dev.ts @@ -9,7 +9,7 @@ import { resolveConfigFilePathOption } from "../lib/config-file-path.js"; import { devEnvStatePath, ensureLocalDashboardSecret, readDevEnvState, recordLocalDashboardProcess } from "../lib/dev-env-state.js"; import { CliError } from "../lib/errors.js"; import { cliVersion } from "../lib/own-package.js"; -import { maybeReexecToLatest } from "../lib/self-update.js"; +import { maybeReexecToLatest, REEXEC_MARKER_ENV } from "../lib/self-update.js"; type ChildCommand = { command: string, @@ -788,14 +788,18 @@ child.on("error", (error) => { `; function runChildProcess(command: ChildCommand, env: NodeJS.ProcessEnv): Promise { + // Scrub the internal re-exec handshake marker so it never leaks into the user's + // command. Done here (not in the wrapper script) to cover both spawn paths. + const childEnv = { ...env }; + delete childEnv[REEXEC_MARKER_ENV]; return new Promise((resolvePromise, reject) => { const child = process.platform === "win32" - ? spawn(command.command, command.args, { stdio: "inherit", env }) + ? spawn(command.command, command.args, { stdio: "inherit", env: childEnv }) : spawn(process.execPath, ["-e", APP_COMMAND_WRAPPER_SCRIPT], { detached: true, stdio: "inherit", env: { - ...env, + ...childEnv, [APP_COMMAND_WRAPPER_PARENT_PID_ENV_VAR]: String(process.pid), [APP_COMMAND_WRAPPER_COMMAND_ENV_VAR]: command.command, [APP_COMMAND_WRAPPER_ARGS_ENV_VAR]: JSON.stringify(command.args), diff --git a/packages/cli/src/lib/self-update.test.ts b/packages/cli/src/lib/self-update.test.ts index 2b35eb5f3..84394a6fc 100644 --- a/packages/cli/src/lib/self-update.test.ts +++ b/packages/cli/src/lib/self-update.test.ts @@ -1,11 +1,28 @@ import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; +import { existsSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from "fs"; +import { tmpdir } from "os"; +import { join } from "path"; +import { EventEmitter } from "events"; +import * as childProcess from "child_process"; +import * as ownPackage from "./own-package.js"; + +// `spawn` is a non-configurable built-in export, so it can't be vi.spyOn'd; +// replace it with a vi.fn. Everything else in child_process stays real. +vi.mock("child_process", async (importOriginal) => { + const actual = await importOriginal(); + return { ...actual, spawn: vi.fn() }; +}); + import { buildNpxInvocation, + decidePostReexec, decideReexec, DISABLE_AUTO_UPDATE_ENV, isEnvFlagEnabled, maybeReexecToLatest, + REEXEC_MARKER_ENV, shouldAutoUpdate, + signalReexecStartedIfChild, SKIP_AUTO_UPDATE_ENV, } from "./self-update.js"; import type { OwnPackage } from "./own-package.js"; @@ -179,3 +196,159 @@ describe("maybeReexecToLatest", () => { await expect(maybeReexecToLatest({ forwardArgs: ["dev"] })).resolves.toBeUndefined(); }); }); + +describe("decidePostReexec", () => { + it("propagates the exit code when the CLI ran to completion (code 0)", () => { + expect(decidePostReexec({ result: { exited: true, code: 0, signal: null }, started: true })) + .toEqual({ kind: "exit", code: 0 }); + }); + + it("propagates a nonzero exit code when the CLI actually started (real command failure)", () => { + // The re-exec'd CLI ran (marker present) and the wrapped command failed — we + // must surface that failure, not silently re-run it. + expect(decidePostReexec({ result: { exited: true, code: 1, signal: null }, started: true })) + .toEqual({ kind: "exit", code: 1 }); + }); + + it("falls back when npx exits nonzero before the CLI starts (e.g. Lock compromised)", () => { + // npm errored during install/lock; our CLI never ran. Don't take down dev — + // run the installed CLI instead. + const action = decidePostReexec({ result: { exited: true, code: 1, signal: null }, started: false }); + expect(action.kind).toBe("fallback"); + }); + + it("propagates (does not fall back) when npx was killed by a signal before the CLI started", () => { + // e.g. the user pressed Ctrl-C during the download. They want to abort, not + // get a fresh dev session launched on the installed CLI. + expect(decidePostReexec({ result: { exited: true, code: 130, signal: "SIGINT" }, started: false })) + .toEqual({ kind: "exit", code: 130 }); + }); + + it("falls back when npx cannot be spawned at all", () => { + const action = decidePostReexec({ result: { exited: false, error: "spawn npx ENOENT" }, started: false }); + expect(action.kind).toBe("fallback"); + }); +}); + +describe("signalReexecStartedIfChild", () => { + let dir: string; + + beforeEach(() => { + dir = mkdtempSync(join(tmpdir(), "hexclave-reexec-test-")); + }); + + afterEach(() => { + rmSync(dir, { recursive: true, force: true }); + }); + + it("writes the marker file when the marker env var is set (we are the child)", () => { + const marker = join(dir, "started"); + signalReexecStartedIfChild({ [REEXEC_MARKER_ENV]: marker }); + expect(existsSync(marker)).toBe(true); + expect(readFileSync(marker, "utf8")).toBe("1"); + }); + + it("does nothing when no marker env var is set (normal top-level run)", () => { + const marker = join(dir, "started"); + signalReexecStartedIfChild({}); + expect(existsSync(marker)).toBe(false); + }); + + it("does not throw when the marker path is unwritable (best-effort)", () => { + const marker = join(dir, "nonexistent-subdir", "started"); + expect(() => signalReexecStartedIfChild({ [REEXEC_MARKER_ENV]: marker })).not.toThrow(); + expect(existsSync(marker)).toBe(false); + }); +}); + +// End-to-end wiring of maybeReexecToLatest (marker -> spawn -> existence check -> +// decidePostReexec). The decision functions are pure-tested above; these guard +// against the glue regressing. +describe("maybeReexecToLatest fallback wiring", () => { + const managedKeys = [SKIP_AUTO_UPDATE_ENV, DISABLE_AUTO_UPDATE_ENV, REEXEC_MARKER_ENV]; + const savedEnv: Record = {}; + + beforeEach(() => { + for (const key of managedKeys) { + savedEnv[key] = process.env[key]; + delete process.env[key]; + } + vi.spyOn(ownPackage, "getOwnPackage").mockReturnValue({ + name: "@hexclave/cli", + version: "1.0.0", + binName: "hexclave", + }); + }); + + afterEach(() => { + for (const key of managedKeys) { + if (savedEnv[key] == null) delete process.env[key]; + else process.env[key] = savedEnv[key]; + } + vi.restoreAllMocks(); + }); + + // Fake npx child. `writeMarker` simulates whether the re-exec'd CLI touched the + // marker (as signalReexecStartedIfChild would) before the process closes. + function mockNpxChild(opts: { writeMarker: boolean }): EventEmitter & { pid: number, kill: () => void } { + const child = Object.assign(new EventEmitter(), { pid: 4242, kill: () => {} }); + vi.mocked(childProcess.spawn).mockImplementation((( + _command: string, + _args: readonly string[], + spawnOpts: { env?: NodeJS.ProcessEnv }, + ) => { + if (opts.writeMarker) { + const markerFile = spawnOpts.env?.[REEXEC_MARKER_ENV]; + if (markerFile != null) writeFileSync(markerFile, "1"); + } + return child; + }) as unknown as typeof childProcess.spawn); + return child; + } + + it("falls back (never calls process.exit) when npx exits nonzero without the CLI starting", async () => { + const exitSpy = vi.spyOn(process, "exit").mockImplementation(((() => { + throw new Error("process.exit should not have been called"); + }) as never)); + const child = mockNpxChild({ writeMarker: false }); + + const promise = maybeReexecToLatest({ forwardArgs: ["dev"] }); + child.emit("close", 1, null); + + await expect(promise).resolves.toBeUndefined(); + expect(exitSpy).not.toHaveBeenCalled(); + }); + + it("propagates the exit code (calls process.exit) when the CLI started then failed", async () => { + const exitSpy = vi.spyOn(process, "exit").mockImplementation(((() => { + // Throw so we don't actually exit the test runner; the SUT's catch swallows it. + throw new Error("__exit__"); + }) as never)); + const child = mockNpxChild({ writeMarker: true }); + + const promise = maybeReexecToLatest({ forwardArgs: ["dev"] }); + child.emit("close", 1, null); + + await promise; + expect(exitSpy).toHaveBeenCalledWith(1); + }); + + it("propagates a nonzero, non-NaN code when killed by a signal missing from os.constants", async () => { + const exitSpy = vi.spyOn(process, "exit").mockImplementation(((() => { + throw new Error("__exit__"); + }) as never)); + const child = mockNpxChild({ writeMarker: false }); + + const promise = maybeReexecToLatest({ forwardArgs: ["dev"] }); + // SIGSTKFLT is absent from os.constants.signals on macOS (present on Linux). + // Either way the abort must surface as a real nonzero code, never NaN. + child.emit("close", null, "SIGSTKFLT"); + + await promise; + expect(exitSpy).toHaveBeenCalledTimes(1); + const code = exitSpy.mock.calls[0][0]; + expect(typeof code).toBe("number"); + expect(Number.isNaN(code as number)).toBe(false); + expect(code as number).toBeGreaterThan(0); + }); +}); diff --git a/packages/cli/src/lib/self-update.ts b/packages/cli/src/lib/self-update.ts index 16655014c..c1ad8fb3e 100644 --- a/packages/cli/src/lib/self-update.ts +++ b/packages/cli/src/lib/self-update.ts @@ -1,4 +1,7 @@ import { spawn } from "child_process"; +import { existsSync, mkdtempSync, rmSync, writeFileSync } from "fs"; +import { constants as osConstants, tmpdir } from "os"; +import { join } from "path"; import { forwardSignals } from "./child-process.js"; import { getOwnPackage, type OwnPackage } from "./own-package.js"; @@ -7,6 +10,10 @@ import { getOwnPackage, type OwnPackage } from "./own-package.js"; export const SKIP_AUTO_UPDATE_ENV = "STACK_CLI_SKIP_AUTO_UPDATE"; // User-facing opt-out. Set to a truthy value to never auto-update. export const DISABLE_AUTO_UPDATE_ENV = "STACK_CLI_NO_AUTO_UPDATE"; +// Marker file the re-exec'd child touches on startup; its presence lets the +// parent tell a real command failure from an npx/install failure where our CLI +// never ran. Set by the parent on the child's env. +export const REEXEC_MARKER_ENV = "HEXCLAVE_CLI_REEXEC_MARKER"; const LOG_PREFIX = "[Hexclave] "; @@ -96,8 +103,10 @@ export function decideReexec(opts: { }; } -type ReexecResult = - | { exited: true, code: number } +export type ReexecResult = + // `signal` is set when the child was killed by one (e.g. a forwarded Ctrl-C), + // distinguishing an abort from an npx failure (a bare exit code can't). + | { exited: true, code: number, signal: NodeJS.Signals | null } | { exited: false, error: string }; // Quote an argument for the single cmd.exe command line that Node builds when @@ -110,19 +119,31 @@ function quoteShellArg(arg: string): string { return `"${arg.replace(/"/g, '\\"')}"`; } -function runReexec(invocation: NpxInvocation): Promise { +function runReexec(invocation: NpxInvocation, markerFile: string | null): Promise { return new Promise((resolvePromise) => { const args = invocation.shell ? invocation.args.map(quoteShellArg) : invocation.args; + const env: NodeJS.ProcessEnv = { ...process.env, [SKIP_AUTO_UPDATE_ENV]: "1" }; + if (markerFile != null) env[REEXEC_MARKER_ENV] = markerFile; const child = spawn(invocation.command, args, { stdio: "inherit", - env: { ...process.env, [SKIP_AUTO_UPDATE_ENV]: "1" }, + env, shell: invocation.shell, }); const cleanup = forwardSignals(child); - child.on("close", (code) => { + child.on("close", (code, signal) => { cleanup(); - resolvePromise({ exited: true, code: code ?? 1 }); + if (signal != null) { + // Report the conventional 128 + signal-number exit code. The lookup can + // be undefined at runtime (not every signal is in os.constants.signals on + // every platform); 128 + undefined is NaN and process.exit(NaN) coerces + // to 0, masking the abort — so fall back to a generic nonzero code. + const signalNumber = osConstants.signals[signal] as number | undefined; + const code = signalNumber != null ? 128 + signalNumber : 1; + resolvePromise({ exited: true, code, signal }); + return; + } + resolvePromise({ exited: true, code: code ?? 1, signal: null }); }); // npx missing / not spawnable: report so the caller can fall back to the // installed CLI instead of failing the whole `hexclave dev`. @@ -133,13 +154,76 @@ function runReexec(invocation: NpxInvocation): Promise { }); } -// Re-runs the requested command through `npx @latest` so the user always -// gets the latest CLI + dashboard without reinstalling, then exits with the -// child's code. The re-exec'd child carries SKIP_AUTO_UPDATE_ENV so it runs the -// freshly downloaded CLI directly instead of recursing. Best-effort: if npx -// can't be spawned (or auto-update is off / opted out) we silently fall through -// to the installed CLI. +// What the parent does once the re-exec'd npx process is done: `exit` propagates +// the child's code (our CLI ran), `fallback` runs the installed CLI inline (the +// update failed before our CLI ran). Pure so it can be unit-tested. +export type PostReexecAction = + | { kind: "exit", code: number } + | { kind: "fallback", detail: string }; + +// `started` = whether the re-exec'd CLI's startup marker appeared. A nonzero +// exit with it started is a real failure (propagate); without it — or npx not +// spawnable — the update failed before our CLI ran, so we fall back. +export function decidePostReexec(opts: { result: ReexecResult, started: boolean }): PostReexecAction { + const { result, started } = opts; + if (!result.exited) { + return { kind: "fallback", detail: `could not run npx (${result.error})` }; + } + // Killed by a forwarded signal (e.g. Ctrl-C): the user wants to abort, not + // relaunch dev on the installed CLI. Propagate instead of falling back. + if (result.signal != null) { + return { kind: "exit", code: result.code }; + } + if (result.code !== 0 && !started) { + return { kind: "fallback", detail: `npx exited with code ${result.code} before the CLI started` }; + } + return { kind: "exit", code: result.code }; +} + +// Create a unique marker dir; the child writes a file inside it on startup. +// Returns null if the temp dir can't be created, in which case the caller treats +// every exit as "started" (preserving the old always-propagate behavior). +function createReexecMarker(): { dir: string, file: string } | null { + try { + const dir = mkdtempSync(join(tmpdir(), "hexclave-reexec-")); + return { dir, file: join(dir, "started") }; + } catch { + return null; + } +} + +function cleanupReexecMarker(marker: { dir: string } | null): void { + if (marker == null) return; + try { + rmSync(marker.dir, { recursive: true, force: true }); + } catch { + // best-effort temp cleanup + } +} + +// When we're the npx-spawned child (the parent set the marker env), touch the +// marker so the parent knows the latest CLI started. No-op at top level. +export function signalReexecStartedIfChild(env: NodeJS.ProcessEnv): void { + const markerFile = env[REEXEC_MARKER_ENV]; + if (markerFile == null || markerFile === "") return; + try { + writeFileSync(markerFile, "1"); + } catch { + // best-effort; if the write fails the parent just propagates as before. + } +} + +// Re-runs the command through `npx @latest` so the user always gets the +// latest CLI + dashboard without reinstalling, then exits with the child's code. +// The child carries SKIP_AUTO_UPDATE_ENV (run directly, don't recurse) and a +// marker path to signal it started. Best-effort: if the update fails before our +// CLI runs we fall back to the installed CLI instead of failing `hexclave dev`. export async function maybeReexecToLatest(opts: { forwardArgs: string[] }): Promise { + // If npx re-exec'd us to the latest CLI, record that we started so the parent + // can tell a real command failure from an npx/install failure. + signalReexecStartedIfChild(process.env); + + let marker: { dir: string, file: string } | null = null; try { const decision = decideReexec({ env: process.env, @@ -148,13 +232,25 @@ export async function maybeReexecToLatest(opts: { forwardArgs: string[] }): Prom }); if (!decision.reexec) return; - const result = await runReexec(decision.invocation); - if (result.exited) { - process.exit(result.code); + marker = createReexecMarker(); + const result = await runReexec(decision.invocation, marker?.file ?? null); + // No marker (couldn't create one): treat as "started" to keep the old + // always-propagate behavior rather than fall back spuriously. + const started = marker == null || existsSync(marker.file); + cleanupReexecMarker(marker); + marker = null; + + const action = decidePostReexec({ result, started }); + if (action.kind === "exit") { + process.exit(action.code); } - logUpdate(`Could not run npx (${result.error}); continuing with the installed CLI.`); + logUpdate(`Auto-update skipped: ${action.detail}; continuing with the installed CLI.`); } catch { // Fail open: any unexpected error must not block the installed CLI from // running. + } finally { + // Covers early-return/throw/opt-out paths; the success path already cleaned + // up before process.exit (which skips finally). + cleanupReexecMarker(marker); } }