Address Greptile review: escape agent prompt, fix result detection, roll back all agent-written files, fix abort masking

Co-Authored-By: mantra <mantra@stack-auth.com>
This commit is contained in:
Devin AI 2026-06-02 20:26:58 +00:00
parent e3c7065c4f
commit 14bd4254fc
3 changed files with 136 additions and 33 deletions

View File

@ -1,15 +1,16 @@
import { mkdtempSync, readFileSync, rmSync, writeFileSync } from "fs";
import { existsSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from "fs";
import { join } from "path";
import { afterEach, describe, expect, it, vi } from "vitest";
// Lets each AI-path test inject what the (otherwise network-backed) agent does
// to the files on disk, so we can exercise the orchestration and validation
// around `updateConfigObject` without calling a real model.
let mockAgentImpl: ((options: { prompt: string, cwd: string }) => void | Promise<void>) | null = null;
type MockAgentOptions = { prompt: string, cwd: string, onFileWillChange?: (filePath: string) => void };
let mockAgentImpl: ((options: MockAgentOptions) => void | Promise<void>) | null = null;
vi.mock("server-only", () => ({}));
vi.mock("./config-update-agent", () => ({
runConfigUpdateAgent: async (options: { prompt: string, cwd: string }) => {
runConfigUpdateAgent: async (options: MockAgentOptions) => {
if (mockAgentImpl == null) {
throw new Error("runConfigUpdateAgent was called but no mock implementation was set for this test.");
}
@ -19,9 +20,20 @@ vi.mock("./config-update-agent", () => ({
let tempDir: string | undefined;
function getTempDir(): string {
if (tempDir == null) {
tempDir = mkdtempSync(join(process.cwd(), ".stack-rde-config-test-"));
// Give the temp dir its own package.json so SDK-package detection (which
// walks up to the nearest package.json) resolves deterministically here
// instead of picking up the dashboard's own dependencies, which would make
// the rendered `StackConfig` import env-dependent.
writeFileSync(join(tempDir, "package.json"), JSON.stringify({ name: "stack-rde-config-test" }), "utf-8");
}
return tempDir;
}
function writeTempFile(name: string, content: string): string {
tempDir ??= mkdtempSync(join(process.cwd(), ".stack-rde-config-test-"));
const filePath = join(tempDir, name);
const filePath = join(getTempDir(), name);
writeFileSync(filePath, content, "utf-8");
return filePath;
}
@ -331,6 +343,32 @@ describe("remote development environment config file", () => {
expect(readFileSync(templatePath, "utf-8")).toBe("export default <div>Old email</div>;\n");
});
it("rolls back a brand-new file the agent creates outside the statically-imported set", async () => {
// A wrapped config takes the agent path (it isn't a plain static literal),
// so the agent mock actually runs.
const configSource = `import { defineStackConfig } from "@hexclave/shared/config";\nexport const config = defineStackConfig({ auth: { allowSignUp: true } });\n`;
const configPath = writeTempConfig(configSource);
const newFilePath = join(getTempDir(), "generated-extra.ts");
const { updateConfigObject } = await import("./config-file");
// The agent creates a file that the config doesn't statically import, so the
// only way it can be rolled back is via the `onFileWillChange` hook firing
// before the write. After creating it, the agent fails so the run must roll
// back — deleting the newly-created file.
mockAgentImpl = (options) => {
options.onFileWillChange?.(newFilePath);
writeFileSync(newFilePath, "export const extra = true;\n", "utf-8");
throw new Error("agent blew up");
};
await expect(updateConfigObject(configPath, { "auth.allowSignUp": false }))
.rejects.toThrow("agent blew up");
expect(existsSync(newFilePath)).toBe(false);
expect(readFileSync(configPath, "utf-8")).toBe(configSource);
});
it("fails a non-evaluable update when the agent leaves every file unchanged", async () => {
const templatePath = writeTempFile("welcome-email.tsx", "export default <div>Old email</div>;\n");
const configSource = `import welcomeEmail from "./welcome-email.tsx" with { type: "text" };\n\nexport const config = {\n emails: { templates: { welcome: welcomeEmail } },\n};\n`;

View File

@ -166,14 +166,18 @@ export async function updateConfigObject(configFilePath: string, configUpdate: C
// full semantic check afterwards), then let the agent apply the change.
const baselineConfig = await tryReadConfigForValidation(configFilePath);
// Snapshot the config file and any externally-referenced files the agent might
// edit, so we can roll back to the original state if the agent fails or its
// result doesn't validate — never leaving a half-applied update behind.
// Snapshot the config file and its statically-referenced imports up front, then
// additionally capture any file the agent is about to write *before* it writes
// (via `onFileWillChange`). Together these let us roll back to the exact
// original state if the agent fails or its result doesn't validate — including
// brand-new files and files the agent edits that weren't statically imported —
// so we never leave a half-applied update behind.
const snapshots = snapshotConfigFiles(configFilePath, content);
try {
await runConfigUpdateAgent({
prompt: buildConfigUpdatePrompt(path.basename(configFilePath), configUpdate),
cwd: path.dirname(configFilePath),
onFileWillChange: (filePath) => captureSnapshotIfAbsent(snapshots, filePath),
});
await validateAgentUpdate(configFilePath, baselineConfig, configUpdate, snapshots);
} catch (error) {
@ -186,26 +190,30 @@ export async function updateConfigObject(configFilePath: string, configUpdate: C
type ConfigFileSnapshot = { path: string, content: string | null };
/**
* Captures the config file plus every file it imports via a relative path (e.g.
* `import x from "./welcome-email.tsx" with { type: "text" }`), which are exactly
* the files an in-place update is expected to touch. Files are read as UTF-8
* text, which matches this feature's text-import use case.
* Captures a file's current on-disk content into `snapshots` (recording `null`
* if it doesn't exist) unless it's already captured. Paths are resolved to
* absolute form so the same file is never snapshotted twice under different
* spellings. Files are read as UTF-8 text, which matches this feature's
* text-import use case.
*/
function captureSnapshotIfAbsent(snapshots: ConfigFileSnapshot[], filePath: string): void {
const resolved = path.resolve(filePath);
if (snapshots.some((snapshot) => snapshot.path === resolved)) return;
snapshots.push({ path: resolved, content: existsSync(resolved) ? readFileSync(resolved, "utf-8") : null });
}
/**
* Seeds the rollback set with the config file plus every file it imports via a
* relative path (e.g. `import x from "./welcome-email.tsx" with { type: "text" }`),
* which are exactly the files an in-place update is expected to touch. Any other
* file the agent ends up writing is captured on the fly via `onFileWillChange`,
* so this static pass is just a best-effort head start, not the full guarantee.
*/
function snapshotConfigFiles(configFilePath: string, configContent: string): ConfigFileSnapshot[] {
const dir = path.dirname(configFilePath);
const snapshots: ConfigFileSnapshot[] = [{ path: configFilePath, content: configContent }];
const snapshots: ConfigFileSnapshot[] = [{ path: path.resolve(configFilePath), content: configContent }];
for (const specifier of getRelativeImportSpecifiers(configContent)) {
const resolved = path.resolve(dir, specifier);
// Skip imports that resolve outside the config directory (e.g. `../shared/x`).
// The agent runs with its cwd set to the config directory, so it can only edit
// files within it; parent-directory imports are valid project layouts but
// simply aren't snapshot/rollback targets here (the agent won't touch them).
const relative = path.relative(dir, resolved);
if (relative.startsWith("..") || path.isAbsolute(relative)) {
continue;
}
if (snapshots.some((snapshot) => snapshot.path === resolved)) continue;
snapshots.push({ path: resolved, content: existsSync(resolved) ? readFileSync(resolved, "utf-8") : null });
captureSnapshotIfAbsent(snapshots, path.resolve(dir, specifier));
}
return snapshots;
}
@ -312,7 +320,11 @@ function flattenConfigUpdate(update: Config): ConfigChange[] {
function buildConfigUpdatePrompt(configFileName: string, configUpdate: Config): string {
const changes = flattenConfigUpdate(configUpdate);
const changeLines = changes.map(({ path: configPath, value }) => {
return `- \`${configPath}\`: set to ${JSON.stringify(value)}`;
// Both the path and value come from the (untrusted) config update, so they're
// JSON-encoded rather than interpolated raw — this escapes backticks, quotes,
// and newlines that could otherwise break out of the prompt's formatting or
// inject extra instructions.
return `- ${JSON.stringify(configPath)}: set to ${JSON.stringify(value)}`;
}).join("\n");
return `You are editing a Hexclave / Stack Auth configuration file in place. Apply a set of configuration changes WITHOUT changing how the file is written.

View File

@ -1,5 +1,6 @@
import "server-only";
import path from "path";
import { query } from "@anthropic-ai/claude-agent-sdk";
/**
@ -33,6 +34,36 @@ const AGENT_TIMEOUT_MS: number = (() => {
return parsed;
})();
/**
* True for the error produced when an `AbortController` aborts an awaited
* operation. The SDK surfaces this as either a real `AbortError` or a
* `DOMException` whose `name` is `"AbortError"`.
*/
function isAbortError(error: unknown): boolean {
return error instanceof Error && error.name === "AbortError";
}
// The file-mutating tools we allow the agent to use, each of which takes the
// target file in a `file_path` field. We watch these so callers can snapshot a
// file for rollback *before* the agent overwrites it.
const FILE_MUTATING_TOOLS = new Set(["Write", "Edit", "MultiEdit", "NotebookEdit"]);
function hasStringFilePath(input: unknown): input is { file_path: string } {
return typeof input === "object" && input !== null && "file_path" in input && typeof input.file_path === "string";
}
/**
* Returns the absolute path a file-mutating tool call is about to write, or
* `null` if the tool doesn't write a file. Relative paths are resolved against
* the agent's `cwd` to match where the tool will actually write.
*/
function getToolWriteTargetPath(toolName: string, toolInput: unknown, cwd: string): string | null {
if (!FILE_MUTATING_TOOLS.has(toolName) || !hasStringFilePath(toolInput)) {
return null;
}
return path.isAbsolute(toolInput.file_path) ? toolInput.file_path : path.resolve(cwd, toolInput.file_path);
}
function stripClaudeCodeEnv(): Record<string, string | undefined> {
const env = { ...process.env };
// Removing CLAUDECODE prevents the SDK from detecting a nested agent. The
@ -51,6 +82,11 @@ function stripClaudeCodeEnv(): Record<string, string | undefined> {
export async function runConfigUpdateAgent(options: {
prompt: string,
cwd: string,
// Called with the absolute path of each file the agent is about to write or
// edit, *before* the change happens, so the caller can capture the original
// content for rollback even when the file isn't statically referenced by the
// config (a new file, or an existing-but-unimported file).
onFileWillChange?: (filePath: string) => void,
}): Promise<void> {
const abortController = new AbortController();
const timeout = setTimeout(() => abortController.abort(), AGENT_TIMEOUT_MS);
@ -66,6 +102,17 @@ export async function runConfigUpdateAgent(options: {
// runs server-side so it must be fully isolated from the host environment.
settingSources: [],
strictMcpConfig: true,
hooks: options.onFileWillChange == null ? undefined : {
PreToolUse: [{
hooks: [async (input) => {
if (input.hook_event_name === "PreToolUse") {
const target = getToolWriteTargetPath(input.tool_name, input.tool_input, options.cwd);
if (target != null) options.onFileWillChange?.(target);
}
return { continue: true };
}],
}],
},
// Bash is intentionally omitted: applying a config delta only needs file
// inspection and editing, and withholding shell access reduces the blast
// radius of running an agent against the user's project.
@ -82,17 +129,23 @@ export async function runConfigUpdateAgent(options: {
stderr: (data: string) => { console.warn(`${LOG_PREFIX} [agent] ${data}`); },
},
})) {
// Detect the terminal message via `"result" in message` to match the CLI
// (`runClaudeAgent`); only a successful result carries a `result` field,
// so any other terminal subtype is reported as a failure.
if ("result" in message) {
sawResult = true;
} else if (message.type === "result") {
throw new Error(`Config update agent failed (${message.subtype}). It was unable to apply the config changes to the file.`);
// Only the terminal `result` message signals completion; a successful one
// carries a `result` field, so any other `result`-type subtype is a
// failure. Gating both branches on `type === "result"` avoids treating an
// intermediate message that happens to carry a `result` property as done.
if (message.type === "result") {
if ("result" in message) {
sawResult = true;
} else {
throw new Error(`Config update agent failed (${message.subtype}). It was unable to apply the config changes to the file.`);
}
}
}
} catch (error) {
if (abortController.signal.aborted) {
// Only translate to a timeout error when the failure is the abort we
// triggered; otherwise a real agent error that races with the timeout would
// be masked by the generic "timed out" message.
if (abortController.signal.aborted && isAbortError(error)) {
throw new Error(`Config update agent timed out after ${AGENT_TIMEOUT_MS}ms. It was unable to apply the config changes to the file.`);
}
throw error;