fix: enable prompt caching for Anthropic agent requests

Add an ephemeral cache_control breakpoint on the system message so OpenRouter forwards it to upstream Anthropic providers (Anthropic direct, Amazon Bedrock, Google Vertex). Without the breakpoint, none of them cache — the OpenRouter activity log showed tokens_cached=0 on every Claude request. Verified with live OpenRouter calls: 2nd identical request now reports ~99% cache hit on the system prompt, cutting per-call input cost ~12x.
2026-06-19 21:00:40 +08:00 · 2026-04-13 20:01:30 -07:00 · 2026-04-13 20:01:30 -07:00 · d16e3d84ba
commit d16e3d84ba
parent 5399142db9
1 changed files with 17 additions and 4 deletions
--- a/apps/backend/src/app/api/latest/ai/query/[mode]/route.ts
+++ b/apps/backend/src/app/api/latest/ai/query/[mode]/route.ts
@ -53,11 +53,25 @@ export const POST = createSmartRouteHandler({
    const isDocsOrSearch = systemPromptId === "docs-ask-ai" || systemPromptId === "command-center-ask-ai";
    const stepLimit = toolsArg == null ? 1 : isDocsOrSearch ? 50 : 5;

+    // Anthropic models require an explicit cache_control breakpoint for prompt caching
+    // to work via OpenRouter (whether routed to Anthropic, Bedrock, or Google Vertex).
+    // Mark the static system prompt as an ephemeral cache breakpoint.
+    const isAnthropic = model.modelId.startsWith("anthropic/");
+    const systemMessage: ModelMessage = {
+      role: "system",
+      content: systemPrompt,
+      ...(isAnthropic && {
+        providerOptions: {
+          openrouter: { cacheControl: { type: "ephemeral" } },
+        },
+      }),
+    };
+    const fullMessages: ModelMessage[] = [systemMessage, ...(messages as ModelMessage[])];
+
    if (mode === "stream") {
      const result = streamText({
        model,
-        system: systemPrompt,
-        messages: messages as ModelMessage[],
+        messages: fullMessages,
        tools: toolsArg,
        stopWhen: stepCountIs(stepLimit),
      });
@ -71,8 +85,7 @@ export const POST = createSmartRouteHandler({
      const timeoutId = setTimeout(() => controller.abort(), 120_000);
      const result = await generateText({
        model,
-        system: systemPrompt,
-        messages: messages as ModelMessage[],
+        messages: fullMessages,
        tools: toolsArg,
        abortSignal: controller.signal,
        stopWhen: stepCountIs(stepLimit),