From d2a84f5a285d62790ab37519a8a6415840d328ad Mon Sep 17 00:00:00 2001 From: BilalG1 Date: Thu, 25 Jun 2026 14:48:49 -0700 Subject: [PATCH] fix: reduce recurring production Sentry errors (Stripe webhooks, email, session replay) (#1667) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary A cleanup pass over recurring production errors triaged from Sentry (`stackframe-pw` org). The common thread: expected/edge-case conditions thrown as `HexclaveAssertionError` / `captureError`, so Sentry filed them as errors (and, for the Stripe ones, Stripe redelivered indefinitely). Each is handled at the source or logged at the correct severity. | Sentry issue | Fix | Risk | |---|---|---| | [STACK-BACKEND-1F5](https://stackframe-pw.sentry.io/issues/STACK-BACKEND-1F5) — `Unknown stripe webhook type` (`invoice_payment.paid`, `payout.paid`) | Add both to `ignoredEvents`. They fell through to the throwing `else` and Stripe redelivered them. (`payout.failed`/`canceled`/`updated` intentionally left unhandled for now.) | Trivial | | [STACK-SERVER-1ZV](https://stackframe-pw.sentry.io/issues/STACK-SERVER-1ZV) — session-replay `413 Request body too large` | Measure event size in UTF-8 bytes (was UTF-16 `.length`, which undercounts multibyte content); drop a single oversized event with a warning instead of shipping a doomed request | Low | | [STACK-BACKEND-140](https://stackframe-pw.sentry.io/issues/STACK-BACKEND-140) + [STACK-BACKEND-1F1](https://stackframe-pw.sentry.io/issues/STACK-BACKEND-1F1) — `Unknown error while sending (test) email` | Classify refused SMTP connections (`ECONNREFUSED`, surfaced by nodemailer as `code: 'ESOCKET'`) as a typed `CONNECTION_REFUSED` error with a real user-facing message, instead of falling through to the `UNKNOWN` catch-all in both the low-level sender and the send-test-email route. Marked `canRetry` so the queued-email path reschedules with backoff. | Low | ## Notes - **Session replay (1ZV):** edited the `packages/template` source-of-truth; the generated SDK copies are gitignored and regenerated by CI (`pnpm -w run generate-sdks`). The `TextEncoder` is hoisted out of the rrweb emit hot path to avoid per-event allocation. - **Email classification (140/1F1):** the new `CONNECTION_REFUSED` errorType is additive — other consumers only read `errorType` for logging, and the send-test-email route only special-cases `UNKNOWN`, so the new type cleanly bypasses both assertion captures. `canRetry: true` is safe because the connection is refused before any SMTP exchange (no message handed off → no duplicate-delivery risk); transient refusals recover, and a persistent misconfig still fails after `MAX_SEND_ATTEMPTS`. The one-shot send-test-email path ignores `canRetry`, so its immediate feedback is unchanged. ## Investigated but intentionally NOT changed here These were initially included, then reverted so we keep getting Sentry signal while the root causes are still under investigation: - **[STACK-BACKEND-1GM](https://stackframe-pw.sentry.io/issues/STACK-BACKEND-1GM)** — `Stripe webhook bad customer id`. A subscription-changed event with no customer (the observed case was a Stripe-CLI test `payment_intent.succeeded` against a dev-connected account). Skipping is likely the right long-term fix, but kept the throw for now to keep observing. Note: in live mode the same path could fire on legitimate customerless one-time payments / guest checkouts. - **[STACK-BACKEND-1CN](https://stackframe-pw.sentry.io/issues/STACK-BACKEND-1CN)** — `Recovered N stale outgoing request(s)`. This is a self-healing recovery notice (0 user impact); the underlying cause is the poller process dying between the claim `UPDATE` and the delete. Kept at `captureError` to keep collecting data on how often / why it happens. ## Verification - `typecheck` clean: `@hexclave/backend`, `@hexclave/template`, `@hexclave/js`, `@hexclave/react`, `@hexclave/next`, `@hexclave/tanstack-start` - `eslint` clean on all touched files --- .../integrations/stripe/webhooks/route.tsx | 2 ++ apps/backend/src/lib/emails-low-level.tsx | 13 +++++++++++++ .../apps/implementations/session-replay.ts | 17 ++++++++++++++++- 3 files changed, 31 insertions(+), 1 deletion(-) diff --git a/apps/backend/src/app/api/latest/integrations/stripe/webhooks/route.tsx b/apps/backend/src/app/api/latest/integrations/stripe/webhooks/route.tsx index 7656826eb..3ff7ac17e 100644 --- a/apps/backend/src/app/api/latest/integrations/stripe/webhooks/route.tsx +++ b/apps/backend/src/app/api/latest/integrations/stripe/webhooks/route.tsx @@ -51,7 +51,9 @@ const ignoredEvents = [ "balance.available", "customer.updated", "customer.created", + "invoice_payment.paid", "payout.created", + "payout.paid", "payout.reconciliation_completed", ] as const satisfies Stripe.Event.Type[]; diff --git a/apps/backend/src/lib/emails-low-level.tsx b/apps/backend/src/lib/emails-low-level.tsx index 424c66537..fca4a7936 100644 --- a/apps/backend/src/lib/emails-low-level.tsx +++ b/apps/backend/src/lib/emails-low-level.tsx @@ -141,6 +141,19 @@ async function _lowLevelSendEmailWithoutRetries(options: LowLevelSendEmailOption } as const); } + // nodemailer surfaces a refused connection as code 'ESOCKET' with 'ECONNREFUSED' in the message. + // Safe to retry: the connection was refused before any SMTP exchange, so the message was never + // handed off — there's no duplicate-delivery risk, and a transient refusal (server restarting / + // overloaded) can recover. A persistent misconfig still fails after MAX_SEND_ATTEMPTS. + if (code === 'ECONNREFUSED' || error.message.includes('ECONNREFUSED')) { + return Result.error({ + rawError: error, + errorType: 'CONNECTION_REFUSED', + canRetry: true, + message: 'The email server refused the connection. Please make sure the email host and port configuration are correct.', + } as const); + } + if (responseCode === 535 || code === 'EAUTH') { return Result.error({ rawError: error, diff --git a/packages/template/src/lib/hexclave-app/apps/implementations/session-replay.ts b/packages/template/src/lib/hexclave-app/apps/implementations/session-replay.ts index 80bad1ff1..fc9c6c7b4 100644 --- a/packages/template/src/lib/hexclave-app/apps/implementations/session-replay.ts +++ b/packages/template/src/lib/hexclave-app/apps/implementations/session-replay.ts @@ -110,6 +110,9 @@ const MAX_APPROX_BYTES_PER_BATCH = 512_000; // envelope overhead (browser_session_id, timestamps, wrapper keys, etc.). const MAX_FLUSH_PAYLOAD_BYTES = 900_000; +// Reused across the emit hot path to avoid per-event allocation. +const textEncoder = new TextEncoder(); + export type StoredSession = { session_id: string, created_at_ms: number, @@ -286,6 +289,17 @@ export class SessionRecorder { // When _flushInProgress blocked earlier flushes, events can accumulate // well past MAX_APPROX_BYTES_PER_BATCH; sending them all at once would // exceed the server's 1MB body limit (413). + // A single event over the limit can't be sent (rrweb events aren't splittable); drop it and move on. + const firstSize = allSizes[offset] ?? throwErr("_eventSizes out of sync with _events — this should never happen"); + if (firstSize > MAX_FLUSH_PAYLOAD_BYTES) { + captureWarning( + "SessionRecorder.flush", + new Error(`Dropping oversized session replay event (${firstSize} bytes > ${MAX_FLUSH_PAYLOAD_BYTES} byte limit); it cannot be sent without a 413.`), + ); + offset += 1; + continue; + } + let batchBytes = 0; let batchEnd = offset; for (let i = offset; i < allEvents.length; i++) { @@ -384,7 +398,8 @@ export class SessionRecorder { } } - const eventSize = JSON.stringify(event).length; + // Measure UTF-8 byte length to match the server's byte limit (.length counts UTF-16 units, undercounting multibyte content). + const eventSize = textEncoder.encode(JSON.stringify(event)).byteLength; this._events.push(event); this._eventSizes.push(eventSize); this._approxBytes += eventSize;