From d51c303fb01fee34fa2fd4da46a2ffaddc6df9d6 Mon Sep 17 00:00:00 2001 From: BilalG1 Date: Mon, 23 Mar 2026 10:09:04 -0700 Subject: [PATCH] fix clickhouse surrogate pair bug (#1270) ## Summary by CodeRabbit * **Bug Fixes** * Enhanced analytics event processing to properly handle edge cases when data contains certain truncated special characters or emoji sequences, ensuring data integrity. * **Tests** * Added coverage for analytics data edge case handling. --- .../latest/analytics/events/batch/route.tsx | 24 +++++++++- .../api/v1/analytics-events-batch.test.ts | 45 +++++++++++++++++++ 2 files changed, 68 insertions(+), 1 deletion(-) diff --git a/apps/backend/src/app/api/latest/analytics/events/batch/route.tsx b/apps/backend/src/app/api/latest/analytics/events/batch/route.tsx index 683b1d8bc..d1403c87f 100644 --- a/apps/backend/src/app/api/latest/analytics/events/batch/route.tsx +++ b/apps/backend/src/app/api/latest/analytics/events/batch/route.tsx @@ -10,6 +10,28 @@ const UUID_RE = /^[0-9a-f]{8}-[0-9a-f]{4}-[1-8][0-9a-f]{3}-[089ab][0-9a-f]{3}-[0 const MAX_EVENTS = 500; +// Lone surrogates (\uD800-\uDFFF not part of a valid pair) are technically +// representable in JS strings but rejected by ClickHouse's JSON parser. +// The client-side event tracker can produce these when .substring() truncates +// text in the middle of a surrogate pair (e.g. emoji characters). +// eslint-disable-next-line no-control-regex +const LONE_SURROGATE_RE = /[\uD800-\uDBFF](?![\uDC00-\uDFFF])|(? [k, stripLoneSurrogates(v)]) + ); + } + return value; +} + export const POST = createSmartRouteHandler({ metadata: { summary: "Upload analytics event batch", @@ -69,7 +91,7 @@ export const POST = createSmartRouteHandler({ const rows = body.events.map((event) => ({ event_type: event.event_type, event_at: new Date(event.event_at_ms), - data: event.data, + data: stripLoneSurrogates(event.data), project_id: projectId, branch_id: branchId, user_id: userId, diff --git a/apps/e2e/tests/backend/endpoints/api/v1/analytics-events-batch.test.ts b/apps/e2e/tests/backend/endpoints/api/v1/analytics-events-batch.test.ts index 5813cd96d..c47f9a6f5 100644 --- a/apps/e2e/tests/backend/endpoints/api/v1/analytics-events-batch.test.ts +++ b/apps/e2e/tests/backend/endpoints/api/v1/analytics-events-batch.test.ts @@ -160,6 +160,51 @@ it("accepts valid $click events", async ({ expect }) => { `); }); +it("handles click event data containing a truncated surrogate pair (lone high surrogate)", async ({ expect }) => { + await Project.createAndSwitch({ config: { magic_link_enabled: true } }); + await Project.updateConfig({ apps: { installed: { analytics: { enabled: true } } } }); + await Auth.Otp.signIn(); + + // Simulate what the client-side event tracker does: .substring(0, 200) can + // cut a string in the middle of a surrogate pair when emoji characters are + // near the boundary. For example, 🍉 is "\uD83C\uDF49" in UTF-16; cutting + // after the high surrogate leaves a lone "\uD83C" that ClickHouse cannot parse. + const paddedText = "a".repeat(199) + "\uD83C"; // lone high surrogate at position 199 + + const now = Date.now(); + const res = await uploadEventBatch({ + sessionReplaySegmentId: randomUUID(), + batchId: randomUUID(), + sentAtMs: now, + events: [ + { + event_type: "$click", + event_at_ms: now - 50, + data: { + tag_name: "div", + text: paddedText, + href: null, + selector: "div.container", + x: 100, + y: 200, + page_x: 100, + page_y: 500, + viewport_width: 375, + viewport_height: 647, + }, + }, + ], + }); + + expect(res).toMatchInlineSnapshot(` + NiceResponse { + "status": 200, + "body": { "inserted": 1 }, + "headers": Headers {