fix clickhouse surrogate pair bug (#1270)

<!--

Make sure you've read the CONTRIBUTING.md guidelines:
https://github.com/stack-auth/stack-auth/blob/dev/CONTRIBUTING.md

-->


<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->

## Summary by CodeRabbit

* **Bug Fixes**
* Enhanced analytics event processing to properly handle edge cases when
data contains certain truncated special characters or emoji sequences,
ensuring data integrity.

* **Tests**
  * Added coverage for analytics data edge case handling.

<!-- end of auto-generated comment: release notes by coderabbit.ai -->
This commit is contained in:
BilalG1 2026-03-23 10:09:04 -07:00 committed by GitHub
parent 1d00ed2c64
commit d51c303fb0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 68 additions and 1 deletions

View File

@ -10,6 +10,28 @@ const UUID_RE = /^[0-9a-f]{8}-[0-9a-f]{4}-[1-8][0-9a-f]{3}-[089ab][0-9a-f]{3}-[0
const MAX_EVENTS = 500;
// Lone surrogates (\uD800-\uDFFF not part of a valid pair) are technically
// representable in JS strings but rejected by ClickHouse's JSON parser.
// The client-side event tracker can produce these when .substring() truncates
// text in the middle of a surrogate pair (e.g. emoji characters).
// eslint-disable-next-line no-control-regex
const LONE_SURROGATE_RE = /[\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?<![\uD800-\uDBFF])[\uDC00-\uDFFF]/g;
function stripLoneSurrogates(value: unknown): unknown {
if (typeof value === "string") {
return value.replace(LONE_SURROGATE_RE, "\uFFFD");
}
if (Array.isArray(value)) {
return value.map(stripLoneSurrogates);
}
if (value !== null && typeof value === "object") {
return Object.fromEntries(
Object.entries(value).map(([k, v]) => [k, stripLoneSurrogates(v)])
);
}
return value;
}
export const POST = createSmartRouteHandler({
metadata: {
summary: "Upload analytics event batch",
@ -69,7 +91,7 @@ export const POST = createSmartRouteHandler({
const rows = body.events.map((event) => ({
event_type: event.event_type,
event_at: new Date(event.event_at_ms),
data: event.data,
data: stripLoneSurrogates(event.data),
project_id: projectId,
branch_id: branchId,
user_id: userId,

View File

@ -160,6 +160,51 @@ it("accepts valid $click events", async ({ expect }) => {
`);
});
it("handles click event data containing a truncated surrogate pair (lone high surrogate)", async ({ expect }) => {
await Project.createAndSwitch({ config: { magic_link_enabled: true } });
await Project.updateConfig({ apps: { installed: { analytics: { enabled: true } } } });
await Auth.Otp.signIn();
// Simulate what the client-side event tracker does: .substring(0, 200) can
// cut a string in the middle of a surrogate pair when emoji characters are
// near the boundary. For example, 🍉 is "\uD83C\uDF49" in UTF-16; cutting
// after the high surrogate leaves a lone "\uD83C" that ClickHouse cannot parse.
const paddedText = "a".repeat(199) + "\uD83C"; // lone high surrogate at position 199
const now = Date.now();
const res = await uploadEventBatch({
sessionReplaySegmentId: randomUUID(),
batchId: randomUUID(),
sentAtMs: now,
events: [
{
event_type: "$click",
event_at_ms: now - 50,
data: {
tag_name: "div",
text: paddedText,
href: null,
selector: "div.container",
x: 100,
y: 200,
page_x: 100,
page_y: 500,
viewport_width: 375,
viewport_height: 647,
},
},
],
});
expect(res).toMatchInlineSnapshot(`
NiceResponse {
"status": 200,
"body": { "inserted": 1 },
"headers": Headers { <some fields may have been hidden> },
}
`);
});
it("rejects empty events array", async ({ expect }) => {
await Project.createAndSwitch({ config: { magic_link_enabled: true } });
await Project.updateConfig({ apps: { installed: { analytics: { enabled: true } } } });