mirror of
https://github.com/stack-auth/stack.git
synced 2026-06-04 21:04:37 +08:00
fix(email-queue): recover stuck sending without duplicate retry (#1356)
## Summary Email outbox rows can get stuck in `SENDING` if a worker dies after setting `startedSendingAt` but before finishing or unclaiming. This change adds `recoverEmailsStuckInSending`, which runs each email queue step and marks rows past the stuck timeout as **terminal server errors** with delivery status unknown, **without** scheduling an automatic retry (to avoid duplicate sends if the provider already accepted the message). ## Changes - **`recoverEmailsStuckInSending`**: updates stuck rows with `finishedSendingAt`, `canHaveDeliveryInfo: false`, and server error fields; emits Sentry via `captureError` when any rows are recovered. - **Tests**: `email-queue-step.test.tsx` covers recovery of old `startedSendingAt`, no-op for recent sends, and idempotency (second pass does not re-queue). ## Test plan - [ ] `pnpm` / vitest for `apps/backend/src/lib/email-queue-step.test.tsx` (requires dev DB like other integration tests in this package) Made with [Cursor](https://cursor.com) <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit * **Bug Fixes** * Email reliability: messages that remained stuck in sending are now automatically marked as terminal failures, assigned standardized error details, cleared from retry scheduling, prevented from receiving delivery info, and recovery emits an alert only when actual work occurs. Recovery is safe to run repeatedly (idempotent). * **Tests** * Added integration tests validating recovery behavior, proper field updates, and idempotency. <!-- end of auto-generated comment: release notes by coderabbit.ai -->
This commit is contained in:
parent
37e70ca1ae
commit
7957de4182
119
apps/backend/src/lib/email-queue-step.test.tsx
Normal file
119
apps/backend/src/lib/email-queue-step.test.tsx
Normal file
@ -0,0 +1,119 @@
|
||||
import { EmailOutboxCreatedWith } from "@/generated/prisma/client";
|
||||
import { globalPrismaClient } from "@/prisma-client";
|
||||
import { afterAll, describe, expect, it } from "vitest";
|
||||
import { _forTesting } from "./email-queue-step";
|
||||
import { DEFAULT_BRANCH_ID, getSoleTenancyFromProjectBranch } from "./tenancies";
|
||||
|
||||
const { failEmailsStuckInSending, STUCK_EMAIL_TIMEOUT_MS } = _forTesting;
|
||||
|
||||
// These tests connect to the real dev DB (like payments.test.tsx) and create real EmailOutbox
|
||||
// rows against the seeded `internal` tenancy. Each row is tagged with a unique tsxSource so we
|
||||
// can find and clean up just our test rows.
|
||||
describe.sequential("failEmailsStuckInSending", () => {
|
||||
const testRunTag = `stuck-in-sending-test-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`;
|
||||
const createdIds: { tenancyId: string, id: string }[] = [];
|
||||
|
||||
const recoveryTestFilter = { tsxSource: `/* ${testRunTag} */` };
|
||||
|
||||
const makeRow = async (params: {
|
||||
startedSendingAt: Date | null,
|
||||
finishedSendingAt?: Date | null,
|
||||
isPaused?: boolean,
|
||||
sendRetries?: number,
|
||||
nextSendRetryAt?: Date | null,
|
||||
}) => {
|
||||
const tenancy = await getSoleTenancyFromProjectBranch("internal", DEFAULT_BRANCH_ID);
|
||||
const created = await globalPrismaClient.emailOutbox.create({
|
||||
data: {
|
||||
tenancyId: tenancy.id,
|
||||
tsxSource: recoveryTestFilter.tsxSource,
|
||||
themeId: null,
|
||||
isHighPriority: false,
|
||||
to: { type: "custom-emails", emails: ["stuck-test@example.com"] },
|
||||
extraRenderVariables: {},
|
||||
shouldSkipDeliverabilityCheck: true,
|
||||
createdWith: EmailOutboxCreatedWith.PROGRAMMATIC_CALL,
|
||||
scheduledAt: new Date(0),
|
||||
isQueued: true,
|
||||
renderedByWorkerId: "00000000-0000-0000-0000-000000000000",
|
||||
startedRenderingAt: new Date(0),
|
||||
finishedRenderingAt: new Date(0),
|
||||
renderedHtml: "<p>stuck</p>",
|
||||
renderedText: "stuck",
|
||||
renderedSubject: "stuck",
|
||||
renderedIsTransactional: false,
|
||||
startedSendingAt: params.startedSendingAt,
|
||||
finishedSendingAt: params.finishedSendingAt ?? null,
|
||||
sendRetries: params.sendRetries ?? 0,
|
||||
nextSendRetryAt: params.nextSendRetryAt ?? null,
|
||||
isPaused: params.isPaused ?? false,
|
||||
},
|
||||
});
|
||||
createdIds.push({ tenancyId: created.tenancyId, id: created.id });
|
||||
return created;
|
||||
};
|
||||
|
||||
afterAll(async () => {
|
||||
for (const { tenancyId, id } of createdIds) {
|
||||
await globalPrismaClient.emailOutbox.deleteMany({ where: { tenancyId, id } });
|
||||
}
|
||||
});
|
||||
|
||||
it("marks a row as failed when startedSendingAt is older than the stuck timeout", async () => {
|
||||
const longAgo = new Date(Date.now() - STUCK_EMAIL_TIMEOUT_MS - 60_000);
|
||||
const row = await makeRow({
|
||||
startedSendingAt: longAgo,
|
||||
sendRetries: 1,
|
||||
nextSendRetryAt: new Date(Date.now() + 60_000),
|
||||
});
|
||||
|
||||
await failEmailsStuckInSending(recoveryTestFilter);
|
||||
|
||||
const after = await globalPrismaClient.emailOutbox.findUniqueOrThrow({
|
||||
where: { tenancyId_id: { tenancyId: row.tenancyId, id: row.id } },
|
||||
});
|
||||
expect(after.finishedSendingAt).not.toBeNull();
|
||||
expect(after.startedSendingAt?.toISOString()).toBe(row.startedSendingAt?.toISOString());
|
||||
expect(after.canHaveDeliveryInfo).toBe(false);
|
||||
expect(after.sendServerErrorExternalMessage).toMatch(/timed out/i);
|
||||
expect(after.sendServerErrorInternalMessage).toMatch(/stuck in sending/i);
|
||||
expect(after.sendServerErrorInternalMessage).toMatch(/terminal server error/i);
|
||||
// Must be a terminal state — no retry scheduled.
|
||||
expect(after.nextSendRetryAt).toBeNull();
|
||||
// sendRetries is not bumped by this path (we never attempted the send again).
|
||||
expect(after.sendRetries).toBe(row.sendRetries);
|
||||
// Status must be SERVER_ERROR, not SENDING.
|
||||
expect(after.status).toBe("SERVER_ERROR");
|
||||
});
|
||||
|
||||
it("does not touch a row that started sending recently", async () => {
|
||||
const recently = new Date(Date.now() - 1000);
|
||||
const row = await makeRow({ startedSendingAt: recently });
|
||||
|
||||
await failEmailsStuckInSending(recoveryTestFilter);
|
||||
|
||||
const after = await globalPrismaClient.emailOutbox.findUniqueOrThrow({
|
||||
where: { tenancyId_id: { tenancyId: row.tenancyId, id: row.id } },
|
||||
});
|
||||
expect(after.finishedSendingAt).toBeNull();
|
||||
expect(after.sendServerErrorExternalMessage).toBeNull();
|
||||
expect(after.status).toBe("SENDING");
|
||||
});
|
||||
|
||||
it("does not re-queue rows already marked failed for another send attempt", async () => {
|
||||
const longAgo = new Date(Date.now() - STUCK_EMAIL_TIMEOUT_MS - 60_000);
|
||||
const row = await makeRow({ startedSendingAt: longAgo });
|
||||
|
||||
await failEmailsStuckInSending(recoveryTestFilter);
|
||||
// A second pass should be a no-op for this row: it's already terminal, so it must not
|
||||
// become a candidate for re-sending (which could duplicate an already-accepted delivery).
|
||||
await failEmailsStuckInSending(recoveryTestFilter);
|
||||
|
||||
const after = await globalPrismaClient.emailOutbox.findUniqueOrThrow({
|
||||
where: { tenancyId_id: { tenancyId: row.tenancyId, id: row.id } },
|
||||
});
|
||||
expect(after.nextSendRetryAt).toBeNull();
|
||||
expect(after.isQueued).toBe(true); // unchanged: we do not unclaim stuck rows
|
||||
expect(after.status).toBe("SERVER_ERROR");
|
||||
});
|
||||
});
|
||||
@ -97,7 +97,7 @@ export const runEmailQueueStep = withTraceSpan("runEmailQueueStep", async () =>
|
||||
|
||||
const sendPlan = await withTraceSpan("runEmailQueueStep-prepareSendPlan", prepareSendPlan)(deltaSeconds);
|
||||
await withTraceSpan("runEmailQueueStep-processSendPlan", processSendPlan)(sendPlan);
|
||||
await withTraceSpan("runEmailQueueStep-logEmailsStuckInSending", logEmailsStuckInSending)();
|
||||
await withTraceSpan("runEmailQueueStep-failEmailsStuckInSending", failEmailsStuckInSending)();
|
||||
const sendEnd = performance.now();
|
||||
|
||||
if (sendPlan.length > 0 || queuedCount > 0 || pendingRender.length > 0) {
|
||||
@ -134,25 +134,73 @@ async function retryEmailsStuckInRendering(): Promise<void> {
|
||||
}
|
||||
}
|
||||
|
||||
async function logEmailsStuckInSending(): Promise<void> {
|
||||
const res = await globalPrismaClient.emailOutbox.findMany({
|
||||
where: {
|
||||
startedSendingAt: {
|
||||
lte: new Date(Date.now() - STUCK_EMAIL_TIMEOUT_MS),
|
||||
},
|
||||
finishedSendingAt: null,
|
||||
skippedReason: null,
|
||||
isPaused: false,
|
||||
/**
|
||||
* Mark rows stuck in SENDING for longer than STUCK_EMAIL_TIMEOUT_MS as failed (terminal server error).
|
||||
*
|
||||
* These rows can happen when sending hangs or fails after `startedSendingAt` was set but before
|
||||
* the send attempt was finalized. We deliberately do NOT retry automatically: the upstream send
|
||||
* may have actually been accepted by the SMTP/provider, and retrying would risk a duplicate
|
||||
* delivery. Instead, we mark the row as a terminal server error with delivery status unknown,
|
||||
* and emit a Sentry signal so a human can investigate if desired.
|
||||
*/
|
||||
async function failEmailsStuckInSending(additionalWhere?: Prisma.EmailOutboxWhereInput): Promise<void> {
|
||||
const stuckCutoff = new Date(Date.now() - STUCK_EMAIL_TIMEOUT_MS);
|
||||
const failedAt = new Date();
|
||||
const externalMessage = "Email sending timed out before we could confirm whether it was delivered.";
|
||||
const internalDetails: Prisma.InputJsonObject = {
|
||||
errorType: "stuck-in-sending-marked-failed",
|
||||
stuckTimeoutMs: STUCK_EMAIL_TIMEOUT_MS,
|
||||
failedAt: failedAt.toISOString(),
|
||||
};
|
||||
const externalDetails: Prisma.InputJsonObject = {
|
||||
errorType: "stuck-in-sending-marked-failed",
|
||||
};
|
||||
|
||||
const baseWhere: Prisma.EmailOutboxWhereInput = {
|
||||
startedSendingAt: { lte: stuckCutoff },
|
||||
finishedSendingAt: null,
|
||||
skippedReason: null,
|
||||
isPaused: false,
|
||||
};
|
||||
|
||||
const failed = await globalPrismaClient.emailOutbox.updateManyAndReturn({
|
||||
where: additionalWhere == null ? baseWhere : { AND: [baseWhere, additionalWhere] },
|
||||
data: {
|
||||
finishedSendingAt: failedAt,
|
||||
// canHaveDeliveryInfo must be non-null when finishedSendingAt is set (see schema).
|
||||
// We set it to false because we have no webhook/provider signal we can correlate: delivery
|
||||
// status is effectively unknown, and treating it as "no delivery info available" is the
|
||||
// honest representation.
|
||||
canHaveDeliveryInfo: false,
|
||||
sendServerErrorExternalMessage: externalMessage,
|
||||
sendServerErrorExternalDetails: externalDetails,
|
||||
sendServerErrorInternalMessage: `Email was stuck in sending (startedSendingAt <= ${stuckCutoff.toISOString()}) and was marked as a terminal server error by the email queue step (delivery status unknown). Not retried to avoid duplicate delivery.`,
|
||||
sendServerErrorInternalDetails: internalDetails,
|
||||
nextSendRetryAt: null,
|
||||
// Do not append to sendAttemptErrors here: this path did not observe a provider response
|
||||
// or application error for this attempt, and the terminal server-error fields already
|
||||
// record the failure without duplicating potentially sensitive details.
|
||||
shouldUpdateSequenceId: true,
|
||||
},
|
||||
select: { id: true, tenancyId: true, startedSendingAt: true, to: true, sentAt: true, sendAttemptErrors: true },
|
||||
select: { id: true, tenancyId: true, startedSendingAt: true },
|
||||
});
|
||||
if (res.length > 0) {
|
||||
captureError("email-queue-step-stuck-in-sending", new StackAssertionError(`${res.length} emails stuck in sending! This should never happen. It was NOT correctly marked as an error! Manual intervention is required.`, {
|
||||
emails: res,
|
||||
}));
|
||||
|
||||
if (failed.length > 0) {
|
||||
captureError(
|
||||
"email-queue-step-stuck-in-sending",
|
||||
new StackAssertionError(
|
||||
`${failed.length} emails were stuck in sending and were marked as failed (terminal server errors; delivery status unknown, not retried to avoid duplicate sends). Manual investigation is recommended.`,
|
||||
{ emails: failed.map(({ id, tenancyId, startedSendingAt }) => ({ id, tenancyId, startedSendingAt })) },
|
||||
),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
export const _forTesting = {
|
||||
failEmailsStuckInSending,
|
||||
STUCK_EMAIL_TIMEOUT_MS,
|
||||
};
|
||||
|
||||
async function updateLastExecutionTime(): Promise<number> {
|
||||
const key = "EMAIL_QUEUE_METADATA_KEY";
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user