chatwoot/enterprise/app/jobs/captain/documents/perform_sync_job.rb
Aakash Bakhle d20950c5b4
feat: scheduler fairness [AI-159] (#14425)
# Pull Request Template

## Description
Better scheduling and queueing mechanics for document auto-sync
- add jitter plan wise for document sync
- move auto-sync documents to purgeable queue

## Type of change

Please delete options that are not relevant.

- [x] New feature (non-breaking change which adds functionality)

## How Has This Been Tested?

Please describe the tests that you ran to verify your changes. Provide
instructions so we can reproduce. Please also list any relevant details
for your test configuration.
locally tested and with specs

## Checklist:

- [x] My code follows the style guidelines of this project
- [x] I have performed a self-review of my code
- [x] I have commented on my code, particularly in hard-to-understand
areas
- [ ] I have made corresponding changes to the documentation
- [x] My changes generate no new warnings
- [x] I have added tests that prove my fix is effective or that my
feature works
- [x] New and existing unit tests pass locally with my changes
- [x] Any dependent changes have been merged and published in downstream
modules

---------

Co-authored-by: Sivin Varghese <64252451+iamsivin@users.noreply.github.com>
Co-authored-by: iamsivin <iamsivin@gmail.com>
Co-authored-by: Muhsin Keloth <muhsinkeramam@gmail.com>
Co-authored-by: Sony Mathew <sony@chatwoot.com>
Co-authored-by: Vishnu Narayanan <iamwishnu@gmail.com>
2026-05-27 16:01:51 +05:30

120 lines
4.7 KiB
Ruby

class Captain::Documents::PerformSyncJob < MutexApplicationJob
queue_as :low
# A single page fetch + fingerprint compare should complete in seconds.
# 10 minutes is generous headroom — if still "syncing" after that, the worker likely died mid-run.
# Shared with ScheduleSyncsJob so stale locks are re-enqueued at the same threshold.
LOCK_TIMEOUT = 10.minutes
# Safety net for anything we didn't rescue by name — parser bugs, ActiveRecord blips,
# random infra issues. Three attempts lets a real hiccup recover. The exhaustion block
# absorbs the final exception so Sidekiq doesn't layer its own retry policy on top, and
# is the single place we report to Sentry — handle_unexpected_failure logs but does not
# capture, so a deterministic bug emits one Sentry event instead of one per attempt.
# Goes first because retry_on handlers dispatch bottom-to-top.
retry_on StandardError, wait: 5.seconds, attempts: 3 do |job, error|
document = job.arguments.first
ChatwootExceptionTracker.new(error, account: document.account).capture_exception
job.send(:log_sync_outcome, document, result: :unexpected_retry_exhausted,
error_code: 'sync_error',
exception_class: error.class.name)
end
# Permanent errors (404, 403, empty content) - no point retrying, discard immediately.
# Document is already marked failed by SyncService before the exception reaches here.
discard_on(Captain::Documents::SyncService::PermanentSyncError)
# TransientSyncError is raised by SyncService when the customer's site is unreachable -
# timeouts, TLS errors, 5xx, connection drops. Four attempts with backoff gives the site
# a chance to recover before we mark the document failed.
#
# The exhaustion block absorbs the exception so it doesn't propagate to Sentry —
# site flakiness isn't an application bug.
retry_on(
Captain::Documents::SyncService::TransientSyncError,
wait: ->(executions) { [30.seconds, 2.minutes, 5.minutes][executions - 1] || 5.minutes },
attempts: 4
) do |job, error|
document = job.arguments.first
job.send(:mark_sync_failed, document, error.message)
job.send(:log_sync_outcome, document, result: :transient_retry_exhausted, error_code: error.message)
end
discard_on ActiveJob::DeserializationError
discard_on ActiveRecord::RecordNotFound
def perform(document)
start_time = Time.current
return if document.pdf_document?
with_lock(lock_key(document), LOCK_TIMEOUT) do
perform_sync(document, start_time)
end
rescue LockAcquisitionError
log_sync_outcome(document, result: :already_syncing)
rescue Captain::Documents::SyncService::PermanentSyncError => e
log_failure_and_raise(document, :permanent_failure, e, start_time)
rescue Captain::Documents::SyncService::TransientSyncError => e
log_failure_and_raise(document, :transient_failure, e, start_time)
rescue StandardError => e
handle_unexpected_failure(document, e, start_time)
end
private
def perform_sync(document, start_time)
mark_sync_started(document)
result = Captain::Documents::SyncService.new(document.reload).perform
log_sync_outcome(document, result: result, duration_ms: duration_ms_since(start_time))
end
def log_sync_outcome(document, **fields)
payload = {
document_id: document.id,
account_id: document.account_id,
assistant_id: document.assistant_id
}.merge(fields)
Rails.logger.info("[Captain::Documents::PerformSyncJob] #{payload.to_json}")
end
def log_failure_and_raise(document, result, error, start_time)
log_sync_outcome(document, result: result, error_code: error.message,
duration_ms: duration_ms_since(start_time))
raise error
end
def mark_sync_failed(document, error_code)
document.update!(
sync_status: :failed,
sync_step: nil,
last_sync_error_code: error_code,
last_sync_attempted_at: Time.current
)
end
def mark_sync_started(document)
document.update!(
sync_status: :syncing,
sync_step: nil,
last_sync_error_code: nil,
last_sync_attempted_at: Time.current
)
end
def handle_unexpected_failure(document, error, start_time)
mark_sync_failed(document, 'sync_error')
log_sync_outcome(document, result: :unexpected_failure, error_code: 'sync_error',
exception_class: error.class.name,
duration_ms: duration_ms_since(start_time))
raise error
end
def lock_key(document)
format(::Redis::Alfred::CAPTAIN_DOCUMENT_SYNC_MUTEX, document_id: document.id)
end
def duration_ms_since(start_time)
((Time.current - start_time) * 1000).round
end
end