mirror of
https://github.com/chatwoot/chatwoot.git
synced 2026-06-04 21:02:35 +08:00
# Pull Request Template ## Description Better scheduling and queueing mechanics for document auto-sync - add jitter plan wise for document sync - move auto-sync documents to purgeable queue ## Type of change Please delete options that are not relevant. - [x] New feature (non-breaking change which adds functionality) ## How Has This Been Tested? Please describe the tests that you ran to verify your changes. Provide instructions so we can reproduce. Please also list any relevant details for your test configuration. locally tested and with specs ## Checklist: - [x] My code follows the style guidelines of this project - [x] I have performed a self-review of my code - [x] I have commented on my code, particularly in hard-to-understand areas - [ ] I have made corresponding changes to the documentation - [x] My changes generate no new warnings - [x] I have added tests that prove my fix is effective or that my feature works - [x] New and existing unit tests pass locally with my changes - [x] Any dependent changes have been merged and published in downstream modules --------- Co-authored-by: Sivin Varghese <64252451+iamsivin@users.noreply.github.com> Co-authored-by: iamsivin <iamsivin@gmail.com> Co-authored-by: Muhsin Keloth <muhsinkeramam@gmail.com> Co-authored-by: Sony Mathew <sony@chatwoot.com> Co-authored-by: Vishnu Narayanan <iamwishnu@gmail.com>
120 lines
4.7 KiB
Ruby
120 lines
4.7 KiB
Ruby
class Captain::Documents::PerformSyncJob < MutexApplicationJob
|
|
queue_as :low
|
|
|
|
# A single page fetch + fingerprint compare should complete in seconds.
|
|
# 10 minutes is generous headroom — if still "syncing" after that, the worker likely died mid-run.
|
|
# Shared with ScheduleSyncsJob so stale locks are re-enqueued at the same threshold.
|
|
LOCK_TIMEOUT = 10.minutes
|
|
|
|
# Safety net for anything we didn't rescue by name — parser bugs, ActiveRecord blips,
|
|
# random infra issues. Three attempts lets a real hiccup recover. The exhaustion block
|
|
# absorbs the final exception so Sidekiq doesn't layer its own retry policy on top, and
|
|
# is the single place we report to Sentry — handle_unexpected_failure logs but does not
|
|
# capture, so a deterministic bug emits one Sentry event instead of one per attempt.
|
|
# Goes first because retry_on handlers dispatch bottom-to-top.
|
|
retry_on StandardError, wait: 5.seconds, attempts: 3 do |job, error|
|
|
document = job.arguments.first
|
|
ChatwootExceptionTracker.new(error, account: document.account).capture_exception
|
|
job.send(:log_sync_outcome, document, result: :unexpected_retry_exhausted,
|
|
error_code: 'sync_error',
|
|
exception_class: error.class.name)
|
|
end
|
|
|
|
# Permanent errors (404, 403, empty content) - no point retrying, discard immediately.
|
|
# Document is already marked failed by SyncService before the exception reaches here.
|
|
discard_on(Captain::Documents::SyncService::PermanentSyncError)
|
|
|
|
# TransientSyncError is raised by SyncService when the customer's site is unreachable -
|
|
# timeouts, TLS errors, 5xx, connection drops. Four attempts with backoff gives the site
|
|
# a chance to recover before we mark the document failed.
|
|
#
|
|
# The exhaustion block absorbs the exception so it doesn't propagate to Sentry —
|
|
# site flakiness isn't an application bug.
|
|
retry_on(
|
|
Captain::Documents::SyncService::TransientSyncError,
|
|
wait: ->(executions) { [30.seconds, 2.minutes, 5.minutes][executions - 1] || 5.minutes },
|
|
attempts: 4
|
|
) do |job, error|
|
|
document = job.arguments.first
|
|
job.send(:mark_sync_failed, document, error.message)
|
|
job.send(:log_sync_outcome, document, result: :transient_retry_exhausted, error_code: error.message)
|
|
end
|
|
|
|
discard_on ActiveJob::DeserializationError
|
|
discard_on ActiveRecord::RecordNotFound
|
|
|
|
def perform(document)
|
|
start_time = Time.current
|
|
return if document.pdf_document?
|
|
|
|
with_lock(lock_key(document), LOCK_TIMEOUT) do
|
|
perform_sync(document, start_time)
|
|
end
|
|
rescue LockAcquisitionError
|
|
log_sync_outcome(document, result: :already_syncing)
|
|
rescue Captain::Documents::SyncService::PermanentSyncError => e
|
|
log_failure_and_raise(document, :permanent_failure, e, start_time)
|
|
rescue Captain::Documents::SyncService::TransientSyncError => e
|
|
log_failure_and_raise(document, :transient_failure, e, start_time)
|
|
rescue StandardError => e
|
|
handle_unexpected_failure(document, e, start_time)
|
|
end
|
|
|
|
private
|
|
|
|
def perform_sync(document, start_time)
|
|
mark_sync_started(document)
|
|
result = Captain::Documents::SyncService.new(document.reload).perform
|
|
log_sync_outcome(document, result: result, duration_ms: duration_ms_since(start_time))
|
|
end
|
|
|
|
def log_sync_outcome(document, **fields)
|
|
payload = {
|
|
document_id: document.id,
|
|
account_id: document.account_id,
|
|
assistant_id: document.assistant_id
|
|
}.merge(fields)
|
|
Rails.logger.info("[Captain::Documents::PerformSyncJob] #{payload.to_json}")
|
|
end
|
|
|
|
def log_failure_and_raise(document, result, error, start_time)
|
|
log_sync_outcome(document, result: result, error_code: error.message,
|
|
duration_ms: duration_ms_since(start_time))
|
|
raise error
|
|
end
|
|
|
|
def mark_sync_failed(document, error_code)
|
|
document.update!(
|
|
sync_status: :failed,
|
|
sync_step: nil,
|
|
last_sync_error_code: error_code,
|
|
last_sync_attempted_at: Time.current
|
|
)
|
|
end
|
|
|
|
def mark_sync_started(document)
|
|
document.update!(
|
|
sync_status: :syncing,
|
|
sync_step: nil,
|
|
last_sync_error_code: nil,
|
|
last_sync_attempted_at: Time.current
|
|
)
|
|
end
|
|
|
|
def handle_unexpected_failure(document, error, start_time)
|
|
mark_sync_failed(document, 'sync_error')
|
|
log_sync_outcome(document, result: :unexpected_failure, error_code: 'sync_error',
|
|
exception_class: error.class.name,
|
|
duration_ms: duration_ms_since(start_time))
|
|
raise error
|
|
end
|
|
|
|
def lock_key(document)
|
|
format(::Redis::Alfred::CAPTAIN_DOCUMENT_SYNC_MUTEX, document_id: document.id)
|
|
end
|
|
|
|
def duration_ms_since(start_time)
|
|
((Time.current - start_time) * 1000).round
|
|
end
|
|
end
|