chatwoot/enterprise/app/models/captain/document.rb
Aakash Bakhle 37c8e7e699
fix: firecrawl long external link (#14566)
# Pull Request Template

## Description

Fixes urls going past 255 chars, this is because of arabic urls, where
each character balloons to 8-9 characters and goes past the 255 limit

## Type of change

Please delete options that are not relevant.

- [x] Bug fix (non-breaking change which fixes an issue)

## How Has This Been Tested?

Please describe the tests that you ran to verify your changes. Provide
instructions so we can reproduce. Please also list any relevant details
for your test configuration.
specs


## Checklist:

- [x] My code follows the style guidelines of this project
- [x] I have performed a self-review of my code
- [x] I have commented on my code, particularly in hard-to-understand
areas
- [ ] I have made corresponding changes to the documentation
- [x] My changes generate no new warnings
- [x] I have added tests that prove my fix is effective or that my
feature works
- [x] New and existing unit tests pass locally with my changes
- [x] Any dependent changes have been merged and published in downstream
modules
2026-05-26 14:07:07 +05:30

188 lines
6.1 KiB
Ruby

# == Schema Information
#
# Table name: captain_documents
#
# id :bigint not null, primary key
# content :text
# content_fingerprint :string
# external_link :text not null
# last_sync_attempted_at :datetime
# last_sync_error_code :string
# last_synced_at :datetime
# metadata :jsonb
# name :string
# status :integer default("in_progress"), not null
# sync_status :integer
# created_at :datetime not null
# updated_at :datetime not null
# account_id :bigint not null
# assistant_id :bigint not null
#
# Indexes
#
# idx_captain_documents_on_account_assistant_sync_stats (account_id,assistant_id,sync_status,last_synced_at)
# idx_captain_documents_on_assistant_id_and_external_link_md5 (assistant_id, md5(external_link)) UNIQUE
# index_captain_documents_on_account_id (account_id)
# index_captain_documents_on_account_id_and_sync_status (account_id,sync_status)
# index_captain_documents_on_assistant_id (assistant_id)
# index_captain_documents_on_status (status)
#
class Captain::Document < ApplicationRecord
class LimitExceededError < StandardError; end
SYNC_STALE_TIMEOUT = 2.hours
self.table_name = 'captain_documents'
belongs_to :assistant, class_name: 'Captain::Assistant'
has_many :responses, class_name: 'Captain::AssistantResponse', dependent: :destroy, as: :documentable
belongs_to :account
has_one_attached :pdf_file
store_accessor :metadata, :content_fingerprint, :last_sync_error_code, :sync_step, :openai_file_id
validates :external_link, presence: true, unless: -> { pdf_file.attached? }
validates :external_link, uniqueness: { scope: :assistant_id }, allow_blank: true
validates :content, length: { maximum: 200_000 }
validates :pdf_file, presence: true, if: :pdf_document?
validate :validate_pdf_format, if: :pdf_document?
validate :validate_file_attachment, if: -> { pdf_file.attached? }
before_validation :ensure_account_id
before_validation :set_external_link_for_pdf
before_validation :normalize_external_link
enum status: {
in_progress: 0,
available: 1
}
enum :sync_status, { syncing: 0, synced: 1, failed: 2 }, prefix: :sync
before_create :ensure_within_plan_limit
after_create_commit :enqueue_crawl_job
after_create_commit :update_document_usage
after_destroy :update_document_usage
after_commit :enqueue_response_builder_job
scope :ordered, -> { order(created_at: :desc) }
scope :for_account, ->(account_id) { where(account_id: account_id) }
scope :for_assistant, ->(assistant_id) { where(assistant_id: assistant_id) }
scope :syncable, -> { where("external_link NOT LIKE 'PDF:%' AND external_link NOT LIKE '%.pdf'") }
scope :pdf_documents, -> { where("external_link LIKE 'PDF:%' OR external_link LIKE '%.pdf'") }
scope :sync_in_progress, -> { sync_syncing.where(arel_table[:last_sync_attempted_at].gteq(SYNC_STALE_TIMEOUT.ago)) }
scope :stale, lambda { |stale_before|
sync_failed.or(sync_synced.where(arel_table[:last_synced_at].lt(stale_before)))
}
scope :synced_since, lambda { |time|
sync_synced.where(arel_table[:last_synced_at].gteq(time))
}
def pdf_document?
return true if pdf_file.attached? && pdf_file.blob.content_type == 'application/pdf'
return true if external_link&.start_with?('PDF:')
external_link&.ends_with?('.pdf')
end
def content_type
pdf_file.blob.content_type if pdf_file.attached?
end
def file_size
pdf_file.blob.byte_size if pdf_file.attached?
end
def store_openai_file_id(file_id)
update!(openai_file_id: file_id)
end
def display_url
return external_link if external_link.present? && !external_link.start_with?('PDF:')
if pdf_file.attached?
Rails.application.routes.url_helpers.rails_blob_url(pdf_file, only_path: false)
else
external_link
end
end
def to_llm_metadata
{ document_id: id, assistant_id: assistant_id, external_link: external_link }
end
def syncable?
!pdf_document?
end
def sync_stale?
sync_syncing? && (last_sync_attempted_at.blank? || last_sync_attempted_at < SYNC_STALE_TIMEOUT.ago)
end
def sync_in_progress?
sync_syncing? && !sync_stale?
end
private
def enqueue_crawl_job
return if status != 'in_progress'
Captain::Documents::CrawlJob.perform_later(self)
end
def enqueue_response_builder_job
return unless should_enqueue_response_builder?
Captain::Documents::ResponseBuilderJob.perform_later(self)
end
def should_enqueue_response_builder?
return false if destroyed?
return false unless available?
return saved_change_to_status? if pdf_document?
(saved_change_to_status? || saved_change_to_content?) && content.present?
end
def update_document_usage
account.update_document_usage
end
def ensure_account_id
self.account_id = assistant&.account_id
end
def ensure_within_plan_limit
limits = account.usage_limits[:captain][:documents]
raise LimitExceededError, I18n.t('captain.documents.limit_exceeded') unless limits[:current_available].positive?
end
def validate_pdf_format
return unless pdf_file.attached?
errors.add(:pdf_file, I18n.t('captain.documents.pdf_format_error')) unless pdf_file.blob.content_type == 'application/pdf'
end
def validate_file_attachment
return unless pdf_file.attached?
return unless pdf_file.blob.byte_size > 10.megabytes
errors.add(:pdf_file, I18n.t('captain.documents.pdf_size_error'))
end
def set_external_link_for_pdf
return unless pdf_file.attached? && external_link.blank?
# Set a unique external_link for PDF files
# Format: PDF: filename_timestamp (without extension)
timestamp = Time.current.strftime('%Y%m%d%H%M%S')
self.external_link = "PDF: #{pdf_file.filename.base}_#{timestamp}"
end
def normalize_external_link
return if external_link.blank?
return if pdf_document?
self.external_link = external_link.delete_suffix('/')
end
end