fix: firecrawl long external link (#14566)

# Pull Request Template

## Description

Fixes urls going past 255 chars, this is because of arabic urls, where
each character balloons to 8-9 characters and goes past the 255 limit

## Type of change

Please delete options that are not relevant.

- [x] Bug fix (non-breaking change which fixes an issue)

## How Has This Been Tested?

Please describe the tests that you ran to verify your changes. Provide
instructions so we can reproduce. Please also list any relevant details
for your test configuration.
specs


## Checklist:

- [x] My code follows the style guidelines of this project
- [x] I have performed a self-review of my code
- [x] I have commented on my code, particularly in hard-to-understand
areas
- [ ] I have made corresponding changes to the documentation
- [x] My changes generate no new warnings
- [x] I have added tests that prove my fix is effective or that my
feature works
- [x] New and existing unit tests pass locally with my changes
- [x] Any dependent changes have been merged and published in downstream
modules
This commit is contained in:
Aakash Bakhle 2026-05-26 14:07:07 +05:30 committed by GitHub
parent 75c2f91019
commit 37c8e7e699
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 36 additions and 10 deletions

View File

@ -0,0 +1,16 @@
class ChangeCaptainDocumentExternalLinkToText < ActiveRecord::Migration[7.0]
OLD_INDEX_NAME = 'index_captain_documents_on_assistant_id_and_external_link'.freeze
NEW_INDEX_NAME = 'idx_captain_documents_on_assistant_id_and_external_link_md5'.freeze
def up
remove_index :captain_documents, name: OLD_INDEX_NAME, if_exists: true
change_column :captain_documents, :external_link, :text, null: false
add_index :captain_documents, 'assistant_id, md5(external_link)', unique: true, name: NEW_INDEX_NAME, if_not_exists: true
end
def down
remove_index :captain_documents, name: NEW_INDEX_NAME, if_exists: true
change_column :captain_documents, :external_link, :string, null: false
add_index :captain_documents, [:assistant_id, :external_link], unique: true, name: OLD_INDEX_NAME, if_not_exists: true
end
end

View File

@ -10,7 +10,7 @@
# #
# It's strongly recommended that you check this file into your version control system. # It's strongly recommended that you check this file into your version control system.
ActiveRecord::Schema[7.1].define(version: 2026_05_15_000000) do ActiveRecord::Schema[7.1].define(version: 2026_05_25_093000) do
# These extensions should be enabled to support this database # These extensions should be enabled to support this database
enable_extension "pg_stat_statements" enable_extension "pg_stat_statements"
enable_extension "pg_trgm" enable_extension "pg_trgm"
@ -370,7 +370,7 @@ ActiveRecord::Schema[7.1].define(version: 2026_05_15_000000) do
create_table "captain_documents", force: :cascade do |t| create_table "captain_documents", force: :cascade do |t|
t.string "name" t.string "name"
t.string "external_link", null: false t.text "external_link", null: false
t.text "content" t.text "content"
t.bigint "assistant_id", null: false t.bigint "assistant_id", null: false
t.bigint "account_id", null: false t.bigint "account_id", null: false
@ -381,10 +381,10 @@ ActiveRecord::Schema[7.1].define(version: 2026_05_15_000000) do
t.integer "sync_status" t.integer "sync_status"
t.datetime "last_synced_at" t.datetime "last_synced_at"
t.datetime "last_sync_attempted_at" t.datetime "last_sync_attempted_at"
t.index "assistant_id, md5(external_link)", name: "idx_captain_documents_on_assistant_id_and_external_link_md5", unique: true
t.index ["account_id", "assistant_id", "sync_status", "last_synced_at"], name: "idx_captain_documents_on_account_assistant_sync_stats" t.index ["account_id", "assistant_id", "sync_status", "last_synced_at"], name: "idx_captain_documents_on_account_assistant_sync_stats"
t.index ["account_id", "sync_status"], name: "index_captain_documents_on_account_id_and_sync_status" t.index ["account_id", "sync_status"], name: "index_captain_documents_on_account_id_and_sync_status"
t.index ["account_id"], name: "index_captain_documents_on_account_id" t.index ["account_id"], name: "index_captain_documents_on_account_id"
t.index ["assistant_id", "external_link"], name: "index_captain_documents_on_assistant_id_and_external_link", unique: true
t.index ["assistant_id"], name: "index_captain_documents_on_assistant_id" t.index ["assistant_id"], name: "index_captain_documents_on_assistant_id"
t.index ["status"], name: "index_captain_documents_on_status" t.index ["status"], name: "index_captain_documents_on_status"
end end

View File

@ -5,7 +5,7 @@
# id :bigint not null, primary key # id :bigint not null, primary key
# content :text # content :text
# content_fingerprint :string # content_fingerprint :string
# external_link :string not null # external_link :text not null
# last_sync_attempted_at :datetime # last_sync_attempted_at :datetime
# last_sync_error_code :string # last_sync_error_code :string
# last_synced_at :datetime # last_synced_at :datetime
@ -20,12 +20,12 @@
# #
# Indexes # Indexes
# #
# idx_captain_documents_on_account_assistant_sync_stats (account_id,assistant_id,sync_status,last_synced_at) # idx_captain_documents_on_account_assistant_sync_stats (account_id,assistant_id,sync_status,last_synced_at)
# index_captain_documents_on_account_id (account_id) # idx_captain_documents_on_assistant_id_and_external_link_md5 (assistant_id, md5(external_link)) UNIQUE
# index_captain_documents_on_account_id_and_sync_status (account_id,sync_status) # index_captain_documents_on_account_id (account_id)
# index_captain_documents_on_assistant_id (assistant_id) # index_captain_documents_on_account_id_and_sync_status (account_id,sync_status)
# index_captain_documents_on_assistant_id_and_external_link (assistant_id,external_link) UNIQUE # index_captain_documents_on_assistant_id (assistant_id)
# index_captain_documents_on_status (status) # index_captain_documents_on_status (status)
# #
class Captain::Document < ApplicationRecord class Captain::Document < ApplicationRecord
class LimitExceededError < StandardError; end class LimitExceededError < StandardError; end

View File

@ -61,6 +61,16 @@ RSpec.describe Captain::Tools::FirecrawlParserJob, type: :job do
end end
end end
it 'stores external links longer than 255 characters' do
long_url = "https://example.com/#{'arabic-product-slug-' * 300}"
payload[:metadata]['url'] = long_url
described_class.perform_now(assistant_id: assistant.id, payload: payload)
expect(assistant.documents.last.external_link).to eq(long_url)
expect(assistant.documents.last.external_link.length).to be > 255
end
context 'when an error occurs' do context 'when an error occurs' do
it 'raises an error with a descriptive message' do it 'raises an error with a descriptive message' do
allow(Captain::Assistant).to receive(:find).and_raise(ActiveRecord::RecordNotFound) allow(Captain::Assistant).to receive(:find).and_raise(ActiveRecord::RecordNotFound)