From 37c8e7e6997f02e3dc76af4c92477bfe83b3fea6 Mon Sep 17 00:00:00 2001 From: Aakash Bakhle <48802744+aakashb95@users.noreply.github.com> Date: Tue, 26 May 2026 14:07:07 +0530 Subject: [PATCH] fix: firecrawl long external link (#14566) # Pull Request Template ## Description Fixes urls going past 255 chars, this is because of arabic urls, where each character balloons to 8-9 characters and goes past the 255 limit ## Type of change Please delete options that are not relevant. - [x] Bug fix (non-breaking change which fixes an issue) ## How Has This Been Tested? Please describe the tests that you ran to verify your changes. Provide instructions so we can reproduce. Please also list any relevant details for your test configuration. specs ## Checklist: - [x] My code follows the style guidelines of this project - [x] I have performed a self-review of my code - [x] I have commented on my code, particularly in hard-to-understand areas - [ ] I have made corresponding changes to the documentation - [x] My changes generate no new warnings - [x] I have added tests that prove my fix is effective or that my feature works - [x] New and existing unit tests pass locally with my changes - [x] Any dependent changes have been merged and published in downstream modules --- ...nge_captain_document_external_link_to_text.rb | 16 ++++++++++++++++ db/schema.rb | 6 +++--- enterprise/app/models/captain/document.rb | 14 +++++++------- .../captain/tools/firecrawl_parser_job_spec.rb | 10 ++++++++++ 4 files changed, 36 insertions(+), 10 deletions(-) create mode 100644 db/migrate/20260525093000_change_captain_document_external_link_to_text.rb diff --git a/db/migrate/20260525093000_change_captain_document_external_link_to_text.rb b/db/migrate/20260525093000_change_captain_document_external_link_to_text.rb new file mode 100644 index 00000000000..50cee2b0df9 --- /dev/null +++ b/db/migrate/20260525093000_change_captain_document_external_link_to_text.rb @@ -0,0 +1,16 @@ +class ChangeCaptainDocumentExternalLinkToText < ActiveRecord::Migration[7.0] + OLD_INDEX_NAME = 'index_captain_documents_on_assistant_id_and_external_link'.freeze + NEW_INDEX_NAME = 'idx_captain_documents_on_assistant_id_and_external_link_md5'.freeze + + def up + remove_index :captain_documents, name: OLD_INDEX_NAME, if_exists: true + change_column :captain_documents, :external_link, :text, null: false + add_index :captain_documents, 'assistant_id, md5(external_link)', unique: true, name: NEW_INDEX_NAME, if_not_exists: true + end + + def down + remove_index :captain_documents, name: NEW_INDEX_NAME, if_exists: true + change_column :captain_documents, :external_link, :string, null: false + add_index :captain_documents, [:assistant_id, :external_link], unique: true, name: OLD_INDEX_NAME, if_not_exists: true + end +end diff --git a/db/schema.rb b/db/schema.rb index 9d9fe3cbc95..f2c47957137 100644 --- a/db/schema.rb +++ b/db/schema.rb @@ -10,7 +10,7 @@ # # It's strongly recommended that you check this file into your version control system. -ActiveRecord::Schema[7.1].define(version: 2026_05_15_000000) do +ActiveRecord::Schema[7.1].define(version: 2026_05_25_093000) do # These extensions should be enabled to support this database enable_extension "pg_stat_statements" enable_extension "pg_trgm" @@ -370,7 +370,7 @@ ActiveRecord::Schema[7.1].define(version: 2026_05_15_000000) do create_table "captain_documents", force: :cascade do |t| t.string "name" - t.string "external_link", null: false + t.text "external_link", null: false t.text "content" t.bigint "assistant_id", null: false t.bigint "account_id", null: false @@ -381,10 +381,10 @@ ActiveRecord::Schema[7.1].define(version: 2026_05_15_000000) do t.integer "sync_status" t.datetime "last_synced_at" t.datetime "last_sync_attempted_at" + t.index "assistant_id, md5(external_link)", name: "idx_captain_documents_on_assistant_id_and_external_link_md5", unique: true t.index ["account_id", "assistant_id", "sync_status", "last_synced_at"], name: "idx_captain_documents_on_account_assistant_sync_stats" t.index ["account_id", "sync_status"], name: "index_captain_documents_on_account_id_and_sync_status" t.index ["account_id"], name: "index_captain_documents_on_account_id" - t.index ["assistant_id", "external_link"], name: "index_captain_documents_on_assistant_id_and_external_link", unique: true t.index ["assistant_id"], name: "index_captain_documents_on_assistant_id" t.index ["status"], name: "index_captain_documents_on_status" end diff --git a/enterprise/app/models/captain/document.rb b/enterprise/app/models/captain/document.rb index 0fe14813bd4..6eda0eb5ae2 100644 --- a/enterprise/app/models/captain/document.rb +++ b/enterprise/app/models/captain/document.rb @@ -5,7 +5,7 @@ # id :bigint not null, primary key # content :text # content_fingerprint :string -# external_link :string not null +# external_link :text not null # last_sync_attempted_at :datetime # last_sync_error_code :string # last_synced_at :datetime @@ -20,12 +20,12 @@ # # Indexes # -# idx_captain_documents_on_account_assistant_sync_stats (account_id,assistant_id,sync_status,last_synced_at) -# index_captain_documents_on_account_id (account_id) -# index_captain_documents_on_account_id_and_sync_status (account_id,sync_status) -# index_captain_documents_on_assistant_id (assistant_id) -# index_captain_documents_on_assistant_id_and_external_link (assistant_id,external_link) UNIQUE -# index_captain_documents_on_status (status) +# idx_captain_documents_on_account_assistant_sync_stats (account_id,assistant_id,sync_status,last_synced_at) +# idx_captain_documents_on_assistant_id_and_external_link_md5 (assistant_id, md5(external_link)) UNIQUE +# index_captain_documents_on_account_id (account_id) +# index_captain_documents_on_account_id_and_sync_status (account_id,sync_status) +# index_captain_documents_on_assistant_id (assistant_id) +# index_captain_documents_on_status (status) # class Captain::Document < ApplicationRecord class LimitExceededError < StandardError; end diff --git a/spec/enterprise/jobs/captain/tools/firecrawl_parser_job_spec.rb b/spec/enterprise/jobs/captain/tools/firecrawl_parser_job_spec.rb index e12efc54bd0..6aed603855d 100644 --- a/spec/enterprise/jobs/captain/tools/firecrawl_parser_job_spec.rb +++ b/spec/enterprise/jobs/captain/tools/firecrawl_parser_job_spec.rb @@ -61,6 +61,16 @@ RSpec.describe Captain::Tools::FirecrawlParserJob, type: :job do end end + it 'stores external links longer than 255 characters' do + long_url = "https://example.com/#{'arabic-product-slug-' * 300}" + payload[:metadata]['url'] = long_url + + described_class.perform_now(assistant_id: assistant.id, payload: payload) + + expect(assistant.documents.last.external_link).to eq(long_url) + expect(assistant.documents.last.external_link.length).to be > 255 + end + context 'when an error occurs' do it 'raises an error with a descriptive message' do allow(Captain::Assistant).to receive(:find).and_raise(ActiveRecord::RecordNotFound)