chatwoot/spec/lib/integrations/llm_instrumentation_spec.rb
Aakash Bakhle eaffad12e7
feat(langfuse): propagate observation metadata for evals (#14634)
# Pull Request Template

## Description

We need to pass on trace level attributes down to the spans inside them
like tool calls, observations, etc.
This way, we can filter observations based on trace level attributes.


## Type of change

- [x] Bug fix (non-breaking change which fixes an issue)

## How Has This Been Tested?

Please describe the tests that you ran to verify your changes. Provide
instructions so we can reproduce. Please also list any relevant details
for your test configuration.

Attributes added to observation metadata for easy filtering
<img width="1327" height="708" alt="image"
src="https://github.com/user-attachments/assets/8f1d1bf8-cde4-481d-a2c2-7920ad2fc52e"
/>

added a `generation_stage` to differentiate llm_calls that call tools vs
those that generate a `final_response`
<img width="1806" height="968" alt="CleanShot 2026-06-03 at 15 11 09@2x"
src="https://github.com/user-attachments/assets/db1fa8e0-7f2d-404b-a719-27a16d400442"
/>


propagated attributes to tool calls for future use
<img width="903" height="517" alt="image"
src="https://github.com/user-attachments/assets/edc61ce8-93db-465c-a66e-043138e2dc15"
/>



## Checklist:

- [x] My code follows the style guidelines of this project
- [x] I have performed a self-review of my code
- [x] I have commented on my code, particularly in hard-to-understand
areas
- [ ] I have made corresponding changes to the documentation
- [x] My changes generate no new warnings
- [x] I have added tests that prove my fix is effective or that my
feature works
- [x] New and existing unit tests pass locally with my changes
- [x] Any dependent changes have been merged and published in downstream
modules
2026-06-03 16:45:19 +05:30

343 lines
16 KiB
Ruby

require 'rails_helper'
RSpec.describe Integrations::LlmInstrumentation do
let(:test_class) do
Class.new do
include Integrations::LlmInstrumentation
end
end
let(:instance) { test_class.new }
let!(:otel_config) do
InstallationConfig.find_or_create_by(name: 'OTEL_PROVIDER') do |config|
config.value = 'langfuse'
end
end
let(:params) do
{
span_name: 'llm.test',
account_id: 123,
conversation_id: 456,
feature_name: 'reply_suggestion',
model: 'gpt-4o-mini',
messages: [{ 'role' => 'user', 'content' => 'Hello' }],
temperature: 0.7
}
end
before do
InstallationConfig.find_or_create_by(name: 'LANGFUSE_SECRET_KEY') do |config|
config.value = 'test-secret-key'
end
end
describe '#instrument_llm_call' do
context 'when OTEL provider is not configured' do
before { otel_config.update(value: '') }
it 'executes the block without tracing' do
result = instance.instrument_llm_call(params) { 'my_result' }
expect(result).to eq('my_result')
end
end
context 'when OTEL provider is configured' do
it 'executes the block and returns the result' do
mock_span = instance_double(OpenTelemetry::Trace::Span)
allow(mock_span).to receive(:set_attribute)
allow(mock_span).to receive(:status=)
mock_tracer = instance_double(OpenTelemetry::Trace::Tracer)
allow(instance).to receive(:tracer).and_return(mock_tracer)
allow(mock_tracer).to receive(:in_span).and_yield(mock_span)
result = instance.instrument_llm_call(params) { 'my_result' }
expect(result).to eq('my_result')
end
it 'creates a tracing span with the provided span name' do
mock_span = instance_double(OpenTelemetry::Trace::Span)
allow(mock_span).to receive(:set_attribute)
allow(mock_span).to receive(:status=)
mock_tracer = instance_double(OpenTelemetry::Trace::Tracer)
allow(instance).to receive(:tracer).and_return(mock_tracer)
allow(mock_tracer).to receive(:in_span).and_yield(mock_span)
instance.instrument_llm_call(params) { 'result' }
expect(mock_tracer).to have_received(:in_span).with('llm.test')
end
it 'returns the block result even if instrumentation has errors' do
mock_tracer = instance_double(OpenTelemetry::Trace::Tracer)
allow(instance).to receive(:tracer).and_return(mock_tracer)
allow(mock_tracer).to receive(:in_span).and_raise(StandardError.new('Instrumentation failed'))
result = instance.instrument_llm_call(params) { 'my_result' }
expect(result).to eq('my_result')
end
it 'handles errors gracefully and captures exceptions' do
mock_span = instance_double(OpenTelemetry::Trace::Span)
allow(mock_span).to receive(:status=)
mock_tracer = instance_double(OpenTelemetry::Trace::Tracer)
allow(instance).to receive(:tracer).and_return(mock_tracer)
allow(mock_tracer).to receive(:in_span).and_yield(mock_span)
allow(mock_span).to receive(:set_attribute).and_raise(StandardError.new('Span error'))
allow(ChatwootExceptionTracker).to receive(:new).and_call_original
result = instance.instrument_llm_call(params) { 'my_result' }
expect(result).to eq('my_result')
expect(ChatwootExceptionTracker).to have_received(:new)
end
it 'sets correct request attributes on the span' do
mock_span = instance_double(OpenTelemetry::Trace::Span)
allow(mock_span).to receive(:set_attribute)
allow(mock_span).to receive(:status=)
mock_tracer = instance_double(OpenTelemetry::Trace::Tracer)
allow(instance).to receive(:tracer).and_return(mock_tracer)
allow(mock_tracer).to receive(:in_span).and_yield(mock_span)
instance.instrument_llm_call(params) { 'result' }
expect(mock_span).to have_received(:set_attribute).with('gen_ai.provider.name', 'openai')
expect(mock_span).to have_received(:set_attribute).with('gen_ai.request.model', 'gpt-4o-mini')
expect(mock_span).to have_received(:set_attribute).with('gen_ai.request.temperature', 0.7)
end
it 'sets correct prompt message attributes' do
mock_span = instance_double(OpenTelemetry::Trace::Span)
allow(mock_span).to receive(:set_attribute)
allow(mock_span).to receive(:status=)
mock_tracer = instance_double(OpenTelemetry::Trace::Tracer)
allow(instance).to receive(:tracer).and_return(mock_tracer)
allow(mock_tracer).to receive(:in_span).and_yield(mock_span)
custom_params = params.merge(
messages: [
{ 'role' => 'system', 'content' => 'You are a helpful assistant' },
{ 'role' => 'user', 'content' => 'Hello' }
]
)
instance.instrument_llm_call(custom_params) { 'result' }
expect(mock_span).to have_received(:set_attribute).with('gen_ai.prompt.0.role', 'system')
expect(mock_span).to have_received(:set_attribute).with('gen_ai.prompt.0.content', 'You are a helpful assistant')
expect(mock_span).to have_received(:set_attribute).with('gen_ai.prompt.1.role', 'user')
expect(mock_span).to have_received(:set_attribute).with('gen_ai.prompt.1.content', 'Hello')
end
it 'sets correct metadata attributes' do
mock_span = instance_double(OpenTelemetry::Trace::Span)
allow(mock_span).to receive(:set_attribute)
allow(mock_span).to receive(:status=)
mock_tracer = instance_double(OpenTelemetry::Trace::Tracer)
allow(instance).to receive(:tracer).and_return(mock_tracer)
allow(mock_tracer).to receive(:in_span).and_yield(mock_span)
instance.instrument_llm_call(params) { 'result' }
expect(mock_span).to have_received(:set_attribute).with('langfuse.user.id', '123')
expect(mock_span).to have_received(:set_attribute).with('langfuse.session.id', '123_456')
expect(mock_span).to have_received(:set_attribute).with('langfuse.trace.tags', ['reply_suggestion'])
expect(mock_span).to have_received(:set_attribute).with('langfuse.observation.metadata.user_id', '123')
expect(mock_span).to have_received(:set_attribute).with('langfuse.observation.metadata.session_id', '123_456')
expect(mock_span).to have_received(:set_attribute).with('langfuse.observation.metadata.feature_name', 'reply_suggestion')
end
it 'sets completion message attributes when result contains message' do
mock_span = instance_double(OpenTelemetry::Trace::Span)
allow(mock_span).to receive(:set_attribute)
allow(mock_span).to receive(:status=)
mock_tracer = instance_double(OpenTelemetry::Trace::Tracer)
allow(instance).to receive(:tracer).and_return(mock_tracer)
allow(mock_tracer).to receive(:in_span).and_yield(mock_span)
result = instance.instrument_llm_call(params) do
{ message: 'AI response here' }
end
expect(result).to eq({ message: 'AI response here' })
expect(mock_span).to have_received(:set_attribute).with('gen_ai.completion.0.role', 'assistant')
expect(mock_span).to have_received(:set_attribute).with('gen_ai.completion.0.content', 'AI response here')
end
it 'sets usage metrics when result contains usage data' do
mock_span = instance_double(OpenTelemetry::Trace::Span)
allow(mock_span).to receive(:set_attribute)
allow(mock_span).to receive(:status=)
mock_tracer = instance_double(OpenTelemetry::Trace::Tracer)
allow(instance).to receive(:tracer).and_return(mock_tracer)
allow(mock_tracer).to receive(:in_span).and_yield(mock_span)
result = instance.instrument_llm_call(params) do
{
usage: {
'prompt_tokens' => 150,
'completion_tokens' => 200,
'total_tokens' => 350
}
}
end
expect(result[:usage]['prompt_tokens']).to eq(150)
expect(mock_span).to have_received(:set_attribute).with('gen_ai.usage.input_tokens', 150)
expect(mock_span).to have_received(:set_attribute).with('gen_ai.usage.output_tokens', 200)
expect(mock_span).to have_received(:set_attribute).with('gen_ai.usage.total_tokens', 350)
end
it 'sets error attributes when result contains error' do
mock_span = instance_double(OpenTelemetry::Trace::Span)
mock_status = instance_double(OpenTelemetry::Trace::Status)
allow(mock_span).to receive(:set_attribute)
allow(mock_span).to receive(:status=)
allow(OpenTelemetry::Trace::Status).to receive(:error).and_return(mock_status)
mock_tracer = instance_double(OpenTelemetry::Trace::Tracer)
allow(instance).to receive(:tracer).and_return(mock_tracer)
allow(mock_tracer).to receive(:in_span).and_yield(mock_span)
result = instance.instrument_llm_call(params) do
{
error: 'API rate limit exceeded'
}
end
expect(result[:error]).to eq('API rate limit exceeded')
expect(mock_span).to have_received(:set_attribute)
.with('gen_ai.response.error', '"API rate limit exceeded"')
expect(mock_span).to have_received(:status=).with(mock_status)
expect(OpenTelemetry::Trace::Status).to have_received(:error).with('API rate limit exceeded')
end
end
describe '#instrument_agent_session' do
context 'when OTEL provider is not configured' do
before { otel_config.update(value: '') }
it 'executes the block without tracing' do
result = instance.instrument_agent_session(params) { 'my_result' }
expect(result).to eq('my_result')
end
end
context 'when OTEL provider is configured' do
let(:mock_span) { instance_double(OpenTelemetry::Trace::Span) }
let(:mock_tracer) { instance_double(OpenTelemetry::Trace::Tracer) }
before do
allow(mock_span).to receive(:set_attribute)
allow(instance).to receive(:tracer).and_return(mock_tracer)
allow(mock_tracer).to receive(:in_span).and_yield(mock_span)
end
it 'executes the block and returns the result' do
result = instance.instrument_agent_session(params) { 'my_result' }
expect(result).to eq('my_result')
end
it 'returns the block result even if instrumentation has errors' do
allow(mock_tracer).to receive(:in_span).and_raise(StandardError.new('Instrumentation failed'))
result = instance.instrument_agent_session(params) { 'my_result' }
expect(result).to eq('my_result')
end
it 'sets trace input and output attributes' do
result_data = { content: 'AI response' }
instance.instrument_agent_session(params) { result_data }
expect(mock_span).to have_received(:set_attribute).with('langfuse.observation.input', params[:messages].to_json)
expect(mock_span).to have_received(:set_attribute).with('langfuse.observation.output', result_data.to_json)
end
it 'propagates trace attributes as observation metadata to child tool spans' do
root_span = instance_double(OpenTelemetry::Trace::Span)
tool_span = instance_double(OpenTelemetry::Trace::Span)
tool_instance = test_class.new
allow(root_span).to receive(:set_attribute)
allow(tool_span).to receive(:set_attribute)
allow(instance).to receive(:tracer).and_return(mock_tracer)
allow(tool_instance).to receive(:tracer).and_return(mock_tracer)
allow(mock_tracer).to receive(:in_span).with('llm.test').and_yield(root_span)
allow(mock_tracer).to receive(:in_span).with('tool.search').and_yield(tool_span)
instance.instrument_agent_session(params) do
tool_instance.instrument_tool_call('search', { query: 'test' }) { 'tool result' }
end
expect(tool_span).to have_received(:set_attribute).with('langfuse.observation.metadata.user_id', '123')
expect(tool_span).to have_received(:set_attribute).with('langfuse.observation.metadata.session_id', '123_456')
expect(tool_span).to have_received(:set_attribute).with('langfuse.observation.metadata.feature_name', 'reply_suggestion')
end
it 'keeps inherited session metadata for nested service spans with their own feature tag' do
root_span = instance_double(OpenTelemetry::Trace::Span)
nested_span = instance_double(OpenTelemetry::Trace::Span)
nested_instance = test_class.new
nested_params = params.merge(span_name: 'llm.translate_query', conversation_id: nil, feature_name: 'translate_query')
allow(root_span).to receive(:set_attribute)
allow(nested_span).to receive(:set_attribute)
allow(instance).to receive(:tracer).and_return(mock_tracer)
allow(nested_instance).to receive(:tracer).and_return(mock_tracer)
allow(mock_tracer).to receive(:in_span).with('llm.test').and_yield(root_span)
allow(mock_tracer).to receive(:in_span).with('llm.translate_query').and_yield(nested_span)
instance.instrument_agent_session(params) do
nested_instance.instrument_llm_call(nested_params) { 'translated query' }
end
expect(nested_span).to have_received(:set_attribute).with('langfuse.session.id', '123_456')
expect(nested_span).to have_received(:set_attribute).with('langfuse.trace.tags', ['translate_query'])
expect(nested_span).to have_received(:set_attribute).with('langfuse.observation.metadata.session_id', '123_456')
expect(nested_span).to have_received(:set_attribute).with('langfuse.observation.metadata.feature_name', 'translate_query')
end
it 'propagates session metadata to nested embedding spans' do
root_span = instance_double(OpenTelemetry::Trace::Span)
embedding_span = instance_double(OpenTelemetry::Trace::Span)
embedding_instance = test_class.new
embedding_params = {
span_name: 'llm.captain.embedding',
account_id: 123,
feature_name: 'embedding',
model: 'text-embedding-3-small',
input: 'search result'
}
allow(root_span).to receive(:set_attribute)
allow(embedding_span).to receive(:set_attribute)
allow(instance).to receive(:tracer).and_return(mock_tracer)
allow(embedding_instance).to receive(:tracer).and_return(mock_tracer)
allow(mock_tracer).to receive(:in_span).with('llm.test').and_yield(root_span)
allow(mock_tracer).to receive(:in_span).with('llm.captain.embedding').and_yield(embedding_span)
instance.instrument_agent_session(params) do
embedding_instance.instrument_embedding_call(embedding_params) { [0.1, 0.2, 0.3] }
end
expect(embedding_span).to have_received(:set_attribute).with('langfuse.session.id', '123_456')
expect(embedding_span).to have_received(:set_attribute).with('langfuse.trace.tags', ['embedding'])
expect(embedding_span).to have_received(:set_attribute).with('langfuse.observation.metadata.session_id', '123_456')
expect(embedding_span).to have_received(:set_attribute).with('langfuse.observation.metadata.feature_name', 'embedding')
end
# Regression test for Langfuse double-counting bug.
# Setting gen_ai.request.model on parent spans causes Langfuse to classify them as
# GENERATIONs instead of SPANs, resulting in cost being counted multiple times
# (once for the parent, once for each child GENERATION).
# See: https://github.com/langfuse/langfuse/issues/7549
it 'does NOT set gen_ai.request.model to avoid being classified as a GENERATION' do
instance.instrument_agent_session(params) { 'result' }
expect(mock_span).not_to have_received(:set_attribute).with('gen_ai.request.model', anything)
end
end
end
end
end