Class: ContentEmbedding

Inherits:
ApplicationRecord show all
Defined in:
app/models/content_embedding.rb,
app/models/content_embedding/item_embedding.rb,
app/models/content_embedding/post_embedding.rb,
app/models/content_embedding/image_embedding.rb,
app/models/content_embedding/video_embedding.rb,
app/models/content_embedding/article_embedding.rb,
app/models/content_embedding/activity_embedding.rb,
app/models/content_embedding/showcase_embedding.rb,
app/models/content_embedding/site_map_embedding.rb,
app/models/content_embedding/reviews_io_embedding.rb,
app/models/content_embedding/call_record_embedding.rb,
app/models/content_embedding/product_line_embedding.rb

Overview

== Schema Information

Table name: content_embeddings_product_lines
Database name: primary

id :bigint not null, primary key
content_hash :string(32)
content_type :string default("primary"), not null
embeddable_type :string not null
embedding :vector(1536)
embedding_dimensions :integer
embedding_model :string default("text-embedding-3-small")
locale :string default("en")
model :string
token_count :integer
unified_embedding :vector
created_at :datetime not null
updated_at :datetime not null
embeddable_id :bigint not null

Indexes

idx_content_embeddings_product_lines_embedding_hnsw (embedding) USING hnsw
idx_content_embeddings_product_lines_embedding_model (embedding_model)
idx_content_embeddings_product_lines_unique (embeddable_id,content_type,locale) UNIQUE

Foreign Keys

fk_content_embeddings_product_lines_embeddable (embeddable_id => product_lines.id) ON DELETE => cascade

Defined Under Namespace

Modules: TextSearchable Classes: ActivityEmbedding, ArticleEmbedding, CallRecordEmbedding, ImageEmbedding, ItemEmbedding, PostEmbedding, ProductLineEmbedding, ReviewsIoEmbedding, ShowcaseEmbedding, SiteMapEmbedding, VideoEmbedding

Constant Summary collapse

EMBEDDING_MODELS =

Known embedding models and their dimensions
NOTE: HNSW indexes have a 2000 dimension limit in pgvector

{
  'text-embedding-3-small' => { dimensions: 1536, type: :text },
  'gemini-embedding-2-preview' => { dimensions: 1536, type: :multimodal },
  'jina-embeddings-v4' => { dimensions: 1536, type: :multimodal }  # Legacy, migrating away
}.freeze
DEFAULT_TEXT_MODEL =
'text-embedding-3-small'
UNIFIED_MODEL =
'gemini-embedding-2-preview'
LEGACY_UNIFIED_MODEL =
'jina-embeddings-v4'
SENSITIVE_TYPES =

Types that contain sensitive data and should NOT be exposed via MCP

%w[CallRecord Activity Communication].freeze
SEMANTIC_SIMILARITY_THRESHOLD =

Semantic search across all content types.

DEPRECATED: Prefer using partition-specific search methods:

  • ContentEmbedding::PostEmbedding.semantic_search("query")
  • ContentEmbedding::ImageEmbedding.semantic_search("query")
  • Or via the model: Post.semantic_search("query")

This method uses OpenAI embeddings by default, which won't work correctly
for Image search (which uses Gemini Embedding 2 multimodal embeddings).

param query [String] Natural language search query
param limit [Integer] Maximum number of results (default: 10)
param types [Array, nil] Filter by embeddable types (e.g., ['Showcase', 'Post'])
param locale [String] Locale for content filtering (default: 'en')
param published_only [Boolean] Only return published/active content (default: true)
Minimum similarity threshold for semantic search (0.0-1.0)
Results below this similarity are excluded as noise
0 = no filtering (default), 0.1 = very permissive, 0.3 = moderate, 0.5 = strict

Returns:

  • (ActiveRecord::Relation)

    Embeddings ordered by similarity

0.0

Instance Attribute Summary collapse

Belongs to collapse

Class Method Summary collapse

Instance Method Summary collapse

Methods inherited from ApplicationRecord

ransackable_associations, ransackable_attributes, ransackable_scopes, ransortable_attributes, #to_relation

Methods included from Models::EventPublishable

#publish_event

Instance Attribute Details

#content_hashObject (readonly)

content_hash is only required for text embeddings, not unified embeddings
Unified rows (content_type='unified') don't need content_hash

Validations (unless => -> { content_type == 'unified' } ):



141
# File 'app/models/content_embedding.rb', line 141

validates :content_hash, presence: true, unless: -> { content_type == 'unified' }

#content_typeObject (readonly)

Validations

Validations:



138
# File 'app/models/content_embedding.rb', line 138

validates :content_type, presence: true

#embeddable_typeObject (readonly)



142
143
144
145
# File 'app/models/content_embedding.rb', line 142

validates :embeddable_type, inclusion: {
  in: %w[Post Article Showcase Video Image Item ProductLine SiteMap ReviewsIo CallRecord AssistantBrainEntry Activity Communication],
  message: '%<value>s is not a supported embeddable type'
}

Class Method Details

.active_imagesActiveRecord::Relation<ContentEmbedding>

A relation of ContentEmbeddings that are active images. Active Record Scope

Returns:

See Also:



231
232
233
234
235
# File 'app/models/content_embedding.rb', line 231

scope :active_images, -> {
  where(embeddable_type: 'Image')
    .joins('INNER JOIN digital_assets ON digital_assets.id = content_embeddings.embeddable_id')
    .where(digital_assets: { inactive: false })
}

.active_publicationsActiveRecord::Relation<ContentEmbedding>

A relation of ContentEmbeddings that are active publications. Active Record Scope

Returns:

See Also:



267
268
269
270
271
# File 'app/models/content_embedding.rb', line 267

scope :active_publications, -> {
  where(embeddable_type: 'Item')
    .joins('INNER JOIN items ON items.id = content_embeddings.embeddable_id')
    .where(items: { is_discontinued: false })
}

.active_reviewsActiveRecord::Relation<ContentEmbedding>

A relation of ContentEmbeddings that are active reviews. Active Record Scope

Returns:

See Also:



260
261
262
263
264
# File 'app/models/content_embedding.rb', line 260

scope :active_reviews, -> {
  where(embeddable_type: 'ReviewsIo')
    .joins('INNER JOIN reviews_io ON reviews_io.id = content_embeddings.embeddable_id')
    .where(reviews_io: { status: 'active' })
}

.active_videosActiveRecord::Relation<ContentEmbedding>

A relation of ContentEmbeddings that are active videos. Active Record Scope

Returns:

See Also:



224
225
226
227
228
# File 'app/models/content_embedding.rb', line 224

scope :active_videos, -> {
  where(embeddable_type: 'Video')
    .joins('INNER JOIN digital_assets ON digital_assets.id = content_embeddings.embeddable_id')
    .where(digital_assets: { inactive: false })
}

.by_dimensionsActiveRecord::Relation<ContentEmbedding>

A relation of ContentEmbeddings that are by dimensions. Active Record Scope

Returns:

See Also:



188
# File 'app/models/content_embedding.rb', line 188

scope :by_dimensions, ->(dims) { where(embedding_dimensions: dims) }

.by_modelActiveRecord::Relation<ContentEmbedding>

A relation of ContentEmbeddings that are by model. Active Record Scope

Returns:

See Also:



182
# File 'app/models/content_embedding.rb', line 182

scope :by_model, ->(model) { where(embedding_model: model) }

.by_typeActiveRecord::Relation<ContentEmbedding>

A relation of ContentEmbeddings that are by type. Active Record Scope

Returns:

See Also:



154
155
156
157
# File 'app/models/content_embedding.rb', line 154

scope :by_type, ->(types) {
  types = Array(types).flatten.compact
  types.present? ? where(embeddable_type: types) : all
}

.calculate_rrf_scores(vector_results, keyword_results, k) ⇒ Hash{Integer => Float}

Calculate Reciprocal Rank Fusion scores
RRF Score = sum of 1/(k + rank) for each result list

Parameters:

  • vector_results (Array<ContentEmbedding>)

    Results from vector search

  • keyword_results (Array<ContentEmbedding>)

    Results from keyword search

  • k (Integer)

    RRF constant (typically 60)

Returns:

  • (Hash{Integer => Float})

    Map of embedding ID to RRF score



463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
# File 'app/models/content_embedding.rb', line 463

def self.calculate_rrf_scores(vector_results, keyword_results, k)
  scores = Hash.new(0.0)

  # Add vector search contribution (rank is 0-indexed)
  vector_results.each_with_index do |result, rank|
    scores[result.id] += 1.0 / (k + rank + 1)
  end

  # Add keyword search contribution
  keyword_results.each_with_index do |result, rank|
    scores[result.id] += 1.0 / (k + rank + 1)
  end

  scores
end

.faqs_onlyActiveRecord::Relation<ContentEmbedding>

A relation of ContentEmbeddings that are faqs only. Active Record Scope

Returns:

See Also:



207
208
209
210
211
# File 'app/models/content_embedding.rb', line 207

scope :faqs_only, -> {
  where(embeddable_type: 'Article')
    .joins('INNER JOIN articles ON articles.id = content_embeddings.embeddable_id')
    .where(articles: { type: 'ArticleFaq' })
}

.find_similar(record, limit: 5, same_type_only: false) ⇒ ActiveRecord::Relation

Find content similar to a given record

Examples:

Find similar showcases

ContentEmbedding.find_similar(showcase, same_type_only: true)

Parameters:

  • record (ApplicationRecord)

    The record to find similar content for

  • limit (Integer) (defaults to: 5)

    Maximum number of results (default: 5)

  • same_type_only (Boolean) (defaults to: false)

    Only return same type of content (default: false)

Returns:

  • (ActiveRecord::Relation)

    Similar embeddings ordered by similarity



352
353
354
355
356
357
358
359
# File 'app/models/content_embedding.rb', line 352

def self.find_similar(record, limit: 5, same_type_only: false)
  embedding = record.content_embeddings.primary_content.first
  return none unless embedding&.embedding

  scope = where.not(embeddable_type: embedding.embeddable_type, embeddable_id: record.id)
  scope = scope.by_type(embedding.embeddable_type) if same_type_only
  scope.nearest_neighbors(:embedding, embedding.embedding, distance: :cosine).limit(limit)
end

.for_localeActiveRecord::Relation<ContentEmbedding>

A relation of ContentEmbeddings that are for locale. Active Record Scope

Returns:

See Also:



164
165
166
167
168
169
170
171
172
173
174
# File 'app/models/content_embedding.rb', line 164

scope :for_locale, ->(locale) {
  locale_str = locale.to_s
  if locale_str.include?('-')
    # Exact match for regional locales (en-US, en-CA, fr-CA)
    where(locale: locale_str)
  else
    # Base locale matches itself and all regional variants
    # Use table name to avoid ambiguity when joined
    where('content_embeddings.locale = ? OR content_embeddings.locale LIKE ?', locale_str, "#{locale_str}-%")
  end
}

.gemini_embeddingActiveRecord::Relation<ContentEmbedding>

A relation of ContentEmbeddings that are gemini embedding. Active Record Scope

Returns:

See Also:



191
# File 'app/models/content_embedding.rb', line 191

scope :gemini_embedding, -> { by_model(UNIFIED_MODEL) }

.generate_query_embedding(query, model: DEFAULT_TEXT_MODEL) ⇒ Array<Float>?

Generate embedding for a query string with caching
Uses OpenAI text-embedding-3-small by default (matches Posts, Showcases, Videos, etc.)
For visual/image search, use unified_visual_search which uses Gemini Embedding 2

Parameters:

  • query (String)

    Text to embed

  • model (String) (defaults to: DEFAULT_TEXT_MODEL)

    Embedding model (default: text-embedding-3-small)

Returns:

  • (Array<Float>, nil)

    Embedding vector or nil on error



486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
# File 'app/models/content_embedding.rb', line 486

def self.generate_query_embedding(query, model: DEFAULT_TEXT_MODEL)
  cache_key = "query_embedding:#{model}:#{Digest::SHA256.hexdigest(query.downcase.strip)[0..15]}"

  cached = Rails.cache.read(cache_key)
  return cached if cached.present?

  vector = case model
           when 'text-embedding-3-small'
             result = RubyLLM.embed(query, model: model, provider: :openai, assume_model_exists: true)
             result.vectors
           when /^gemini-embedding/
             Embedding::Gemini.embed_query(query, dimensions: 1536)
           when /^jina-embeddings/
             Embedding::Gemini.embed_query(query, dimensions: 1536)
           else
             raise ArgumentError, "No query embedding implementation for model: #{model}"
           end

  Rails.cache.write(cache_key, vector, expires_in: 24.hours) if vector.present?

  vector
rescue RubyLLM::RateLimitError => e
  Rails.logger.warn "Rate limited generating query embedding (#{model}): #{e.message}"
  nil
rescue RubyLLM::Error => e
  Rails.logger.error "RubyLLM error generating query embedding (#{model}): #{e.message}"
  nil
rescue StandardError => e
  Rails.logger.error "Failed to generate query embedding (#{model}): #{e.message}"
  nil
end

.generate_unified_query_embedding(query, model:, dimensions:) ⇒ Array<Float>?

Generate query embedding using the appropriate service for the model

Parameters:

  • query (String)

    Text to embed

  • model (String)

    Target embedding model

  • dimensions (Integer)

    Vector dimensions for the model

Returns:

  • (Array<Float>, nil)

    Embedding vector or nil on error



574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
# File 'app/models/content_embedding.rb', line 574

def self.generate_unified_query_embedding(query, model:, dimensions:)
  cache_key = "unified_query_embedding:#{model}:#{Digest::SHA256.hexdigest(query.downcase.strip)[0..15]}"

  cached = Rails.cache.read(cache_key)
  return cached if cached.present?

  vector = case model
           when 'text-embedding-3-small'
             result = RubyLLM.embed(query, model: model, provider: :openai, assume_model_exists: true)
             result.vectors
           when /^gemini-embedding/
             Embedding::Gemini.embed_query(query, dimensions: dimensions)
           when /^jina-embeddings/
             # Legacy Jina embeddings — use Gemini for queries during migration
             Embedding::Gemini.embed_query(query, dimensions: dimensions)
           else
             raise ArgumentError, "No query embedding implementation for model: #{model}"
           end

  Rails.cache.write(cache_key, vector, expires_in: 24.hours) if vector.present?
  vector
rescue RubyLLM::RateLimitError => e
  Rails.logger.warn "Rate limited generating unified query embedding (#{model}): #{e.message}"
  nil
rescue RubyLLM::Error => e
  Rails.logger.error "RubyLLM error generating unified query embedding (#{model}): #{e.message}"
  nil
rescue StandardError => e
  Rails.logger.error "Failed to generate unified query embedding (#{model}): #{e.message}"
  nil
end

.hybrid_search(query, limit: 10, types: nil, locale: 'en', published_only: true, k: 60, min_similarity: SEMANTIC_SIMILARITY_THRESHOLD, exclude_sensitive: true) ⇒ Array<ContentEmbedding>

Hybrid search using Reciprocal Rank Fusion (RRF)
Combines vector similarity with keyword/trigram search using rank-based scoring

RRF Score = 1/(k + rank_vector) + 1/(k + rank_keyword)
This properly weights results that appear in both searches higher

Examples:

Hybrid search with RRF

ContentEmbedding.hybrid_search("TempZone Flex Roll installation")

Parameters:

  • query (String)

    Natural language search query

  • limit (Integer) (defaults to: 10)

    Maximum number of results (default: 10)

  • types (Array<String>, nil) (defaults to: nil)

    Filter by embeddable types

  • locale (String) (defaults to: 'en')

    Locale for content filtering (default: 'en')

  • published_only (Boolean) (defaults to: true)

    Only return published/active content (default: true)

  • k (Integer) (defaults to: 60)

    RRF constant (default: 60, standard value)

  • min_similarity (Object) (defaults to: SEMANTIC_SIMILARITY_THRESHOLD)

Returns:



379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
# File 'app/models/content_embedding.rb', line 379

def self.hybrid_search(query, limit: 10, types: nil, locale: 'en', published_only: true, k: 60, min_similarity: SEMANTIC_SIMILARITY_THRESHOLD, exclude_sensitive: true)
  return [] if query.blank?

  fetch_limit = [limit * 3, 30].max # Fetch more for better RRF ranking

  # 1. Vector search (semantic matches) - with similarity threshold
  vector_results = semantic_search(
    query,
    limit: fetch_limit,
    types: types,
    locale: locale,
    published_only: published_only,
    min_similarity: min_similarity,
    exclude_sensitive: exclude_sensitive
  ).to_a

  # 2. Keyword search using pg_search/trigram if available, fallback to ILIKE
  keyword_results = keyword_search_for_rrf(query, fetch_limit, types, locale, published_only, exclude_sensitive: exclude_sensitive)

  # 3. Calculate RRF scores
  rrf_scores = calculate_rrf_scores(vector_results, keyword_results, k)

  # 4. Sort by RRF score and return top results
  sorted_entries = rrf_scores.sort_by { |_id, score| -score }.first(limit)

  return [] if sorted_entries.empty?

  # Build a map of ID -> RRF score for assigning similarity
  score_map = sorted_entries.to_h
  sorted_ids = sorted_entries.map(&:first)

  # Fetch records and assign RRF scores as similarity scores
  records = where(id: sorted_ids).includes(:embeddable).index_by(&:id)
  sorted_ids.filter_map do |id|
    record = records[id]
    next unless record

    # Store RRF score as a virtual attribute for consistency with semantic_search
    record.define_singleton_method(:neighbor_distance) { 1.0 - score_map[id] }
    record
  end
end

.images_onlyActiveRecord::Relation<ContentEmbedding>

A relation of ContentEmbeddings that are images only. Active Record Scope

Returns:

See Also:



649
# File 'app/models/content_embedding.rb', line 649

scope :images_only, -> { where(embeddable_type: 'Image') }

.jina_v4ActiveRecord::Relation<ContentEmbedding>

A relation of ContentEmbeddings that are jina v4. Active Record Scope

Returns:

See Also:



194
# File 'app/models/content_embedding.rb', line 194

scope :jina_v4, -> { by_model(LEGACY_UNIFIED_MODEL) }

.keyword_search_for_rrf(query, limit, types, locale, published_only, exclude_sensitive: true) ⇒ Object

Keyword search component for RRF
Uses ILIKE across multiple content types with proper joins



425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
# File 'app/models/content_embedding.rb', line 425

def self.keyword_search_for_rrf(query, limit, types, locale, published_only, exclude_sensitive: true)
  scope = all
  scope = scope.mcp_safe if exclude_sensitive
  scope = scope.by_type(types) if types.present?
  scope = scope.for_locale(locale)
  scope = scope.published_only if published_only

  # Build comprehensive keyword search across all content types
  scope
    .joins("LEFT JOIN articles ON embeddable_type IN ('Article', 'Post') AND articles.id = embeddable_id")
    .joins("LEFT JOIN showcases ON embeddable_type = 'Showcase' AND showcases.id = embeddable_id")
    .joins("LEFT JOIN digital_assets ON embeddable_type IN ('Video', 'Image') AND digital_assets.id = embeddable_id")
    .joins("LEFT JOIN items ON embeddable_type = 'Item' AND items.id = embeddable_id")
    .joins("LEFT JOIN product_lines ON embeddable_type = 'ProductLine' AND product_lines.id = embeddable_id")
    .joins("LEFT JOIN site_maps ON embeddable_type = 'SiteMap' AND site_maps.id = embeddable_id")
    .where(
      <<~SQL.squish,
        articles.subject ILIKE :q OR articles.description ILIKE :q OR
        showcases.name ILIKE :q OR showcases.description ILIKE :q OR
        digital_assets.title ILIKE :q OR digital_assets.meta_description ILIKE :q OR
        items.name ILIKE :q OR items.sku ILIKE :q OR items.search_text ILIKE :q OR
        product_lines.name ILIKE :q OR product_lines.tag_line ILIKE :q OR
        site_maps.extracted_title ILIKE :q OR site_maps.extracted_content ILIKE :q
      SQL
      q: "%#{query}%"
    )
    .limit(limit)
    .to_a
end

.mcp_safeActiveRecord::Relation<ContentEmbedding>

A relation of ContentEmbeddings that are mcp safe. Active Record Scope

Returns:

See Also:



151
# File 'app/models/content_embedding.rb', line 151

scope :mcp_safe, -> { where.not(embeddable_type: SENSITIVE_TYPES) }

.openai_embeddingsActiveRecord::Relation<ContentEmbedding>

A relation of ContentEmbeddings that are openai embeddings. Active Record Scope

Returns:

See Also:



197
# File 'app/models/content_embedding.rb', line 197

scope :openai_embeddings, -> { by_model(DEFAULT_TEXT_MODEL) }

.posts_onlyActiveRecord::Relation<ContentEmbedding>

A relation of ContentEmbeddings that are posts only. Active Record Scope

Returns:

See Also:



214
# File 'app/models/content_embedding.rb', line 214

scope :posts_only, -> { where(embeddable_type: 'Post') }

.primary_contentActiveRecord::Relation<ContentEmbedding>

A relation of ContentEmbeddings that are primary content. Active Record Scope

Returns:

See Also:



158
# File 'app/models/content_embedding.rb', line 158

scope :primary_content, -> { where(content_type: 'primary') }

.published_articlesActiveRecord::Relation<ContentEmbedding>

A relation of ContentEmbeddings that are published articles. Active Record Scope

Returns:

See Also:



201
202
203
204
205
# File 'app/models/content_embedding.rb', line 201

scope :published_articles, -> {
  where(embeddable_type: %w[Article Post])
    .joins('INNER JOIN articles ON articles.id = content_embeddings.embeddable_id')
    .where(articles: { state: 'published' })
}

.published_onlyActiveRecord::Relation<ContentEmbedding>

A relation of ContentEmbeddings that are published only. Active Record Scope

Returns:

See Also:



238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
# File 'app/models/content_embedding.rb', line 238

scope :published_only, -> {
  where(<<~SQL.squish)
    (embeddable_type IN ('Article', 'Post') AND EXISTS (
      SELECT 1 FROM articles WHERE articles.id = content_embeddings.embeddable_id AND articles.state = 'published'
    ))
    OR (embeddable_type = 'Showcase' AND EXISTS (
      SELECT 1 FROM showcases WHERE showcases.id = content_embeddings.embeddable_id AND showcases.state = 'published'
    ))
    OR (embeddable_type IN ('Video', 'Image') AND EXISTS (
      SELECT 1 FROM digital_assets WHERE digital_assets.id = content_embeddings.embeddable_id AND digital_assets.inactive = false
    ))
    OR (embeddable_type = 'ReviewsIo' AND EXISTS (
      SELECT 1 FROM reviews_io WHERE reviews_io.id = content_embeddings.embeddable_id AND reviews_io.status = 'active'
    ))
    OR (embeddable_type = 'Item' AND EXISTS (
      SELECT 1 FROM items WHERE items.id = content_embeddings.embeddable_id AND items.is_discontinued = false
    ))
    OR embeddable_type NOT IN ('Article', 'Post', 'Showcase', 'Video', 'Image', 'ReviewsIo', 'Item')
  SQL
}

.published_showcasesActiveRecord::Relation<ContentEmbedding>

A relation of ContentEmbeddings that are published showcases. Active Record Scope

Returns:

See Also:



217
218
219
220
221
# File 'app/models/content_embedding.rb', line 217

scope :published_showcases, -> {
  where(embeddable_type: 'Showcase')
    .joins('INNER JOIN showcases ON showcases.id = content_embeddings.embeddable_id')
    .where(showcases: { state: 'published' })
}

.recent_firstActiveRecord::Relation<ContentEmbedding>

A relation of ContentEmbeddings that are recent first. Active Record Scope

Returns:

See Also:



175
# File 'app/models/content_embedding.rb', line 175

scope :recent_first, -> { order(created_at: :desc) }

.semantic_search(query, limit: 10, types: nil, locale: 'en', published_only: true, min_similarity: SEMANTIC_SIMILARITY_THRESHOLD, exclude_sensitive: true) ⇒ Object



295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
# File 'app/models/content_embedding.rb', line 295

def self.semantic_search(query, limit: 10, types: nil, locale: 'en', published_only: true, min_similarity: SEMANTIC_SIMILARITY_THRESHOLD, exclude_sensitive: true)
  return none if query.blank?

  # If searching a single type, delegate to the partition class for correct embedding model
  types_array = Array(types).flatten.compact
  if types_array.size == 1
    partition_class = "ContentEmbedding::#{types_array.first}Embedding".safe_constantize
    if partition_class && partition_class.respond_to?(:semantic_search)
      return partition_class.semantic_search(query, limit: limit, locale: locale, published_only: published_only, min_similarity: min_similarity)
    end
  end

  # Multi-type or no-type search: use OpenAI embeddings (won't work for Image)
  # Log a warning if Image is included in the types
  if types_array.include?('Image')
    Rails.logger.warn '[ContentEmbedding] Multi-type search including Image will not work correctly. Use ImageEmbedding.semantic_search for images.'
  end

  query_embedding = generate_query_embedding(query)
  return none unless query_embedding

  # Cosine distance: 0 = identical, 1 = orthogonal, 2 = opposite
  # Convert similarity threshold to max distance: distance = 1 - similarity
  max_distance = 1.0 - min_similarity

  # Build the query using nearest_neighbors which adds ORDER BY distance
  scope = nearest_neighbors(:embedding, query_embedding, distance: :cosine)

  # Filter by max distance using pgvector's cosine distance operator <=>
  # Use sanitize_sql_array to properly format the vector as a PostgreSQL array literal
  if min_similarity.positive?
    vector_literal = "[#{query_embedding.join(',')}]"
    scope = scope.where(
      sanitize_sql_array(['embedding <=> ?::vector <= ?', vector_literal, max_distance])
    )
  end

  # Exclude sensitive types (e.g., CallRecord) unless explicitly opted out.
  # MCP and public-facing searches should always exclude sensitive data.
  scope = scope.mcp_safe if exclude_sensitive

  scope = scope.by_type(types) if types.present?
  scope = scope.for_locale(locale)
  scope = scope.published_only if published_only
  scope.limit(limit).includes(:embeddable)
end

.unified_search(query, model: UNIFIED_MODEL, limit: 10, types: nil, locale: 'en', published_only: true) ⇒ ActiveRecord::Relation

Semantic search using unified embeddings with model-specific partial indexes.
This method supports progressive migration between embedding models.

Examples:

Search using Gemini Embedding 2

ContentEmbedding.unified_search("bathroom floor heating", model: 'gemini-embedding-2-preview')

Search using OpenAI embeddings

ContentEmbedding.unified_search("heated driveway", model: 'text-embedding-3-small')

Parameters:

  • query (String)

    Natural language search query

  • model (String) (defaults to: UNIFIED_MODEL)

    Embedding model to search (determines which partial index to use)

  • limit (Integer) (defaults to: 10)

    Maximum number of results (default: 10)

  • types (Array<String>, nil) (defaults to: nil)

    Filter by embeddable types

  • locale (String) (defaults to: 'en')

    Locale for content filtering (default: 'en')

  • published_only (Boolean) (defaults to: true)

    Only return published/active content (default: true)

Returns:

  • (ActiveRecord::Relation)

    Embeddings ordered by similarity

Raises:

  • (ArgumentError)


539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
# File 'app/models/content_embedding.rb', line 539

def self.unified_search(query, model: UNIFIED_MODEL, limit: 10, types: nil, locale: 'en', published_only: true)
  return none if query.blank?

  model_config = EMBEDDING_MODELS[model]
  raise ArgumentError, "Unknown embedding model: #{model}" unless model_config

  dimensions = model_config[:dimensions]

  # Generate query embedding using appropriate service
  query_embedding = generate_unified_query_embedding(query, model: model, dimensions: dimensions)
  return none unless query_embedding

  # Build query with model filter AND explicit cast for partial index usage
  # From pgvector docs: queries must cast to vector(N) to use expression indexes
  # https://github.com/pgvector/pgvector#can-i-store-vectors-with-different-dimensions-in-the-same-column
  vector_literal = "[#{query_embedding.join(',')}]"

  scope = by_model(model)
             .with_unified_embedding
             .select("#{table_name}.*, unified_embedding::vector(#{dimensions}) <=> '#{vector_literal}' AS neighbor_distance")
             .order(Arel.sql("unified_embedding::vector(#{dimensions}) <=> '#{vector_literal}'"))

  scope = scope.by_type(types) if types.present?
  scope = scope.for_locale(locale)
  scope = scope.published_only if published_only
  scope.limit(limit).includes(:embeddable)
end

.unified_visual_search(query, model: UNIFIED_MODEL, limit: 10) ⇒ ActiveRecord::Relation

Visual search using unified embeddings (cross-modal: text → image)
Uses Gemini Embedding 2 which embeds text and images in the same semantic space

Parameters:

  • query (String)

    Text description of desired images

  • model (String) (defaults to: UNIFIED_MODEL)

    Embedding model (should be multimodal)

  • limit (Integer) (defaults to: 10)

    Maximum results

Returns:

  • (ActiveRecord::Relation)

    Image embeddings ordered by similarity

Raises:

  • (ArgumentError)


614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
# File 'app/models/content_embedding.rb', line 614

def self.unified_visual_search(query, model: UNIFIED_MODEL, limit: 10)
  return none if query.blank?

  model_config = EMBEDDING_MODELS[model]
  raise ArgumentError, "Unknown embedding model: #{model}" unless model_config

  dimensions = model_config[:dimensions]

  # Generate query embedding
  query_embedding = generate_unified_query_embedding(query, model: model, dimensions: dimensions)
  return none unless query_embedding

  # Build query with explicit cast for partial index usage
  # From pgvector docs: queries must cast to vector(N) to use expression indexes
  vector_literal = "[#{query_embedding.join(',')}]"

  # Search only images with this model's embeddings
  by_model(model)
    .where(embeddable_type: 'Image')
    .with_unified_embedding
    .select("#{table_name}.*, unified_embedding::vector(#{dimensions}) <=> '#{vector_literal}' AS neighbor_distance")
    .order(Arel.sql("unified_embedding::vector(#{dimensions}) <=> '#{vector_literal}'"))
    .limit(limit)
    .includes(:embeddable)
end

.with_unified_embeddingActiveRecord::Relation<ContentEmbedding>

A relation of ContentEmbeddings that are with unified embedding. Active Record Scope

Returns:

See Also:



185
# File 'app/models/content_embedding.rb', line 185

scope :with_unified_embedding, -> { where.not(unified_embedding: nil) }

Instance Method Details

#embeddableEmbeddable

Returns:

  • (Embeddable)

See Also:



135
# File 'app/models/content_embedding.rb', line 135

belongs_to :embeddable, polymorphic: true

#similarity_scoreObject

Calculate similarity score (0-1, higher is more similar)



641
642
643
644
645
646
# File 'app/models/content_embedding.rb', line 641

def similarity_score
  return nil unless respond_to?(:neighbor_distance)

  # Cosine distance is 0-2, convert to similarity 0-1
  1.0 - (neighbor_distance / 2.0)
end