Class: ImageDuplicatePair

Inherits:
ApplicationRecord show all
Defined in:
app/models/image_duplicate_pair.rb

Overview

== Schema Information

Table name: image_duplicate_pairs
Database name: primary

id :bigint not null, primary key
hamming_distance :integer not null
status :string default("pending"), not null
created_at :datetime not null
updated_at :datetime not null
image_a_id :bigint not null
image_b_id :bigint not null

Indexes

index_image_duplicate_pairs_on_hamming_distance (hamming_distance)
index_image_duplicate_pairs_on_image_a_id_and_image_b_id (image_a_id,image_b_id) UNIQUE
index_image_duplicate_pairs_on_image_b_id (image_b_id)
index_image_duplicate_pairs_on_status (status)

Constant Summary collapse

STATUSES =

Status values

%w[pending reviewed false_positive merged].freeze

Instance Attribute Summary collapse

Belongs to collapse

Class Method Summary collapse

Instance Method Summary collapse

Methods inherited from ApplicationRecord

ransackable_associations, ransackable_attributes, ransackable_scopes, ransortable_attributes, #to_relation

Methods included from Models::EventPublishable

#publish_event

Instance Attribute Details

#hamming_distanceObject (readonly)

Validations

Validations:



28
# File 'app/models/image_duplicate_pair.rb', line 28

validates :image_a_id, :image_b_id, :hamming_distance, presence: true

#image_a_idObject (readonly)

Validations

Validations:



28
# File 'app/models/image_duplicate_pair.rb', line 28

validates :image_a_id, :image_b_id, :hamming_distance, presence: true

#image_b_idObject (readonly)

Validations

Validations:



28
# File 'app/models/image_duplicate_pair.rb', line 28

validates :image_a_id, :image_b_id, :hamming_distance, presence: true

#statusObject (readonly)



34
# File 'app/models/image_duplicate_pair.rb', line 34

validates :status, inclusion: { in: STATUSES }

Class Method Details

.build_clusters(threshold: 10) ⇒ Array<Set<Integer>>

Group pairs into clusters for display
Returns groups of image IDs that are all duplicates of each other

Parameters:

  • threshold (Integer) (defaults to: 10)

    Maximum hamming distance

Returns:

  • (Array<Set<Integer>>)

    Array of image ID sets



110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
# File 'app/models/image_duplicate_pair.rb', line 110

def self.build_clusters(threshold: 10)
  pairs = within_threshold(threshold).pending.pluck(:image_a_id, :image_b_id)
  return [] if pairs.empty?

  # Union-Find algorithm
  parent = {}

  find_root = lambda do |x|
    parent[x] ||= x
    parent[x] = find_root.call(parent[x]) if parent[x] != x
    parent[x]
  end

  union = lambda do |x, y|
    px = find_root.call(x)
    py = find_root.call(y)
    parent[px] = py if px != py
  end

  pairs.each { |id1, id2| union.call(id1, id2) }

  # Group by root
  groups = Hash.new { |h, k| h[k] = Set.new }
  parent.keys.each do |id|
    root = find_root.call(id)
    groups[root] << id
  end

  groups.values.select { |cluster| cluster.size > 1 }
end

.bulk_upsert_or_update(pairs_data) ⇒ Integer

Bulk upsert pairs - inserts new pairs or updates existing ones in a single query.
Uses PostgreSQL's ON CONFLICT to efficiently handle duplicates.

Parameters:

  • pairs_data (Array<Hash>)

    Array of hashes with :id1, :id2, :distance keys

Returns:

  • (Integer)

    Number of pairs processed



74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# File 'app/models/image_duplicate_pair.rb', line 74

def self.bulk_upsert_or_update(pairs_data)
  return 0 if pairs_data.blank?

  now = Time.current

  # Prepare records with canonical ordering (smaller id first)
  records = pairs_data.map do |pair|
    image_a_id, image_b_id = [pair[:id1], pair[:id2]].sort
    {
      image_a_id: image_a_id,
      image_b_id: image_b_id,
      hamming_distance: pair[:distance],
      status: 'pending',
      created_at: now,
      updated_at: now
    }
  end

  # Use upsert_all for efficient bulk insert/update
  # On conflict, update hamming_distance (distance may change if fingerprints recalculated)
  # Note: Rails automatically handles updated_at, so we only specify hamming_distance
  upsert_all(
    records,
    unique_by: %i[image_a_id image_b_id],
    update_only: %i[hamming_distance]
  )

  records.size
end

.exact_matchesActiveRecord::Relation<ImageDuplicatePair>

A relation of ImageDuplicatePairs that are exact matches. Active Record Scope

Returns:

See Also:



42
# File 'app/models/image_duplicate_pair.rb', line 42

scope :exact_matches, -> { where(hamming_distance: 0) }

.false_positivesActiveRecord::Relation<ImageDuplicatePair>

A relation of ImageDuplicatePairs that are false positives. Active Record Scope

Returns:

See Also:



39
# File 'app/models/image_duplicate_pair.rb', line 39

scope :false_positives, -> { where(status: 'false_positive') }

.find_or_create_pair(image_1, image_2, distance:) ⇒ ImageDuplicatePair

Find or create a pair, ensuring consistent ordering (smaller id first)

Parameters:

  • image_1 (Image, Integer)

    First image or ID

  • image_2 (Image, Integer)

    Second image or ID

  • distance (Integer)

    Hamming distance between fingerprints

Returns:



53
54
55
56
57
58
59
60
61
62
63
64
65
66
# File 'app/models/image_duplicate_pair.rb', line 53

def self.find_or_create_pair(image_1, image_2, distance:)
  id_1 = image_1.is_a?(Image) ? image_1.id : image_1
  id_2 = image_2.is_a?(Image) ? image_2.id : image_2

  # Ensure consistent ordering
  image_a_id, image_b_id = [id_1, id_2].sort

  find_or_create_by!(image_a_id: image_a_id, image_b_id: image_b_id) do |pair|
    pair.hamming_distance = distance
  end
rescue ActiveRecord::RecordNotUnique
  # Race condition - another process created it
  find_by!(image_a_id: image_a_id, image_b_id: image_b_id)
end

.mergedActiveRecord::Relation<ImageDuplicatePair>

A relation of ImageDuplicatePairs that are merged. Active Record Scope

Returns:

See Also:



40
# File 'app/models/image_duplicate_pair.rb', line 40

scope :merged, -> { where(status: 'merged') }

.pendingActiveRecord::Relation<ImageDuplicatePair>

A relation of ImageDuplicatePairs that are pending. Active Record Scope

Returns:

See Also:



37
# File 'app/models/image_duplicate_pair.rb', line 37

scope :pending, -> { where(status: 'pending') }

.recentActiveRecord::Relation<ImageDuplicatePair>

A relation of ImageDuplicatePairs that are recent. Active Record Scope

Returns:

See Also:



44
# File 'app/models/image_duplicate_pair.rb', line 44

scope :recent, -> { order(created_at: :desc) }

.reviewedActiveRecord::Relation<ImageDuplicatePair>

A relation of ImageDuplicatePairs that are reviewed. Active Record Scope

Returns:

See Also:



38
# File 'app/models/image_duplicate_pair.rb', line 38

scope :reviewed, -> { where(status: 'reviewed') }

.within_thresholdActiveRecord::Relation<ImageDuplicatePair>

A relation of ImageDuplicatePairs that are within threshold. Active Record Scope

Returns:

See Also:



43
# File 'app/models/image_duplicate_pair.rb', line 43

scope :within_threshold, ->(threshold) { where('hamming_distance <= ?', threshold) }

Instance Method Details

#image_aImage

Associations

Returns:

See Also:



24
# File 'app/models/image_duplicate_pair.rb', line 24

belongs_to :image_a, class_name: 'Image'

#image_bImage

Returns:

See Also:



25
# File 'app/models/image_duplicate_pair.rb', line 25

belongs_to :image_b, class_name: 'Image'

#mark_false_positive!Object

Mark this pair as a false positive (not actually duplicates)



147
148
149
# File 'app/models/image_duplicate_pair.rb', line 147

def mark_false_positive!
  update!(status: 'false_positive')
end

#mark_merged!Object

Mark this pair as merged (one image was merged into the other)



152
153
154
# File 'app/models/image_duplicate_pair.rb', line 152

def mark_merged!
  update!(status: 'merged')
end

#mark_reviewed!Object

Mark this pair as reviewed (not false positive, just seen)



142
143
144
# File 'app/models/image_duplicate_pair.rb', line 142

def mark_reviewed!
  update!(status: 'reviewed')
end