Class: CatalogItemUrlWorker

Inherits:
Object
  • Object
show all
Includes:
Sidekiq::Job
Defined in:
app/workers/catalog_item_url_worker.rb

Constant Summary collapse

HEADERS =

The following headers were extracted from a chrome inspector mimic as curl request

{
  'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
}

Instance Method Summary collapse

Instance Method Details

#http_get(url) ⇒ Object

Using net/http instead of curb for better WebMock compatibility



14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
# File 'app/workers/catalog_item_url_worker.rb', line 14

def http_get(url)
  require 'net/http'
  require 'uri'

  uri = URI(url)
  http = Net::HTTP.new(uri.host, uri.port)
  http.use_ssl = (uri.scheme == 'https')
  http.open_timeout = 10
  http.read_timeout = 10

  request = Net::HTTP::Get.new(uri)
  HEADERS.each { |k, v| request[k] = v }

  http.request(request)
end

#perform(catalog_item_id) ⇒ Object



39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# File 'app/workers/catalog_item_url_worker.rb', line 39

def perform(catalog_item_id)
  catalog_item = CatalogItem.where.not(skip_url_checks: true).find(catalog_item_id)
  return unless catalog_item.url.present?

  # Skip items that have a recent successful Oxylabs probe (within 7 days).
  # Probes use JS rendering and geo-location, making their url_valid result
  # authoritative. A naive HTTP GET overwrites it with false negatives on
  # React/SPA sites (e.g. Costco) that block simple HTTP requests.
  if catalog_item.retailer_probes.where(status: 'success').where('created_at > ?', 7.days.ago).exists?
    logger.info "Skipping URL check for #{catalog_item.id} — recent successful probe exists"
    return
  end

  res = nil
  begin
    res = http_get(catalog_item.url)
    logger.info "#{catalog_item.url} result was #{res.code}"
  rescue StandardError => e
    logger.warn "Exception while retrieving url #{catalog_item.url}. #{e}"
    ErrorReporting.warning(e, source: :background, catalog_item_id: catalog_item.id, url: catalog_item.url)
  end
  valid = res&.code&.start_with?('200') || false
  catalog_item.update_columns(url_valid: valid, url_last_checked: Time.current)
  valid
end

#test_costcoObject

If your http_get works against costco it will work against most anyone



31
32
33
# File 'app/workers/catalog_item_url_worker.rb', line 31

def test_costco
  http_get('https://www.costco.ca/warmlyyours-riviera-towel-warmer.product.100802733.html').code
end

#test_walmartObject



35
36
37
# File 'app/workers/catalog_item_url_worker.rb', line 35

def test_walmart
  http_get('https://www.walmart.com/ip/Grande-10-Towel-Warmer-Black-Hardwired-10-Bars/595189227').code
end