Class: Cache::SiteCrawler

Inherits:

BaseService

Object
BaseService
Cache::SiteCrawler

show all

Defined in:: app/services/cache/site_crawler.rb

Overview

Crawls site pages to warm cache and extract content for semantic search
Renamed from CacheWarmer to reflect expanded responsibilities

Constant Summary collapse

USER_AGENT =

'Heatwave Caching Agent'

SKIP_CONTENT_EXTRACTION_CATEGORIES = Low-value page types excluded from content extraction. These aren't embedded (only static_page is) and are skipped in batch SEO analysis. Crawling still runs for cache warming, schema, and link graph.

%w[product showcase support post_tag author towel_warmer_filter floor_plan form].freeze

REMOVE_SELECTORS = Content extraction selectors to remove (navigation, footers, etc.)

%w[
  script style nav footer header noscript iframe
  .navbar .footer .nav .sidebar .menu .breadcrumb
  .cookie-banner .modal .popup [role="navigation"]
].freeze

CONTENT_SELECTORS = Selectors to try for main content (in order of preference)

%w[main article .content .main-content #content #main].freeze

Instance Method Summary collapse

#extract_static_pages(locale: nil) ⇒ Object
Convenience method to crawl and extract content for static pages only.
#initialize(options = {}) ⇒ SiteCrawler constructor
A new instance of SiteCrawler.
#process(category: nil, url: nil, locale: nil, pages: nil, last_status_update_older_than: nil, extract_content: false) ⇒ Hash
Crawl pages to warm cache and optionally extract content.

Constructor Details

#initialize(options = {}) ⇒ `SiteCrawler`

Returns a new instance of SiteCrawler.

# File 'app/services/cache/site_crawler.rb', line 23

def initialize(options = {})
  @ssl_context = OpenSSL::SSL::SSLContext.new
  @ssl_context.verify_mode = OpenSSL::SSL::VERIFY_NONE if Rails.env.development?
  super
end

Instance Method Details

#extract_static_pages(locale: nil) ⇒ `Object`

Convenience method to crawl and extract content for static pages only



118
119
120

# File 'app/services/cache/site_crawler.rb', line 118

def extract_static_pages(locale: nil)
  process(category: 'static_page', locale: locale, extract_content: true)
end

#process(category: nil, url: nil, locale: nil, pages: nil, last_status_update_older_than: nil, extract_content: false) ⇒ `Hash`

Crawl pages to warm cache and optionally extract content

Parameters:

category (String) (defaults to: nil) —
Filter by sitemap category
url (String) (defaults to: nil) —
URL pattern to match (supports SQL LIKE wildcards)
locale (String) (defaults to: nil) —
Filter by locale
pages (ActiveRecord::Relation) (defaults to: nil) —
Specific pages to process
last_status_update_older_than (Time) (defaults to: nil) —
Only process pages not updated since
extract_content (Boolean) (defaults to: false) —
Whether to extract and store page content

Returns:

(Hash) —
Results keyed by URL

# File 'app/services/cache/site_crawler.rb', line 38

def process(category: nil, url: nil, locale: nil, pages: nil,
            last_status_update_older_than: nil, extract_content: false)
  results = {}

  if pages.nil?
    pages = SiteMap.cacheable
    pages = pages.where(locale: locale) if locale
    pages = pages.where(category: category) if category
    pages = pages.where(SiteMap[:path].matches(url)) if url
    if last_status_update_older_than
      pages = pages.where(
        'site_maps.last_status_datetime IS NULL OR ' \
        '(site_maps.last_status_datetime IS NOT NULL AND site_maps.last_status_datetime <= ?)',
        last_status_update_older_than
      )
    end
  end

  begin
    http = HTTP.persistent WEB_URL

    pages.find_each do |page|
      logger.info "HTTP GET -> #{page.url}"
      begin
        http_res = HTTP.headers('User-Agent' => USER_AGENT).get(page.url, ssl_context: @ssl_context)
        logger.info "Result: #{http_res.code}"
        results[page.url] = http_res.code

        updates = { last_status: http_res.code, last_status_datetime: Time.current }

        if http_res.code == 200
          html = http_res.body.to_s

          # Extract content for static pages when enabled
          if extract_content && should_extract_content?(page)
            extracted = extract_page_content(html)
            updates.merge!(
              extracted_title: extracted[:title]&.truncate(255),
              extracted_content: extracted[:content],
              extracted_at: Time.current
            )
            logger.info "  Extracted #{extracted[:content]&.length || 0} chars of content"
          end

          # Always extract rendered JSON-LD schema from every page
          schemas = extract_json_ld_schemas(html)
          if schemas.present?
            updates[:rendered_schema] = schemas
            updates[:rendered_schema_at] = Time.current
            schema_types = schemas.flat_map { |s| Array(s['@type']) }.compact.uniq
            logger.info "  Schema: #{schema_types.join(', ')} (#{schemas.size} block#{'s' if schemas.size > 1})"
          end

          # Always extract and persist internal link graph
          begin
            links = extract_internal_links_with_type(html)
            if links.any?
              SiteMapLink.upsert_for_page!(page, links)
              editorial_count = links.count { |l| l[:link_type] == 'editorial' }
              logger.info "  Links: #{links.size} total (#{editorial_count} editorial)"
            end
          rescue StandardError => e
            logger.warn "  Link extraction failed: #{e.message}"
          end
        end

        page.update_columns(updates)
      rescue StandardError => e
        results[page.url] = e.to_s
        page.update_columns(last_status: e.to_s, last_status_datetime: Time.current)
      end
    end
  ensure
    http&.close
  end

  results
end