Class: Cache::SiteCrawler

Inherits:
BaseService
  • Object
show all
Defined in:
app/services/cache/site_crawler.rb

Overview

Crawls site pages to warm cache and extract content for semantic search
Renamed from CacheWarmer to reflect expanded responsibilities

Constant Summary collapse

USER_AGENT =
'Heatwave Caching Agent'
SKIP_CONTENT_EXTRACTION_CATEGORIES =

Low-value page types excluded from content extraction. These aren't
embedded (only static_page is) and are skipped in batch SEO analysis.
Crawling still runs for cache warming, schema, and link graph.

%w[product showcase support post_tag author towel_warmer_filter floor_plan form].freeze
REMOVE_SELECTORS =

Content extraction selectors to remove (navigation, footers, etc.)

%w[
  script style nav footer header noscript iframe
  .navbar .footer .nav .sidebar .menu .breadcrumb
  .cookie-banner .modal .popup [role="navigation"]
].freeze
CONTENT_SELECTORS =

Selectors to try for main content (in order of preference)

%w[main article .content .main-content #content #main].freeze

Instance Method Summary collapse

Constructor Details

#initialize(options = {}) ⇒ SiteCrawler

Returns a new instance of SiteCrawler.



23
24
25
26
27
# File 'app/services/cache/site_crawler.rb', line 23

def initialize(options = {})
  @ssl_context = OpenSSL::SSL::SSLContext.new
  @ssl_context.verify_mode = OpenSSL::SSL::VERIFY_NONE if Rails.env.development?
  super
end

Instance Method Details

#extract_static_pages(locale: nil) ⇒ Object

Convenience method to crawl and extract content for static pages only



118
119
120
# File 'app/services/cache/site_crawler.rb', line 118

def extract_static_pages(locale: nil)
  process(category: 'static_page', locale: locale, extract_content: true)
end

#process(category: nil, url: nil, locale: nil, pages: nil, last_status_update_older_than: nil, extract_content: false) ⇒ Hash

Crawl pages to warm cache and optionally extract content

Parameters:

  • category (String) (defaults to: nil)

    Filter by sitemap category

  • url (String) (defaults to: nil)

    URL pattern to match (supports SQL LIKE wildcards)

  • locale (String) (defaults to: nil)

    Filter by locale

  • pages (ActiveRecord::Relation) (defaults to: nil)

    Specific pages to process

  • last_status_update_older_than (Time) (defaults to: nil)

    Only process pages not updated since

  • extract_content (Boolean) (defaults to: false)

    Whether to extract and store page content

Returns:

  • (Hash)

    Results keyed by URL



38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# File 'app/services/cache/site_crawler.rb', line 38

def process(category: nil, url: nil, locale: nil, pages: nil,
            last_status_update_older_than: nil, extract_content: false)
  results = {}

  if pages.nil?
    pages = SiteMap.cacheable
    pages = pages.where(locale: locale) if locale
    pages = pages.where(category: category) if category
    pages = pages.where(SiteMap[:path].matches(url)) if url
    if last_status_update_older_than
      pages = pages.where(
        'site_maps.last_status_datetime IS NULL OR ' \
        '(site_maps.last_status_datetime IS NOT NULL AND site_maps.last_status_datetime <= ?)',
        last_status_update_older_than
      )
    end
  end

  begin
    http = HTTP.persistent WEB_URL

    pages.find_each do |page|
      logger.info "HTTP GET -> #{page.url}"
      begin
        http_res = HTTP.headers('User-Agent' => USER_AGENT).get(page.url, ssl_context: @ssl_context)
        logger.info "Result: #{http_res.code}"
        results[page.url] = http_res.code

        updates = { last_status: http_res.code, last_status_datetime: Time.current }

        if http_res.code == 200
          html = http_res.body.to_s

          # Extract content for static pages when enabled
          if extract_content && should_extract_content?(page)
            extracted = extract_page_content(html)
            updates.merge!(
              extracted_title: extracted[:title]&.truncate(255),
              extracted_content: extracted[:content],
              extracted_at: Time.current
            )
            logger.info "  Extracted #{extracted[:content]&.length || 0} chars of content"
          end

          # Always extract rendered JSON-LD schema from every page
          schemas = extract_json_ld_schemas(html)
          if schemas.present?
            updates[:rendered_schema] = schemas
            updates[:rendered_schema_at] = Time.current
            schema_types = schemas.flat_map { |s| Array(s['@type']) }.compact.uniq
            logger.info "  Schema: #{schema_types.join(', ')} (#{schemas.size} block#{'s' if schemas.size > 1})"
          end

          # Always extract and persist internal link graph
          begin
            links = extract_internal_links_with_type(html)
            if links.any?
              SiteMapLink.upsert_for_page!(page, links)
              editorial_count = links.count { |l| l[:link_type] == 'editorial' }
              logger.info "  Links: #{links.size} total (#{editorial_count} editorial)"
            end
          rescue StandardError => e
            logger.warn "  Link extraction failed: #{e.message}"
          end
        end

        page.update_columns(updates)
      rescue StandardError => e
        results[page.url] = e.to_s
        page.update_columns(last_status: e.to_s, last_status_datetime: Time.current)
      end
    end
  ensure
    http&.close
  end

  results
end