Class: Seo::LinkAnalyzer

Inherits:
BaseService show all
Defined in:
app/services/seo/link_analyzer.rb

Overview

Service object: link analyzer.

Defined Under Namespace

Classes: Result

Instance Attribute Summary collapse

Attributes inherited from BaseService

#options

Instance Method Summary collapse

Methods inherited from BaseService

#initialize, #log_debug, #log_error, #log_info, #log_warning, #logger, #tagged_logger

Constructor Details

This class inherits a constructor from BaseService

Instance Attribute Details

#html_docObject (readonly)

Returns the value of attribute html_doc.



4
5
6
# File 'app/services/seo/link_analyzer.rb', line 4

def html_doc
  @html_doc
end

#html_rawObject (readonly)

Returns the value of attribute html_raw.



4
5
6
# File 'app/services/seo/link_analyzer.rb', line 4

def html_raw
  @html_raw
end

Returns the value of attribute link_analysis.



4
5
6
# File 'app/services/seo/link_analyzer.rb', line 4

def link_analysis
  @link_analysis
end

Returns the value of attribute links.



4
5
6
# File 'app/services/seo/link_analyzer.rb', line 4

def links
  @links
end

#localeObject (readonly)

Returns the value of attribute locale.



4
5
6
# File 'app/services/seo/link_analyzer.rb', line 4

def locale
  @locale
end

Instance Method Details

#build_clientFaraday::Connection

Build a fresh Faraday client for a single HEAD probe.

Each thread spawned in check_links gets its own client because the
underlying http.rb connection state isn't safe to share across
threads. Short-lived — discarded after one request.

Returns:

  • (Faraday::Connection)

    a Faraday client with 5s open timeout
    and 15s read timeout, configured against the project's default
    adapter (http.rb, set in config/initializers/faraday.rb).



46
47
48
49
50
# File 'app/services/seo/link_analyzer.rb', line 46

def build_client
  Faraday.new(request: { open_timeout: 5, timeout: 15 }) do |f|
    f.adapter Faraday.default_adapter
  end
end


52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
# File 'app/services/seo/link_analyzer.rb', line 52

def check_links(links)
  results = {}
  links.each_slice(15) do |group| # 15 threads at a time
    threads = []
    group.each do |href|
      if (uri = begin
        Addressable::URI.parse(href)
      rescue StandardError
        nil
      end)
        logger.info "Checking #{uri}"
        threads << Thread.new do
          res = build_client.head(uri.to_s)
          results[href] = { result: res.status }
          if (redirect_location = res.headers['Location'].presence)
            uri_r = Addressable::URI.parse(redirect_location)
            unless uri_r.absolute? # Copy from original
              uri_r.scheme = uri.scheme
              uri_r.port = uri.port unless uri.port.in?([80, 443])
              uri_r.host = uri.host
            end
            results[href][:location] = uri_r.to_s
          end
        rescue Faraday::TimeoutError
          # 408 Request Timeout — the upstream took too long to respond.
          results[href] = { result: 408 }
        rescue Faraday::ConnectionFailed, SocketError => e
          # Don't conflate network errors with 404. Callers
          # (BlogContentValidator, LinkCheck) treat 404 as "page broken /
          # missing"; a connection failure / DNS / TLS hiccup is
          # transient and shouldn't trigger the same remediation. Use
          # 503 (Service Unavailable) and surface the exception so the
          # caller can decide.
          results[href] = { result: 503, exception: e }
        rescue StandardError => e
          results[href] = { result: 500, exception: e }
        end
      else
        logger.warn "Bad url: #{href}"
        results[href] = { result: :invalid_url }
      end
    end
    threads.each(&:join)
  end
  results
end


28
29
30
31
32
33
34
35
# File 'app/services/seo/link_analyzer.rb', line 28

def extract_links(_html_raw = nil)
  return [] if _html_raw.blank?

  html_localized = localize_links(_html_raw)
  doc = Nokogiri::HTML(html_localized)
  links = doc.css('a')
  links.map { |link| link.attribute('href').to_s }.uniq.sort.delete_if { |href| href.blank? || !href.starts_with?('http') }
end


24
25
26
# File 'app/services/seo/link_analyzer.rb', line 24

def localize_links(_html_raw = nil)
  (_html_raw || html_raw).gsub(/\{\{\s*locale\s*\}\}/, (locale || 'en-US').to_s)
end

#process(html_raw, locale: I18n.locale) ⇒ Object

Feed me an html raw with links and i'll tell you everything that's wrong with them



12
13
14
15
16
17
18
19
20
21
22
# File 'app/services/seo/link_analyzer.rb', line 12

def process(html_raw, locale: I18n.locale)
  return Result.new(link_analysis: [], status: :skipped) if html_raw.blank?

  logger.tagged('Seo::LinkAnalyzer') do
    @locale = locale
    @html_raw = html_raw
    @links = extract_links
    @link_analysis = check_links(@links)
  end
  Result.new(link_analysis: @link_analysis, status: :ok)
end