Class: Assistant::BlogContentValidator

Inherits:
Object
  • Object
show all
Defined in:
app/services/assistant/blog_content_validator.rb

Constant Summary collapse

HEAD_CHECK_PRIVATE_RANGES =

Hosts/addresses we must never HEAD from assistant write validation (SSRF guard).

[
  IPAddr.new('127.0.0.0/8'),
  IPAddr.new('10.0.0.0/8'),
  IPAddr.new('172.16.0.0/12'),
  IPAddr.new('192.168.0.0/16'),
  IPAddr.new('169.254.0.0/16'),
  IPAddr.new('0.0.0.0/32'),
  IPAddr.new('::1/128'),
  IPAddr.new('fe80::/10'),
  IPAddr.new('fc00::/7')
].freeze

Class Method Summary collapse

Class Method Details

.detect_dropped_embeds(old_html, new_html) ⇒ Object

Compare old and new HTML for dropped embedded assets.
Returns an array of dropped marker hashes (empty = all preserved).



45
46
47
48
49
50
51
52
53
# File 'app/services/assistant/blog_content_validator.rb', line 45

def self.detect_dropped_embeds(old_html, new_html)
  old_markers = extract_embed_markers(old_html)
  return [] if old_markers.empty?

  new_markers = extract_embed_markers(new_html)
  new_identifiers = new_markers.map { |marker| marker[:identifier] }.to_set

  old_markers.reject { |marker| new_identifiers.include?(marker[:identifier]) }
end

.extract_embed_markers(html) ⇒ Object

Extract embed identifiers (data-wy-oembed UUIDs or wy-*-embed class markers)
from HTML content. Returns an array of { type:, identifier: } hashes.
Used to compare old vs new content and detect dropped assets.



23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
# File 'app/services/assistant/blog_content_validator.rb', line 23

def self.extract_embed_markers(html)
  return [] if html.blank?

  markers = []
  doc = Nokogiri::HTML.fragment(html)

  doc.css('figure[data-wy-oembed], figure.wy-image-embed, figure.wy-faq-embed, ' \
          'figure.wy-product-embed, div.wy-video-embed, figure.wy-video-embed').each do |el|
    oembed_type = el['data-wy-oembed']
    uuid = el['data-embedded-asset-uuid']

    identifier = uuid.presence || el['data-image-id'].presence || el['data-video-id'].presence
    identifier ||= el.css('img[src]').first&.[]('src')&.split('/')&.last

    markers << { type: oembed_type || el['class'], identifier: identifier } if identifier
  end

  markers
end

.extract_http_anchor_hrefs_from_html(html, locale: I18n.locale) ⇒ Object

Absolute http(s) hrefs from tags (same locale substitution as LinkAnalyzer).



117
118
119
120
121
122
123
# File 'app/services/assistant/blog_content_validator.rb', line 117

def self.extract_http_anchor_hrefs_from_html(html, locale: I18n.locale)
  return [] if html.blank?

  localized = html.to_s.gsub(/\{\{[\s]*locale[\s]*\}\}/, locale.to_s)
  doc = Nokogiri::HTML(localized)
  doc.css('a').filter_map { |anchor| anchor['href'].to_s.presence }.uniq.select { |href| href.start_with?('http') }
end

.guard_solution_body!(old_html:, new_html:, post_id: nil, skip_embed_check: false) ⇒ nil, String

Single guardian pipeline for blog body HTML (links, embeds, coherence).

Parameters:

  • old_html (String)

    previous solution; empty for create_blog_post

  • new_html (String)

    proposed solution

Returns:

  • (nil, String)

    nil if OK, else JSON error string for the tool response



280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
# File 'app/services/assistant/blog_content_validator.rb', line 280

def self.guard_solution_body!(old_html:, new_html:, post_id: nil, skip_embed_check: false)
  old_html = old_html.to_s
  new_html = new_html.to_s

  if new_html.blank?
    if old_html.blank?
      return reject_thin_new_body(new_html)
    end

    hint = post_id.present? ? "Call get_blog_post(#{post_id}) first to load the full current HTML." : 'Fetch the full current content before editing.'
    return {
      error: 'Content safety check failed: new solution is blank — save rejected.',
      hint: hint
    }.to_json
  end

  if old_html.blank? && (err = reject_thin_new_body(new_html))
    return err
  end

  if (err = reject_catastrophic_content_loss(old_html, new_html, post_id: post_id))
    return err
  end

  unless skip_embed_check
    dropped = detect_dropped_embeds(old_html, new_html)
    if dropped.any?
      hint = post_id.present? ? "Call get_blog_post(#{post_id}) to refresh the current content, then preserve all <figure> blocks exactly as-is." : nil
      payload = {
        error: "Embedded asset safety check failed: #{dropped.size} embedded " \
               'asset(s) from the current content are missing in the new HTML — save rejected.',
        dropped_assets: dropped
      }
      payload[:hint] = hint if hint.present?
      return payload.to_json
    end
  end

  if (err = validate_internal_links(new_html))
    return err
  end

  # Stage 4 of the Sunny blog editor fix plan: server-enforced rules.
  # Run before validate_external_citation_links because content rules
  # are pure-Ruby DOM checks (cheap), while external link validation
  # makes HEAD requests over the network. Failing fast on doomed writes
  # avoids paying for the network round-trip when the body is already
  # invalid for an unrelated reason.
  if defined?(Blog::ContentRules)
    violations = Blog::ContentRules.validate(scope: :post_body, html: new_html)
    if violations.any?
      return {
        error: Blog::ContentRules.format_error(violations, scope: :post_body),
        content_rule_violations: violations,
        hint: 'See Blog::ContentRules for the canonical list of server-enforced rules.',
        recovery: 'content_only_validation_failure'
      }.to_json
    end
  end

  if (err = validate_external_citation_links(new_html))
    return err
  end

  nil
end

.head_check_blocked_uri?(uri) ⇒ Boolean

True when URI must not be probed with HEAD (SSRF / internal targets).

Returns:

  • (Boolean)


126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
# File 'app/services/assistant/blog_content_validator.rb', line 126

def self.head_check_blocked_uri?(uri)
  host = uri.host.to_s.downcase
  return true if host.blank?

  return true if host == 'localhost' || host == '0.0.0.0'
  return true if host.end_with?('.local') || host.end_with?('.internal')

  begin
    addr = IPAddr.new(host)
    return true if HEAD_CHECK_PRIVATE_RANGES.any? { |range| range.include?(addr) }
  rescue IPAddr::InvalidAddressError, IPAddr::AddressFamilyError
    # Non-literal hostname — OK to probe (DNS resolution happens in HTTP client).
  end

  false
end

.reject_catastrophic_content_loss(old_html, new_html, post_id:) ⇒ Object

Reject updates that wipe most of the article (truncation / stale compacted context).



231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
# File 'app/services/assistant/blog_content_validator.rb', line 231

def self.reject_catastrophic_content_loss(old_html, new_html, post_id:)
  old_html = old_html.to_s
  new_html = new_html.to_s
  old_len = old_html.length
  new_len = new_html.length
  hint = post_id.present? ? "Call get_blog_post(#{post_id}) first to load the full current HTML." : 'Fetch the full current content before editing.'

  if old_len > 2_000 && new_len < (old_len * 0.25)
    return {
      error: "Content safety check failed: new solution (#{new_len} chars) " \
             "is less than 25% of current content (#{old_len} chars). " \
             'This usually means stale or summarized context — the save was rejected.',
      hint: hint
    }.to_json
  end

  old_text = visible_text_from_html(old_html)
  new_text = visible_text_from_html(new_html)
  if old_text > 400 && new_text < 40 && new_len < (old_len * 0.35)
    return {
      error: 'Content coherence check failed: the update removes almost all readable text ' \
             'while shrinking the HTML substantially — save rejected.',
      old_visible_text_chars: old_text,
      new_visible_text_chars: new_text,
      hint: hint
    }.to_json
  end

  nil
end

.reject_thin_new_body(new_html) ⇒ Object

New drafts must have either enough visible copy or enough HTML (e.g. embed-heavy stubs).



263
264
265
266
267
268
269
270
271
272
273
274
# File 'app/services/assistant/blog_content_validator.rb', line 263

def self.reject_thin_new_body(new_html)
  new_html = new_html.to_s
  txt = visible_text_from_html(new_html)
  return nil if txt >= 25
  return nil if new_html.length >= 200

  {
    error: 'Content too thin to save: add at least a short paragraph of body copy, ' \
           'or more structured HTML/embeds — draft was rejected.',
    visible_text_chars: txt
  }.to_json
end

.unescape_solution(html) ⇒ Object

LLMs (especially Gemini) sometimes paste oEmbed HTML from tool results
with residual JSON escaping: " instead of " and \u003e instead of >.
This creates broken HTML when saved to the database. Strip these artifacts.



58
59
60
61
62
63
64
65
66
67
# File 'app/services/assistant/blog_content_validator.rb', line 58

def self.unescape_solution(html)
  return html if html.blank?

  html
    .gsub('\\"', '"')
    .gsub('\\u003c', '<').gsub('\\u003e', '>')
    .gsub('\\u0026', '&').gsub('\\u003d', '=')
    .gsub('\\u0027', "'")
    .gsub('\n', "\n").gsub('\t', "\t")
end

Populate the editorial link graph for an article after a successful save.
Non-fatal: logs errors but never blocks the save response.



383
384
385
386
387
# File 'app/services/assistant/blog_content_validator.rb', line 383

def self.upsert_link_graph(article)
  Seo::InternalLinkValidator.upsert_editorial_links!(article)
rescue StandardError => e
  Rails.logger.warn "[BlogContentValidator] Link graph upsert failed for Article##{article.id}: #{e.message}"
end

HEAD-check absolute http(s) links to third-party hosts (not warmlyyours.com).
WarmlyYours URLs are already covered by validate_internal_links.
Filters disallowed targets before any network I/O to avoid SSRF on the write path.

Returns:

  • (nil, String)

    nil if OK, or JSON error string (same contract as validate_internal_links)



148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
# File 'app/services/assistant/blog_content_validator.rb', line 148

def self.validate_external_citation_links(html)
  return nil if html.blank?

  hrefs = extract_http_anchor_hrefs_from_html(html)
  return nil if hrefs.empty?

  disallowed = []
  to_check = []

  hrefs.each do |href|
    uri = Addressable::URI.parse(href) rescue nil
    unless uri
      disallowed << { href: href, reason: 'unparseable URL' }
      next
    end

    unless %w[http https].include?(uri.scheme.to_s.downcase)
      next
    end

    next if uri.host.to_s.match?(/\A(.+\.)?warmlyyours\.com\z/i)

    if head_check_blocked_uri?(uri)
      disallowed << {
        href: href,
        reason: 'URL targets localhost, private, or link-local address — not allowed in assistant content'
      }
      next
    end

    to_check << href
  end

  if disallowed.any?
    return {
      error: 'Disallowed link target(s) — save rejected. Only public third-party http(s) URLs may be linked.',
      disallowed_hrefs: disallowed,
      recovery: 'content_only_validation_failure'
    }.to_json
  end

  return nil if to_check.empty?

  analyzer = Seo::LinkAnalyzer.new
  link_analysis = analyzer.check_links(to_check.uniq)

  broken = []
  link_analysis.each do |href, check|
    code = case check[:result]
           when Integer then check[:result]
           when :invalid_url then 0
           else check[:result].to_i
           end
    next if code.in?(200..399)

    entry = { href: href, http_status: code }
    entry[:redirect] = check[:location] if check[:location].present?
    broken << entry
  end

  return nil if broken.empty?

  {
    error: 'Broken or unreachable external link(s) detected — the content was NOT saved. ' \
           "Fix or remove #{broken.size} third-party link(s) before saving:",
    broken_external_links: broken,
    hint: 'Use fetch_url to verify a URL before linking. If the server blocks automated requests ' \
          '(403) but the page works in a browser, use a different canonical URL, link to an ' \
          'index/search page, or cite the source in plain text without an anchor.',
    recovery: 'content_only_validation_failure'
  }.to_json
end

Validate internal links in HTML content. Returns nil if all links are valid,
or a JSON error string if any broken internal links are found.
Used by create_blog_post, update_blog_post, and CreateFaqTool.



79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# File 'app/services/assistant/blog_content_validator.rb', line 79

def self.validate_internal_links(html)
  return nil if html.blank?

  result = Seo::InternalLinkValidator.new.process(html)
  return nil if result.valid?

  details = result.broken_links.map do |bl|
    entry = { href: bl.href, path: bl.path }
    entry[:suggested_path] = bl.suggestion if bl.suggestion
    entry[:did_you_mean] = bl.did_you_mean if bl.did_you_mean&.any?
    entry
  end

  any_did_you_mean = details.any? { |entry| entry[:did_you_mean].present? }
  hint = if any_did_you_mean
           "Each broken link includes a `did_you_mean` array of candidate paths resolved by " \
             "legacy redirect map / post-slug fuzz / pg_trgm / semantic vector search. " \
             "Pick the matching candidate and retry with the {{locale}} prefix " \
             "(e.g. \"/{{locale}}<candidate>\"). " \
             "If none of the candidates are right, run semantic_search(query: <topic>, types: [\"SiteMap\"]) " \
             "or find_link_opportunities(topic:) before retrying — do NOT guess slug variants."
         else
           "No high-confidence matches were found by the validator. Run " \
             "semantic_search(query: <topic from the surrounding paragraph>, types: [\"SiteMap\"]) " \
             "or find_link_opportunities(topic:) to discover the canonical URL, then retry. " \
             "Do NOT retry with a guessed slug variant — the validator already exhausted trigram + embedding lookups."
         end

  {
    error: "Broken internal links detected — the content was NOT saved. " \
           "Fix or remove these #{result.broken_links.size} link(s) before saving:",
    broken_links: details,
    hint: hint,
    recovery: 'content_only_validation_failure'
  }.to_json
end

.validate_post_asset_references(preview_image_id: nil, related_article_1_id: :_unset, related_article_2_id: :_unset, related_article_3_id: :_unset, related_article_4_id: :_unset) ⇒ Object

Ensures preview and related-article IDs point at real records (assistant cannot invent IDs).



348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
# File 'app/services/assistant/blog_content_validator.rb', line 348

def self.validate_post_asset_references(preview_image_id: nil,
                                        related_article_1_id: :_unset,
                                        related_article_2_id: :_unset,
                                        related_article_3_id: :_unset,
                                        related_article_4_id: :_unset)
  if preview_image_id.present? && !Image.exists?(id: preview_image_id.to_i)
    return { error: "Invalid preview_image_id: Image ##{preview_image_id} does not exist." }.to_json
  end

  {
    related_article_1_id: related_article_1_id,
    related_article_2_id: related_article_2_id,
    related_article_3_id: related_article_3_id,
    related_article_4_id: related_article_4_id
  }.each do |field, raw_id|
    next if raw_id.nil? || raw_id == :_unset

    pid = raw_id.to_i
    if pid <= 0
      return {
        error: "Invalid #{field}: pass null to clear the slot or a positive Post id. " \
               'Use list_blog_posts or get_blog_post to obtain valid IDs.'
      }.to_json
    end

    unless Post.exists?(id: pid)
      return { error: "Invalid #{field}: Post ##{pid} does not exist. Use list_blog_posts or get_blog_post to obtain valid IDs." }.to_json
    end
  end

  nil
end

.visible_text_from_html(html) ⇒ Object

Approximate visible (non-HTML) character count for coherence checks.



222
223
224
225
226
227
228
# File 'app/services/assistant/blog_content_validator.rb', line 222

def self.visible_text_from_html(html)
  return 0 if html.blank?

  Nokogiri::HTML::DocumentFragment.parse(html.to_s).text.squish.length
rescue StandardError
  html.to_s.gsub(/<[^>]+>/, ' ').squish.length
end