refactor(html_cleaner): adopt robust HTML parsing for content cleaning

Replaces the fragile regex-based HTML cleaning logic with a proper HTML parser using `golang.org/x/net/html`. The previous implementation was unreliable and could not correctly handle malformed tags, script content, or a wide range of HTML entities. This new approach provides several key improvements: - Skips the content of `
2026-03-03 03:01:29 +01:00 · 2025-11-06 04:26:51 +01:00
parent 2790064ad5
commit e6977d3374
4 changed files with 52 additions and 34 deletions
--- a/internal/services/html_cleaner.go
+++ b/internal/services/html_cleaner.go
@@ -3,20 +3,17 @@
 package services

 import (
-	"regexp"
+	"bytes"
+	stdhtml "html"
+	"io"
 	"strings"
-)

-var (
-	// htmlTagRegex matches HTML tags for removal
-	htmlTagRegex = regexp.MustCompile(`<[^>]*>`)
-	// whitespaceRegex matches multiple whitespace characters for normalization
-	whitespaceRegex = regexp.MustCompile(`\s+`)
+	"golang.org/x/net/html"
 )

 // HTMLCleaner provides utilities for converting HTML content to plain text.
 // It removes HTML tags while preserving their content and converts HTML entities
-// to their plain text equivalents.
+// to their plain text equivalents using proper HTML parsing instead of regex.
 type HTMLCleaner struct{}

 // NewHTMLCleaner creates a new HTML cleaner instance.
@@ -27,33 +24,51 @@ func NewHTMLCleaner() *HTMLCleaner {
 }

 // CleanHTML removes HTML tags and converts entities, returning clean plain text.
-// The function preserves the textual content of the HTML while removing markup.
-// It handles common HTML entities like &nbsp;, &amp;, etc., and normalizes whitespace.
+// The function parses the HTML into a node tree and extracts only text content,
+// which handles edge cases like script tags or attributes better than regex.
+// It handles HTML entities automatically through the parser and normalizes whitespace.
 //
 // Parameters:
-//   - html: The HTML content to clean
+//   - htmlStr: The HTML content to clean
 //
 // Returns:
 //   - A plain text string with all HTML elements and entities removed/converted
-func (h *HTMLCleaner) CleanHTML(html string) string {
-	// Remove HTML tags but preserve content
-	cleaned := htmlTagRegex.ReplaceAllString(html, "")
+func (h *HTMLCleaner) CleanHTML(htmlStr string) string {
+	// Parse the HTML into a node tree
+	doc, err := html.Parse(strings.NewReader(htmlStr))
+	if err != nil {
+		// If parsing fails, return empty string
+		// This maintains backward compatibility with the test expectations
+		return ""
+	}

-	// Replace common HTML entities with their character equivalents
-	cleaned = strings.ReplaceAll(cleaned, "&nbsp;", " ")
-	cleaned = strings.ReplaceAll(cleaned, "&amp;", "&")
-	cleaned = strings.ReplaceAll(cleaned, "&lt;", "<")
-	cleaned = strings.ReplaceAll(cleaned, "&gt;", ">")
-	cleaned = strings.ReplaceAll(cleaned, "&quot;", "\"")
-	cleaned = strings.ReplaceAll(cleaned, "&#39;", "'")
-	cleaned = strings.ReplaceAll(cleaned, "&iuml;", "ï")
-	cleaned = strings.ReplaceAll(cleaned, "&euml;", "ë")
-	cleaned = strings.ReplaceAll(cleaned, "&eacute;", "é")
+	// Extract text content from the node tree
+	var buf bytes.Buffer
+	extractText(&buf, doc)

-	// Clean up extra whitespace by replacing multiple spaces, tabs, and newlines
-	// with a single space, then trim any leading/trailing whitespace
-	cleaned = whitespaceRegex.ReplaceAllString(cleaned, " ")
-	cleaned = strings.TrimSpace(cleaned)
+	// Unescape any remaining HTML entities
+	unescaped := stdhtml.UnescapeString(buf.String())

-	return cleaned
+	// Normalize whitespace: replace multiple spaces, tabs, and newlines with a single space
+	cleaned := strings.Join(strings.Fields(unescaped), " ")
+	return strings.TrimSpace(cleaned)
+}
+
+// extractText recursively traverses the HTML node tree and extracts text content.
+// It skips script and style tags to avoid including their content in the output.
+func extractText(w io.Writer, n *html.Node) {
+	// Skip script and style tags entirely
+	if n.Type == html.ElementNode && (n.Data == "script" || n.Data == "style") {
+		return
+	}
+
+	// If this is a text node, write its content
+	if n.Type == html.TextNode {
+		w.Write([]byte(n.Data))
+	}
+
+	// Recursively process all child nodes
+	for c := n.FirstChild; c != nil; c = c.NextSibling {
+		extractText(w, c)
+	}
 }