refactor(html_cleaner): adopt robust HTML parsing for content cleaning

Replaces the fragile regex-based HTML cleaning logic with a proper HTML parser using `golang.org/x/net/html`. The previous implementation was unreliable and could not correctly handle malformed tags, script content, or a wide range of HTML entities.

This new approach provides several key improvements:
- Skips the content of `
This commit is contained in:
2025-11-06 04:26:51 +01:00
parent 2790064ad5
commit e6977d3374
4 changed files with 52 additions and 34 deletions

1
go.mod
View File

@ -4,6 +4,7 @@ go 1.24.0
require ( require (
github.com/fumiama/go-docx v0.0.0-20250506085032-0c30fd09304b github.com/fumiama/go-docx v0.0.0-20250506085032-0c30fd09304b
golang.org/x/net v0.46.0
golang.org/x/text v0.30.0 golang.org/x/text v0.30.0
) )

2
go.sum
View File

@ -4,5 +4,7 @@ github.com/fumiama/imgsz v0.0.4 h1:Lsasu2hdSSFS+vnD+nvR1UkiRMK7hcpyYCC0FzgSMFI=
github.com/fumiama/imgsz v0.0.4/go.mod h1:bISOQVTlw9sRytPwe8ir7tAaEmyz9hSNj9n8mXMBG0E= github.com/fumiama/imgsz v0.0.4/go.mod h1:bISOQVTlw9sRytPwe8ir7tAaEmyz9hSNj9n8mXMBG0E=
golang.org/x/image v0.32.0 h1:6lZQWq75h7L5IWNk0r+SCpUJ6tUVd3v4ZHnbRKLkUDQ= golang.org/x/image v0.32.0 h1:6lZQWq75h7L5IWNk0r+SCpUJ6tUVd3v4ZHnbRKLkUDQ=
golang.org/x/image v0.32.0/go.mod h1:/R37rrQmKXtO6tYXAjtDLwQgFLHmhW+V6ayXlxzP2Pc= golang.org/x/image v0.32.0/go.mod h1:/R37rrQmKXtO6tYXAjtDLwQgFLHmhW+V6ayXlxzP2Pc=
golang.org/x/net v0.46.0 h1:giFlY12I07fugqwPuWJi68oOnpfqFnJIJzaIIm2JVV4=
golang.org/x/net v0.46.0/go.mod h1:Q9BGdFy1y4nkUwiLvT5qtyhAnEHgnQ/zd8PfU6nc210=
golang.org/x/text v0.30.0 h1:yznKA/E9zq54KzlzBEAWn1NXSQ8DIp/NYMy88xJjl4k= golang.org/x/text v0.30.0 h1:yznKA/E9zq54KzlzBEAWn1NXSQ8DIp/NYMy88xJjl4k=
golang.org/x/text v0.30.0/go.mod h1:yDdHFIX9t+tORqspjENWgzaCVXgk0yYnYuSZ8UzzBVM= golang.org/x/text v0.30.0/go.mod h1:yDdHFIX9t+tORqspjENWgzaCVXgk0yYnYuSZ8UzzBVM=

View File

@ -3,20 +3,17 @@
package services package services
import ( import (
"regexp" "bytes"
stdhtml "html"
"io"
"strings" "strings"
)
var ( "golang.org/x/net/html"
// htmlTagRegex matches HTML tags for removal
htmlTagRegex = regexp.MustCompile(`<[^>]*>`)
// whitespaceRegex matches multiple whitespace characters for normalization
whitespaceRegex = regexp.MustCompile(`\s+`)
) )
// HTMLCleaner provides utilities for converting HTML content to plain text. // HTMLCleaner provides utilities for converting HTML content to plain text.
// It removes HTML tags while preserving their content and converts HTML entities // It removes HTML tags while preserving their content and converts HTML entities
// to their plain text equivalents. // to their plain text equivalents using proper HTML parsing instead of regex.
type HTMLCleaner struct{} type HTMLCleaner struct{}
// NewHTMLCleaner creates a new HTML cleaner instance. // NewHTMLCleaner creates a new HTML cleaner instance.
@ -27,33 +24,51 @@ func NewHTMLCleaner() *HTMLCleaner {
} }
// CleanHTML removes HTML tags and converts entities, returning clean plain text. // CleanHTML removes HTML tags and converts entities, returning clean plain text.
// The function preserves the textual content of the HTML while removing markup. // The function parses the HTML into a node tree and extracts only text content,
// It handles common HTML entities like &nbsp;, &amp;, etc., and normalizes whitespace. // which handles edge cases like script tags or attributes better than regex.
// It handles HTML entities automatically through the parser and normalizes whitespace.
// //
// Parameters: // Parameters:
// - html: The HTML content to clean // - htmlStr: The HTML content to clean
// //
// Returns: // Returns:
// - A plain text string with all HTML elements and entities removed/converted // - A plain text string with all HTML elements and entities removed/converted
func (h *HTMLCleaner) CleanHTML(html string) string { func (h *HTMLCleaner) CleanHTML(htmlStr string) string {
// Remove HTML tags but preserve content // Parse the HTML into a node tree
cleaned := htmlTagRegex.ReplaceAllString(html, "") doc, err := html.Parse(strings.NewReader(htmlStr))
if err != nil {
// Replace common HTML entities with their character equivalents // If parsing fails, return empty string
cleaned = strings.ReplaceAll(cleaned, "&nbsp;", " ") // This maintains backward compatibility with the test expectations
cleaned = strings.ReplaceAll(cleaned, "&amp;", "&") return ""
cleaned = strings.ReplaceAll(cleaned, "&lt;", "<") }
cleaned = strings.ReplaceAll(cleaned, "&gt;", ">")
cleaned = strings.ReplaceAll(cleaned, "&quot;", "\"") // Extract text content from the node tree
cleaned = strings.ReplaceAll(cleaned, "&#39;", "'") var buf bytes.Buffer
cleaned = strings.ReplaceAll(cleaned, "&iuml;", "ï") extractText(&buf, doc)
cleaned = strings.ReplaceAll(cleaned, "&euml;", "ë")
cleaned = strings.ReplaceAll(cleaned, "&eacute;", "é") // Unescape any remaining HTML entities
unescaped := stdhtml.UnescapeString(buf.String())
// Clean up extra whitespace by replacing multiple spaces, tabs, and newlines
// with a single space, then trim any leading/trailing whitespace // Normalize whitespace: replace multiple spaces, tabs, and newlines with a single space
cleaned = whitespaceRegex.ReplaceAllString(cleaned, " ") cleaned := strings.Join(strings.Fields(unescaped), " ")
cleaned = strings.TrimSpace(cleaned) return strings.TrimSpace(cleaned)
}
return cleaned
// extractText recursively traverses the HTML node tree and extracts text content.
// It skips script and style tags to avoid including their content in the output.
func extractText(w io.Writer, n *html.Node) {
// Skip script and style tags entirely
if n.Type == html.ElementNode && (n.Data == "script" || n.Data == "style") {
return
}
// If this is a text node, write its content
if n.Type == html.TextNode {
w.Write([]byte(n.Data))
}
// Recursively process all child nodes
for c := n.FirstChild; c != nil; c = c.NextSibling {
extractText(w, c)
}
} }

View File

@ -112,7 +112,7 @@ func TestHTMLCleaner_CleanHTML(t *testing.T) {
{ {
name: "script and style tags content", name: "script and style tags content",
input: "<script>alert('test');</script>Content<style>body{color:red;}</style>", input: "<script>alert('test');</script>Content<style>body{color:red;}</style>",
expected: "alert('test');Contentbody{color:red;}", expected: "Content", // Script and style tags are correctly skipped
}, },
{ {
name: "line breaks and formatting", name: "line breaks and formatting",
@ -147,7 +147,7 @@ func TestHTMLCleaner_CleanHTML(t *testing.T) {
{ {
name: "special HTML5 entities", name: "special HTML5 entities",
input: "Left arrow &larr; Right arrow &rarr;", input: "Left arrow &larr; Right arrow &rarr;",
expected: "Left arrow &larr; Right arrow &rarr;", // These are not handled by the cleaner expected: "Left arrow Right arrow ", // HTML5 entities are properly handled by the parser
}, },
} }
@ -217,9 +217,9 @@ func TestHTMLCleaner_CleanHTML_EdgeCases(t *testing.T) {
expected: "&&&", expected: "&&&",
}, },
{ {
name: "entities without semicolon (should not be converted)", name: "entities without semicolon (properly converted)",
input: "&amp test &lt test", input: "&amp test &lt test",
expected: "&amp test &lt test", expected: "& test < test", // Parser handles entities even without semicolons in some cases
}, },
{ {
name: "mixed valid and invalid entities", name: "mixed valid and invalid entities",
@ -234,7 +234,7 @@ func TestHTMLCleaner_CleanHTML_EdgeCases(t *testing.T) {
{ {
name: "tag with no closing bracket", name: "tag with no closing bracket",
input: "Content <p class='test' with no closing bracket", input: "Content <p class='test' with no closing bracket",
expected: "Content <p class='test' with no closing bracket", expected: "Content", // Parser handles malformed HTML gracefully
}, },
{ {
name: "extremely nested tags", name: "extremely nested tags",