diff --git a/go.mod b/go.mod index 964d893..8a33e50 100644 --- a/go.mod +++ b/go.mod @@ -4,6 +4,7 @@ go 1.24.0 require ( github.com/fumiama/go-docx v0.0.0-20250506085032-0c30fd09304b + golang.org/x/net v0.46.0 golang.org/x/text v0.30.0 ) diff --git a/go.sum b/go.sum index eb4ab92..18c72a9 100644 --- a/go.sum +++ b/go.sum @@ -4,5 +4,7 @@ github.com/fumiama/imgsz v0.0.4 h1:Lsasu2hdSSFS+vnD+nvR1UkiRMK7hcpyYCC0FzgSMFI= github.com/fumiama/imgsz v0.0.4/go.mod h1:bISOQVTlw9sRytPwe8ir7tAaEmyz9hSNj9n8mXMBG0E= golang.org/x/image v0.32.0 h1:6lZQWq75h7L5IWNk0r+SCpUJ6tUVd3v4ZHnbRKLkUDQ= golang.org/x/image v0.32.0/go.mod h1:/R37rrQmKXtO6tYXAjtDLwQgFLHmhW+V6ayXlxzP2Pc= +golang.org/x/net v0.46.0 h1:giFlY12I07fugqwPuWJi68oOnpfqFnJIJzaIIm2JVV4= +golang.org/x/net v0.46.0/go.mod h1:Q9BGdFy1y4nkUwiLvT5qtyhAnEHgnQ/zd8PfU6nc210= golang.org/x/text v0.30.0 h1:yznKA/E9zq54KzlzBEAWn1NXSQ8DIp/NYMy88xJjl4k= golang.org/x/text v0.30.0/go.mod h1:yDdHFIX9t+tORqspjENWgzaCVXgk0yYnYuSZ8UzzBVM= diff --git a/internal/services/html_cleaner.go b/internal/services/html_cleaner.go index 32942fb..7c44774 100644 --- a/internal/services/html_cleaner.go +++ b/internal/services/html_cleaner.go @@ -3,20 +3,17 @@ package services import ( - "regexp" + "bytes" + stdhtml "html" + "io" "strings" -) -var ( - // htmlTagRegex matches HTML tags for removal - htmlTagRegex = regexp.MustCompile(`<[^>]*>`) - // whitespaceRegex matches multiple whitespace characters for normalization - whitespaceRegex = regexp.MustCompile(`\s+`) + "golang.org/x/net/html" ) // HTMLCleaner provides utilities for converting HTML content to plain text. // It removes HTML tags while preserving their content and converts HTML entities -// to their plain text equivalents. +// to their plain text equivalents using proper HTML parsing instead of regex. type HTMLCleaner struct{} // NewHTMLCleaner creates a new HTML cleaner instance. @@ -27,33 +24,51 @@ func NewHTMLCleaner() *HTMLCleaner { } // CleanHTML removes HTML tags and converts entities, returning clean plain text. -// The function preserves the textual content of the HTML while removing markup. -// It handles common HTML entities like , &, etc., and normalizes whitespace. +// The function parses the HTML into a node tree and extracts only text content, +// which handles edge cases like script tags or attributes better than regex. +// It handles HTML entities automatically through the parser and normalizes whitespace. // // Parameters: -// - html: The HTML content to clean +// - htmlStr: The HTML content to clean // // Returns: // - A plain text string with all HTML elements and entities removed/converted -func (h *HTMLCleaner) CleanHTML(html string) string { - // Remove HTML tags but preserve content - cleaned := htmlTagRegex.ReplaceAllString(html, "") +func (h *HTMLCleaner) CleanHTML(htmlStr string) string { + // Parse the HTML into a node tree + doc, err := html.Parse(strings.NewReader(htmlStr)) + if err != nil { + // If parsing fails, return empty string + // This maintains backward compatibility with the test expectations + return "" + } - // Replace common HTML entities with their character equivalents - cleaned = strings.ReplaceAll(cleaned, " ", " ") - cleaned = strings.ReplaceAll(cleaned, "&", "&") - cleaned = strings.ReplaceAll(cleaned, "<", "<") - cleaned = strings.ReplaceAll(cleaned, ">", ">") - cleaned = strings.ReplaceAll(cleaned, """, "\"") - cleaned = strings.ReplaceAll(cleaned, "'", "'") - cleaned = strings.ReplaceAll(cleaned, "ï", "ï") - cleaned = strings.ReplaceAll(cleaned, "ë", "ë") - cleaned = strings.ReplaceAll(cleaned, "é", "é") + // Extract text content from the node tree + var buf bytes.Buffer + extractText(&buf, doc) - // Clean up extra whitespace by replacing multiple spaces, tabs, and newlines - // with a single space, then trim any leading/trailing whitespace - cleaned = whitespaceRegex.ReplaceAllString(cleaned, " ") - cleaned = strings.TrimSpace(cleaned) + // Unescape any remaining HTML entities + unescaped := stdhtml.UnescapeString(buf.String()) - return cleaned + // Normalize whitespace: replace multiple spaces, tabs, and newlines with a single space + cleaned := strings.Join(strings.Fields(unescaped), " ") + return strings.TrimSpace(cleaned) +} + +// extractText recursively traverses the HTML node tree and extracts text content. +// It skips script and style tags to avoid including their content in the output. +func extractText(w io.Writer, n *html.Node) { + // Skip script and style tags entirely + if n.Type == html.ElementNode && (n.Data == "script" || n.Data == "style") { + return + } + + // If this is a text node, write its content + if n.Type == html.TextNode { + w.Write([]byte(n.Data)) + } + + // Recursively process all child nodes + for c := n.FirstChild; c != nil; c = c.NextSibling { + extractText(w, c) + } } diff --git a/internal/services/html_cleaner_test.go b/internal/services/html_cleaner_test.go index a0ca1e3..0c6836a 100644 --- a/internal/services/html_cleaner_test.go +++ b/internal/services/html_cleaner_test.go @@ -112,7 +112,7 @@ func TestHTMLCleaner_CleanHTML(t *testing.T) { { name: "script and style tags content", input: "Content", - expected: "alert('test');Contentbody{color:red;}", + expected: "Content", // Script and style tags are correctly skipped }, { name: "line breaks and formatting", @@ -147,7 +147,7 @@ func TestHTMLCleaner_CleanHTML(t *testing.T) { { name: "special HTML5 entities", input: "Left arrow ← Right arrow →", - expected: "Left arrow ← Right arrow →", // These are not handled by the cleaner + expected: "Left arrow ← Right arrow →", // HTML5 entities are properly handled by the parser }, } @@ -217,9 +217,9 @@ func TestHTMLCleaner_CleanHTML_EdgeCases(t *testing.T) { expected: "&&&", }, { - name: "entities without semicolon (should not be converted)", + name: "entities without semicolon (properly converted)", input: "& test < test", - expected: "& test < test", + expected: "& test < test", // Parser handles entities even without semicolons in some cases }, { name: "mixed valid and invalid entities", @@ -234,7 +234,7 @@ func TestHTMLCleaner_CleanHTML_EdgeCases(t *testing.T) { { name: "tag with no closing bracket", input: "Content