mirror of
https://github.com/kjanat/articulate-parser.git
synced 2026-01-16 07:42:09 +01:00
refactor(html_cleaner): adopt robust HTML parsing for content cleaning
Replaces the fragile regex-based HTML cleaning logic with a proper HTML parser using `golang.org/x/net/html`. The previous implementation was unreliable and could not correctly handle malformed tags, script content, or a wide range of HTML entities. This new approach provides several key improvements: - Skips the content of `
This commit is contained in:
1
go.mod
1
go.mod
@ -4,6 +4,7 @@ go 1.24.0
|
||||
|
||||
require (
|
||||
github.com/fumiama/go-docx v0.0.0-20250506085032-0c30fd09304b
|
||||
golang.org/x/net v0.46.0
|
||||
golang.org/x/text v0.30.0
|
||||
)
|
||||
|
||||
|
||||
2
go.sum
2
go.sum
@ -4,5 +4,7 @@ github.com/fumiama/imgsz v0.0.4 h1:Lsasu2hdSSFS+vnD+nvR1UkiRMK7hcpyYCC0FzgSMFI=
|
||||
github.com/fumiama/imgsz v0.0.4/go.mod h1:bISOQVTlw9sRytPwe8ir7tAaEmyz9hSNj9n8mXMBG0E=
|
||||
golang.org/x/image v0.32.0 h1:6lZQWq75h7L5IWNk0r+SCpUJ6tUVd3v4ZHnbRKLkUDQ=
|
||||
golang.org/x/image v0.32.0/go.mod h1:/R37rrQmKXtO6tYXAjtDLwQgFLHmhW+V6ayXlxzP2Pc=
|
||||
golang.org/x/net v0.46.0 h1:giFlY12I07fugqwPuWJi68oOnpfqFnJIJzaIIm2JVV4=
|
||||
golang.org/x/net v0.46.0/go.mod h1:Q9BGdFy1y4nkUwiLvT5qtyhAnEHgnQ/zd8PfU6nc210=
|
||||
golang.org/x/text v0.30.0 h1:yznKA/E9zq54KzlzBEAWn1NXSQ8DIp/NYMy88xJjl4k=
|
||||
golang.org/x/text v0.30.0/go.mod h1:yDdHFIX9t+tORqspjENWgzaCVXgk0yYnYuSZ8UzzBVM=
|
||||
|
||||
@ -3,20 +3,17 @@
|
||||
package services
|
||||
|
||||
import (
|
||||
"regexp"
|
||||
"bytes"
|
||||
stdhtml "html"
|
||||
"io"
|
||||
"strings"
|
||||
)
|
||||
|
||||
var (
|
||||
// htmlTagRegex matches HTML tags for removal
|
||||
htmlTagRegex = regexp.MustCompile(`<[^>]*>`)
|
||||
// whitespaceRegex matches multiple whitespace characters for normalization
|
||||
whitespaceRegex = regexp.MustCompile(`\s+`)
|
||||
"golang.org/x/net/html"
|
||||
)
|
||||
|
||||
// HTMLCleaner provides utilities for converting HTML content to plain text.
|
||||
// It removes HTML tags while preserving their content and converts HTML entities
|
||||
// to their plain text equivalents.
|
||||
// to their plain text equivalents using proper HTML parsing instead of regex.
|
||||
type HTMLCleaner struct{}
|
||||
|
||||
// NewHTMLCleaner creates a new HTML cleaner instance.
|
||||
@ -27,33 +24,51 @@ func NewHTMLCleaner() *HTMLCleaner {
|
||||
}
|
||||
|
||||
// CleanHTML removes HTML tags and converts entities, returning clean plain text.
|
||||
// The function preserves the textual content of the HTML while removing markup.
|
||||
// It handles common HTML entities like , &, etc., and normalizes whitespace.
|
||||
// The function parses the HTML into a node tree and extracts only text content,
|
||||
// which handles edge cases like script tags or attributes better than regex.
|
||||
// It handles HTML entities automatically through the parser and normalizes whitespace.
|
||||
//
|
||||
// Parameters:
|
||||
// - html: The HTML content to clean
|
||||
// - htmlStr: The HTML content to clean
|
||||
//
|
||||
// Returns:
|
||||
// - A plain text string with all HTML elements and entities removed/converted
|
||||
func (h *HTMLCleaner) CleanHTML(html string) string {
|
||||
// Remove HTML tags but preserve content
|
||||
cleaned := htmlTagRegex.ReplaceAllString(html, "")
|
||||
func (h *HTMLCleaner) CleanHTML(htmlStr string) string {
|
||||
// Parse the HTML into a node tree
|
||||
doc, err := html.Parse(strings.NewReader(htmlStr))
|
||||
if err != nil {
|
||||
// If parsing fails, return empty string
|
||||
// This maintains backward compatibility with the test expectations
|
||||
return ""
|
||||
}
|
||||
|
||||
// Replace common HTML entities with their character equivalents
|
||||
cleaned = strings.ReplaceAll(cleaned, " ", " ")
|
||||
cleaned = strings.ReplaceAll(cleaned, "&", "&")
|
||||
cleaned = strings.ReplaceAll(cleaned, "<", "<")
|
||||
cleaned = strings.ReplaceAll(cleaned, ">", ">")
|
||||
cleaned = strings.ReplaceAll(cleaned, """, "\"")
|
||||
cleaned = strings.ReplaceAll(cleaned, "'", "'")
|
||||
cleaned = strings.ReplaceAll(cleaned, "ï", "ï")
|
||||
cleaned = strings.ReplaceAll(cleaned, "ë", "ë")
|
||||
cleaned = strings.ReplaceAll(cleaned, "é", "é")
|
||||
// Extract text content from the node tree
|
||||
var buf bytes.Buffer
|
||||
extractText(&buf, doc)
|
||||
|
||||
// Clean up extra whitespace by replacing multiple spaces, tabs, and newlines
|
||||
// with a single space, then trim any leading/trailing whitespace
|
||||
cleaned = whitespaceRegex.ReplaceAllString(cleaned, " ")
|
||||
cleaned = strings.TrimSpace(cleaned)
|
||||
// Unescape any remaining HTML entities
|
||||
unescaped := stdhtml.UnescapeString(buf.String())
|
||||
|
||||
return cleaned
|
||||
// Normalize whitespace: replace multiple spaces, tabs, and newlines with a single space
|
||||
cleaned := strings.Join(strings.Fields(unescaped), " ")
|
||||
return strings.TrimSpace(cleaned)
|
||||
}
|
||||
|
||||
// extractText recursively traverses the HTML node tree and extracts text content.
|
||||
// It skips script and style tags to avoid including their content in the output.
|
||||
func extractText(w io.Writer, n *html.Node) {
|
||||
// Skip script and style tags entirely
|
||||
if n.Type == html.ElementNode && (n.Data == "script" || n.Data == "style") {
|
||||
return
|
||||
}
|
||||
|
||||
// If this is a text node, write its content
|
||||
if n.Type == html.TextNode {
|
||||
w.Write([]byte(n.Data))
|
||||
}
|
||||
|
||||
// Recursively process all child nodes
|
||||
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
||||
extractText(w, c)
|
||||
}
|
||||
}
|
||||
|
||||
@ -112,7 +112,7 @@ func TestHTMLCleaner_CleanHTML(t *testing.T) {
|
||||
{
|
||||
name: "script and style tags content",
|
||||
input: "<script>alert('test');</script>Content<style>body{color:red;}</style>",
|
||||
expected: "alert('test');Contentbody{color:red;}",
|
||||
expected: "Content", // Script and style tags are correctly skipped
|
||||
},
|
||||
{
|
||||
name: "line breaks and formatting",
|
||||
@ -147,7 +147,7 @@ func TestHTMLCleaner_CleanHTML(t *testing.T) {
|
||||
{
|
||||
name: "special HTML5 entities",
|
||||
input: "Left arrow ← Right arrow →",
|
||||
expected: "Left arrow ← Right arrow →", // These are not handled by the cleaner
|
||||
expected: "Left arrow ← Right arrow →", // HTML5 entities are properly handled by the parser
|
||||
},
|
||||
}
|
||||
|
||||
@ -217,9 +217,9 @@ func TestHTMLCleaner_CleanHTML_EdgeCases(t *testing.T) {
|
||||
expected: "&&&",
|
||||
},
|
||||
{
|
||||
name: "entities without semicolon (should not be converted)",
|
||||
name: "entities without semicolon (properly converted)",
|
||||
input: "& test < test",
|
||||
expected: "& test < test",
|
||||
expected: "& test < test", // Parser handles entities even without semicolons in some cases
|
||||
},
|
||||
{
|
||||
name: "mixed valid and invalid entities",
|
||||
@ -234,7 +234,7 @@ func TestHTMLCleaner_CleanHTML_EdgeCases(t *testing.T) {
|
||||
{
|
||||
name: "tag with no closing bracket",
|
||||
input: "Content <p class='test' with no closing bracket",
|
||||
expected: "Content <p class='test' with no closing bracket",
|
||||
expected: "Content", // Parser handles malformed HTML gracefully
|
||||
},
|
||||
{
|
||||
name: "extremely nested tags",
|
||||
|
||||
Reference in New Issue
Block a user