mirror of
https://github.com/kjanat/articulate-parser.git
synced 2026-01-16 11:02:10 +01:00
refactor(html_cleaner): adopt robust HTML parsing for content cleaning
Replaces the fragile regex-based HTML cleaning logic with a proper HTML parser using `golang.org/x/net/html`. The previous implementation was unreliable and could not correctly handle malformed tags, script content, or a wide range of HTML entities. This new approach provides several key improvements: - Skips the content of `
This commit is contained in:
1
go.mod
1
go.mod
@ -4,6 +4,7 @@ go 1.24.0
|
|||||||
|
|
||||||
require (
|
require (
|
||||||
github.com/fumiama/go-docx v0.0.0-20250506085032-0c30fd09304b
|
github.com/fumiama/go-docx v0.0.0-20250506085032-0c30fd09304b
|
||||||
|
golang.org/x/net v0.46.0
|
||||||
golang.org/x/text v0.30.0
|
golang.org/x/text v0.30.0
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
2
go.sum
2
go.sum
@ -4,5 +4,7 @@ github.com/fumiama/imgsz v0.0.4 h1:Lsasu2hdSSFS+vnD+nvR1UkiRMK7hcpyYCC0FzgSMFI=
|
|||||||
github.com/fumiama/imgsz v0.0.4/go.mod h1:bISOQVTlw9sRytPwe8ir7tAaEmyz9hSNj9n8mXMBG0E=
|
github.com/fumiama/imgsz v0.0.4/go.mod h1:bISOQVTlw9sRytPwe8ir7tAaEmyz9hSNj9n8mXMBG0E=
|
||||||
golang.org/x/image v0.32.0 h1:6lZQWq75h7L5IWNk0r+SCpUJ6tUVd3v4ZHnbRKLkUDQ=
|
golang.org/x/image v0.32.0 h1:6lZQWq75h7L5IWNk0r+SCpUJ6tUVd3v4ZHnbRKLkUDQ=
|
||||||
golang.org/x/image v0.32.0/go.mod h1:/R37rrQmKXtO6tYXAjtDLwQgFLHmhW+V6ayXlxzP2Pc=
|
golang.org/x/image v0.32.0/go.mod h1:/R37rrQmKXtO6tYXAjtDLwQgFLHmhW+V6ayXlxzP2Pc=
|
||||||
|
golang.org/x/net v0.46.0 h1:giFlY12I07fugqwPuWJi68oOnpfqFnJIJzaIIm2JVV4=
|
||||||
|
golang.org/x/net v0.46.0/go.mod h1:Q9BGdFy1y4nkUwiLvT5qtyhAnEHgnQ/zd8PfU6nc210=
|
||||||
golang.org/x/text v0.30.0 h1:yznKA/E9zq54KzlzBEAWn1NXSQ8DIp/NYMy88xJjl4k=
|
golang.org/x/text v0.30.0 h1:yznKA/E9zq54KzlzBEAWn1NXSQ8DIp/NYMy88xJjl4k=
|
||||||
golang.org/x/text v0.30.0/go.mod h1:yDdHFIX9t+tORqspjENWgzaCVXgk0yYnYuSZ8UzzBVM=
|
golang.org/x/text v0.30.0/go.mod h1:yDdHFIX9t+tORqspjENWgzaCVXgk0yYnYuSZ8UzzBVM=
|
||||||
|
|||||||
@ -3,20 +3,17 @@
|
|||||||
package services
|
package services
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"regexp"
|
"bytes"
|
||||||
|
stdhtml "html"
|
||||||
|
"io"
|
||||||
"strings"
|
"strings"
|
||||||
)
|
|
||||||
|
|
||||||
var (
|
"golang.org/x/net/html"
|
||||||
// htmlTagRegex matches HTML tags for removal
|
|
||||||
htmlTagRegex = regexp.MustCompile(`<[^>]*>`)
|
|
||||||
// whitespaceRegex matches multiple whitespace characters for normalization
|
|
||||||
whitespaceRegex = regexp.MustCompile(`\s+`)
|
|
||||||
)
|
)
|
||||||
|
|
||||||
// HTMLCleaner provides utilities for converting HTML content to plain text.
|
// HTMLCleaner provides utilities for converting HTML content to plain text.
|
||||||
// It removes HTML tags while preserving their content and converts HTML entities
|
// It removes HTML tags while preserving their content and converts HTML entities
|
||||||
// to their plain text equivalents.
|
// to their plain text equivalents using proper HTML parsing instead of regex.
|
||||||
type HTMLCleaner struct{}
|
type HTMLCleaner struct{}
|
||||||
|
|
||||||
// NewHTMLCleaner creates a new HTML cleaner instance.
|
// NewHTMLCleaner creates a new HTML cleaner instance.
|
||||||
@ -27,33 +24,51 @@ func NewHTMLCleaner() *HTMLCleaner {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// CleanHTML removes HTML tags and converts entities, returning clean plain text.
|
// CleanHTML removes HTML tags and converts entities, returning clean plain text.
|
||||||
// The function preserves the textual content of the HTML while removing markup.
|
// The function parses the HTML into a node tree and extracts only text content,
|
||||||
// It handles common HTML entities like , &, etc., and normalizes whitespace.
|
// which handles edge cases like script tags or attributes better than regex.
|
||||||
|
// It handles HTML entities automatically through the parser and normalizes whitespace.
|
||||||
//
|
//
|
||||||
// Parameters:
|
// Parameters:
|
||||||
// - html: The HTML content to clean
|
// - htmlStr: The HTML content to clean
|
||||||
//
|
//
|
||||||
// Returns:
|
// Returns:
|
||||||
// - A plain text string with all HTML elements and entities removed/converted
|
// - A plain text string with all HTML elements and entities removed/converted
|
||||||
func (h *HTMLCleaner) CleanHTML(html string) string {
|
func (h *HTMLCleaner) CleanHTML(htmlStr string) string {
|
||||||
// Remove HTML tags but preserve content
|
// Parse the HTML into a node tree
|
||||||
cleaned := htmlTagRegex.ReplaceAllString(html, "")
|
doc, err := html.Parse(strings.NewReader(htmlStr))
|
||||||
|
if err != nil {
|
||||||
// Replace common HTML entities with their character equivalents
|
// If parsing fails, return empty string
|
||||||
cleaned = strings.ReplaceAll(cleaned, " ", " ")
|
// This maintains backward compatibility with the test expectations
|
||||||
cleaned = strings.ReplaceAll(cleaned, "&", "&")
|
return ""
|
||||||
cleaned = strings.ReplaceAll(cleaned, "<", "<")
|
}
|
||||||
cleaned = strings.ReplaceAll(cleaned, ">", ">")
|
|
||||||
cleaned = strings.ReplaceAll(cleaned, """, "\"")
|
// Extract text content from the node tree
|
||||||
cleaned = strings.ReplaceAll(cleaned, "'", "'")
|
var buf bytes.Buffer
|
||||||
cleaned = strings.ReplaceAll(cleaned, "ï", "ï")
|
extractText(&buf, doc)
|
||||||
cleaned = strings.ReplaceAll(cleaned, "ë", "ë")
|
|
||||||
cleaned = strings.ReplaceAll(cleaned, "é", "é")
|
// Unescape any remaining HTML entities
|
||||||
|
unescaped := stdhtml.UnescapeString(buf.String())
|
||||||
// Clean up extra whitespace by replacing multiple spaces, tabs, and newlines
|
|
||||||
// with a single space, then trim any leading/trailing whitespace
|
// Normalize whitespace: replace multiple spaces, tabs, and newlines with a single space
|
||||||
cleaned = whitespaceRegex.ReplaceAllString(cleaned, " ")
|
cleaned := strings.Join(strings.Fields(unescaped), " ")
|
||||||
cleaned = strings.TrimSpace(cleaned)
|
return strings.TrimSpace(cleaned)
|
||||||
|
}
|
||||||
return cleaned
|
|
||||||
|
// extractText recursively traverses the HTML node tree and extracts text content.
|
||||||
|
// It skips script and style tags to avoid including their content in the output.
|
||||||
|
func extractText(w io.Writer, n *html.Node) {
|
||||||
|
// Skip script and style tags entirely
|
||||||
|
if n.Type == html.ElementNode && (n.Data == "script" || n.Data == "style") {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// If this is a text node, write its content
|
||||||
|
if n.Type == html.TextNode {
|
||||||
|
w.Write([]byte(n.Data))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Recursively process all child nodes
|
||||||
|
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
||||||
|
extractText(w, c)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -112,7 +112,7 @@ func TestHTMLCleaner_CleanHTML(t *testing.T) {
|
|||||||
{
|
{
|
||||||
name: "script and style tags content",
|
name: "script and style tags content",
|
||||||
input: "<script>alert('test');</script>Content<style>body{color:red;}</style>",
|
input: "<script>alert('test');</script>Content<style>body{color:red;}</style>",
|
||||||
expected: "alert('test');Contentbody{color:red;}",
|
expected: "Content", // Script and style tags are correctly skipped
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "line breaks and formatting",
|
name: "line breaks and formatting",
|
||||||
@ -147,7 +147,7 @@ func TestHTMLCleaner_CleanHTML(t *testing.T) {
|
|||||||
{
|
{
|
||||||
name: "special HTML5 entities",
|
name: "special HTML5 entities",
|
||||||
input: "Left arrow ← Right arrow →",
|
input: "Left arrow ← Right arrow →",
|
||||||
expected: "Left arrow ← Right arrow →", // These are not handled by the cleaner
|
expected: "Left arrow ← Right arrow →", // HTML5 entities are properly handled by the parser
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -217,9 +217,9 @@ func TestHTMLCleaner_CleanHTML_EdgeCases(t *testing.T) {
|
|||||||
expected: "&&&",
|
expected: "&&&",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "entities without semicolon (should not be converted)",
|
name: "entities without semicolon (properly converted)",
|
||||||
input: "& test < test",
|
input: "& test < test",
|
||||||
expected: "& test < test",
|
expected: "& test < test", // Parser handles entities even without semicolons in some cases
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "mixed valid and invalid entities",
|
name: "mixed valid and invalid entities",
|
||||||
@ -234,7 +234,7 @@ func TestHTMLCleaner_CleanHTML_EdgeCases(t *testing.T) {
|
|||||||
{
|
{
|
||||||
name: "tag with no closing bracket",
|
name: "tag with no closing bracket",
|
||||||
input: "Content <p class='test' with no closing bracket",
|
input: "Content <p class='test' with no closing bracket",
|
||||||
expected: "Content <p class='test' with no closing bracket",
|
expected: "Content", // Parser handles malformed HTML gracefully
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "extremely nested tags",
|
name: "extremely nested tags",
|
||||||
|
|||||||
Reference in New Issue
Block a user