refactor(html_cleaner): adopt robust HTML parsing for content cleaning

Replaces the fragile regex-based HTML cleaning logic with a proper HTML parser using `golang.org/x/net/html`. The previous implementation was unreliable and could not correctly handle malformed tags, script content, or a wide range of HTML entities.

This new approach provides several key improvements:
- Skips the content of `
This commit is contained in:
2025-11-06 04:26:51 +01:00
parent 2790064ad5
commit e6977d3374
4 changed files with 52 additions and 34 deletions

View File

@ -112,7 +112,7 @@ func TestHTMLCleaner_CleanHTML(t *testing.T) {
{
name: "script and style tags content",
input: "<script>alert('test');</script>Content<style>body{color:red;}</style>",
expected: "alert('test');Contentbody{color:red;}",
expected: "Content", // Script and style tags are correctly skipped
},
{
name: "line breaks and formatting",
@ -147,7 +147,7 @@ func TestHTMLCleaner_CleanHTML(t *testing.T) {
{
name: "special HTML5 entities",
input: "Left arrow &larr; Right arrow &rarr;",
expected: "Left arrow &larr; Right arrow &rarr;", // These are not handled by the cleaner
expected: "Left arrow Right arrow ", // HTML5 entities are properly handled by the parser
},
}
@ -217,9 +217,9 @@ func TestHTMLCleaner_CleanHTML_EdgeCases(t *testing.T) {
expected: "&&&",
},
{
name: "entities without semicolon (should not be converted)",
name: "entities without semicolon (properly converted)",
input: "&amp test &lt test",
expected: "&amp test &lt test",
expected: "& test < test", // Parser handles entities even without semicolons in some cases
},
{
name: "mixed valid and invalid entities",
@ -234,7 +234,7 @@ func TestHTMLCleaner_CleanHTML_EdgeCases(t *testing.T) {
{
name: "tag with no closing bracket",
input: "Content <p class='test' with no closing bracket",
expected: "Content <p class='test' with no closing bracket",
expected: "Content", // Parser handles malformed HTML gracefully
},
{
name: "extremely nested tags",