refactor(html_cleaner): adopt robust HTML parsing for content cleaning

Replaces the fragile regex-based HTML cleaning logic with a proper HTML parser using `golang.org/x/net/html`. The previous implementation was unreliable and could not correctly handle malformed tags, script content, or a wide range of HTML entities. This new approach provides several key improvements: - Skips the content of `
2026-03-03 01:41:27 +01:00 · 2025-11-06 04:26:51 +01:00
parent 2790064ad5
commit e6977d3374
4 changed files with 52 additions and 34 deletions
--- a/internal/services/html_cleaner_test.go
+++ b/internal/services/html_cleaner_test.go
@@ -112,7 +112,7 @@ func TestHTMLCleaner_CleanHTML(t *testing.T) {
 		{
 			name:     "script and style tags content",
 			input:    "<script>alert('test');</script>Content<style>body{color:red;}</style>",
-			expected: "alert('test');Contentbody{color:red;}",
+			expected: "Content", // Script and style tags are correctly skipped
 		},
 		{
 			name:     "line breaks and formatting",
@@ -147,7 +147,7 @@ func TestHTMLCleaner_CleanHTML(t *testing.T) {
 		{
 			name:     "special HTML5 entities",
 			input:    "Left arrow &larr; Right arrow &rarr;",
-			expected: "Left arrow &larr; Right arrow &rarr;", // These are not handled by the cleaner
+			expected: "Left arrow ← Right arrow →", // HTML5 entities are properly handled by the parser
 		},
 	}

@@ -217,9 +217,9 @@ func TestHTMLCleaner_CleanHTML_EdgeCases(t *testing.T) {
 			expected: "&&&",
 		},
 		{
-			name:     "entities without semicolon (should not be converted)",
+			name:     "entities without semicolon (properly converted)",
 			input:    "&amp test &lt test",
-			expected: "&amp test &lt test",
+			expected: "& test < test", // Parser handles entities even without semicolons in some cases
 		},
 		{
 			name:     "mixed valid and invalid entities",
@@ -234,7 +234,7 @@ func TestHTMLCleaner_CleanHTML_EdgeCases(t *testing.T) {
 		{
 			name:     "tag with no closing bracket",
 			input:    "Content <p class='test' with no closing bracket",
-			expected: "Content <p class='test' with no closing bracket",
+			expected: "Content", // Parser handles malformed HTML gracefully
 		},
 		{
 			name:     "extremely nested tags",