mirror of
https://github.com/kjanat/articulate-parser.git
synced 2026-01-16 09:02:10 +01:00
refactor(html_cleaner): adopt robust HTML parsing for content cleaning
Replaces the fragile regex-based HTML cleaning logic with a proper HTML parser using `golang.org/x/net/html`. The previous implementation was unreliable and could not correctly handle malformed tags, script content, or a wide range of HTML entities. This new approach provides several key improvements: - Skips the content of `
This commit is contained in:
@ -112,7 +112,7 @@ func TestHTMLCleaner_CleanHTML(t *testing.T) {
|
||||
{
|
||||
name: "script and style tags content",
|
||||
input: "<script>alert('test');</script>Content<style>body{color:red;}</style>",
|
||||
expected: "alert('test');Contentbody{color:red;}",
|
||||
expected: "Content", // Script and style tags are correctly skipped
|
||||
},
|
||||
{
|
||||
name: "line breaks and formatting",
|
||||
@ -147,7 +147,7 @@ func TestHTMLCleaner_CleanHTML(t *testing.T) {
|
||||
{
|
||||
name: "special HTML5 entities",
|
||||
input: "Left arrow ← Right arrow →",
|
||||
expected: "Left arrow ← Right arrow →", // These are not handled by the cleaner
|
||||
expected: "Left arrow ← Right arrow →", // HTML5 entities are properly handled by the parser
|
||||
},
|
||||
}
|
||||
|
||||
@ -217,9 +217,9 @@ func TestHTMLCleaner_CleanHTML_EdgeCases(t *testing.T) {
|
||||
expected: "&&&",
|
||||
},
|
||||
{
|
||||
name: "entities without semicolon (should not be converted)",
|
||||
name: "entities without semicolon (properly converted)",
|
||||
input: "& test < test",
|
||||
expected: "& test < test",
|
||||
expected: "& test < test", // Parser handles entities even without semicolons in some cases
|
||||
},
|
||||
{
|
||||
name: "mixed valid and invalid entities",
|
||||
@ -234,7 +234,7 @@ func TestHTMLCleaner_CleanHTML_EdgeCases(t *testing.T) {
|
||||
{
|
||||
name: "tag with no closing bracket",
|
||||
input: "Content <p class='test' with no closing bracket",
|
||||
expected: "Content <p class='test' with no closing bracket",
|
||||
expected: "Content", // Parser handles malformed HTML gracefully
|
||||
},
|
||||
{
|
||||
name: "extremely nested tags",
|
||||
|
||||
Reference in New Issue
Block a user