Title
First paragraph with link.
- Item 1
- Item 2
// Package services_test provides tests for the HTML cleaner service. package services import ( "strings" "testing" ) // TestNewHTMLCleaner tests the NewHTMLCleaner constructor. func TestNewHTMLCleaner(t *testing.T) { cleaner := NewHTMLCleaner() if cleaner == nil { t.Fatal("NewHTMLCleaner() returned nil") } } // TestHTMLCleaner_CleanHTML tests the CleanHTML method with various HTML inputs. func TestHTMLCleaner_CleanHTML(t *testing.T) { cleaner := NewHTMLCleaner() tests := []struct { name string input string expected string }{ { name: "plain text (no HTML)", input: "This is plain text", expected: "This is plain text", }, { name: "empty string", input: "", expected: "", }, { name: "simple HTML tag", input: "
Hello world
", expected: "Hello world", }, { name: "multiple HTML tags", input: "Paragraph text
", expected: "TitleParagraph text", }, { name: "nested HTML tags", input: "Paragraph with bold text
Text with attributes
", expected: "Text with attributes", }, { name: "self-closing tags", input: "Line 1Hello & welcome to our site!
", expected: "Hello & welcome to our site!", }, { name: "multiple whitespace", input: "Text with\t\tmultiple\n\nspaces", expected: "Text with multiple spaces", }, { name: "whitespace with HTML", input: "Text with
This is a great course about & HTML entities like and "quotes".
Unclosed paragraph
Line 1
\nLine 2
\nLine 3
", expected: "Line 1 Line 2 Line 3", }, { name: "only whitespace", input: " \t\n ", expected: "", }, { name: "only HTML tags", input: "\n Paragraph with\n line breaks\n
", expected: "Paragraph with line breaks", }, { name: "complex nested structure", input: "First paragraph with link.
Content
", expected: "Content", }, { name: "special HTML5 entities", input: "Left arrow ← Right arrow →", expected: "Left arrow ← Right arrow →", // These are not handled by the cleaner }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { result := cleaner.CleanHTML(tt.input) if result != tt.expected { t.Errorf("CleanHTML(%q) = %q, want %q", tt.input, result, tt.expected) } }) } } // TestHTMLCleaner_CleanHTML_LargeContent tests the CleanHTML method with large content. func TestHTMLCleaner_CleanHTML_LargeContent(t *testing.T) { cleaner := NewHTMLCleaner() // Create a large HTML string var builder strings.Builder builder.WriteString("") for i := 0; i < 1000; i++ { builder.WriteString("Paragraph ") builder.WriteString(string(rune('0' + i%10))) builder.WriteString(" with some content & entities.
") } builder.WriteString("") input := builder.String() result := cleaner.CleanHTML(input) // Check that HTML tags are removed if strings.Contains(result, "<") || strings.Contains(result, ">") { t.Error("Result should not contain HTML tags") } // Check that content is preserved if !strings.Contains(result, "Paragraph") { t.Error("Result should contain paragraph content") } // Check that entities are converted if strings.Contains(result, "&") { t.Error("Result should not contain unconverted HTML entities") } if !strings.Contains(result, "&") { t.Error("Result should contain converted ampersand") } } // TestHTMLCleaner_CleanHTML_EdgeCases tests edge cases for the CleanHTML method. func TestHTMLCleaner_CleanHTML_EdgeCases(t *testing.T) { cleaner := NewHTMLCleaner() tests := []struct { name string input string expected string }{ { name: "only entities", input: "&<>"' ", expected: "&<>\"'", }, { name: "repeated entities", input: "&&&", expected: "&&&", }, { name: "entities without semicolon (should not be converted)", input: "& test < test", expected: "& test < test", }, { name: "mixed valid and invalid entities", input: "& &invalid; < &fake;", expected: "& &invalid; < &fake;", }, { name: "unclosed tag at end", input: "Contentwith unclosed", expected: "Content with unclosed", }, { name: "tag with no closing bracket", input: "Content
Hello 世界! Café naïve résumé
", expected: "Hello 世界! Café naïve résumé", }, { name: "unicode with entities", input: "Unicode: 你好 & emoji: 🌍
", expected: "Unicode: 你好 & emoji: 🌍", }, { name: "mixed scripts", input: "This is a great course about & HTML entities like and "quotes".
Paragraph ") builder.WriteString(string(rune('0' + i%10))) builder.WriteString(" with some content & entities <test>.
") } input := builder.String() b.ResetTimer() for i := 0; i < b.N; i++ { cleaner.CleanHTML(input) } }