// Package services_test provides tests for the HTML cleaner service. package services import ( "strings" "testing" ) // TestNewHTMLCleaner tests the NewHTMLCleaner constructor. func TestNewHTMLCleaner(t *testing.T) { cleaner := NewHTMLCleaner() if cleaner == nil { t.Fatal("NewHTMLCleaner() returned nil") } } // TestHTMLCleaner_CleanHTML tests the CleanHTML method with various HTML inputs. func TestHTMLCleaner_CleanHTML(t *testing.T) { cleaner := NewHTMLCleaner() tests := []struct { name string input string expected string }{ { name: "plain text (no HTML)", input: "This is plain text", expected: "This is plain text", }, { name: "empty string", input: "", expected: "", }, { name: "simple HTML tag", input: "

Hello world

", expected: "Hello world", }, { name: "multiple HTML tags", input: "

Title

Paragraph text

", expected: "TitleParagraph text", }, { name: "nested HTML tags", input: "

Title

Paragraph with bold text

", expected: "TitleParagraph with bold text", }, { name: "HTML with attributes", input: "

Text with attributes

", expected: "Text with attributes", }, { name: "self-closing tags", input: "Line 1
Line 2
End", expected: "Line 1Line 2End", }, { name: "HTML entities - basic", input: "AT&T <company> "quoted"   text", expected: "AT&T \"quoted\" text", }, { name: "HTML entities - apostrophe", input: "It's a test", expected: "It's a test", }, { name: "HTML entities - special characters", input: "ïber ëlite écarté", expected: "ïber ëlite écarté", }, { name: "HTML entities - nbsp", input: "Word1   Word2", expected: "Word1 Word2", }, { name: "mixed HTML and entities", input: "

Hello & welcome to our site!

", expected: "Hello & welcome to our site!", }, { name: "multiple whitespace", input: "Text with\t\tmultiple\n\nspaces", expected: "Text with multiple spaces", }, { name: "whitespace with HTML", input: "

Text with

spaces
", expected: "Text with spaces", }, { name: "complex content", input: "

Course Title

This is a great course about & HTML entities like   and "quotes".

", expected: "Course TitleThis is a great course about & HTML entities like and \"quotes\".", }, { name: "malformed HTML", input: "

Unclosed paragraph

Another tag

", expected: "Unclosed paragraphAnother tag", }, { name: "HTML comments (should be removed)", input: "Text beforeText after", expected: "Text beforeText after", }, { name: "script and style tags content", input: "Content", expected: "alert('test');Contentbody{color:red;}", }, { name: "line breaks and formatting", input: "

Line 1

\n

Line 2

\n

Line 3

", expected: "Line 1 Line 2 Line 3", }, { name: "only whitespace", input: " \t\n ", expected: "", }, { name: "only HTML tags", input: "

", expected: "", }, { name: "HTML with newlines", input: "

\n Paragraph with\n line breaks\n

", expected: "Paragraph with line breaks", }, { name: "complex nested structure", input: "

Title

First paragraph with link.

  • Item 1
  • Item 2
", expected: "TitleFirst paragraph with link.Item 1Item 2", }, { name: "entities in attributes (should still be processed)", input: "

Content

", expected: "Content", }, { name: "special HTML5 entities", input: "Left arrow ← Right arrow →", expected: "Left arrow ← Right arrow →", // These are not handled by the cleaner }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { result := cleaner.CleanHTML(tt.input) if result != tt.expected { t.Errorf("CleanHTML(%q) = %q, want %q", tt.input, result, tt.expected) } }) } } // TestHTMLCleaner_CleanHTML_LargeContent tests the CleanHTML method with large content. func TestHTMLCleaner_CleanHTML_LargeContent(t *testing.T) { cleaner := NewHTMLCleaner() // Create a large HTML string var builder strings.Builder builder.WriteString("") for i := range 1000 { builder.WriteString("

Paragraph ") builder.WriteString(string(rune('0' + i%10))) builder.WriteString(" with some content & entities.

") } builder.WriteString("") input := builder.String() result := cleaner.CleanHTML(input) // Check that HTML tags are removed if strings.Contains(result, "<") || strings.Contains(result, ">") { t.Error("Result should not contain HTML tags") } // Check that content is preserved if !strings.Contains(result, "Paragraph") { t.Error("Result should contain paragraph content") } // Check that entities are converted if strings.Contains(result, "&") { t.Error("Result should not contain unconverted HTML entities") } if !strings.Contains(result, "&") { t.Error("Result should contain converted ampersand") } } // TestHTMLCleaner_CleanHTML_EdgeCases tests edge cases for the CleanHTML method. func TestHTMLCleaner_CleanHTML_EdgeCases(t *testing.T) { cleaner := NewHTMLCleaner() tests := []struct { name string input string expected string }{ { name: "only entities", input: "&<>"' ", expected: "&<>\"'", }, { name: "repeated entities", input: "&&&", expected: "&&&", }, { name: "entities without semicolon (should not be converted)", input: "& test < test", expected: "& test < test", }, { name: "mixed valid and invalid entities", input: "& &invalid; < &fake;", expected: "& &invalid; < &fake;", }, { name: "unclosed tag at end", input: "Content

with unclosed", expected: "Content with unclosed", }, { name: "tag with no closing bracket", input: "Content

Deep content
", expected: "Deep content", }, { name: "empty tags with whitespace", input: "

\t\n
", expected: "", }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { result := cleaner.CleanHTML(tt.input) if result != tt.expected { t.Errorf("CleanHTML(%q) = %q, want %q", tt.input, result, tt.expected) } }) } } // TestHTMLCleaner_CleanHTML_Unicode tests Unicode content handling. func TestHTMLCleaner_CleanHTML_Unicode(t *testing.T) { cleaner := NewHTMLCleaner() tests := []struct { name string input string expected string }{ { name: "unicode characters", input: "

Hello 世界! Café naïve résumé

", expected: "Hello 世界! Café naïve résumé", }, { name: "unicode with entities", input: "

Unicode: 你好 & emoji: 🌍

", expected: "Unicode: 你好 & emoji: 🌍", }, { name: "mixed scripts", input: "
English العربية русский 日本語
", expected: "English العربية русский 日本語", }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { result := cleaner.CleanHTML(tt.input) if result != tt.expected { t.Errorf("CleanHTML(%q) = %q, want %q", tt.input, result, tt.expected) } }) } } // BenchmarkHTMLCleaner_CleanHTML benchmarks the CleanHTML method. func BenchmarkHTMLCleaner_CleanHTML(b *testing.B) { cleaner := NewHTMLCleaner() input := "

Course Title

This is a great course about & HTML entities like   and "quotes".

" for b.Loop() { cleaner.CleanHTML(input) } } // BenchmarkHTMLCleaner_CleanHTML_Large benchmarks the CleanHTML method with large content. func BenchmarkHTMLCleaner_CleanHTML_Large(b *testing.B) { cleaner := NewHTMLCleaner() // Create a large HTML string var builder strings.Builder for i := range 100 { builder.WriteString("

Paragraph ") builder.WriteString(string(rune('0' + i%10))) builder.WriteString(" with some content & entities <test>.

") } input := builder.String() for b.Loop() { cleaner.CleanHTML(input) } }