Add comprehensive unit tests for services and main package

- Implement tests for the app service, including course processing from file and URI. - Create mock implementations for CourseParser and Exporter to facilitate testing. - Add tests for HTML cleaner service to validate HTML content cleaning functionality. - Develop tests for the parser service, covering course fetching and loading from files. - Introduce tests for utility functions in the main package, ensuring URI validation and string joining. - Include benchmarks for performance evaluation of key functions.
2026-03-03 01:41:27 +01:00 · 2025-05-25 15:23:48 +02:00
parent 9de7222ec3
commit b01260e765
17 changed files with 4431 additions and 191 deletions
--- a/internal/services/html_cleaner_test.go
+++ b/internal/services/html_cleaner_test.go
@@ -0,0 +1,325 @@
+// Package services_test provides tests for the HTML cleaner service.
+package services
+
+import (
+	"strings"
+	"testing"
+)
+
+// TestNewHTMLCleaner tests the NewHTMLCleaner constructor.
+func TestNewHTMLCleaner(t *testing.T) {
+	cleaner := NewHTMLCleaner()
+
+	if cleaner == nil {
+		t.Fatal("NewHTMLCleaner() returned nil")
+	}
+}
+
+// TestHTMLCleaner_CleanHTML tests the CleanHTML method with various HTML inputs.
+func TestHTMLCleaner_CleanHTML(t *testing.T) {
+	cleaner := NewHTMLCleaner()
+
+	tests := []struct {
+		name     string
+		input    string
+		expected string
+	}{
+		{
+			name:     "plain text (no HTML)",
+			input:    "This is plain text",
+			expected: "This is plain text",
+		},
+		{
+			name:     "empty string",
+			input:    "",
+			expected: "",
+		},
+		{
+			name:     "simple HTML tag",
+			input:    "<p>Hello world</p>",
+			expected: "Hello world",
+		},
+		{
+			name:     "multiple HTML tags",
+			input:    "<h1>Title</h1><p>Paragraph text</p>",
+			expected: "TitleParagraph text",
+		},
+		{
+			name:     "nested HTML tags",
+			input:    "<div><h1>Title</h1><p>Paragraph with <strong>bold</strong> text</p></div>",
+			expected: "TitleParagraph with bold text",
+		},
+		{
+			name:     "HTML with attributes",
+			input:    "<p class=\"test\" id=\"para1\">Text with attributes</p>",
+			expected: "Text with attributes",
+		},
+		{
+			name:     "self-closing tags",
+			input:    "Line 1<br/>Line 2<hr/>End",
+			expected: "Line 1Line 2End",
+		},
+		{
+			name:     "HTML entities - basic",
+			input:    "AT&amp;T &lt;company&gt; &quot;quoted&quot; &nbsp; text",
+			expected: "AT&T <company> \"quoted\" text",
+		},
+		{
+			name:     "HTML entities - apostrophe",
+			input:    "It&#39;s a test",
+			expected: "It's a test",
+		},
+		{
+			name:     "HTML entities - special characters",
+			input:    "&iuml;ber &euml;lite &eacute;cart&eacute;",
+			expected: "ïber ëlite écarté",
+		},
+		{
+			name:     "HTML entities - nbsp",
+			input:    "Word1&nbsp;&nbsp;&nbsp;Word2",
+			expected: "Word1 Word2",
+		},
+		{
+			name:     "mixed HTML and entities",
+			input:    "<p>Hello &amp; welcome to <strong>our</strong> site!</p>",
+			expected: "Hello & welcome to our site!",
+		},
+		{
+			name:     "multiple whitespace",
+			input:    "Text   with\t\tmultiple\n\nspaces",
+			expected: "Text with multiple spaces",
+		},
+		{
+			name:     "whitespace with HTML",
+			input:    "<p>  Text  with  </p>  <div>  spaces  </div>  ",
+			expected: "Text with spaces",
+		},
+		{
+			name:     "complex content",
+			input:    "<div class=\"content\"><h1>Course Title</h1><p>This is a <em>great</em> course about &amp; HTML entities like &nbsp; and &quot;quotes&quot;.</p></div>",
+			expected: "Course TitleThis is a great course about & HTML entities like and \"quotes\".",
+		},
+		{
+			name:     "malformed HTML",
+			input:    "<p>Unclosed paragraph<div>Another <span>tag</p></div>",
+			expected: "Unclosed paragraphAnother tag",
+		},
+		{
+			name:     "HTML comments (should be removed)",
+			input:    "Text before<!-- This is a comment -->Text after",
+			expected: "Text beforeText after",
+		},
+		{
+			name:     "script and style tags content",
+			input:    "<script>alert('test');</script>Content<style>body{color:red;}</style>",
+			expected: "alert('test');Contentbody{color:red;}",
+		},
+		{
+			name:     "line breaks and formatting",
+			input:    "<p>Line 1</p>\n<p>Line 2</p>\n<p>Line 3</p>",
+			expected: "Line 1 Line 2 Line 3",
+		},
+		{
+			name:     "only whitespace",
+			input:    "   \t\n   ",
+			expected: "",
+		},
+		{
+			name:     "only HTML tags",
+			input:    "<div><p></p></div>",
+			expected: "",
+		},
+		{
+			name:     "HTML with newlines",
+			input:    "<p>\n  Paragraph with\n  line breaks\n</p>",
+			expected: "Paragraph with line breaks",
+		},
+		{
+			name:     "complex nested structure",
+			input:    "<article><header><h1>Title</h1></header><section><p>First paragraph with <a href=\"#\">link</a>.</p><ul><li>Item 1</li><li>Item 2</li></ul></section></article>",
+			expected: "TitleFirst paragraph with link.Item 1Item 2",
+		},
+		{
+			name:     "entities in attributes (should still be processed)",
+			input:    "<p title=\"AT&amp;T\">Content</p>",
+			expected: "Content",
+		},
+		{
+			name:     "special HTML5 entities",
+			input:    "Left arrow &larr; Right arrow &rarr;",
+			expected: "Left arrow &larr; Right arrow &rarr;", // These are not handled by the cleaner
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := cleaner.CleanHTML(tt.input)
+			if result != tt.expected {
+				t.Errorf("CleanHTML(%q) = %q, want %q", tt.input, result, tt.expected)
+			}
+		})
+	}
+}
+
+// TestHTMLCleaner_CleanHTML_LargeContent tests the CleanHTML method with large content.
+func TestHTMLCleaner_CleanHTML_LargeContent(t *testing.T) {
+	cleaner := NewHTMLCleaner()
+
+	// Create a large HTML string
+	var builder strings.Builder
+	builder.WriteString("<html><body>")
+	for i := 0; i < 1000; i++ {
+		builder.WriteString("<p>Paragraph ")
+		builder.WriteString(string(rune('0' + i%10)))
+		builder.WriteString(" with some content &amp; entities.</p>")
+	}
+	builder.WriteString("</body></html>")
+
+	input := builder.String()
+	result := cleaner.CleanHTML(input)
+
+	// Check that HTML tags are removed
+	if strings.Contains(result, "<") || strings.Contains(result, ">") {
+		t.Error("Result should not contain HTML tags")
+	}
+
+	// Check that content is preserved
+	if !strings.Contains(result, "Paragraph") {
+		t.Error("Result should contain paragraph content")
+	}
+
+	// Check that entities are converted
+	if strings.Contains(result, "&amp;") {
+		t.Error("Result should not contain unconverted HTML entities")
+	}
+	if !strings.Contains(result, "&") {
+		t.Error("Result should contain converted ampersand")
+	}
+}
+
+// TestHTMLCleaner_CleanHTML_EdgeCases tests edge cases for the CleanHTML method.
+func TestHTMLCleaner_CleanHTML_EdgeCases(t *testing.T) {
+	cleaner := NewHTMLCleaner()
+
+	tests := []struct {
+		name     string
+		input    string
+		expected string
+	}{
+		{
+			name:     "only entities",
+			input:    "&amp;&lt;&gt;&quot;&#39;&nbsp;",
+			expected: "&<>\"'",
+		},
+		{
+			name:     "repeated entities",
+			input:    "&amp;&amp;&amp;",
+			expected: "&&&",
+		},
+		{
+			name:     "entities without semicolon (should not be converted)",
+			input:    "&amp test &lt test",
+			expected: "&amp test &lt test",
+		},
+		{
+			name:     "mixed valid and invalid entities",
+			input:    "&amp; &invalid; &lt; &fake;",
+			expected: "& &invalid; < &fake;",
+		},
+		{
+			name:     "unclosed tag at end",
+			input:    "Content <p>with unclosed",
+			expected: "Content with unclosed",
+		},
+		{
+			name:     "tag with no closing bracket",
+			input:    "Content <p class='test' with no closing bracket",
+			expected: "Content <p class='test' with no closing bracket",
+		},
+		{
+			name:     "extremely nested tags",
+			input:    "<div><div><div><div><div>Deep content</div></div></div></div></div>",
+			expected: "Deep content",
+		},
+		{
+			name:     "empty tags with whitespace",
+			input:    "<p>   </p><div>\t\n</div>",
+			expected: "",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := cleaner.CleanHTML(tt.input)
+			if result != tt.expected {
+				t.Errorf("CleanHTML(%q) = %q, want %q", tt.input, result, tt.expected)
+			}
+		})
+	}
+}
+
+// TestHTMLCleaner_CleanHTML_Unicode tests Unicode content handling.
+func TestHTMLCleaner_CleanHTML_Unicode(t *testing.T) {
+	cleaner := NewHTMLCleaner()
+
+	tests := []struct {
+		name     string
+		input    string
+		expected string
+	}{
+		{
+			name:     "unicode characters",
+			input:    "<p>Hello 世界! Café naïve résumé</p>",
+			expected: "Hello 世界! Café naïve résumé",
+		},
+		{
+			name:     "unicode with entities",
+			input:    "<p>Unicode: 你好 &amp; emoji: 🌍</p>",
+			expected: "Unicode: 你好 & emoji: 🌍",
+		},
+		{
+			name:     "mixed scripts",
+			input:    "<div>English العربية русский 日本語</div>",
+			expected: "English العربية русский 日本語",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := cleaner.CleanHTML(tt.input)
+			if result != tt.expected {
+				t.Errorf("CleanHTML(%q) = %q, want %q", tt.input, result, tt.expected)
+			}
+		})
+	}
+}
+
+// BenchmarkHTMLCleaner_CleanHTML benchmarks the CleanHTML method.
+func BenchmarkHTMLCleaner_CleanHTML(b *testing.B) {
+	cleaner := NewHTMLCleaner()
+	input := "<div class=\"content\"><h1>Course Title</h1><p>This is a <em>great</em> course about &amp; HTML entities like &nbsp; and &quot;quotes&quot;.</p><ul><li>Item 1</li><li>Item 2</li></ul></div>"
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		cleaner.CleanHTML(input)
+	}
+}
+
+// BenchmarkHTMLCleaner_CleanHTML_Large benchmarks the CleanHTML method with large content.
+func BenchmarkHTMLCleaner_CleanHTML_Large(b *testing.B) {
+	cleaner := NewHTMLCleaner()
+
+	// Create a large HTML string
+	var builder strings.Builder
+	for i := 0; i < 100; i++ {
+		builder.WriteString("<p>Paragraph ")
+		builder.WriteString(string(rune('0' + i%10)))
+		builder.WriteString(" with some content &amp; entities &lt;test&gt;.</p>")
+	}
+	input := builder.String()
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		cleaner.CleanHTML(input)
+	}
+}