mirror of
https://github.com/kjanat/articulate-parser.git
synced 2026-01-16 09:02:10 +01:00
Add comprehensive unit tests for services and main package
- Implement tests for the app service, including course processing from file and URI. - Create mock implementations for CourseParser and Exporter to facilitate testing. - Add tests for HTML cleaner service to validate HTML content cleaning functionality. - Develop tests for the parser service, covering course fetching and loading from files. - Introduce tests for utility functions in the main package, ensuring URI validation and string joining. - Include benchmarks for performance evaluation of key functions.
This commit is contained in:
325
internal/services/html_cleaner_test.go
Normal file
325
internal/services/html_cleaner_test.go
Normal file
@ -0,0 +1,325 @@
|
||||
// Package services_test provides tests for the HTML cleaner service.
|
||||
package services
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// TestNewHTMLCleaner tests the NewHTMLCleaner constructor.
|
||||
func TestNewHTMLCleaner(t *testing.T) {
|
||||
cleaner := NewHTMLCleaner()
|
||||
|
||||
if cleaner == nil {
|
||||
t.Fatal("NewHTMLCleaner() returned nil")
|
||||
}
|
||||
}
|
||||
|
||||
// TestHTMLCleaner_CleanHTML tests the CleanHTML method with various HTML inputs.
|
||||
func TestHTMLCleaner_CleanHTML(t *testing.T) {
|
||||
cleaner := NewHTMLCleaner()
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
input string
|
||||
expected string
|
||||
}{
|
||||
{
|
||||
name: "plain text (no HTML)",
|
||||
input: "This is plain text",
|
||||
expected: "This is plain text",
|
||||
},
|
||||
{
|
||||
name: "empty string",
|
||||
input: "",
|
||||
expected: "",
|
||||
},
|
||||
{
|
||||
name: "simple HTML tag",
|
||||
input: "<p>Hello world</p>",
|
||||
expected: "Hello world",
|
||||
},
|
||||
{
|
||||
name: "multiple HTML tags",
|
||||
input: "<h1>Title</h1><p>Paragraph text</p>",
|
||||
expected: "TitleParagraph text",
|
||||
},
|
||||
{
|
||||
name: "nested HTML tags",
|
||||
input: "<div><h1>Title</h1><p>Paragraph with <strong>bold</strong> text</p></div>",
|
||||
expected: "TitleParagraph with bold text",
|
||||
},
|
||||
{
|
||||
name: "HTML with attributes",
|
||||
input: "<p class=\"test\" id=\"para1\">Text with attributes</p>",
|
||||
expected: "Text with attributes",
|
||||
},
|
||||
{
|
||||
name: "self-closing tags",
|
||||
input: "Line 1<br/>Line 2<hr/>End",
|
||||
expected: "Line 1Line 2End",
|
||||
},
|
||||
{
|
||||
name: "HTML entities - basic",
|
||||
input: "AT&T <company> "quoted" text",
|
||||
expected: "AT&T <company> \"quoted\" text",
|
||||
},
|
||||
{
|
||||
name: "HTML entities - apostrophe",
|
||||
input: "It's a test",
|
||||
expected: "It's a test",
|
||||
},
|
||||
{
|
||||
name: "HTML entities - special characters",
|
||||
input: "ïber ëlite écarté",
|
||||
expected: "ïber ëlite écarté",
|
||||
},
|
||||
{
|
||||
name: "HTML entities - nbsp",
|
||||
input: "Word1 Word2",
|
||||
expected: "Word1 Word2",
|
||||
},
|
||||
{
|
||||
name: "mixed HTML and entities",
|
||||
input: "<p>Hello & welcome to <strong>our</strong> site!</p>",
|
||||
expected: "Hello & welcome to our site!",
|
||||
},
|
||||
{
|
||||
name: "multiple whitespace",
|
||||
input: "Text with\t\tmultiple\n\nspaces",
|
||||
expected: "Text with multiple spaces",
|
||||
},
|
||||
{
|
||||
name: "whitespace with HTML",
|
||||
input: "<p> Text with </p> <div> spaces </div> ",
|
||||
expected: "Text with spaces",
|
||||
},
|
||||
{
|
||||
name: "complex content",
|
||||
input: "<div class=\"content\"><h1>Course Title</h1><p>This is a <em>great</em> course about & HTML entities like and "quotes".</p></div>",
|
||||
expected: "Course TitleThis is a great course about & HTML entities like and \"quotes\".",
|
||||
},
|
||||
{
|
||||
name: "malformed HTML",
|
||||
input: "<p>Unclosed paragraph<div>Another <span>tag</p></div>",
|
||||
expected: "Unclosed paragraphAnother tag",
|
||||
},
|
||||
{
|
||||
name: "HTML comments (should be removed)",
|
||||
input: "Text before<!-- This is a comment -->Text after",
|
||||
expected: "Text beforeText after",
|
||||
},
|
||||
{
|
||||
name: "script and style tags content",
|
||||
input: "<script>alert('test');</script>Content<style>body{color:red;}</style>",
|
||||
expected: "alert('test');Contentbody{color:red;}",
|
||||
},
|
||||
{
|
||||
name: "line breaks and formatting",
|
||||
input: "<p>Line 1</p>\n<p>Line 2</p>\n<p>Line 3</p>",
|
||||
expected: "Line 1 Line 2 Line 3",
|
||||
},
|
||||
{
|
||||
name: "only whitespace",
|
||||
input: " \t\n ",
|
||||
expected: "",
|
||||
},
|
||||
{
|
||||
name: "only HTML tags",
|
||||
input: "<div><p></p></div>",
|
||||
expected: "",
|
||||
},
|
||||
{
|
||||
name: "HTML with newlines",
|
||||
input: "<p>\n Paragraph with\n line breaks\n</p>",
|
||||
expected: "Paragraph with line breaks",
|
||||
},
|
||||
{
|
||||
name: "complex nested structure",
|
||||
input: "<article><header><h1>Title</h1></header><section><p>First paragraph with <a href=\"#\">link</a>.</p><ul><li>Item 1</li><li>Item 2</li></ul></section></article>",
|
||||
expected: "TitleFirst paragraph with link.Item 1Item 2",
|
||||
},
|
||||
{
|
||||
name: "entities in attributes (should still be processed)",
|
||||
input: "<p title=\"AT&T\">Content</p>",
|
||||
expected: "Content",
|
||||
},
|
||||
{
|
||||
name: "special HTML5 entities",
|
||||
input: "Left arrow ← Right arrow →",
|
||||
expected: "Left arrow ← Right arrow →", // These are not handled by the cleaner
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result := cleaner.CleanHTML(tt.input)
|
||||
if result != tt.expected {
|
||||
t.Errorf("CleanHTML(%q) = %q, want %q", tt.input, result, tt.expected)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestHTMLCleaner_CleanHTML_LargeContent tests the CleanHTML method with large content.
|
||||
func TestHTMLCleaner_CleanHTML_LargeContent(t *testing.T) {
|
||||
cleaner := NewHTMLCleaner()
|
||||
|
||||
// Create a large HTML string
|
||||
var builder strings.Builder
|
||||
builder.WriteString("<html><body>")
|
||||
for i := 0; i < 1000; i++ {
|
||||
builder.WriteString("<p>Paragraph ")
|
||||
builder.WriteString(string(rune('0' + i%10)))
|
||||
builder.WriteString(" with some content & entities.</p>")
|
||||
}
|
||||
builder.WriteString("</body></html>")
|
||||
|
||||
input := builder.String()
|
||||
result := cleaner.CleanHTML(input)
|
||||
|
||||
// Check that HTML tags are removed
|
||||
if strings.Contains(result, "<") || strings.Contains(result, ">") {
|
||||
t.Error("Result should not contain HTML tags")
|
||||
}
|
||||
|
||||
// Check that content is preserved
|
||||
if !strings.Contains(result, "Paragraph") {
|
||||
t.Error("Result should contain paragraph content")
|
||||
}
|
||||
|
||||
// Check that entities are converted
|
||||
if strings.Contains(result, "&") {
|
||||
t.Error("Result should not contain unconverted HTML entities")
|
||||
}
|
||||
if !strings.Contains(result, "&") {
|
||||
t.Error("Result should contain converted ampersand")
|
||||
}
|
||||
}
|
||||
|
||||
// TestHTMLCleaner_CleanHTML_EdgeCases tests edge cases for the CleanHTML method.
|
||||
func TestHTMLCleaner_CleanHTML_EdgeCases(t *testing.T) {
|
||||
cleaner := NewHTMLCleaner()
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
input string
|
||||
expected string
|
||||
}{
|
||||
{
|
||||
name: "only entities",
|
||||
input: "&<>"' ",
|
||||
expected: "&<>\"'",
|
||||
},
|
||||
{
|
||||
name: "repeated entities",
|
||||
input: "&&&",
|
||||
expected: "&&&",
|
||||
},
|
||||
{
|
||||
name: "entities without semicolon (should not be converted)",
|
||||
input: "& test < test",
|
||||
expected: "& test < test",
|
||||
},
|
||||
{
|
||||
name: "mixed valid and invalid entities",
|
||||
input: "& &invalid; < &fake;",
|
||||
expected: "& &invalid; < &fake;",
|
||||
},
|
||||
{
|
||||
name: "unclosed tag at end",
|
||||
input: "Content <p>with unclosed",
|
||||
expected: "Content with unclosed",
|
||||
},
|
||||
{
|
||||
name: "tag with no closing bracket",
|
||||
input: "Content <p class='test' with no closing bracket",
|
||||
expected: "Content <p class='test' with no closing bracket",
|
||||
},
|
||||
{
|
||||
name: "extremely nested tags",
|
||||
input: "<div><div><div><div><div>Deep content</div></div></div></div></div>",
|
||||
expected: "Deep content",
|
||||
},
|
||||
{
|
||||
name: "empty tags with whitespace",
|
||||
input: "<p> </p><div>\t\n</div>",
|
||||
expected: "",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result := cleaner.CleanHTML(tt.input)
|
||||
if result != tt.expected {
|
||||
t.Errorf("CleanHTML(%q) = %q, want %q", tt.input, result, tt.expected)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestHTMLCleaner_CleanHTML_Unicode tests Unicode content handling.
|
||||
func TestHTMLCleaner_CleanHTML_Unicode(t *testing.T) {
|
||||
cleaner := NewHTMLCleaner()
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
input string
|
||||
expected string
|
||||
}{
|
||||
{
|
||||
name: "unicode characters",
|
||||
input: "<p>Hello 世界! Café naïve résumé</p>",
|
||||
expected: "Hello 世界! Café naïve résumé",
|
||||
},
|
||||
{
|
||||
name: "unicode with entities",
|
||||
input: "<p>Unicode: 你好 & emoji: 🌍</p>",
|
||||
expected: "Unicode: 你好 & emoji: 🌍",
|
||||
},
|
||||
{
|
||||
name: "mixed scripts",
|
||||
input: "<div>English العربية русский 日本語</div>",
|
||||
expected: "English العربية русский 日本語",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result := cleaner.CleanHTML(tt.input)
|
||||
if result != tt.expected {
|
||||
t.Errorf("CleanHTML(%q) = %q, want %q", tt.input, result, tt.expected)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// BenchmarkHTMLCleaner_CleanHTML benchmarks the CleanHTML method.
|
||||
func BenchmarkHTMLCleaner_CleanHTML(b *testing.B) {
|
||||
cleaner := NewHTMLCleaner()
|
||||
input := "<div class=\"content\"><h1>Course Title</h1><p>This is a <em>great</em> course about & HTML entities like and "quotes".</p><ul><li>Item 1</li><li>Item 2</li></ul></div>"
|
||||
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
cleaner.CleanHTML(input)
|
||||
}
|
||||
}
|
||||
|
||||
// BenchmarkHTMLCleaner_CleanHTML_Large benchmarks the CleanHTML method with large content.
|
||||
func BenchmarkHTMLCleaner_CleanHTML_Large(b *testing.B) {
|
||||
cleaner := NewHTMLCleaner()
|
||||
|
||||
// Create a large HTML string
|
||||
var builder strings.Builder
|
||||
for i := 0; i < 100; i++ {
|
||||
builder.WriteString("<p>Paragraph ")
|
||||
builder.WriteString(string(rune('0' + i%10)))
|
||||
builder.WriteString(" with some content & entities <test>.</p>")
|
||||
}
|
||||
input := builder.String()
|
||||
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
cleaner.CleanHTML(input)
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user