Files
articulate-parser/internal/services/html_cleaner.go
Kaj Kowalski 37927a36b6 refactor(core)!: Add context, config, and structured logging
Introduces `context.Context` to the `FetchCourse` method and its call chain, allowing for cancellable network requests and timeouts. This improves application robustness when fetching remote course data.

A new configuration package centralizes application settings, loading them from environment variables with sensible defaults for base URL, request timeout, and logging.

Standard `log` and `fmt` calls are replaced with a structured logging system built on `slog`, supporting both JSON and human-readable text formats.

This change also includes:
- Extensive benchmarks and example tests.
- Simplified Go doc comments across several packages.

BREAKING CHANGE: The `NewArticulateParser` constructor signature has been updated to accept a logger, base URL, and timeout, which are now supplied via the new configuration system.
2025-11-06 05:14:14 +01:00

69 lines
2.2 KiB
Go

// Package services provides the core functionality for the articulate-parser application.
// It implements the interfaces defined in the interfaces package.
package services
import (
"bytes"
stdhtml "html"
"io"
"strings"
"golang.org/x/net/html"
)
// HTMLCleaner provides utilities for converting HTML content to plain text.
// It removes HTML tags while preserving their content and converts HTML entities
// to their plain text equivalents using proper HTML parsing instead of regex.
type HTMLCleaner struct{}
// NewHTMLCleaner creates a new HTML cleaner instance.
// This service is typically injected into exporters that need to handle
// HTML content from Articulate Rise courses.
func NewHTMLCleaner() *HTMLCleaner {
return &HTMLCleaner{}
}
// CleanHTML removes HTML tags and converts entities, returning clean plain text.
// It parses the HTML into a node tree and extracts only text content,
// skipping script and style tags. HTML entities are automatically handled
// by the parser, and whitespace is normalized.
func (h *HTMLCleaner) CleanHTML(htmlStr string) string {
// Parse the HTML into a node tree
doc, err := html.Parse(strings.NewReader(htmlStr))
if err != nil {
// If parsing fails, return empty string
// This maintains backward compatibility with the test expectations
return ""
}
// Extract text content from the node tree
var buf bytes.Buffer
extractText(&buf, doc)
// Unescape any remaining HTML entities
unescaped := stdhtml.UnescapeString(buf.String())
// Normalize whitespace: replace multiple spaces, tabs, and newlines with a single space
cleaned := strings.Join(strings.Fields(unescaped), " ")
return strings.TrimSpace(cleaned)
}
// extractText recursively traverses the HTML node tree and extracts text content.
// It skips script and style tags to avoid including their content in the output.
func extractText(w io.Writer, n *html.Node) {
// Skip script and style tags entirely
if n.Type == html.ElementNode && (n.Data == "script" || n.Data == "style") {
return
}
// If this is a text node, write its content
if n.Type == html.TextNode {
w.Write([]byte(n.Data))
}
// Recursively process all child nodes
for c := n.FirstChild; c != nil; c = c.NextSibling {
extractText(w, c)
}
}