articulate-parser/internal/services/html_cleaner.go

// Package services provides the core functionality for the articulate-parser application.
// It implements the interfaces defined in the interfaces package.
package services

import (
	"bytes"
	stdhtml "html"
	"io"
	"strings"

	"golang.org/x/net/html"
)

// HTMLCleaner provides utilities for converting HTML content to plain text.
// It removes HTML tags while preserving their content and converts HTML entities
// to their plain text equivalents using proper HTML parsing instead of regex.
type HTMLCleaner struct{}

// NewHTMLCleaner creates a new HTML cleaner instance.
// This service is typically injected into exporters that need to handle
// HTML content from Articulate Rise courses.
func NewHTMLCleaner() *HTMLCleaner {
	return &HTMLCleaner{}
}

// CleanHTML removes HTML tags and converts entities, returning clean plain text.
// The function parses the HTML into a node tree and extracts only text content,
// which handles edge cases like script tags or attributes better than regex.
// It handles HTML entities automatically through the parser and normalizes whitespace.
//
// Parameters:
//   - htmlStr: The HTML content to clean
//
// Returns:
//   - A plain text string with all HTML elements and entities removed/converted
func (h *HTMLCleaner) CleanHTML(htmlStr string) string {
	// Parse the HTML into a node tree
	doc, err := html.Parse(strings.NewReader(htmlStr))
	if err != nil {
		// If parsing fails, return empty string
		// This maintains backward compatibility with the test expectations
		return ""
	}

	// Extract text content from the node tree
	var buf bytes.Buffer
	extractText(&buf, doc)

	// Unescape any remaining HTML entities
	unescaped := stdhtml.UnescapeString(buf.String())

	// Normalize whitespace: replace multiple spaces, tabs, and newlines with a single space
	cleaned := strings.Join(strings.Fields(unescaped), " ")
	return strings.TrimSpace(cleaned)
}

// extractText recursively traverses the HTML node tree and extracts text content.
// It skips script and style tags to avoid including their content in the output.
func extractText(w io.Writer, n *html.Node) {
	// Skip script and style tags entirely
	if n.Type == html.ElementNode && (n.Data == "script" || n.Data == "style") {
		return
	}

	// If this is a text node, write its content
	if n.Type == html.TextNode {
		w.Write([]byte(n.Data))
	}

	// Recursively process all child nodes
	for c := n.FirstChild; c != nil; c = c.NextSibling {
		extractText(w, c)
	}
}