// Package services provides the core functionality for the articulate-parser application. // It implements the interfaces defined in the interfaces package. package services import ( "regexp" "strings" ) // HTMLCleaner provides utilities for converting HTML content to plain text. // It removes HTML tags while preserving their content and converts HTML entities // to their plain text equivalents. type HTMLCleaner struct{} // NewHTMLCleaner creates a new HTML cleaner instance. // This service is typically injected into exporters that need to handle // HTML content from Articulate Rise courses. func NewHTMLCleaner() *HTMLCleaner { return &HTMLCleaner{} } // CleanHTML removes HTML tags and converts entities, returning clean plain text. // The function preserves the textual content of the HTML while removing markup. // It handles common HTML entities like  , &, etc., and normalizes whitespace. // // Parameters: // - html: The HTML content to clean // // Returns: // - A plain text string with all HTML elements and entities removed/converted func (h *HTMLCleaner) CleanHTML(html string) string { // Remove HTML tags but preserve content re := regexp.MustCompile(`<[^>]*>`) cleaned := re.ReplaceAllString(html, "") // Replace common HTML entities with their character equivalents cleaned = strings.ReplaceAll(cleaned, " ", " ") cleaned = strings.ReplaceAll(cleaned, "&", "&") cleaned = strings.ReplaceAll(cleaned, "<", "<") cleaned = strings.ReplaceAll(cleaned, ">", ">") cleaned = strings.ReplaceAll(cleaned, """, "\"") cleaned = strings.ReplaceAll(cleaned, "'", "'") cleaned = strings.ReplaceAll(cleaned, "ï", "ï") cleaned = strings.ReplaceAll(cleaned, "ë", "ë") cleaned = strings.ReplaceAll(cleaned, "é", "é") // Clean up extra whitespace by replacing multiple spaces, tabs, and newlines // with a single space, then trim any leading/trailing whitespace cleaned = regexp.MustCompile(`\s+`).ReplaceAllString(cleaned, " ") cleaned = strings.TrimSpace(cleaned) return cleaned }