mirror of
https://github.com/kjanat/articulate-parser.git
synced 2026-01-16 09:42:09 +01:00
Introduces a modular exporter pattern supporting DOCX and Markdown formats by implementing Exporter interfaces and restructuring application logic. Enhances CI to install UPX for binary compression, excluding recent macOS binaries due to compatibility issues. Enables CGO when building binaries for all platforms, addressing potential cross-platform compatibility concerns. Bumps version to 0.1.1.
54 lines
2.0 KiB
Go
54 lines
2.0 KiB
Go
// Package services provides the core functionality for the articulate-parser application.
|
|
// It implements the interfaces defined in the interfaces package.
|
|
package services
|
|
|
|
import (
|
|
"regexp"
|
|
"strings"
|
|
)
|
|
|
|
// HTMLCleaner provides utilities for converting HTML content to plain text.
|
|
// It removes HTML tags while preserving their content and converts HTML entities
|
|
// to their plain text equivalents.
|
|
type HTMLCleaner struct{}
|
|
|
|
// NewHTMLCleaner creates a new HTML cleaner instance.
|
|
// This service is typically injected into exporters that need to handle
|
|
// HTML content from Articulate Rise courses.
|
|
func NewHTMLCleaner() *HTMLCleaner {
|
|
return &HTMLCleaner{}
|
|
}
|
|
|
|
// CleanHTML removes HTML tags and converts entities, returning clean plain text.
|
|
// The function preserves the textual content of the HTML while removing markup.
|
|
// It handles common HTML entities like , &, etc., and normalizes whitespace.
|
|
//
|
|
// Parameters:
|
|
// - html: The HTML content to clean
|
|
//
|
|
// Returns:
|
|
// - A plain text string with all HTML elements and entities removed/converted
|
|
func (h *HTMLCleaner) CleanHTML(html string) string {
|
|
// Remove HTML tags but preserve content
|
|
re := regexp.MustCompile(`<[^>]*>`)
|
|
cleaned := re.ReplaceAllString(html, "")
|
|
|
|
// Replace common HTML entities with their character equivalents
|
|
cleaned = strings.ReplaceAll(cleaned, " ", " ")
|
|
cleaned = strings.ReplaceAll(cleaned, "&", "&")
|
|
cleaned = strings.ReplaceAll(cleaned, "<", "<")
|
|
cleaned = strings.ReplaceAll(cleaned, ">", ">")
|
|
cleaned = strings.ReplaceAll(cleaned, """, "\"")
|
|
cleaned = strings.ReplaceAll(cleaned, "'", "'")
|
|
cleaned = strings.ReplaceAll(cleaned, "ï", "ï")
|
|
cleaned = strings.ReplaceAll(cleaned, "ë", "ë")
|
|
cleaned = strings.ReplaceAll(cleaned, "é", "é")
|
|
|
|
// Clean up extra whitespace by replacing multiple spaces, tabs, and newlines
|
|
// with a single space, then trim any leading/trailing whitespace
|
|
cleaned = regexp.MustCompile(`\s+`).ReplaceAllString(cleaned, " ")
|
|
cleaned = strings.TrimSpace(cleaned)
|
|
|
|
return cleaned
|
|
}
|