mirror of
https://github.com/kjanat/articulate-parser.git
synced 2026-01-16 09:02:10 +01:00
refactor: Standardize method names and introduce context propagation
Removes the `Get` prefix from exporter methods (e.g., GetSupportedFormat -> SupportedFormat) to better align with Go conventions for simple accessors. Introduces `context.Context` propagation through the application, starting from `ProcessCourseFromURI` down to the HTTP request in the parser. This makes network operations cancellable and allows for setting deadlines, improving application robustness. Additionally, optimizes the HTML cleaner by pre-compiling regular expressions for a minor performance gain.
This commit is contained in:
@ -7,6 +7,13 @@ import (
|
||||
"strings"
|
||||
)
|
||||
|
||||
var (
|
||||
// htmlTagRegex matches HTML tags for removal
|
||||
htmlTagRegex = regexp.MustCompile(`<[^>]*>`)
|
||||
// whitespaceRegex matches multiple whitespace characters for normalization
|
||||
whitespaceRegex = regexp.MustCompile(`\s+`)
|
||||
)
|
||||
|
||||
// HTMLCleaner provides utilities for converting HTML content to plain text.
|
||||
// It removes HTML tags while preserving their content and converts HTML entities
|
||||
// to their plain text equivalents.
|
||||
@ -30,8 +37,7 @@ func NewHTMLCleaner() *HTMLCleaner {
|
||||
// - A plain text string with all HTML elements and entities removed/converted
|
||||
func (h *HTMLCleaner) CleanHTML(html string) string {
|
||||
// Remove HTML tags but preserve content
|
||||
re := regexp.MustCompile(`<[^>]*>`)
|
||||
cleaned := re.ReplaceAllString(html, "")
|
||||
cleaned := htmlTagRegex.ReplaceAllString(html, "")
|
||||
|
||||
// Replace common HTML entities with their character equivalents
|
||||
cleaned = strings.ReplaceAll(cleaned, " ", " ")
|
||||
@ -46,7 +52,7 @@ func (h *HTMLCleaner) CleanHTML(html string) string {
|
||||
|
||||
// Clean up extra whitespace by replacing multiple spaces, tabs, and newlines
|
||||
// with a single space, then trim any leading/trailing whitespace
|
||||
cleaned = regexp.MustCompile(`\s+`).ReplaceAllString(cleaned, " ")
|
||||
cleaned = whitespaceRegex.ReplaceAllString(cleaned, " ")
|
||||
cleaned = strings.TrimSpace(cleaned)
|
||||
|
||||
return cleaned
|
||||
|
||||
Reference in New Issue
Block a user