From 9fa7475da7aad10fa8dc6623c5576a6a9ce00f0b Mon Sep 17 00:00:00 2001 From: Kaj Kowalski Date: Thu, 22 May 2025 01:08:57 +0200 Subject: [PATCH] Normalizes language and category values Adds functions to normalize language and category values from the CSV data, mapping variations to standard names and groups. This improves data consistency and enables more accurate analysis and reporting. --- lib/csvFetcher.ts | 184 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 182 insertions(+), 2 deletions(-) diff --git a/lib/csvFetcher.ts b/lib/csvFetcher.ts index b05f768..bafc3d6 100644 --- a/lib/csvFetcher.ts +++ b/lib/csvFetcher.ts @@ -43,6 +43,186 @@ interface SessionData { initialMsg?: string; } +/** + * Normalizes language values to a standard set + * @param languageStr The raw language string from CSV + * @returns A normalized language string + */ +function normalizeLanguage(languageStr?: string): string | null { + if (!languageStr) return null; + + const normalized = languageStr.toLowerCase().trim(); + + // Map of language variations to standard names + const languageMap: Record = { + // English variations + english: "English", + en: "English", + eng: "English", + + // Dutch variations + dutch: "Dutch", + nederlands: "Dutch", + nl: "Dutch", + nederland: "Dutch", + netherland: "Dutch", + netherlands: "Dutch", + hollands: "Dutch", + niederländisch: "Dutch", + nizozemski: "Dutch", + + // Other languages that might appear + bosnian: "Bosnian", + bs: "Bosnian", + turkish: "Turkish", + tr: "Turkish", + turks: "Turkish", + german: "German", + de: "German", + duits: "German", + french: "French", + fr: "French", + frans: "French", + spanish: "Spanish", + es: "Spanish", + spaans: "Spanish", + }; + + return languageMap[normalized] || "Other"; +} + +/** + * Normalizes category values to standard groups + * @param categoryStr The raw category string from CSV + * @returns A normalized category string + */ +function normalizeCategory(categoryStr?: string): string | null { + if (!categoryStr) return null; + + const normalized = categoryStr.toLowerCase().trim(); + + // Define category groups using keywords + const categoryMapping: Record = { + "Onboarding": [ + "onboarding", + "start", + "begin", + "new", + "orientation", + "welcome", + "intro", + "getting started", + "documents", + "documenten", + "first day", + "eerste dag", + ], + "General Information": [ + "general", + "algemeen", + "info", + "information", + "informatie", + "question", + "vraag", + "inquiry", + "chat", + "conversation", + "gesprek", + "talk", + ], + "Greeting": [ + "greeting", + "greet", + "hello", + "hi", + "hey", + "welcome", + "hallo", + "hoi", + "greetings", + ], + "HR & Payroll": [ + "salary", + "salaris", + "pay", + "payroll", + "loon", + "loonstrook", + "hr", + "human resources", + "benefits", + "vacation", + "leave", + "verlof", + "maaltijdvergoeding", + "vergoeding", + ], + "Schedules & Hours": [ + "schedule", + "hours", + "tijd", + "time", + "roster", + "rooster", + "planning", + "shift", + "dienst", + "working hours", + "werktijden", + "openingstijden", + ], + "Role & Responsibilities": [ + "role", + "job", + "function", + "functie", + "task", + "taak", + "responsibilities", + "leidinggevende", + "manager", + "teamleider", + "supervisor", + "team", + "lead", + ], + "Technical Support": [ + "technical", + "tech", + "support", + "laptop", + "computer", + "system", + "systeem", + "it", + "software", + "hardware", + ], + "Offboarding": [ + "offboarding", + "leave", + "exit", + "quit", + "resign", + "resignation", + "ontslag", + "vertrek", + "afsluiting", + ], + }; + + // Try to match the category using keywords + for (const [category, keywords] of Object.entries(categoryMapping)) { + if (keywords.some((keyword) => normalized.includes(keyword))) { + return category; + } + } + + // If no match, return "Other" + return "Other"; +} + /** * Converts sentiment string values to numeric scores * @param sentimentStr The sentiment string from the CSV @@ -164,7 +344,7 @@ export async function fetchAndParseCsv( endTime: safeParseDate(r.end_time), ipAddress: r.ip_address, country: r.country, - language: r.language, + language: normalizeLanguage(r.language), messagesSent: Number(r.messages_sent) || 0, sentiment: mapSentimentToScore(r.sentiment), escalated: isTruthyValue(r.escalated), @@ -175,7 +355,7 @@ export async function fetchAndParseCsv( : null, tokens: Number(r.tokens) || 0, tokensEur: r.tokens_eur ? parseFloat(r.tokens_eur) : 0, - category: r.category, + category: normalizeCategory(r.category), initialMsg: r.initial_msg, })); }