Refactor transcript fetching and processing scripts

- Introduced a new function `fetchTranscriptContent` to handle fetching transcripts with optional authentication. - Enhanced error handling and logging for transcript fetching. - Updated the `parseTranscriptToMessages` function to improve message parsing logic. - Replaced the old session processing logic with a new approach that utilizes `SessionImport` records. - Removed obsolete scripts related to manual triggers and whitespace fixing. - Updated the server initialization to remove direct server handling, transitioning to a more modular approach. - Improved overall code structure and readability across various scripts.
2026-06-14 14:25:50 +02:00 · 2025-06-27 16:38:16 +02:00
parent d7ac0ba208
commit 1dd618b666
35 changed files with 6536 additions and 12797 deletions
@@ -1,440 +1,41 @@
-// Fetches, parses, and returns chat session data for a company from a CSV URL
+// Simplified CSV fetcher - fetches and parses CSV data without any processing
+// Maps directly to SessionImport table fields
 import fetch from "node-fetch";
 import { parse } from "csv-parse/sync";
-import ISO6391 from "iso-639-1";
-import countries from "i18n-iso-countries";

-// Register locales for i18n-iso-countries
-import enLocale from "i18n-iso-countries/langs/en.json" with { type: "json" };
-countries.registerLocale(enLocale);
-
-// This type is used internally for parsing the CSV records
-interface CSVRecord {
-  session_id: string;
-  start_time: string;
-  end_time?: string;
-  ip_address?: string;
-  country?: string;
-  language?: string;
-  messages_sent?: string;
-  sentiment?: string;
-  escalated?: string;
-  forwarded_hr?: string;
-  full_transcript_url?: string;
-  avg_response_time?: string;
-  tokens?: string;
-  tokens_eur?: string;
-  category?: string;
-  initial_msg?: string;
-  [key: string]: string | undefined;
-}
-
-interface SessionData {
-  id: string;
-  sessionId: string;
-  startTime: Date;
-  endTime: Date | null;
-  ipAddress?: string;
-  country?: string | null; // Will store ISO 3166-1 alpha-2 country code or null/undefined
-  language?: string | null; // Will store ISO 639-1 language code or null/undefined
-  messagesSent: number;
-  sentiment: number | null;
-  escalated: boolean;
-  forwardedHr: boolean;
-  fullTranscriptUrl?: string | null;
-  avgResponseTime: number | null;
-  tokens: number;
-  tokensEur: number;
-  category?: string | null;
-  initialMsg?: string;
+// Raw CSV data interface matching SessionImport schema
+interface RawSessionImport {
+  externalSessionId: string;
+  startTimeRaw: string;
+  endTimeRaw: string;
+  ipAddress: string | null;
+  countryCode: string | null;
+  language: string | null;
+  messagesSent: number | null;
+  sentimentRaw: string | null;
+  escalatedRaw: string | null;
+  forwardedHrRaw: string | null;
+  fullTranscriptUrl: string | null;
+  avgResponseTimeSeconds: number | null;
+  tokens: number | null;
+  tokensEur: number | null;
+  category: string | null;
+  initialMessage: string | null;
 }

 /**
- * Converts country names to ISO 3166-1 alpha-2 codes
- * @param countryStr Raw country string from CSV
- * @returns ISO 3166-1 alpha-2 country code or null if not found
+ * Fetches and parses CSV data from a URL without any processing
+ * Maps CSV columns by position to SessionImport fields
+ * @param url The CSV URL
+ * @param username Optional username for authentication
+ * @param password Optional password for authentication
+ * @returns Array of raw session import data
 */
-function getCountryCode(countryStr?: string): string | null | undefined {
-  if (countryStr === undefined) return undefined;
-  if (countryStr === null || countryStr === "") return null;
-
-  // Clean the input
-  const normalized = countryStr.trim();
-  if (!normalized) return null;
-
-  // Direct ISO code check (if already a 2-letter code)
-  if (normalized.length === 2 && normalized === normalized.toUpperCase()) {
-    return countries.isValid(normalized) ? normalized : null;
-  }
-
-  // Special case for country codes used in the dataset
-  const countryMapping: Record<string, string> = {
-    BA: "BA", // Bosnia and Herzegovina
-    NL: "NL", // Netherlands
-    USA: "US", // United States
-    UK: "GB", // United Kingdom
-    GB: "GB", // Great Britain
-    Nederland: "NL",
-    Netherlands: "NL",
-    Netherland: "NL",
-    Holland: "NL",
-    Germany: "DE",
-    Deutschland: "DE",
-    Belgium: "BE",
-    België: "BE",
-    Belgique: "BE",
-    France: "FR",
-    Frankreich: "FR",
-    "United States": "US",
-    "United States of America": "US",
-    Bosnia: "BA",
-    "Bosnia and Herzegovina": "BA",
-    "Bosnia & Herzegovina": "BA",
-  };
-
-  // Check mapping
-  if (normalized in countryMapping) {
-    return countryMapping[normalized];
-  }
-
-  // Try to get the code from the country name (in English)
-  try {
-    const code = countries.getAlpha2Code(normalized, "en");
-    if (code) return code;
-  } catch (error) {
-    process.stderr.write(
-      `[CSV] Error converting country name to code: ${normalized} - ${error}\n`
-    );
-  }
-
-  // If all else fails, return null
-  return null;
-}
-
-/**
- * Converts language names to ISO 639-1 codes
- * @param languageStr Raw language string from CSV
- * @returns ISO 639-1 language code or null if not found
- */
-function getLanguageCode(languageStr?: string): string | null | undefined {
-  if (languageStr === undefined) return undefined;
-  if (languageStr === null || languageStr === "") return null;
-
-  // Clean the input
-  const normalized = languageStr.trim();
-  if (!normalized) return null;
-
-  // Direct ISO code check (if already a 2-letter code)
-  if (normalized.length === 2 && normalized === normalized.toLowerCase()) {
-    return ISO6391.validate(normalized) ? normalized : null;
-  }
-
-  // Special case mappings
-  const languageMapping: Record<string, string> = {
-    english: "en",
-    English: "en",
-    dutch: "nl",
-    Dutch: "nl",
-    nederlands: "nl",
-    Nederlands: "nl",
-    nl: "nl",
-    bosnian: "bs",
-    Bosnian: "bs",
-    turkish: "tr",
-    Turkish: "tr",
-    german: "de",
-    German: "de",
-    deutsch: "de",
-    Deutsch: "de",
-    french: "fr",
-    French: "fr",
-    français: "fr",
-    Français: "fr",
-    spanish: "es",
-    Spanish: "es",
-    español: "es",
-    Español: "es",
-    italian: "it",
-    Italian: "it",
-    italiano: "it",
-    Italiano: "it",
-    nizozemski: "nl", // "Dutch" in some Slavic languages
-  };
-
-  // Check mapping
-  if (normalized in languageMapping) {
-    return languageMapping[normalized];
-  }
-
-  // Try to get code using the ISO6391 library
-  try {
-    const code = ISO6391.getCode(normalized);
-    if (code) return code;
-  } catch (error) {
-    process.stderr.write(
-      `[CSV] Error converting language name to code: ${normalized} - ${error}\n`
-    );
-  }
-  // If all else fails, return null
-  return null;
-}
-
-/**
- * Normalizes category values to standard groups
- * @param categoryStr The raw category string from CSV
- * @returns A normalized category string
- */
-function normalizeCategory(categoryStr?: string): string | null {
-  if (!categoryStr) return null;
-
-  const normalized = categoryStr.toLowerCase().trim();
-
-  // Define category groups using keywords
-  const categoryMapping: Record<string, string[]> = {
-    Onboarding: [
-      "onboarding",
-      "start",
-      "begin",
-      "new",
-      "orientation",
-      "welcome",
-      "intro",
-      "getting started",
-      "documents",
-      "documenten",
-      "first day",
-      "eerste dag",
-    ],
-    "General Information": [
-      "general",
-      "algemeen",
-      "info",
-      "information",
-      "informatie",
-      "question",
-      "vraag",
-      "inquiry",
-      "chat",
-      "conversation",
-      "gesprek",
-      "talk",
-    ],
-    Greeting: [
-      "greeting",
-      "greet",
-      "hello",
-      "hi",
-      "hey",
-      "welcome",
-      "hallo",
-      "hoi",
-      "greetings",
-    ],
-    "HR & Payroll": [
-      "salary",
-      "salaris",
-      "pay",
-      "payroll",
-      "loon",
-      "loonstrook",
-      "hr",
-      "human resources",
-      "benefits",
-      "vacation",
-      "leave",
-      "verlof",
-      "maaltijdvergoeding",
-      "vergoeding",
-    ],
-    "Schedules & Hours": [
-      "schedule",
-      "hours",
-      "tijd",
-      "time",
-      "roster",
-      "rooster",
-      "planning",
-      "shift",
-      "dienst",
-      "working hours",
-      "werktijden",
-      "openingstijden",
-    ],
-    "Role & Responsibilities": [
-      "role",
-      "job",
-      "function",
-      "functie",
-      "task",
-      "taak",
-      "responsibilities",
-      "leidinggevende",
-      "manager",
-      "teamleider",
-      "supervisor",
-      "team",
-      "lead",
-    ],
-    "Technical Support": [
-      "technical",
-      "tech",
-      "support",
-      "laptop",
-      "computer",
-      "system",
-      "systeem",
-      "it",
-      "software",
-      "hardware",
-    ],
-    Offboarding: [
-      "offboarding",
-      "leave",
-      "exit",
-      "quit",
-      "resign",
-      "resignation",
-      "ontslag",
-      "vertrek",
-      "afsluiting",
-    ],
-  };
-
-  // Try to match the category using keywords
-  for (const [category, keywords] of Object.entries(categoryMapping)) {
-    if (keywords.some((keyword) => normalized.includes(keyword))) {
-      return category;
-    }
-  }
-
-  // If no match, return "Other"
-  return "Other";
-}
-
-/**
- * Converts sentiment string values to numeric scores
- * @param sentimentStr The sentiment string from the CSV
- * @returns A numeric score representing the sentiment
- */
-function mapSentimentToScore(sentimentStr?: string): number | null {
-  if (!sentimentStr) return null;
-
-  // Convert to lowercase for case-insensitive matching
-  const sentiment = sentimentStr.toLowerCase();
-
-  // Map sentiment strings to numeric values on a scale from -1 to 2
-  const sentimentMap: Record<string, number> = {
-    happy: 1.0,
-    excited: 1.5,
-    positive: 0.8,
-    neutral: 0.0,
-    playful: 0.7,
-    negative: -0.8,
-    angry: -1.0,
-    sad: -0.7,
-    frustrated: -0.9,
-    positief: 0.8, // Dutch
-    neutraal: 0.0, // Dutch
-    negatief: -0.8, // Dutch
-    positivo: 0.8, // Spanish/Italian
-    neutro: 0.0, // Spanish/Italian
-    negativo: -0.8, // Spanish/Italian
-    yes: 0.5, // For any "yes" sentiment
-    no: -0.5, // For any "no" sentiment
-  };
-
-  return sentimentMap[sentiment] !== undefined
-    ? sentimentMap[sentiment]
-    : isNaN(parseFloat(sentiment))
-      ? null
-      : parseFloat(sentiment);
-}
-
-/**
- * Checks if a string value should be considered as boolean true
- * @param value The string value to check
- * @returns True if the string indicates a positive/true value
- */
-function isTruthyValue(value?: string): boolean {
-  if (!value) return false;
-
-  const truthyValues = [
-    "1",
-    "true",
-    "yes",
-    "y",
-    "ja",
-    "si",
-    "oui",
-    "да",
-    "да",
-    "はい",
-  ];
-
-  return truthyValues.includes(value.toLowerCase());
-}
-
-/**
- * Safely parses a date string into a Date object.
- * Handles potential errors and various formats, prioritizing D-M-YYYY HH:MM:SS.
- * @param dateStr The date string to parse.
- * @returns A Date object or null if parsing fails.
- */
-function safeParseDate(dateStr?: string): Date | null {
-  if (!dateStr) return null;
-
-  // Try to parse D-M-YYYY HH:MM:SS format (with hyphens or dots)
-  const dateTimeRegex =
-    /^(\d{1,2})[.-](\d{1,2})[.-](\d{4}) (\d{1,2}):(\d{1,2}):(\d{1,2})$/;
-  const match = dateStr.match(dateTimeRegex);
-
-  if (match) {
-    const day = match[1];
-    const month = match[2];
-    const year = match[3];
-    const hour = match[4];
-    const minute = match[5];
-    const second = match[6];
-
-    // Reformat to YYYY-MM-DDTHH:MM:SS (ISO-like, but local time)
-    // Ensure month and day are two digits
-    const formattedDateStr = `${year}-${month.padStart(2, "0")}-${day.padStart(2, "0")}T${hour.padStart(2, "0")}:${minute.padStart(2, "0")}:${second.padStart(2, "0")}`;
-
-    try {
-      const date = new Date(formattedDateStr);
-      // Basic validation: check if the constructed date is valid
-      if (!isNaN(date.getTime())) {
-        // console.log(`[safeParseDate] Parsed from D-M-YYYY: ${dateStr} -> ${formattedDateStr} -> ${date.toISOString()}`);
-        return date;
-      }
-    } catch (e) {
-      console.warn(
-        `[safeParseDate] Error parsing reformatted string ${formattedDateStr} from ${dateStr}:`,
-        e
-      );
-    }
-  }
-
-  // Fallback for other potential formats (e.g., direct ISO 8601) or if the primary parse failed
-  try {
-    const parsedDate = new Date(dateStr);
-    if (!isNaN(parsedDate.getTime())) {
-      // console.log(`[safeParseDate] Parsed with fallback: ${dateStr} -> ${parsedDate.toISOString()}`);
-      return parsedDate;
-    }
-  } catch (e) {
-    console.warn(`[safeParseDate] Error parsing with fallback ${dateStr}:`, e);
-  }
-
-  console.warn(`Failed to parse date string: ${dateStr}`);
-  return null;
-}
-
 export async function fetchAndParseCsv(
  url: string,
  username?: string,
  password?: string
-): Promise<Partial<SessionData>[]> {
+): Promise<RawSessionImport[]> {
  const authHeader =
    username && password
      ? "Basic " + Buffer.from(`${username}:${password}`).toString("base64")
@@ -443,56 +44,39 @@ export async function fetchAndParseCsv(
  const res = await fetch(url, {
    headers: authHeader ? { Authorization: authHeader } : {},
  });
-  if (!res.ok) throw new Error("Failed to fetch CSV: " + res.statusText);
+  
+  if (!res.ok) {
+    throw new Error(`Failed to fetch CSV: ${res.status} ${res.statusText}`);
+  }

  const text = await res.text();

-  // Parse without expecting headers, using known order
-  const records: CSVRecord[] = parse(text, {
+  // Parse CSV without headers, using positional column mapping
+  const records: string[][] = parse(text, {
    delimiter: ",",
-    columns: [
-      "session_id",
-      "start_time",
-      "end_time",
-      "ip_address",
-      "country",
-      "language",
-      "messages_sent",
-      "sentiment",
-      "escalated",
-      "forwarded_hr",
-      "full_transcript_url",
-      "avg_response_time",
-      "tokens",
-      "tokens_eur",
-      "category",
-      "initial_msg",
-    ],
-    from_line: 1,
+    from_line: 1, // Start from first line (no headers)
    relax_column_count: true,
    skip_empty_lines: true,
    trim: true,
  });

-  // Coerce types for relevant columns
-  return records.map((r) => ({
-    id: r.session_id,
-    startTime: safeParseDate(r.start_time) || new Date(), // Fallback to current date if invalid
-    endTime: safeParseDate(r.end_time),
-    ipAddress: r.ip_address,
-    country: getCountryCode(r.country),
-    language: getLanguageCode(r.language),
-    messagesSent: Number(r.messages_sent) || 0,
-    sentiment: mapSentimentToScore(r.sentiment),
-    escalated: isTruthyValue(r.escalated),
-    forwardedHr: isTruthyValue(r.forwarded_hr),
-    fullTranscriptUrl: r.full_transcript_url,
-    avgResponseTime: r.avg_response_time
-      ? parseFloat(r.avg_response_time)
-      : null,
-    tokens: Number(r.tokens) || 0,
-    tokensEur: r.tokens_eur ? parseFloat(r.tokens_eur) : 0,
-    category: normalizeCategory(r.category),
-    initialMsg: r.initial_msg,
+  // Map CSV columns by position to SessionImport fields
+  return records.map((row) => ({
+    externalSessionId: row[0] || "",
+    startTimeRaw: row[1] || "",
+    endTimeRaw: row[2] || "",
+    ipAddress: row[3] || null,
+    countryCode: row[4] || null,
+    language: row[5] || null,
+    messagesSent: row[6] ? parseInt(row[6], 10) || null : null,
+    sentimentRaw: row[7] || null,
+    escalatedRaw: row[8] || null,
+    forwardedHrRaw: row[9] || null,
+    fullTranscriptUrl: row[10] || null,
+    avgResponseTimeSeconds: row[11] ? parseFloat(row[11]) || null : null,
+    tokens: row[12] ? parseInt(row[12], 10) || null : null,
+    tokensEur: row[13] ? parseFloat(row[13]) || null : null,
+    category: row[14] || null,
+    initialMessage: row[15] || null,
  }));
 }