Refactor transcript fetching and processing scripts

- Introduced a new function `fetchTranscriptContent` to handle fetching transcripts with optional authentication. - Enhanced error handling and logging for transcript fetching. - Updated the `parseTranscriptToMessages` function to improve message parsing logic. - Replaced the old session processing logic with a new approach that utilizes `SessionImport` records. - Removed obsolete scripts related to manual triggers and whitespace fixing. - Updated the server initialization to remove direct server handling, transitioning to a more modular approach. - Improved overall code structure and readability across various scripts.
2026-03-02 23:41:29 +01:00 · 2025-06-27 16:38:16 +02:00
parent d7ac0ba208
commit 1dd618b666
35 changed files with 6536 additions and 12797 deletions
--- a/scripts/fetch_transcripts.ts
+++ b/scripts/fetch_transcripts.ts
@@ -1,83 +1,182 @@
 import { PrismaClient } from "@prisma/client";
+import fetch from "node-fetch";

 const prisma = new PrismaClient();

-async function main() {
-  console.log("Starting to fetch missing transcripts...");
+/**
+ * Fetches transcript content from a URL with optional authentication
+ */
+async function fetchTranscriptContent(
+  url: string,
+  username?: string,
+  password?: string
+): Promise<string | null> {
+  try {
+    const authHeader =
+      username && password
+        ? "Basic " + Buffer.from(`${username}:${password}`).toString("base64")
+        : undefined;

-  const sessionsToUpdate = await prisma.session.findMany({
+    const response = await fetch(url, {
+      headers: authHeader ? { Authorization: authHeader } : {},
+    });
+
+    if (!response.ok) {
+      console.warn(`Failed to fetch transcript from ${url}: ${response.statusText}`);
+      return null;
+    }
+
+    return await response.text();
+  } catch (error) {
+    console.warn(`Error fetching transcript from ${url}:`, error);
+    return null;
+  }
+}
+
+/**
+ * Parse transcript content into individual messages
+ */
+function parseTranscriptToMessages(transcriptContent: string): Array<{
+  timestamp: Date | null;
+  role: string;
+  content: string;
+  order: number;
+}> {
+  const lines = transcriptContent.split('\n').filter(line => line.trim());
+  const messages: Array<{
+    timestamp: Date | null;
+    role: string;
+    content: string;
+    order: number;
+  }> = [];
+
+  let order = 0;
+
+  for (const line of lines) {
+    // Try to parse lines in format: [timestamp] role: content
+    const match = line.match(/^\[([^\]]+)\]\s*([^:]+):\s*(.+)$/);
+    
+    if (match) {
+      const [, timestampStr, role, content] = match;
+      
+      // Try to parse the timestamp
+      let timestamp: Date | null = null;
+      try {
+        timestamp = new Date(timestampStr);
+        if (isNaN(timestamp.getTime())) {
+          timestamp = null;
+        }
+      } catch {
+        timestamp = null;
+      }
+
+      messages.push({
+        timestamp,
+        role: role.trim(),
+        content: content.trim(),
+        order: order++,
+      });
+    } else {
+      // If line doesn't match expected format, treat as content continuation
+      if (messages.length > 0) {
+        messages[messages.length - 1].content += '\n' + line;
+      } else {
+        // First line doesn't match format, create a generic message
+        messages.push({
+          timestamp: null,
+          role: 'unknown',
+          content: line,
+          order: order++,
+        });
+      }
+    }
+  }
+
+  return messages;
+}
+
+/**
+ * Main function to fetch transcripts for sessions that don't have messages yet
+ */
+async function fetchTranscriptsForSessions() {
+  console.log("Starting to fetch transcripts for sessions without messages...");
+
+  // Find sessions that have transcript URLs but no messages
+  const sessionsNeedingTranscripts = await prisma.session.findMany({
    where: {
      AND: [
        { fullTranscriptUrl: { not: null } },
-        { fullTranscriptUrl: { not: "" } }, // Ensure URL is not an empty string
-        { transcriptContent: null },
+        { messages: { none: {} } }, // No messages yet
      ],
    },
-    select: {
-      id: true,
-      fullTranscriptUrl: true,
+    include: {
+      company: true,
+      messages: true,
    },
  });

-  if (sessionsToUpdate.length === 0) {
-    console.log("No sessions found requiring transcript fetching.");
+  if (sessionsNeedingTranscripts.length === 0) {
+    console.log("No sessions found that need transcript fetching.");
    return;
  }

-  console.log(`Found ${sessionsToUpdate.length} sessions to update.`);
+  console.log(`Found ${sessionsNeedingTranscripts.length} sessions that need transcript fetching.`);
  let successCount = 0;
  let errorCount = 0;

-  for (const session of sessionsToUpdate) {
+  for (const session of sessionsNeedingTranscripts) {
    if (!session.fullTranscriptUrl) {
-      // Should not happen due to query, but good for type safety
-      console.warn(`Session ${session.id} has no fullTranscriptUrl, skipping.`);
+      console.warn(`Session ${session.id} has no transcript URL, skipping.`);
      continue;
    }

-    console.log(
-      `Fetching transcript for session ${session.id} from ${session.fullTranscriptUrl}...`
-    );
+    console.log(`Fetching transcript for session ${session.id}...`);
+    
    try {
-      const response = await fetch(session.fullTranscriptUrl);
-      if (!response.ok) {
-        console.error(
-          `Failed to fetch transcript for session ${session.id}: ${response.status} ${response.statusText}`
-        );
-        const errorBody = await response.text();
-        console.error(`Error details: ${errorBody.substring(0, 500)}`); // Log first 500 chars of error
-        errorCount++;
-        continue;
-      }
-
-      const transcriptText = await response.text();
-
-      if (transcriptText.trim() === "") {
-        console.warn(
-          `Fetched empty transcript for session ${session.id}. Storing as empty string.`
-        );
-      }
-
-      await prisma.session.update({
-        where: { id: session.id },
-        data: { transcriptContent: transcriptText },
-      });
-      console.log(
-        `Successfully fetched and stored transcript for session ${session.id}.`
+      // Fetch transcript content
+      const transcriptContent = await fetchTranscriptContent(
+        session.fullTranscriptUrl,
+        session.company.csvUsername || undefined,
+        session.company.csvPassword || undefined
      );
+
+      if (!transcriptContent) {
+        throw new Error("Failed to fetch transcript content");
+      }
+
+      // Parse transcript into messages
+      const messages = parseTranscriptToMessages(transcriptContent);
+
+      if (messages.length === 0) {
+        throw new Error("No messages found in transcript");
+      }
+
+      // Create messages in database
+      await prisma.message.createMany({
+        data: messages.map(msg => ({
+          sessionId: session.id,
+          timestamp: msg.timestamp,
+          role: msg.role,
+          content: msg.content,
+          order: msg.order,
+        })),
+      });
+
+      console.log(`Successfully fetched transcript for session ${session.id} (${messages.length} messages)`);
      successCount++;
    } catch (error) {
-      console.error(`Error processing session ${session.id}:`, error);
+      console.error(`Error fetching transcript for session ${session.id}:`, error);
      errorCount++;
    }
  }

  console.log("Transcript fetching complete.");
-  console.log(`Successfully updated: ${successCount} sessions.`);
-  console.log(`Failed to update: ${errorCount} sessions.`);
+  console.log(`Successfully fetched: ${successCount} transcripts.`);
+  console.log(`Failed to fetch: ${errorCount} transcripts.`);
 }

-main()
+// Run the main function
+fetchTranscriptsForSessions()
  .catch((e) => {
    console.error("An error occurred during the script execution:", e);
    process.exitCode = 1;