Broken shit

2026-01-16 18:52:08 +01:00 · 2025-06-26 21:00:19 +02:00
parent ab2c75b736
commit 653d70022b
49 changed files with 2826 additions and 2102 deletions
--- a/lib/processingScheduler.ts
+++ b/lib/processingScheduler.ts
@ -1,24 +1,28 @@
 // Session processing scheduler - TypeScript version
-import cron from "node-cron";
+// Note: Disabled due to Next.js compatibility issues
+// import cron from "node-cron";
 import { PrismaClient } from "@prisma/client";
 import fetch from "node-fetch";
 import { readFileSync } from "fs";
 import { fileURLToPath } from "url";
 import { dirname, join } from "path";
+import { VALID_CATEGORIES, ValidCategory, SentimentCategory } from "./types";

 // Load environment variables from .env.local
 const __filename = fileURLToPath(import.meta.url);
 const __dirname = dirname(__filename);
-const envPath = join(__dirname, '..', '.env.local');
+const envPath = join(__dirname, "..", ".env.local");

 try {
-  const envFile = readFileSync(envPath, 'utf8');
-  const envVars = envFile.split('\n').filter(line => line.trim() && !line.startsWith('#'));
+  const envFile = readFileSync(envPath, "utf8");
+  const envVars = envFile
+    .split("\n")
+    .filter((line) => line.trim() && !line.startsWith("#"));

-  envVars.forEach(line => {
-    const [key, ...valueParts] = line.split('=');
+  envVars.forEach((line) => {
+    const [key, ...valueParts] = line.split("=");
    if (key && valueParts.length > 0) {
-      const value = valueParts.join('=').trim();
+      const value = valueParts.join("=").trim();
      if (!process.env[key.trim()]) {
        process.env[key.trim()] = value;
      }
@ -35,10 +39,10 @@ const OPENAI_API_URL = "https://api.openai.com/v1/chat/completions";
 interface ProcessedData {
  language: string;
  messages_sent: number;
-  sentiment: "positive" | "neutral" | "negative";
+  sentiment: SentimentCategory;
  escalated: boolean;
  forwarded_hr: boolean;
-  category: string;
+  category: ValidCategory;
  questions: string[];
  summary: string;
  session_id: string;
@ -53,49 +57,49 @@ interface ProcessingResult {
 /**
 * Processes a session transcript using OpenAI API
 */
-async function processTranscriptWithOpenAI(sessionId: string, transcript: string): Promise<ProcessedData> {
+async function processTranscriptWithOpenAI(
+  sessionId: string,
+  transcript: string
+): Promise<ProcessedData> {
  if (!OPENAI_API_KEY) {
    throw new Error("OPENAI_API_KEY environment variable is not set");
  }

  // Create a system message with instructions
  const systemMessage = `
-    You are an AI assistant tasked with analyzing chat transcripts.
-    Extract the following information from the transcript:
-    1. The primary language used by the user (ISO 639-1 code)
-    2. Number of messages sent by the user
-    3. Overall sentiment (positive, neutral, or negative)
-    4. Whether the conversation was escalated
-    5. Whether HR contact was mentioned or provided
-    6. The best-fitting category for the conversation from this list:
-       - Schedule & Hours
-       - Leave & Vacation
-       - Sick Leave & Recovery
-       - Salary & Compensation
-       - Contract & Hours
-       - Onboarding
-       - Offboarding
-       - Workwear & Staff Pass
-       - Team & Contacts
-       - Personal Questions
-       - Access & Login
-       - Social questions
-       - Unrecognized / Other
-    7. Up to 5 paraphrased questions asked by the user (in English)
-    8. A brief summary of the conversation (10-300 characters)
-    
-    Return the data in JSON format matching this schema:
-    {
-      "language": "ISO 639-1 code",
-      "messages_sent": number,
-      "sentiment": "positive|neutral|negative",
-      "escalated": boolean,
-      "forwarded_hr": boolean,
-      "category": "one of the categories listed above",
-      "questions": ["question 1", "question 2", ...],
-      "summary": "brief summary",
-      "session_id": "${sessionId}"
-    }
+System: You are a JSON-generating assistant. Your task is to analyze raw chat transcripts between a user and an assistant and return structured data.
+
+⚠️ IMPORTANT:
+- You must return a **single, valid JSON object**.
+- Do **not** include markdown formatting, code fences, explanations, or comments.
+- The JSON must match the exact structure and constraints described below.
+
+Here is the schema you must follow:
+
+{
+"language": "ISO 639-1 code, e.g., 'en', 'nl'",
+"messages_sent": "integer, number of messages from the user",
+"sentiment": "'positive', 'neutral', or 'negative'",
+"escalated": "bool: true if the assistant connected or referred to a human agent, otherwise false",
+"forwarded_hr": "bool: true if HR contact info was given, otherwise false",
+"category": "one of: 'Schedule & Hours', 'Leave & Vacation', 'Sick Leave & Recovery', 'Salary & Compensation', 'Contract & Hours', 'Onboarding', 'Offboarding', 'Workwear & Staff Pass', 'Team & Contacts', 'Personal Questions', 'Access & Login', 'Social questions', 'Unrecognized / Other'",
+"questions": array of simplified questions asked by the user formulated in English, try to make a question out of messages,
+"summary": "Brief summary (1–2 sentences) of the conversation",
+}
+You must format your output as a JSON value that adheres to a given "JSON Schema" instance.
+
+"JSON Schema" is a declarative language that allows you to annotate and validate JSON documents.
+
+For example, the example "JSON Schema" instance {{"properties": {{"foo": {{"description": "a list of test words", "type": "array", "items": {{"type": "string"}}}}}}, "required": ["foo"]}}}}
+would match an object with one required property, "foo". The "type" property specifies "foo" must be an "array", and the "description" property semantically describes it as "a list of test words". The items within "foo" must be strings.
+Thus, the object {{"foo": ["bar", "baz"]}} is a well-formatted instance of this example "JSON Schema". The object {{"properties": {{"foo": ["bar", "baz"]}}}} is not well-formatted.
+
+Your output will be parsed and type-checked according to the provided schema instance, so make sure all fields in your output match the schema exactly and there are no trailing commas!
+
+Here is the JSON Schema instance your output must adhere to. Include the enclosing markdown codeblock:
+\`\`\`json
+{"type":"object","properties":{"language":{"type":"string","pattern":"^[a-z]{2}$","description":"ISO 639-1 code for the user's primary language"},"messages_sent":{"type":"integer","minimum":0,"description":"Number of messages sent by the user"},"sentiment":{"type":"string","enum":["positive","neutral","negative"],"description":"Overall tone of the user during the conversation"},"escalated":{"type":"boolean","description":"Whether the assistant indicated it could not help"},"forwarded_hr":{"type":"boolean","description":"Whether HR contact was mentioned or provided"},"category":{"type":"string","enum":["Schedule & Hours","Leave & Vacation","Sick Leave & Recovery","Salary & Compensation","Contract & Hours","Onboarding","Offboarding","Workwear & Staff Pass","Team & Contacts","Personal Questions","Access & Login","Social questions","Unrecognized / Other"],"description":"Best-fitting topic category for the conversation"},"questions":{"type":"array","items":{"type":"string","minLength":5},"minItems":0,"maxItems":5,"description":"List of paraphrased questions asked by the user in English"},"summary":{"type":"string","minLength":10,"maxLength":300,"description":"Brief summary of the conversation"},"session_id":{"type":"string","pattern":"^[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}$","minLength":36,"maxLength":36,"description":"Unique identifier for the conversation session"}},"required":["language","messages_sent","sentiment","escalated","forwarded_hr","category","questions","summary","session_id"],"additionalProperties":false,"$schema":"http://json-schema.org/draft-07/schema#"}
+\`\`\`
  `;

  try {
@ -154,7 +158,6 @@ function validateOpenAIResponse(data: any): void {
    "category",
    "questions",
    "summary",
-    "session_id",
  ];

  for (const field of requiredFields) {
@ -188,25 +191,9 @@ function validateOpenAIResponse(data: any): void {
    throw new Error("Invalid forwarded_hr. Expected boolean");
  }

-  const validCategories = [
-    "Schedule & Hours",
-    "Leave & Vacation",
-    "Sick Leave & Recovery",
-    "Salary & Compensation",
-    "Contract & Hours",
-    "Onboarding",
-    "Offboarding",
-    "Workwear & Staff Pass",
-    "Team & Contacts",
-    "Personal Questions",
-    "Access & Login",
-    "Social questions",
-    "Unrecognized / Other",
-  ];
-
-  if (!validCategories.includes(data.category)) {
+  if (!VALID_CATEGORIES.includes(data.category)) {
    throw new Error(
-      `Invalid category. Expected one of: ${validCategories.join(", ")}`
+      `Invalid category. Expected one of: ${VALID_CATEGORIES.join(", ")}`
    );
  }

@ -224,7 +211,8 @@ function validateOpenAIResponse(data: any): void {
    );
  }

-  if (typeof data.session_id !== "string") {
+  // session_id is optional in the response, we'll use the one we passed in
+  if (data.session_id && typeof data.session_id !== "string") {
    throw new Error("Invalid session_id. Expected string");
  }
 }
@ -241,6 +229,28 @@ async function processSingleSession(session: any): Promise<ProcessingResult> {
    };
  }

+  // Check for minimum data quality requirements
+  const userMessages = session.messages.filter((msg: any) =>
+    msg.role.toLowerCase() === 'user' || msg.role.toLowerCase() === 'human'
+  );
+
+  if (userMessages.length === 0) {
+    // Mark as invalid data - no user interaction
+    await prisma.session.update({
+      where: { id: session.id },
+      data: {
+        processed: true,
+        summary: "No user messages found - marked as invalid data",
+      },
+    });
+
+    return {
+      sessionId: session.id,
+      success: true,
+      error: "No user messages - marked as invalid data",
+    };
+  }
+
  try {
    // Convert messages back to transcript format for OpenAI processing
    const transcript = session.messages
@ -264,12 +274,10 @@ async function processSingleSession(session: any): Promise<ProcessingResult> {
      transcript
    );

-    // Map sentiment string to float value for compatibility with existing data
-    const sentimentMap = {
-      positive: 0.8,
-      neutral: 0.0,
-      negative: -0.8,
-    };
+    // Check if the processed data indicates low quality (empty questions, very short summary, etc.)
+    const hasValidQuestions = processedData.questions && processedData.questions.length > 0;
+    const hasValidSummary = processedData.summary && processedData.summary.length >= 10;
+    const isValidData = hasValidQuestions && hasValidSummary;

    // Update the session with processed data
    await prisma.session.update({
@ -277,7 +285,7 @@ async function processSingleSession(session: any): Promise<ProcessingResult> {
      data: {
        language: processedData.language,
        messagesSent: processedData.messages_sent,
-        sentiment: sentimentMap[processedData.sentiment] || 0,
+        sentiment: null, // Remove numeric sentiment, use only sentimentCategory
        sentimentCategory: processedData.sentiment,
        escalated: processedData.escalated,
        forwardedHr: processedData.forwarded_hr,
@ -288,6 +296,12 @@ async function processSingleSession(session: any): Promise<ProcessingResult> {
      },
    });

+    if (!isValidData) {
+      process.stdout.write(
+        `[ProcessingScheduler] ⚠️ Session ${session.id} marked as invalid data (empty questions or short summary)\n`
+      );
+    }
+
    return {
      sessionId: session.id,
      success: true,
@ -304,7 +318,10 @@ async function processSingleSession(session: any): Promise<ProcessingResult> {
 /**
 * Process sessions in parallel with concurrency limit
 */
-async function processSessionsInParallel(sessions: any[], maxConcurrency: number = 5): Promise<ProcessingResult[]> {
+async function processSessionsInParallel(
+  sessions: any[],
+  maxConcurrency: number = 5
+): Promise<ProcessingResult[]> {
  const results: Promise<ProcessingResult>[] = [];
  const executing: Promise<ProcessingResult>[] = [];

@ -323,7 +340,7 @@ async function processSessionsInParallel(sessions: any[], maxConcurrency: number

    if (executing.length >= maxConcurrency) {
      await Promise.race(executing);
-      const completedIndex = executing.findIndex(p => p === promise);
+      const completedIndex = executing.findIndex((p) => p === promise);
      if (completedIndex !== -1) {
        executing.splice(completedIndex, 1);
      }
@ -334,75 +351,104 @@ async function processSessionsInParallel(sessions: any[], maxConcurrency: number
 }

 /**
- * Process unprocessed sessions
+ * Process unprocessed sessions in batches until completion
 */
-export async function processUnprocessedSessions(batchSize: number | null = null, maxConcurrency: number = 5): Promise<void> {
+export async function processUnprocessedSessions(
+  batchSize: number = 10,
+  maxConcurrency: number = 5
+): Promise<{ totalProcessed: number; totalFailed: number; totalTime: number }> {
  process.stdout.write(
-    "[ProcessingScheduler] Starting to process unprocessed sessions...\n"
+    "[ProcessingScheduler] Starting complete processing of all unprocessed sessions...\n"
  );

-  // Find sessions that have messages but haven't been processed
-  const queryOptions: any = {
-    where: {
-      AND: [
-        { messages: { some: {} } }, // Must have messages
-        { processed: false }, // Only unprocessed sessions
-      ],
-    },
-    include: {
-      messages: {
-        orderBy: { order: "asc" },
+  let totalProcessed = 0;
+  let totalFailed = 0;
+  const overallStartTime = Date.now();
+  let batchNumber = 1;
+
+  while (true) {
+    // Find sessions that have messages but haven't been processed
+    const sessionsToProcess = await prisma.session.findMany({
+      where: {
+        AND: [
+          { messages: { some: {} } }, // Must have messages
+          { processed: false }, // Only unprocessed sessions
+        ],
      },
-    },
-  };
+      include: {
+        messages: {
+          orderBy: { order: "asc" },
+        },
+      },
+      take: batchSize,
+    });

-  // Add batch size limit if specified
-  if (batchSize && batchSize > 0) {
-    queryOptions.take = batchSize;
-  }
-
-  const sessionsToProcess = await prisma.session.findMany(queryOptions);
-
-  // Filter to only sessions that have messages
-  const sessionsWithMessages = sessionsToProcess.filter(
-    (session: any) => session.messages && session.messages.length > 0
-  );
-
-  if (sessionsWithMessages.length === 0) {
-    process.stdout.write(
-      "[ProcessingScheduler] No sessions found requiring processing.\n"
+    // Filter to only sessions that have messages
+    const sessionsWithMessages = sessionsToProcess.filter(
+      (session: any) => session.messages && session.messages.length > 0
    );
-    return;
+
+    if (sessionsWithMessages.length === 0) {
+      process.stdout.write(
+        "[ProcessingScheduler] ✅ All sessions with messages have been processed!\n"
+      );
+      break;
+    }
+
+    process.stdout.write(
+      `[ProcessingScheduler] 📦 Batch ${batchNumber}: Processing ${sessionsWithMessages.length} sessions (max concurrency: ${maxConcurrency})...\n`
+    );
+
+    const batchStartTime = Date.now();
+    const results = await processSessionsInParallel(
+      sessionsWithMessages,
+      maxConcurrency
+    );
+    const batchEndTime = Date.now();
+
+    const batchSuccessCount = results.filter((r) => r.success).length;
+    const batchErrorCount = results.filter((r) => !r.success).length;
+
+    totalProcessed += batchSuccessCount;
+    totalFailed += batchErrorCount;
+
+    process.stdout.write(
+      `[ProcessingScheduler] 📦 Batch ${batchNumber} complete: ${batchSuccessCount} success, ${batchErrorCount} failed (${((batchEndTime - batchStartTime) / 1000).toFixed(2)}s)\n`
+    );
+
+    batchNumber++;
+
+    // Small delay between batches to prevent overwhelming the system
+    if (sessionsWithMessages.length === batchSize) {
+      await new Promise(resolve => setTimeout(resolve, 1000));
+    }
  }

+  const overallEndTime = Date.now();
+  const totalTime = (overallEndTime - overallStartTime) / 1000;
+
+  process.stdout.write("[ProcessingScheduler] 🎉 Complete processing finished!\n");
  process.stdout.write(
-    `[ProcessingScheduler] Found ${sessionsWithMessages.length} sessions to process (max concurrency: ${maxConcurrency}).\n`
+    `[ProcessingScheduler] 📊 Total results: ${totalProcessed} processed, ${totalFailed} failed\n`
+  );
+  process.stdout.write(
+    `[ProcessingScheduler] ⏱️ Total processing time: ${totalTime.toFixed(2)}s\n`
  );

-  const startTime = Date.now();
-  const results = await processSessionsInParallel(sessionsWithMessages, maxConcurrency);
-  const endTime = Date.now();
-
-  const successCount = results.filter((r) => r.success).length;
-  const errorCount = results.filter((r) => !r.success).length;
-
-  process.stdout.write("[ProcessingScheduler] Session processing complete.\n");
-  process.stdout.write(
-    `[ProcessingScheduler] Successfully processed: ${successCount} sessions.\n`
-  );
-  process.stdout.write(
-    `[ProcessingScheduler] Failed to process: ${errorCount} sessions.\n`
-  );
-  process.stdout.write(
-    `[ProcessingScheduler] Total processing time: ${((endTime - startTime) / 1000).toFixed(2)}s\n`
-  );
+  return { totalProcessed, totalFailed, totalTime };
 }

 /**
 * Start the processing scheduler
 */
 export function startProcessingScheduler(): void {
-  // Process unprocessed sessions every hour
+  // Note: Scheduler disabled due to Next.js compatibility issues
+  // Use manual triggers via API endpoints instead
+  console.log("Processing scheduler disabled - using manual triggers via API endpoints");
+
+  // Original cron-based implementation commented out due to Next.js compatibility issues
+  // The functionality is now available via the /api/admin/trigger-processing endpoint
+  /*
  cron.schedule("0 * * * *", async () => {
    try {
      await processUnprocessedSessions();
@ -416,4 +462,5 @@ export function startProcessingScheduler(): void {
  process.stdout.write(
    "[ProcessingScheduler] Started processing scheduler (runs hourly).\n"
  );
+  */
 }