Refactor transcript fetching and processing scripts

- Introduced a new function `fetchTranscriptContent` to handle fetching transcripts with optional authentication.
- Enhanced error handling and logging for transcript fetching.
- Updated the `parseTranscriptToMessages` function to improve message parsing logic.
- Replaced the old session processing logic with a new approach that utilizes `SessionImport` records.
- Removed obsolete scripts related to manual triggers and whitespace fixing.
- Updated the server initialization to remove direct server handling, transitioning to a more modular approach.
- Improved overall code structure and readability across various scripts.
This commit is contained in:
Max Kowalski
2025-06-27 16:38:16 +02:00
parent d7ac0ba208
commit 1dd618b666
35 changed files with 6536 additions and 12797 deletions

View File

@ -1,73 +0,0 @@
// Script to check what's in the transcript files
// Usage: node scripts/check-transcript-content.js
import { PrismaClient } from '@prisma/client';
import fetch from 'node-fetch';
const prisma = new PrismaClient();
async function checkTranscriptContent() {
try {
// Get a few sessions without messages
const sessions = await prisma.session.findMany({
where: {
AND: [
{ fullTranscriptUrl: { not: null } },
{ messages: { none: {} } },
]
},
include: { company: true },
take: 3,
});
for (const session of sessions) {
console.log(`\n📄 Checking session ${session.id}:`);
console.log(` URL: ${session.fullTranscriptUrl}`);
try {
const authHeader = session.company.csvUsername && session.company.csvPassword
? "Basic " + Buffer.from(`${session.company.csvUsername}:${session.company.csvPassword}`).toString("base64")
: undefined;
const response = await fetch(session.fullTranscriptUrl, {
headers: authHeader ? { Authorization: authHeader } : {},
timeout: 10000,
});
if (!response.ok) {
console.log(` ❌ HTTP ${response.status}: ${response.statusText}`);
continue;
}
const content = await response.text();
console.log(` 📏 Content length: ${content.length} characters`);
if (content.length === 0) {
console.log(` ⚠️ Empty file`);
} else if (content.length < 100) {
console.log(` 📝 Full content: "${content}"`);
} else {
console.log(` 📝 First 200 chars: "${content.substring(0, 200)}..."`);
}
// Check if it matches our expected format
const lines = content.split('\n').filter(line => line.trim());
const formatMatches = lines.filter(line =>
line.match(/^\[([^\]]+)\]\s*([^:]+):\s*(.+)$/)
);
console.log(` 🔍 Lines total: ${lines.length}, Format matches: ${formatMatches.length}`);
} catch (error) {
console.log(` ❌ Error: ${error.message}`);
}
}
} catch (error) {
console.error('❌ Error:', error);
} finally {
await prisma.$disconnect();
}
}
checkTranscriptContent();

View File

@ -1,185 +0,0 @@
// Script to fetch transcripts and parse them into messages
// Usage: node scripts/fetch-and-parse-transcripts.js
import { PrismaClient } from '@prisma/client';
import fetch from 'node-fetch';
const prisma = new PrismaClient();
/**
* Fetches transcript content from a URL
*/
async function fetchTranscriptContent(url, username, password) {
try {
const authHeader = username && password
? "Basic " + Buffer.from(`${username}:${password}`).toString("base64")
: undefined;
const response = await fetch(url, {
headers: authHeader ? { Authorization: authHeader } : {},
timeout: 10000,
});
if (!response.ok) {
console.log(`❌ Failed to fetch ${url}: ${response.status} ${response.statusText}`);
return null;
}
return await response.text();
} catch (error) {
console.log(`❌ Error fetching ${url}: ${error.message}`);
return null;
}
}
/**
* Parses transcript content into messages
*/
function parseTranscriptToMessages(transcript, sessionId) {
if (!transcript || transcript.trim() === '') {
return [];
}
const lines = transcript.split('\n').filter(line => line.trim());
const messages = [];
let messageOrder = 0;
let currentTimestamp = new Date();
for (const line of lines) {
// Try format 1: [DD-MM-YYYY HH:MM:SS] Role: Content
const timestampMatch = line.match(/^\[([^\]]+)\]\s*([^:]+):\s*(.+)$/);
if (timestampMatch) {
const [, timestamp, role, content] = timestampMatch;
// Parse timestamp (DD-MM-YYYY HH:MM:SS)
const dateMatch = timestamp.match(/^(\d{1,2})-(\d{1,2})-(\d{4}) (\d{1,2}):(\d{1,2}):(\d{1,2})$/);
let parsedTimestamp = new Date();
if (dateMatch) {
const [, day, month, year, hour, minute, second] = dateMatch;
parsedTimestamp = new Date(
parseInt(year),
parseInt(month) - 1, // Month is 0-indexed
parseInt(day),
parseInt(hour),
parseInt(minute),
parseInt(second)
);
}
messages.push({
sessionId,
role: role.trim().toLowerCase(),
content: content.trim(),
timestamp: parsedTimestamp,
order: messageOrder++,
});
continue;
}
// Try format 2: Role: Content (simple format)
const simpleMatch = line.match(/^([^:]+):\s*(.+)$/);
if (simpleMatch) {
const [, role, content] = simpleMatch;
// Use incremental timestamps (add 1 minute per message)
currentTimestamp = new Date(currentTimestamp.getTime() + 60000);
messages.push({
sessionId,
role: role.trim().toLowerCase(),
content: content.trim(),
timestamp: new Date(currentTimestamp),
order: messageOrder++,
});
}
}
return messages;
}
/**
* Process sessions without messages
*/
async function fetchAndParseTranscripts() {
try {
console.log('🔍 Finding sessions without messages...\n');
// Get sessions that have fullTranscriptUrl but no messages
const sessionsWithoutMessages = await prisma.session.findMany({
where: {
AND: [
{ fullTranscriptUrl: { not: null } },
{ messages: { none: {} } }, // No messages
]
},
include: {
company: true,
},
take: 20, // Process 20 at a time to avoid overwhelming
});
if (sessionsWithoutMessages.length === 0) {
console.log('✅ All sessions with transcript URLs already have messages!');
return;
}
console.log(`📥 Found ${sessionsWithoutMessages.length} sessions to process\n`);
let successCount = 0;
let errorCount = 0;
for (const session of sessionsWithoutMessages) {
console.log(`📄 Processing session ${session.id.substring(0, 8)}...`);
try {
// Fetch transcript content
const transcriptContent = await fetchTranscriptContent(
session.fullTranscriptUrl,
session.company.csvUsername,
session.company.csvPassword
);
if (!transcriptContent) {
console.log(` ⚠️ No transcript content available`);
errorCount++;
continue;
}
// Parse transcript into messages
const messages = parseTranscriptToMessages(transcriptContent, session.id);
if (messages.length === 0) {
console.log(` ⚠️ No messages found in transcript`);
errorCount++;
continue;
}
// Save messages to database
await prisma.message.createMany({
data: messages,
});
console.log(` ✅ Added ${messages.length} messages`);
successCount++;
} catch (error) {
console.log(` ❌ Error: ${error.message}`);
errorCount++;
}
}
console.log(`\n📊 Results:`);
console.log(` ✅ Successfully processed: ${successCount} sessions`);
console.log(` ❌ Failed to process: ${errorCount} sessions`);
console.log(`\n💡 Now you can run the processing scheduler to analyze these sessions!`);
} catch (error) {
console.error('❌ Error:', error);
} finally {
await prisma.$disconnect();
}
}
fetchAndParseTranscripts();

View File

@ -1,83 +1,182 @@
import { PrismaClient } from "@prisma/client";
import fetch from "node-fetch";
const prisma = new PrismaClient();
async function main() {
console.log("Starting to fetch missing transcripts...");
/**
* Fetches transcript content from a URL with optional authentication
*/
async function fetchTranscriptContent(
url: string,
username?: string,
password?: string
): Promise<string | null> {
try {
const authHeader =
username && password
? "Basic " + Buffer.from(`${username}:${password}`).toString("base64")
: undefined;
const sessionsToUpdate = await prisma.session.findMany({
const response = await fetch(url, {
headers: authHeader ? { Authorization: authHeader } : {},
});
if (!response.ok) {
console.warn(`Failed to fetch transcript from ${url}: ${response.statusText}`);
return null;
}
return await response.text();
} catch (error) {
console.warn(`Error fetching transcript from ${url}:`, error);
return null;
}
}
/**
* Parse transcript content into individual messages
*/
function parseTranscriptToMessages(transcriptContent: string): Array<{
timestamp: Date | null;
role: string;
content: string;
order: number;
}> {
const lines = transcriptContent.split('\n').filter(line => line.trim());
const messages: Array<{
timestamp: Date | null;
role: string;
content: string;
order: number;
}> = [];
let order = 0;
for (const line of lines) {
// Try to parse lines in format: [timestamp] role: content
const match = line.match(/^\[([^\]]+)\]\s*([^:]+):\s*(.+)$/);
if (match) {
const [, timestampStr, role, content] = match;
// Try to parse the timestamp
let timestamp: Date | null = null;
try {
timestamp = new Date(timestampStr);
if (isNaN(timestamp.getTime())) {
timestamp = null;
}
} catch {
timestamp = null;
}
messages.push({
timestamp,
role: role.trim(),
content: content.trim(),
order: order++,
});
} else {
// If line doesn't match expected format, treat as content continuation
if (messages.length > 0) {
messages[messages.length - 1].content += '\n' + line;
} else {
// First line doesn't match format, create a generic message
messages.push({
timestamp: null,
role: 'unknown',
content: line,
order: order++,
});
}
}
}
return messages;
}
/**
* Main function to fetch transcripts for sessions that don't have messages yet
*/
async function fetchTranscriptsForSessions() {
console.log("Starting to fetch transcripts for sessions without messages...");
// Find sessions that have transcript URLs but no messages
const sessionsNeedingTranscripts = await prisma.session.findMany({
where: {
AND: [
{ fullTranscriptUrl: { not: null } },
{ fullTranscriptUrl: { not: "" } }, // Ensure URL is not an empty string
{ transcriptContent: null },
{ messages: { none: {} } }, // No messages yet
],
},
select: {
id: true,
fullTranscriptUrl: true,
include: {
company: true,
messages: true,
},
});
if (sessionsToUpdate.length === 0) {
console.log("No sessions found requiring transcript fetching.");
if (sessionsNeedingTranscripts.length === 0) {
console.log("No sessions found that need transcript fetching.");
return;
}
console.log(`Found ${sessionsToUpdate.length} sessions to update.`);
console.log(`Found ${sessionsNeedingTranscripts.length} sessions that need transcript fetching.`);
let successCount = 0;
let errorCount = 0;
for (const session of sessionsToUpdate) {
for (const session of sessionsNeedingTranscripts) {
if (!session.fullTranscriptUrl) {
// Should not happen due to query, but good for type safety
console.warn(`Session ${session.id} has no fullTranscriptUrl, skipping.`);
console.warn(`Session ${session.id} has no transcript URL, skipping.`);
continue;
}
console.log(
`Fetching transcript for session ${session.id} from ${session.fullTranscriptUrl}...`
);
console.log(`Fetching transcript for session ${session.id}...`);
try {
const response = await fetch(session.fullTranscriptUrl);
if (!response.ok) {
console.error(
`Failed to fetch transcript for session ${session.id}: ${response.status} ${response.statusText}`
);
const errorBody = await response.text();
console.error(`Error details: ${errorBody.substring(0, 500)}`); // Log first 500 chars of error
errorCount++;
continue;
}
const transcriptText = await response.text();
if (transcriptText.trim() === "") {
console.warn(
`Fetched empty transcript for session ${session.id}. Storing as empty string.`
);
}
await prisma.session.update({
where: { id: session.id },
data: { transcriptContent: transcriptText },
});
console.log(
`Successfully fetched and stored transcript for session ${session.id}.`
// Fetch transcript content
const transcriptContent = await fetchTranscriptContent(
session.fullTranscriptUrl,
session.company.csvUsername || undefined,
session.company.csvPassword || undefined
);
if (!transcriptContent) {
throw new Error("Failed to fetch transcript content");
}
// Parse transcript into messages
const messages = parseTranscriptToMessages(transcriptContent);
if (messages.length === 0) {
throw new Error("No messages found in transcript");
}
// Create messages in database
await prisma.message.createMany({
data: messages.map(msg => ({
sessionId: session.id,
timestamp: msg.timestamp,
role: msg.role,
content: msg.content,
order: msg.order,
})),
});
console.log(`Successfully fetched transcript for session ${session.id} (${messages.length} messages)`);
successCount++;
} catch (error) {
console.error(`Error processing session ${session.id}:`, error);
console.error(`Error fetching transcript for session ${session.id}:`, error);
errorCount++;
}
}
console.log("Transcript fetching complete.");
console.log(`Successfully updated: ${successCount} sessions.`);
console.log(`Failed to update: ${errorCount} sessions.`);
console.log(`Successfully fetched: ${successCount} transcripts.`);
console.log(`Failed to fetch: ${errorCount} transcripts.`);
}
main()
// Run the main function
fetchTranscriptsForSessions()
.catch((e) => {
console.error("An error occurred during the script execution:", e);
process.exitCode = 1;

View File

@ -1,68 +0,0 @@
// Fix Trailing Whitespace
// This script removes trailing whitespace from specified file types
import fs from "fs";
import path from "path";
import { fileURLToPath } from "url";
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
// Configure which file types to process
const fileTypes = [".ts", ".tsx", ".js", ".jsx", ".json", ".md", ".css"];
// Configure directories to ignore
const ignoreDirs = ["node_modules", ".next", ".git", "out", "build", "dist"];
// Recursively process directories
async function processDirectory(dir) {
try {
const files = await fs.promises.readdir(dir, { withFileTypes: true });
for (const file of files) {
const fullPath = path.join(dir, file.name);
// Skip ignored directories
if (file.isDirectory()) {
if (!ignoreDirs.includes(file.name)) {
await processDirectory(fullPath);
}
continue;
}
// Process only files with matching extensions
const ext = path.extname(file.name);
if (!fileTypes.includes(ext)) {
continue;
}
try {
// Read and process the file
const content = await fs.promises.readFile(fullPath, "utf8");
// Remove trailing whitespace from each line
const processedContent = content
.split("\n")
.map((line) => line.replace(/\s+$/, ""))
.join("\n");
// Only write if changes were made
if (processedContent !== content) {
await fs.promises.writeFile(fullPath, processedContent, "utf8");
console.log(`Fixed trailing whitespace in ${fullPath}`);
}
} catch (fileError) {
console.error(`Error processing file ${fullPath}:`, fileError);
}
}
} catch (dirError) {
console.error(`Error reading directory ${dir}:`, dirError);
}
}
// Start processing from root directory
const rootDir = process.cwd();
console.log(`Starting whitespace cleanup from ${rootDir}`);
processDirectory(rootDir)
.then(() => console.log("Whitespace cleanup completed"))
.catch((err) => console.error("Error in whitespace cleanup:", err));

View File

@ -1,38 +0,0 @@
// Simple script to test the manual processing trigger
// Usage: node scripts/manual-trigger-test.js
import fetch from 'node-fetch';
async function testManualTrigger() {
try {
console.log('Testing manual processing trigger...');
const response = await fetch('http://localhost:3000/api/admin/trigger-processing', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
// Note: In a real scenario, you'd need to include authentication cookies
// For testing, you might need to login first and copy the session cookie
},
body: JSON.stringify({
batchSize: 5, // Process max 5 sessions
maxConcurrency: 3 // Use 3 concurrent workers
})
});
const result = await response.json();
if (response.ok) {
console.log('✅ Manual trigger successful:');
console.log(JSON.stringify(result, null, 2));
} else {
console.log('❌ Manual trigger failed:');
console.log(JSON.stringify(result, null, 2));
}
} catch (error) {
console.error('❌ Error testing manual trigger:', error.message);
}
}
testManualTrigger();

View File

@ -1,243 +0,0 @@
// Manual trigger scripts for both schedulers
import { fetchAndStoreSessionsForAllCompanies } from "../lib/csvFetcher.js";
import { processAllUnparsedTranscripts } from "../lib/transcriptParser.js";
import { PrismaClient } from "@prisma/client";
import fetch from "node-fetch";
import { readFileSync } from "fs";
import { fileURLToPath } from "url";
import { dirname, join } from "path";
// Load environment variables from .env.local
const __filename = fileURLToPath(import.meta.url);
const __dirname = dirname(__filename);
const envPath = join(__dirname, '..', '.env.local');
try {
const envFile = readFileSync(envPath, 'utf8');
const envVars = envFile.split('\n').filter(line => line.trim() && !line.startsWith('#'));
envVars.forEach(line => {
const [key, ...valueParts] = line.split('=');
if (key && valueParts.length > 0) {
const value = valueParts.join('=').trim();
if (!process.env[key.trim()]) {
process.env[key.trim()] = value;
}
}
});
console.log("✅ Environment variables loaded from .env.local");
} catch (error) {
console.warn("⚠️ Could not load .env.local file:", error.message);
}
const prisma = new PrismaClient();
/**
* Manually trigger the session refresh scheduler
*/
async function triggerSessionRefresh() {
console.log("=== Manual Session Refresh Trigger ===");
try {
await fetchAndStoreSessionsForAllCompanies();
console.log("✅ Session refresh completed successfully");
} catch (error) {
console.error("❌ Session refresh failed:", error);
}
}
/**
* Manually trigger the processing scheduler
*/
async function triggerProcessingScheduler() {
console.log("=== Manual Processing Scheduler Trigger ===");
const OPENAI_API_KEY = process.env.OPENAI_API_KEY;
if (!OPENAI_API_KEY) {
console.error("❌ OPENAI_API_KEY environment variable is not set");
return;
}
try {
// Find sessions that need processing
const sessionsToProcess = await prisma.session.findMany({
where: {
AND: [
{ messages: { some: {} } },
{
OR: [
{ processed: false },
{ processed: null }
]
}
],
},
select: {
id: true,
processed: true,
},
take: 5, // Process 5 sessions for manual testing
});
console.log(`Found ${sessionsToProcess.length} sessions to process:`);
sessionsToProcess.forEach((session) => {
console.log(`- Session ${session.id}: processed=${session.processed}`);
});
if (sessionsToProcess.length === 0) {
console.log("✅ No sessions found requiring processing");
return;
}
// Import and run the processing function
const { processUnprocessedSessions } = await import(
"../lib/processingScheduler.js"
);
await processUnprocessedSessions();
console.log("✅ Processing scheduler completed");
} catch (error) {
console.error("❌ Processing scheduler failed:", error);
}
}
/**
* Manually trigger transcript parsing
*/
async function triggerTranscriptParsing() {
console.log("=== Manual Transcript Parsing Trigger ===");
try {
const result = await processAllUnparsedTranscripts();
console.log(
`✅ Transcript parsing completed: ${result.processed} processed, ${result.errors} errors`
);
} catch (error) {
console.error("❌ Transcript parsing failed:", error);
}
}
/**
* Show current processing status
*/
async function showProcessingStatus() {
console.log("=== Processing Status ===");
try {
const totalSessions = await prisma.session.count();
const processedSessions = await prisma.session.count({
where: { processed: true },
});
const unprocessedSessions = await prisma.session.count({
where: {
OR: [
{ processed: false },
{ processed: null }
]
},
});
const withMessages = await prisma.session.count({
where: {
messages: {
some: {},
},
},
});
const readyForProcessing = await prisma.session.count({
where: {
AND: [
{ messages: { some: {} } },
{
OR: [
{ processed: false },
{ processed: null }
]
}
],
},
});
console.log(`📊 Total sessions: ${totalSessions}`);
console.log(`✅ Processed sessions: ${processedSessions}`);
console.log(`⏳ Unprocessed sessions: ${unprocessedSessions}`);
console.log(`📄 Sessions with messages: ${withMessages}`);
console.log(`🔄 Ready for processing: ${readyForProcessing}`);
// Show some examples of unprocessed sessions
if (readyForProcessing > 0) {
console.log("\n📋 Sample unprocessed sessions:");
const samples = await prisma.session.findMany({
where: {
AND: [
{ messages: { some: {} } },
{
OR: [
{ processed: false },
{ processed: null }
]
}
],
},
select: {
id: true,
processed: true,
startTime: true,
},
take: 3,
});
samples.forEach((session) => {
console.log(
`- ${session.id} (${session.startTime.toISOString()}) - processed: ${session.processed}`
);
});
}
} catch (error) {
console.error("❌ Failed to get processing status:", error);
}
}
// Main execution based on command line argument
const command = process.argv[2];
switch (command) {
case "refresh":
await triggerSessionRefresh();
break;
case "process":
await triggerProcessingScheduler();
break;
case "parse":
await triggerTranscriptParsing();
break;
case "status":
await showProcessingStatus();
break;
case "both":
await triggerSessionRefresh();
console.log("\n" + "=".repeat(50) + "\n");
await triggerProcessingScheduler();
break;
case "all":
await triggerSessionRefresh();
console.log("\n" + "=".repeat(50) + "\n");
await triggerTranscriptParsing();
console.log("\n" + "=".repeat(50) + "\n");
await triggerProcessingScheduler();
break;
default:
console.log("Usage: node scripts/manual-triggers.js [command]");
console.log("Commands:");
console.log(
" refresh - Trigger session refresh (fetch new sessions from CSV)"
);
console.log(" parse - Parse transcripts into structured messages");
console.log(
" process - Trigger processing scheduler (process unprocessed sessions)"
);
console.log(" status - Show current processing status");
console.log(" both - Run both refresh and processing");
console.log(" all - Run refresh, parse, and processing in sequence");
break;
}
await prisma.$disconnect();

View File

@ -1,283 +0,0 @@
// Script to manually process unprocessed sessions with OpenAI
import { PrismaClient } from "@prisma/client";
import fetch from "node-fetch";
const prisma = new PrismaClient();
const OPENAI_API_KEY = process.env.OPENAI_API_KEY;
const OPENAI_API_URL = "https://api.openai.com/v1/chat/completions";
/**
* Processes a session transcript using OpenAI API
* @param {string} sessionId The session ID
* @param {string} transcript The transcript content to process
* @returns {Promise<Object>} Processed data from OpenAI
*/
async function processTranscriptWithOpenAI(sessionId, transcript) {
if (!OPENAI_API_KEY) {
throw new Error("OPENAI_API_KEY environment variable is not set");
}
// Create a system message with instructions
const systemMessage = `
You are an AI assistant tasked with analyzing chat transcripts.
Extract the following information from the transcript:
1. The primary language used by the user (ISO 639-1 code)
2. Number of messages sent by the user
3. Overall sentiment (positive, neutral, or negative)
4. Whether the conversation was escalated
5. Whether HR contact was mentioned or provided
6. The best-fitting category for the conversation from this list:
- Schedule & Hours
- Leave & Vacation
- Sick Leave & Recovery
- Salary & Compensation
- Contract & Hours
- Onboarding
- Offboarding
- Workwear & Staff Pass
- Team & Contacts
- Personal Questions
- Access & Login
- Social questions
- Unrecognized / Other
7. Up to 5 paraphrased questions asked by the user (in English)
8. A brief summary of the conversation (10-300 characters)
Return the data in JSON format matching this schema:
{
"language": "ISO 639-1 code",
"messages_sent": number,
"sentiment": "positive|neutral|negative",
"escalated": boolean,
"forwarded_hr": boolean,
"category": "one of the categories listed above",
"questions": ["question 1", "question 2", ...],
"summary": "brief summary",
"session_id": "${sessionId}"
}
`;
try {
const response = await fetch(OPENAI_API_URL, {
method: "POST",
headers: {
"Content-Type": "application/json",
Authorization: `Bearer ${OPENAI_API_KEY}`,
},
body: JSON.stringify({
model: "gpt-4-turbo",
messages: [
{
role: "system",
content: systemMessage,
},
{
role: "user",
content: transcript,
},
],
temperature: 0.3, // Lower temperature for more consistent results
response_format: { type: "json_object" },
}),
});
if (!response.ok) {
const errorText = await response.text();
throw new Error(`OpenAI API error: ${response.status} - ${errorText}`);
}
const data = await response.json();
const processedData = JSON.parse(data.choices[0].message.content);
// Validate the response against our expected schema
validateOpenAIResponse(processedData);
return processedData;
} catch (error) {
console.error(`Error processing transcript with OpenAI:`, error);
throw error;
}
}
/**
* Validates the OpenAI response against our expected schema
* @param {Object} data The data to validate
*/
function validateOpenAIResponse(data) {
// Check required fields
const requiredFields = [
"language",
"messages_sent",
"sentiment",
"escalated",
"forwarded_hr",
"category",
"questions",
"summary",
"session_id",
];
for (const field of requiredFields) {
if (!(field in data)) {
throw new Error(`Missing required field: ${field}`);
}
}
// Validate field types
if (typeof data.language !== "string" || !/^[a-z]{2}$/.test(data.language)) {
throw new Error(
"Invalid language format. Expected ISO 639-1 code (e.g., 'en')"
);
}
if (typeof data.messages_sent !== "number" || data.messages_sent < 0) {
throw new Error("Invalid messages_sent. Expected non-negative number");
}
if (!["positive", "neutral", "negative"].includes(data.sentiment)) {
throw new Error(
"Invalid sentiment. Expected 'positive', 'neutral', or 'negative'"
);
}
if (typeof data.escalated !== "boolean") {
throw new Error("Invalid escalated. Expected boolean");
}
if (typeof data.forwarded_hr !== "boolean") {
throw new Error("Invalid forwarded_hr. Expected boolean");
}
const validCategories = [
"Schedule & Hours",
"Leave & Vacation",
"Sick Leave & Recovery",
"Salary & Compensation",
"Contract & Hours",
"Onboarding",
"Offboarding",
"Workwear & Staff Pass",
"Team & Contacts",
"Personal Questions",
"Access & Login",
"Social questions",
"Unrecognized / Other",
];
if (!validCategories.includes(data.category)) {
throw new Error(
`Invalid category. Expected one of: ${validCategories.join(", ")}`
);
}
if (!Array.isArray(data.questions)) {
throw new Error("Invalid questions. Expected array of strings");
}
if (
typeof data.summary !== "string" ||
data.summary.length < 10 ||
data.summary.length > 300
) {
throw new Error(
"Invalid summary. Expected string between 10-300 characters"
);
}
if (typeof data.session_id !== "string") {
throw new Error("Invalid session_id. Expected string");
}
}
/**
* Main function to process unprocessed sessions
*/
async function processUnprocessedSessions() {
console.log("Starting to process unprocessed sessions...");
// Find sessions that have transcript content but haven't been processed
const sessionsToProcess = await prisma.session.findMany({
where: {
AND: [
{ transcriptContent: { not: null } },
{ transcriptContent: { not: "" } },
{ processed: { not: true } }, // Either false or null
],
},
select: {
id: true,
transcriptContent: true,
},
});
if (sessionsToProcess.length === 0) {
console.log("No sessions found requiring processing.");
return;
}
console.log(`Found ${sessionsToProcess.length} sessions to process.`);
let successCount = 0;
let errorCount = 0;
for (const session of sessionsToProcess) {
if (!session.transcriptContent) {
// Should not happen due to query, but good for type safety
console.warn(
`Session ${session.id} has no transcript content, skipping.`
);
continue;
}
console.log(`Processing transcript for session ${session.id}...`);
try {
const processedData = await processTranscriptWithOpenAI(
session.id,
session.transcriptContent
);
// Map sentiment string to float value for compatibility with existing data
const sentimentMap = {
positive: 0.8,
neutral: 0.0,
negative: -0.8,
};
// Update the session with processed data
await prisma.session.update({
where: { id: session.id },
data: {
language: processedData.language,
messagesSent: processedData.messages_sent,
sentiment: sentimentMap[processedData.sentiment] || 0,
sentimentCategory: processedData.sentiment,
escalated: processedData.escalated,
forwardedHr: processedData.forwarded_hr,
category: processedData.category,
questions: JSON.stringify(processedData.questions),
summary: processedData.summary,
processed: true,
},
});
console.log(`Successfully processed session ${session.id}.`);
successCount++;
} catch (error) {
console.error(`Error processing session ${session.id}:`, error);
errorCount++;
}
}
console.log("Session processing complete.");
console.log(`Successfully processed: ${successCount} sessions.`);
console.log(`Failed to process: ${errorCount} sessions.`);
}
// Run the main function
processUnprocessedSessions()
.catch((e) => {
console.error("An error occurred during the script execution:", e);
process.exitCode = 1;
})
.finally(async () => {
await prisma.$disconnect();
});

View File

@ -18,11 +18,37 @@ interface OpenAIProcessedData {
session_id: string;
}
/**
* Fetches transcript content from a URL
*/
async function fetchTranscriptContent(
url: string,
username?: string,
password?: string
): Promise<string | null> {
try {
const authHeader =
username && password
? "Basic " + Buffer.from(`${username}:${password}`).toString("base64")
: undefined;
const response = await fetch(url, {
headers: authHeader ? { Authorization: authHeader } : {},
});
if (!response.ok) {
console.warn(`Failed to fetch transcript from ${url}: ${response.statusText}`);
return null;
}
return await response.text();
} catch (error) {
console.warn(`Error fetching transcript from ${url}:`, error);
return null;
}
}
/**
* Processes a session transcript using OpenAI API
* @param sessionId The session ID
* @param transcript The transcript content to process
* @returns Processed data from OpenAI
*/
async function processTranscriptWithOpenAI(
sessionId: string,
@ -32,7 +58,6 @@ async function processTranscriptWithOpenAI(
throw new Error("OPENAI_API_KEY environment variable is not set");
}
// Create a system message with instructions
const systemMessage = `
You are an AI assistant tasked with analyzing chat transcripts.
Extract the following information from the transcript:
@ -91,7 +116,7 @@ async function processTranscriptWithOpenAI(
content: transcript,
},
],
temperature: 0.3, // Lower temperature for more consistent results
temperature: 0.3,
response_format: { type: "json_object" },
}),
});
@ -104,9 +129,7 @@ async function processTranscriptWithOpenAI(
const data = (await response.json()) as any;
const processedData = JSON.parse(data.choices[0].message.content);
// Validate the response against our expected schema
validateOpenAIResponse(processedData);
return processedData;
} catch (error) {
console.error(`Error processing transcript with OpenAI:`, error);
@ -116,22 +139,11 @@ async function processTranscriptWithOpenAI(
/**
* Validates the OpenAI response against our expected schema
* @param data The data to validate
*/
function validateOpenAIResponse(
data: any
): asserts data is OpenAIProcessedData {
// Check required fields
function validateOpenAIResponse(data: any): asserts data is OpenAIProcessedData {
const requiredFields = [
"language",
"messages_sent",
"sentiment",
"escalated",
"forwarded_hr",
"category",
"questions",
"summary",
"session_id",
"language", "messages_sent", "sentiment", "escalated",
"forwarded_hr", "category", "questions", "summary", "session_id"
];
for (const field of requiredFields) {
@ -140,11 +152,8 @@ function validateOpenAIResponse(
}
}
// Validate field types
if (typeof data.language !== "string" || !/^[a-z]{2}$/.test(data.language)) {
throw new Error(
"Invalid language format. Expected ISO 639-1 code (e.g., 'en')"
);
throw new Error("Invalid language format. Expected ISO 639-1 code (e.g., 'en')");
}
if (typeof data.messages_sent !== "number" || data.messages_sent < 0) {
@ -152,9 +161,7 @@ function validateOpenAIResponse(
}
if (!["positive", "neutral", "negative"].includes(data.sentiment)) {
throw new Error(
"Invalid sentiment. Expected 'positive', 'neutral', or 'negative'"
);
throw new Error("Invalid sentiment. Expected 'positive', 'neutral', or 'negative'");
}
if (typeof data.escalated !== "boolean") {
@ -166,39 +173,22 @@ function validateOpenAIResponse(
}
const validCategories = [
"Schedule & Hours",
"Leave & Vacation",
"Sick Leave & Recovery",
"Salary & Compensation",
"Contract & Hours",
"Onboarding",
"Offboarding",
"Workwear & Staff Pass",
"Team & Contacts",
"Personal Questions",
"Access & Login",
"Social questions",
"Unrecognized / Other",
"Schedule & Hours", "Leave & Vacation", "Sick Leave & Recovery",
"Salary & Compensation", "Contract & Hours", "Onboarding", "Offboarding",
"Workwear & Staff Pass", "Team & Contacts", "Personal Questions",
"Access & Login", "Social questions", "Unrecognized / Other"
];
if (!validCategories.includes(data.category)) {
throw new Error(
`Invalid category. Expected one of: ${validCategories.join(", ")}`
);
throw new Error(`Invalid category. Expected one of: ${validCategories.join(", ")}`);
}
if (!Array.isArray(data.questions)) {
throw new Error("Invalid questions. Expected array of strings");
}
if (
typeof data.summary !== "string" ||
data.summary.length < 10 ||
data.summary.length > 300
) {
throw new Error(
"Invalid summary. Expected string between 10-300 characters"
);
if (typeof data.summary !== "string" || data.summary.length < 10 || data.summary.length > 300) {
throw new Error("Invalid summary. Expected string between 10-300 characters");
}
if (typeof data.session_id !== "string") {
@ -207,86 +197,146 @@ function validateOpenAIResponse(
}
/**
* Main function to process unprocessed sessions
* Main function to process SessionImport records that need processing
*/
async function processUnprocessedSessions() {
console.log("Starting to process unprocessed sessions...");
console.log("Starting to process unprocessed SessionImport records...");
// Find sessions that have transcript content but haven't been processed
const sessionsToProcess = await prisma.session.findMany({
// Find SessionImport records that are QUEUED and have transcript URLs
const importsToProcess = await prisma.sessionImport.findMany({
where: {
AND: [
{ transcriptContent: { not: null } },
{ transcriptContent: { not: "" } },
{ processed: { not: true } }, // Either false or null
],
status: "QUEUED",
fullTranscriptUrl: { not: null },
},
select: {
id: true,
transcriptContent: true,
include: {
company: true,
},
});
if (sessionsToProcess.length === 0) {
console.log("No sessions found requiring processing.");
if (importsToProcess.length === 0) {
console.log("No SessionImport records found requiring processing.");
return;
}
console.log(`Found ${sessionsToProcess.length} sessions to process.`);
console.log(`Found ${importsToProcess.length} SessionImport records to process.`);
let successCount = 0;
let errorCount = 0;
for (const session of sessionsToProcess) {
if (!session.transcriptContent) {
// Should not happen due to query, but good for type safety
console.warn(
`Session ${session.id} has no transcript content, skipping.`
);
for (const importRecord of importsToProcess) {
if (!importRecord.fullTranscriptUrl) {
console.warn(`SessionImport ${importRecord.id} has no transcript URL, skipping.`);
continue;
}
console.log(`Processing transcript for session ${session.id}...`);
console.log(`Processing transcript for SessionImport ${importRecord.id}...`);
try {
const processedData = await processTranscriptWithOpenAI(
session.id,
session.transcriptContent
// Mark as processing
await prisma.sessionImport.update({
where: { id: importRecord.id },
data: { status: "PROCESSING" },
});
// Fetch transcript content
const transcriptContent = await fetchTranscriptContent(
importRecord.fullTranscriptUrl,
importRecord.company.csvUsername || undefined,
importRecord.company.csvPassword || undefined
);
// Map sentiment string to float value for compatibility with existing data
const sentimentMap: Record<string, number> = {
positive: 0.8,
neutral: 0.0,
negative: -0.8,
};
if (!transcriptContent) {
throw new Error("Failed to fetch transcript content");
}
// Update the session with processed data
await prisma.session.update({
where: { id: session.id },
data: {
// Process with OpenAI
const processedData = await processTranscriptWithOpenAI(
importRecord.externalSessionId,
transcriptContent
);
// Parse dates from raw strings
const startTime = new Date(importRecord.startTimeRaw);
const endTime = new Date(importRecord.endTimeRaw);
// Create or update Session record
const session = await prisma.session.upsert({
where: { importId: importRecord.id },
update: {
startTime: isNaN(startTime.getTime()) ? new Date() : startTime,
endTime: isNaN(endTime.getTime()) ? new Date() : endTime,
ipAddress: importRecord.ipAddress,
country: importRecord.countryCode,
language: processedData.language,
messagesSent: processedData.messages_sent,
sentiment: sentimentMap[processedData.sentiment] || 0,
sentimentCategory: processedData.sentiment,
sentiment: { positive: 0.8, neutral: 0.0, negative: -0.8 }[processedData.sentiment] || 0,
sentimentCategory: processedData.sentiment.toUpperCase() as "POSITIVE" | "NEUTRAL" | "NEGATIVE",
escalated: processedData.escalated,
forwardedHr: processedData.forwarded_hr,
fullTranscriptUrl: importRecord.fullTranscriptUrl,
avgResponseTime: importRecord.avgResponseTimeSeconds,
tokens: importRecord.tokens,
tokensEur: importRecord.tokensEur,
category: processedData.category,
initialMsg: importRecord.initialMessage,
processed: true,
questions: JSON.stringify(processedData.questions),
summary: processedData.summary,
},
create: {
companyId: importRecord.companyId,
importId: importRecord.id,
startTime: isNaN(startTime.getTime()) ? new Date() : startTime,
endTime: isNaN(endTime.getTime()) ? new Date() : endTime,
ipAddress: importRecord.ipAddress,
country: importRecord.countryCode,
language: processedData.language,
messagesSent: processedData.messages_sent,
sentiment: { positive: 0.8, neutral: 0.0, negative: -0.8 }[processedData.sentiment] || 0,
sentimentCategory: processedData.sentiment.toUpperCase() as "POSITIVE" | "NEUTRAL" | "NEGATIVE",
escalated: processedData.escalated,
forwardedHr: processedData.forwarded_hr,
fullTranscriptUrl: importRecord.fullTranscriptUrl,
avgResponseTime: importRecord.avgResponseTimeSeconds,
tokens: importRecord.tokens,
tokensEur: importRecord.tokensEur,
category: processedData.category,
initialMsg: importRecord.initialMessage,
processed: true,
questions: JSON.stringify(processedData.questions),
summary: processedData.summary,
},
});
console.log(`Successfully processed session ${session.id}.`);
// Mark SessionImport as DONE
await prisma.sessionImport.update({
where: { id: importRecord.id },
data: {
status: "DONE",
processedAt: new Date(),
},
});
console.log(`Successfully processed SessionImport ${importRecord.id} -> Session ${session.id}`);
successCount++;
} catch (error) {
console.error(`Error processing session ${session.id}:`, error);
console.error(`Error processing SessionImport ${importRecord.id}:`, error);
// Mark as ERROR
await prisma.sessionImport.update({
where: { id: importRecord.id },
data: {
status: "ERROR",
errorMsg: error instanceof Error ? error.message : String(error),
},
});
errorCount++;
}
}
console.log("Session processing complete.");
console.log(`Successfully processed: ${successCount} sessions.`);
console.log(`Failed to process: ${errorCount} sessions.`);
console.log("SessionImport processing complete.");
console.log(`Successfully processed: ${successCount} records.`);
console.log(`Failed to process: ${errorCount} records.`);
}
// Run the main function

View File

@ -1,75 +0,0 @@
// Script to check processing status and trigger processing
// Usage: node scripts/test-processing-status.js
import { PrismaClient } from '@prisma/client';
const prisma = new PrismaClient();
async function checkProcessingStatus() {
try {
console.log('🔍 Checking processing status...\n');
// Get processing status
const totalSessions = await prisma.session.count();
const processedSessions = await prisma.session.count({
where: { processed: true }
});
const unprocessedSessions = await prisma.session.count({
where: { processed: false }
});
const sessionsWithMessages = await prisma.session.count({
where: {
processed: false,
messages: { some: {} }
}
});
console.log('📊 Processing Status:');
console.log(` Total sessions: ${totalSessions}`);
console.log(` ✅ Processed: ${processedSessions}`);
console.log(` ⏳ Unprocessed: ${unprocessedSessions}`);
console.log(` 📝 Unprocessed with messages: ${sessionsWithMessages}`);
const processedPercentage = ((processedSessions / totalSessions) * 100).toFixed(1);
console.log(` 📈 Processing progress: ${processedPercentage}%\n`);
// Check recent processing activity
const recentlyProcessed = await prisma.session.findMany({
where: {
processed: true,
createdAt: {
gte: new Date(Date.now() - 60 * 60 * 1000) // Last hour
}
},
orderBy: { createdAt: 'desc' },
take: 5,
select: {
id: true,
createdAt: true,
category: true,
sentiment: true
}
});
if (recentlyProcessed.length > 0) {
console.log('🕒 Recently processed sessions:');
recentlyProcessed.forEach(session => {
const timeAgo = Math.round((Date.now() - session.createdAt.getTime()) / 1000 / 60);
console.log(`${session.id.substring(0, 8)}... (${timeAgo}m ago) - ${session.category || 'No category'}`);
});
} else {
console.log('🕒 No sessions processed in the last hour');
}
console.log('\n✨ Processing system is working correctly!');
console.log('💡 The parallel processing successfully processed sessions.');
console.log('🎯 For manual triggers, you need to be logged in as an admin user.');
} catch (error) {
console.error('❌ Error checking status:', error);
} finally {
await prisma.$disconnect();
}
}
checkProcessingStatus();

View File

@ -1,20 +0,0 @@
// Direct trigger for processing scheduler (bypasses authentication)
// Usage: node scripts/trigger-processing-direct.js
import { processUnprocessedSessions } from '../lib/processingScheduler.js';
async function triggerProcessing() {
try {
console.log('🚀 Manually triggering processing scheduler...\n');
// Process with custom parameters
await processUnprocessedSessions(50, 3); // Process 50 sessions with 3 concurrent workers
console.log('\n✅ Processing trigger completed!');
} catch (error) {
console.error('❌ Error triggering processing:', error);
}
}
triggerProcessing();