Enhance data integration and transcript parsing

- Improved date parsing in fetch_and_store_chat_data to support multiple formats and added error logging for unparseable dates. - Enhanced parse_and_store_transcript_messages to handle empty transcripts and expanded message pattern recognition for both User and Assistant. - Implemented intelligent splitting of transcripts based on detected patterns and timestamps, with fallback mechanisms for unrecognized formats. - Updated documentation for Celery and Redis setup, troubleshooting, and project structure. - Added markdown linting configuration and scripts for code formatting. - Updated Nginx configuration to change the web server port. - Added xlsxwriter dependency for Excel file handling in project requirements.
2026-03-03 00:21:28 +01:00 · 2025-05-18 19:18:31 +00:00
parent 8bbbb109bd
commit f0ae061fa7
24 changed files with 1672 additions and 931 deletions
--- a/dashboard_project/data_integration/utils.py
+++ b/dashboard_project/data_integration/utils.py
@@ -94,19 +94,41 @@ def fetch_and_store_chat_data(source_id=None):
            padded_row = row + [""] * (len(header) - len(row))
            data = dict(zip(header, padded_row, strict=False))

-            try:
-                # Try European date format (DD.MM.YYYY) first
-                start_time = make_aware(datetime.strptime(data["start_time"], "%d.%m.%Y %H:%M:%S"))
-            except ValueError:
-                # Fallback to ISO format (YYYY-MM-DD)
-                start_time = make_aware(datetime.strptime(data["start_time"], "%Y-%m-%d %H:%M:%S"))
+            # Parse date fields with multiple format support
+            start_time = None
+            end_time = None

-            try:
-                # Try European date format (DD.MM.YYYY) first
-                end_time = make_aware(datetime.strptime(data["end_time"], "%d.%m.%Y %H:%M:%S"))
-            except ValueError:
-                # Fallback to ISO format (YYYY-MM-DD)
-                end_time = make_aware(datetime.strptime(data["end_time"], "%Y-%m-%d %H:%M:%S"))
+            # List of date formats to try
+            date_formats = [
+                "%d.%m.%Y %H:%M:%S",  # European format: DD.MM.YYYY HH:MM:SS
+                "%Y-%m-%d %H:%M:%S",  # ISO format: YYYY-MM-DD HH:MM:SS
+                "%m/%d/%Y %H:%M:%S",  # US format: MM/DD/YYYY HH:MM:SS
+                "%Y-%m-%dT%H:%M:%S",  # ISO format with T separator
+                "%Y-%m-%dT%H:%M:%S.%fZ",  # ISO format with milliseconds and Z
+            ]
+
+            # Try to parse start_time with multiple formats
+            for date_format in date_formats:
+                try:
+                    start_time = make_aware(datetime.strptime(data["start_time"], date_format))
+                    break
+                except (ValueError, TypeError):
+                    continue
+
+            # Try to parse end_time with multiple formats
+            for date_format in date_formats:
+                try:
+                    end_time = make_aware(datetime.strptime(data["end_time"], date_format))
+                    break
+                except (ValueError, TypeError):
+                    continue
+
+            # If we couldn't parse the dates, log an error and skip this row
+            if not start_time or not end_time:
+                error_msg = f"Could not parse date fields for session {data['session_id']}: start_time={data['start_time']}, end_time={data['end_time']}"
+                logger.error(error_msg)
+                stats["errors"] += 1
+                continue

            messages_sent = int(data["messages_sent"]) if data["messages_sent"] else None
            escalated = data["escalated"].lower() == "true" if data["escalated"] else None
@@ -199,6 +221,10 @@ def fetch_and_store_transcript(session, timeout=30):
 def parse_and_store_transcript_messages(session, transcript_content):
    """Parse and store messages from a transcript.

+    This function parses a chat transcript that contains messages from both User and Assistant.
+    It identifies message boundaries by looking for lines that start with common sender patterns,
+    and groups all following lines until the next sender change as part of that message.
+
    Args:
        session: The ChatSession object
        transcript_content: The raw transcript content
@@ -206,6 +232,11 @@ def parse_and_store_transcript_messages(session, transcript_content):
    Returns:
        int: Number of messages created
    """
+    # Handle empty transcripts
+    if not transcript_content or transcript_content.strip() == "":
+        logger.warning(f"Empty transcript received for session {session.session_id}")
+        return 0
+
    lines = transcript_content.splitlines()
    current_sender = None
    current_message_lines = []
@@ -217,35 +248,285 @@ def parse_and_store_transcript_messages(session, transcript_content):
        logger.info(f"Deleting {existing_count} existing messages for session {session.session_id}")
        ChatMessage.objects.filter(session=session).delete()

+    # Define common message patterns to detect - expanded to include more variations
+    user_patterns = [
+        "User:",
+        "[User]:",
+        "Customer:",
+        "[Customer]:",
+        "Client:",
+        "[Client]:",
+        "Human:",
+        "[Human]:",
+        "Me:",
+        "[Me]:",
+        "Question:",
+        "User >",
+        "Customer >",
+        "User said:",
+        "Customer said:",
+        "User writes:",
+        "User asked:",
+        "User message:",
+        "From user:",
+        "Client message:",
+        "Q:",
+        "Input:",
+        "Query:",
+        "Person:",
+        "Visitor:",
+        "Guest:",
+        "User input:",
+        "User query:",
+    ]
+    assistant_patterns = [
+        "Assistant:",
+        "[Assistant]:",
+        "Agent:",
+        "[Agent]:",
+        "Bot:",
+        "[Bot]:",
+        "AI:",
+        "[AI]:",
+        "ChatGPT:",
+        "[ChatGPT]:",
+        "System:",
+        "[System]:",
+        "Support:",
+        "[Support]:",
+        "Answer:",
+        "Assistant >",
+        "Bot >",
+        "Assistant said:",
+        "Assistant writes:",
+        "AI responded:",
+        "LLM:",
+        "[LLM]:",
+        "Response:",
+        "A:",
+        "Output:",
+        "AI output:",
+        "Model:",
+        "[Model]:",
+        "Assistant message:",
+        "From assistant:",
+        "Bot response:",
+        "AI says:",
+        "NotsoAI:",
+        "[NotsoAI]:",
+        "Notso:",
+        "[Notso]:",
+    ]
+
+    # Function to save current message before starting a new one
+    def save_current_message():
+        nonlocal current_sender, current_message_lines, messages_created
+        if current_sender and current_message_lines:
+            message_text = "\n".join(current_message_lines)
+            # Only save if there's actual content (not just whitespace)
+            if message_text.strip() and save_message(session, current_sender, message_text):
+                messages_created += 1
+                logger.debug(f"Saved {current_sender} message with {len(current_message_lines)} lines")
+
+    # Initial scan to detect format type and potential message boundaries
+    has_recognized_patterns = False
+    potential_timestamps = []
+    timestamp_pattern_count = 0
+
+    # Regex patterns for common timestamp formats
+    import re
+
+    timestamp_patterns = [
+        r"^\[\d{2}:\d{2}:\d{2}\]",  # [HH:MM:SS]
+        r"^\[\d{2}:\d{2}\]",  # [HH:MM]
+        r"^\(\d{2}:\d{2}:\d{2}\)",  # (HH:MM:SS)
+        r"^\(\d{2}:\d{2}\)",  # (HH:MM)
+        r"^\d{2}:\d{2}:\d{2} -",  # HH:MM:SS -
+        r"^\d{2}:\d{2} -",  # HH:MM -
+        r"^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}",  # YYYY-MM-DD HH:MM:SS
+    ]
+
+    # First pass: detect format and message boundaries
+    for i, line in enumerate(lines):
+        line_stripped = line.strip()
+
+        # Check for standard message patterns
+        if any(line_stripped.startswith(pattern) for pattern in user_patterns + assistant_patterns):
+            has_recognized_patterns = True
+
+        # Check for timestamp patterns that might indicate message boundaries
+        for pattern in timestamp_patterns:
+            if re.match(pattern, line_stripped):
+                timestamp_pattern_count += 1
+                potential_timestamps.append(i)
+                break
+
+    # If no recognized patterns are found, try to intelligently split the transcript
+    if not has_recognized_patterns and len(lines) > 0:
+        logger.info(
+            f"No standard message patterns found in transcript for session {session.session_id}. Attempting intelligent split."
+        )
+
+        # Try timestamp-based parsing if we have enough consistent timestamps
+        if timestamp_pattern_count > 3 and timestamp_pattern_count > 0.2 * len(lines):
+            logger.info(f"Attempting timestamp-based parsing with {timestamp_pattern_count} detected timestamps")
+
+            # Add the end of file as a boundary
+            potential_timestamps.append(len(lines))
+
+            # Process messages between timestamps
+            for i in range(len(potential_timestamps) - 1):
+                start_idx = potential_timestamps[i]
+                end_idx = potential_timestamps[i + 1]
+
+                message_content = "\n".join(lines[start_idx:end_idx])
+                first_line = lines[start_idx].lower()
+
+                # Simple heuristic to identify sender
+                is_user = any(
+                    user_word in first_line
+                    for user_word in ["user", "customer", "client", "human", "question", "query"]
+                )
+                is_assistant = any(
+                    assistant_word in first_line
+                    for assistant_word in ["assistant", "agent", "bot", "ai", "system", "support", "answer", "response"]
+                )
+
+                sender = "User" if (is_user or (not is_assistant and i % 2 == 0)) else "Assistant"
+
+                if save_message(session, sender, message_content):
+                    messages_created += 1
+
+            logger.info(f"Created {messages_created} messages using timestamp-based parsing")
+            return messages_created
+
+        # Simple heuristic: alternate between user and assistant, with first message from user
+        # Start with paragraphs (blank line separations) as message boundaries
+        paragraphs = []
+        current_paragraph = []
+
+        for line in lines:
+            if line.strip():
+                current_paragraph.append(line)
+            elif current_paragraph:  # Empty line and we have a paragraph
+                paragraphs.append("\n".join(current_paragraph))
+                current_paragraph = []
+
+        # Add the last paragraph if it's not empty
+        if current_paragraph:
+            paragraphs.append("\n".join(current_paragraph))
+
+        # If we have just one paragraph, try to split by sentence boundaries for very long transcripts
+        if len(paragraphs) == 1 and len(paragraphs[0].split()) > 100:
+            import re
+
+            # Try to split by sentence boundaries
+            text = paragraphs[0]
+            # Define sentence ending patterns
+            sentence_endings = r"(?<=[.!?])\s+"
+            sentences = re.split(sentence_endings, text)
+            # Group sentences into logical chunks (assuming alternating speakers)
+            chunks = []
+            current_chunk = []
+
+            for i, sentence in enumerate(sentences):
+                current_chunk.append(sentence)
+                # Every 2-3 sentences or on a natural break like a question mark
+                if (i % 2 == 1 and sentence.endswith("?")) or len(current_chunk) >= 3:
+                    chunks.append(" ".join(current_chunk))
+                    current_chunk = []
+
+            # Add any remaining sentences
+            if current_chunk:
+                chunks.append(" ".join(current_chunk))
+
+            # Save the chunks alternating between user and assistant
+            for i, chunk in enumerate(chunks):
+                if chunk.strip():
+                    sender = "User" if i % 2 == 0 else "Assistant"
+                    if save_message(session, sender, chunk):
+                        messages_created += 1
+
+            logger.info(f"Created {messages_created} messages by splitting single paragraph into sentences")
+            return messages_created
+
+        # Save messages alternating between user and assistant
+        for i, paragraph in enumerate(paragraphs):
+            if paragraph.strip():  # Only save non-empty paragraphs
+                sender = "User" if i % 2 == 0 else "Assistant"
+                if save_message(session, sender, paragraph):
+                    messages_created += 1
+
+        logger.info(f"Created {messages_created} messages using intelligent split for session {session.session_id}")
+        return messages_created
+
+    # Standard processing with recognized patterns
    for line in lines:
-        if line.startswith("User:"):
-            if (
-                current_sender
-                and current_message_lines
-                and save_message(session, current_sender, "\n".join(current_message_lines))
-            ):
-                messages_created += 1
+        line_stripped = line.strip()
+
+        # Skip empty lines at the beginning
+        if not line_stripped and not current_sender:
+            continue
+
+        # Check if this line indicates a new sender
+        is_user_message = any(line_stripped.startswith(pattern) for pattern in user_patterns)
+        is_assistant_message = any(line_stripped.startswith(pattern) for pattern in assistant_patterns)
+
+        if is_user_message:
+            # Save previous message if any
+            save_current_message()
+
+            # Start new user message
            current_sender = "User"
-            current_message_lines = [line.replace("User:", "").strip()]
-        elif line.startswith("Assistant:"):
-            if (
-                current_sender
-                and current_message_lines
-                and save_message(session, current_sender, "\n".join(current_message_lines))
-            ):
-                messages_created += 1
+            # Remove the prefix from the line
+            for pattern in user_patterns:
+                if line_stripped.startswith(pattern):
+                    line = line[len(pattern) :].strip()
+                    break
+            current_message_lines = [line] if line.strip() else []
+        elif is_assistant_message:
+            # Save previous message if any
+            save_current_message()
+
+            # Start new assistant message
            current_sender = "Assistant"
-            current_message_lines = [line.replace("Assistant:", "").strip()]
+            # Remove the prefix from the line
+            for pattern in assistant_patterns:
+                if line_stripped.startswith(pattern):
+                    line = line[len(pattern) :].strip()
+                    break
+            current_message_lines = [line] if line.strip() else []
        elif current_sender:
-            current_message_lines.append(line.strip())
+            # Continue adding to current message
+            current_message_lines.append(line)
+        else:
+            # If we get here with no current_sender, assume it's the start of a user message
+            logger.warning(f"Found line without sender prefix: '{line}'. Assuming User message.")
+            current_sender = "User"
+            current_message_lines = [line]

    # Save the last message
-    if (
-        current_sender
-        and current_message_lines
-        and save_message(session, current_sender, "\n".join(current_message_lines))
-    ):
-        messages_created += 1
+    save_current_message()
+
+    # Handle case with no messages parsed (possibly incorrectly formatted transcript)
+    if messages_created == 0 and lines:
+        logger.warning(
+            f"No messages were parsed from transcript for session {session.session_id}. Using fallback parsing."
+        )
+
+        # Fallback: Just split the transcript in half, first part user, second part assistant
+        mid_point = len(lines) // 2
+        user_content = "\n".join(lines[:mid_point])
+        assistant_content = "\n".join(lines[mid_point:])
+
+        # Save the split messages if they have content
+        if user_content.strip() and save_message(session, "User", user_content):
+            messages_created += 1
+
+        if assistant_content.strip() and save_message(session, "Assistant", assistant_content):
+            messages_created += 1
+
+        logger.info(f"Created {messages_created} messages using fallback parsing")

    logger.info(f"Created {messages_created} messages for session {session.session_id}")
    return messages_created