Enhance data integration and transcript parsing

- Improved date parsing in fetch_and_store_chat_data to support multiple formats and added error logging for unparseable dates. - Enhanced parse_and_store_transcript_messages to handle empty transcripts and expanded message pattern recognition for both User and Assistant. - Implemented intelligent splitting of transcripts based on detected patterns and timestamps, with fallback mechanisms for unrecognized formats. - Updated documentation for Celery and Redis setup, troubleshooting, and project structure. - Added markdown linting configuration and scripts for code formatting. - Updated Nginx configuration to change the web server port. - Added xlsxwriter dependency for Excel file handling in project requirements.
2026-03-03 00:01:28 +01:00 · 2025-05-18 19:18:31 +00:00
parent 8bbbb109bd
commit f0ae061fa7
24 changed files with 1672 additions and 931 deletions
--- a/dashboard_project/dashboard/views_export.py
+++ b/dashboard_project/dashboard/views_export.py
@@ -1,9 +1,11 @@
 # dashboard/views_export.py

 import csv
+import io
 import json
 from datetime import timedelta

+import xlsxwriter
 from django.contrib.auth.decorators import login_required
 from django.db.models import Q
 from django.http import HttpResponse
@@ -207,6 +209,11 @@ def export_chats_json(request):
        data_source = DataSource.objects.get(id=data_source_id)
        filename = f"{data_source.name.replace(' ', '_').lower()}_chat_sessions"

+    # Add company name, date, and timestamp to the filename
+    current_time = timezone.now().strftime("%Y%m%d_%H%M%S")
+    company_name = company.name.replace(" ", "_").lower()
+    filename = f"{company_name}_{filename}_{current_time}"
+
    # Prepare the data for JSON export using list comprehension
    data = [
        {
@@ -248,3 +255,188 @@ def export_chats_json(request):
    json.dump(export_data, response, indent=2)

    return response
+
+
+@login_required
+def export_chats_excel(request):
+    """Export chat sessions to Excel with filtering options"""
+    user = request.user
+    company = user.company
+
+    if not company:
+        return HttpResponse("You are not associated with any company.", status=403)
+
+    # Get and apply filters
+    data_source_id = request.GET.get("data_source_id")
+    dashboard_id = request.GET.get("dashboard_id")
+    view = request.GET.get("view", "all")
+    start_date = request.GET.get("start_date")
+    end_date = request.GET.get("end_date")
+    country = request.GET.get("country")
+    sentiment = request.GET.get("sentiment")
+    escalated = request.GET.get("escalated")
+
+    # Base queryset
+    sessions = ChatSession.objects.filter(data_source__company=company)
+
+    # Apply data source filter if selected
+    if data_source_id:
+        data_source = get_object_or_404(DataSource, id=data_source_id, company=company)
+        sessions = sessions.filter(data_source=data_source)
+
+    # Apply dashboard filter if selected
+    if dashboard_id:
+        dashboard = get_object_or_404(Dashboard, id=dashboard_id, company=company)
+        data_sources = dashboard.data_sources.all()
+        sessions = sessions.filter(data_source__in=data_sources)
+
+    # Apply view filter
+    if view == "recent":
+        seven_days_ago = timezone.now() - timedelta(days=7)
+        sessions = sessions.filter(start_time__gte=seven_days_ago)
+    elif view == "positive":
+        sessions = sessions.filter(Q(sentiment__icontains="positive"))
+    elif view == "negative":
+        sessions = sessions.filter(Q(sentiment__icontains="negative"))
+    elif view == "escalated":
+        sessions = sessions.filter(escalated=True)
+
+    # Apply additional filters
+    if start_date:
+        sessions = sessions.filter(start_time__date__gte=start_date)
+    if end_date:
+        sessions = sessions.filter(start_time__date__lte=end_date)
+    if country:
+        sessions = sessions.filter(country__icontains=country)
+    if sentiment:
+        sessions = sessions.filter(sentiment__icontains=sentiment)
+    if escalated:
+        escalated_val = escalated.lower() == "true"
+        sessions = sessions.filter(escalated=escalated_val)
+
+    # Order by most recent first
+    sessions = sessions.order_by("-start_time")
+
+    # Create the filename
+    filename = "chat_sessions"
+    if dashboard_id:
+        dashboard = Dashboard.objects.get(id=dashboard_id)
+        filename = f"{dashboard.name.replace(' ', '_').lower()}_chat_sessions"
+    elif data_source_id:
+        data_source = DataSource.objects.get(id=data_source_id)
+        filename = f"{data_source.name.replace(' ', '_').lower()}_chat_sessions"
+
+    # Add company name, date, and timestamp to the filename
+    current_time = timezone.now().strftime("%Y%m%d_%H%M%S")
+    company_name = company.name.replace(" ", "_").lower()
+    filename = f"{company_name}_{filename}_{current_time}"
+
+    # Create in-memory output file
+    output = io.BytesIO()
+
+    # Create Excel workbook and worksheet
+    workbook = xlsxwriter.Workbook(output)
+    worksheet = workbook.add_worksheet("Chat Sessions")
+
+    # Add a bold format to use to highlight cells
+    bold = workbook.add_format({"bold": True, "bg_color": "#D9EAD3"})
+    date_format = workbook.add_format({"num_format": "yyyy-mm-dd hh:mm:ss"})
+
+    # Write header row with formatting
+    headers = [
+        "Session ID",
+        "Start Time",
+        "End Time",
+        "IP Address",
+        "Country",
+        "Language",
+        "Messages Sent",
+        "Sentiment",
+        "Escalated",
+        "Forwarded HR",
+        "Full Transcript",
+        "Avg Response Time (s)",
+        "Tokens",
+        "Tokens EUR",
+        "Category",
+        "Initial Message",
+        "User Rating",
+    ]
+
+    for col, header in enumerate(headers):
+        worksheet.write(0, col, header, bold)
+
+    # Write data rows
+    for row_num, session in enumerate(sessions, 1):
+        worksheet.write(row_num, 0, session.session_id)
+        # Write dates with proper formatting if not None
+        if session.start_time:
+            worksheet.write_datetime(row_num, 1, session.start_time, date_format)
+        else:
+            worksheet.write(row_num, 1, None)
+
+        if session.end_time:
+            worksheet.write_datetime(row_num, 2, session.end_time, date_format)
+        else:
+            worksheet.write(row_num, 2, None)
+
+        worksheet.write(row_num, 3, session.ip_address)
+        worksheet.write(row_num, 4, session.country)
+        worksheet.write(row_num, 5, session.language)
+        worksheet.write(row_num, 6, session.messages_sent)
+        worksheet.write(row_num, 7, session.sentiment)
+        worksheet.write(row_num, 8, "Yes" if session.escalated else "No")
+        worksheet.write(row_num, 9, "Yes" if session.forwarded_hr else "No")
+        worksheet.write(row_num, 10, session.full_transcript)
+        worksheet.write(row_num, 11, session.avg_response_time)
+        worksheet.write(row_num, 12, session.tokens)
+        worksheet.write(row_num, 13, session.tokens_eur)
+        worksheet.write(row_num, 14, session.category)
+        worksheet.write(row_num, 15, session.initial_msg)
+        worksheet.write(row_num, 16, session.user_rating)
+
+    # Add summary sheet with metadata
+    summary = workbook.add_worksheet("Summary")
+    summary.write(0, 0, "Export Information", bold)
+    summary.write(1, 0, "Company:", bold)
+    summary.write(1, 1, company.name)
+    summary.write(2, 0, "Export Date:", bold)
+    summary.write(2, 1, timezone.now().strftime("%Y-%m-%d %H:%M:%S"))
+    summary.write(3, 0, "Total Records:", bold)
+    summary.write(3, 1, len(sessions))
+
+    # Add filters if used
+    filter_row = 5
+    summary.write(filter_row, 0, "Filters Applied:", bold)
+    filter_row += 1
+
+    if data_source_id:
+        data_source = DataSource.objects.get(id=data_source_id)
+        summary.write(filter_row, 0, "Data Source:")
+        summary.write(filter_row, 1, data_source.name)
+        filter_row += 1
+
+    if dashboard_id:
+        dashboard = Dashboard.objects.get(id=dashboard_id)
+        summary.write(filter_row, 0, "Dashboard:")
+        summary.write(filter_row, 1, dashboard.name)
+        filter_row += 1
+
+    if view != "all":
+        summary.write(filter_row, 0, "View:")
+        summary.write(filter_row, 1, view.title())
+        filter_row += 1
+
+    # Auto-adjust column widths for better readability
+    for i, width in enumerate([20, 20, 20, 15, 15, 10, 12, 15, 10, 12, 30, 15, 10, 10, 20, 50, 10]):
+        worksheet.set_column(i, i, width)
+
+    # Close the workbook
+    workbook.close()
+
+    # Set up the response
+    output.seek(0)
+    response = HttpResponse(output, content_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")
+    response["Content-Disposition"] = f'attachment; filename="{filename}.xlsx"'
+
+    return response
--- a/dashboard_project/data_integration/utils.py
+++ b/dashboard_project/data_integration/utils.py
@@ -94,19 +94,41 @@ def fetch_and_store_chat_data(source_id=None):
            padded_row = row + [""] * (len(header) - len(row))
            data = dict(zip(header, padded_row, strict=False))

-            try:
-                # Try European date format (DD.MM.YYYY) first
-                start_time = make_aware(datetime.strptime(data["start_time"], "%d.%m.%Y %H:%M:%S"))
-            except ValueError:
-                # Fallback to ISO format (YYYY-MM-DD)
-                start_time = make_aware(datetime.strptime(data["start_time"], "%Y-%m-%d %H:%M:%S"))
+            # Parse date fields with multiple format support
+            start_time = None
+            end_time = None

-            try:
-                # Try European date format (DD.MM.YYYY) first
-                end_time = make_aware(datetime.strptime(data["end_time"], "%d.%m.%Y %H:%M:%S"))
-            except ValueError:
-                # Fallback to ISO format (YYYY-MM-DD)
-                end_time = make_aware(datetime.strptime(data["end_time"], "%Y-%m-%d %H:%M:%S"))
+            # List of date formats to try
+            date_formats = [
+                "%d.%m.%Y %H:%M:%S",  # European format: DD.MM.YYYY HH:MM:SS
+                "%Y-%m-%d %H:%M:%S",  # ISO format: YYYY-MM-DD HH:MM:SS
+                "%m/%d/%Y %H:%M:%S",  # US format: MM/DD/YYYY HH:MM:SS
+                "%Y-%m-%dT%H:%M:%S",  # ISO format with T separator
+                "%Y-%m-%dT%H:%M:%S.%fZ",  # ISO format with milliseconds and Z
+            ]
+
+            # Try to parse start_time with multiple formats
+            for date_format in date_formats:
+                try:
+                    start_time = make_aware(datetime.strptime(data["start_time"], date_format))
+                    break
+                except (ValueError, TypeError):
+                    continue
+
+            # Try to parse end_time with multiple formats
+            for date_format in date_formats:
+                try:
+                    end_time = make_aware(datetime.strptime(data["end_time"], date_format))
+                    break
+                except (ValueError, TypeError):
+                    continue
+
+            # If we couldn't parse the dates, log an error and skip this row
+            if not start_time or not end_time:
+                error_msg = f"Could not parse date fields for session {data['session_id']}: start_time={data['start_time']}, end_time={data['end_time']}"
+                logger.error(error_msg)
+                stats["errors"] += 1
+                continue

            messages_sent = int(data["messages_sent"]) if data["messages_sent"] else None
            escalated = data["escalated"].lower() == "true" if data["escalated"] else None
@@ -199,6 +221,10 @@ def fetch_and_store_transcript(session, timeout=30):
 def parse_and_store_transcript_messages(session, transcript_content):
    """Parse and store messages from a transcript.

+    This function parses a chat transcript that contains messages from both User and Assistant.
+    It identifies message boundaries by looking for lines that start with common sender patterns,
+    and groups all following lines until the next sender change as part of that message.
+
    Args:
        session: The ChatSession object
        transcript_content: The raw transcript content
@@ -206,6 +232,11 @@ def parse_and_store_transcript_messages(session, transcript_content):
    Returns:
        int: Number of messages created
    """
+    # Handle empty transcripts
+    if not transcript_content or transcript_content.strip() == "":
+        logger.warning(f"Empty transcript received for session {session.session_id}")
+        return 0
+
    lines = transcript_content.splitlines()
    current_sender = None
    current_message_lines = []
@@ -217,35 +248,285 @@ def parse_and_store_transcript_messages(session, transcript_content):
        logger.info(f"Deleting {existing_count} existing messages for session {session.session_id}")
        ChatMessage.objects.filter(session=session).delete()

+    # Define common message patterns to detect - expanded to include more variations
+    user_patterns = [
+        "User:",
+        "[User]:",
+        "Customer:",
+        "[Customer]:",
+        "Client:",
+        "[Client]:",
+        "Human:",
+        "[Human]:",
+        "Me:",
+        "[Me]:",
+        "Question:",
+        "User >",
+        "Customer >",
+        "User said:",
+        "Customer said:",
+        "User writes:",
+        "User asked:",
+        "User message:",
+        "From user:",
+        "Client message:",
+        "Q:",
+        "Input:",
+        "Query:",
+        "Person:",
+        "Visitor:",
+        "Guest:",
+        "User input:",
+        "User query:",
+    ]
+    assistant_patterns = [
+        "Assistant:",
+        "[Assistant]:",
+        "Agent:",
+        "[Agent]:",
+        "Bot:",
+        "[Bot]:",
+        "AI:",
+        "[AI]:",
+        "ChatGPT:",
+        "[ChatGPT]:",
+        "System:",
+        "[System]:",
+        "Support:",
+        "[Support]:",
+        "Answer:",
+        "Assistant >",
+        "Bot >",
+        "Assistant said:",
+        "Assistant writes:",
+        "AI responded:",
+        "LLM:",
+        "[LLM]:",
+        "Response:",
+        "A:",
+        "Output:",
+        "AI output:",
+        "Model:",
+        "[Model]:",
+        "Assistant message:",
+        "From assistant:",
+        "Bot response:",
+        "AI says:",
+        "NotsoAI:",
+        "[NotsoAI]:",
+        "Notso:",
+        "[Notso]:",
+    ]
+
+    # Function to save current message before starting a new one
+    def save_current_message():
+        nonlocal current_sender, current_message_lines, messages_created
+        if current_sender and current_message_lines:
+            message_text = "\n".join(current_message_lines)
+            # Only save if there's actual content (not just whitespace)
+            if message_text.strip() and save_message(session, current_sender, message_text):
+                messages_created += 1
+                logger.debug(f"Saved {current_sender} message with {len(current_message_lines)} lines")
+
+    # Initial scan to detect format type and potential message boundaries
+    has_recognized_patterns = False
+    potential_timestamps = []
+    timestamp_pattern_count = 0
+
+    # Regex patterns for common timestamp formats
+    import re
+
+    timestamp_patterns = [
+        r"^\[\d{2}:\d{2}:\d{2}\]",  # [HH:MM:SS]
+        r"^\[\d{2}:\d{2}\]",  # [HH:MM]
+        r"^\(\d{2}:\d{2}:\d{2}\)",  # (HH:MM:SS)
+        r"^\(\d{2}:\d{2}\)",  # (HH:MM)
+        r"^\d{2}:\d{2}:\d{2} -",  # HH:MM:SS -
+        r"^\d{2}:\d{2} -",  # HH:MM -
+        r"^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}",  # YYYY-MM-DD HH:MM:SS
+    ]
+
+    # First pass: detect format and message boundaries
+    for i, line in enumerate(lines):
+        line_stripped = line.strip()
+
+        # Check for standard message patterns
+        if any(line_stripped.startswith(pattern) for pattern in user_patterns + assistant_patterns):
+            has_recognized_patterns = True
+
+        # Check for timestamp patterns that might indicate message boundaries
+        for pattern in timestamp_patterns:
+            if re.match(pattern, line_stripped):
+                timestamp_pattern_count += 1
+                potential_timestamps.append(i)
+                break
+
+    # If no recognized patterns are found, try to intelligently split the transcript
+    if not has_recognized_patterns and len(lines) > 0:
+        logger.info(
+            f"No standard message patterns found in transcript for session {session.session_id}. Attempting intelligent split."
+        )
+
+        # Try timestamp-based parsing if we have enough consistent timestamps
+        if timestamp_pattern_count > 3 and timestamp_pattern_count > 0.2 * len(lines):
+            logger.info(f"Attempting timestamp-based parsing with {timestamp_pattern_count} detected timestamps")
+
+            # Add the end of file as a boundary
+            potential_timestamps.append(len(lines))
+
+            # Process messages between timestamps
+            for i in range(len(potential_timestamps) - 1):
+                start_idx = potential_timestamps[i]
+                end_idx = potential_timestamps[i + 1]
+
+                message_content = "\n".join(lines[start_idx:end_idx])
+                first_line = lines[start_idx].lower()
+
+                # Simple heuristic to identify sender
+                is_user = any(
+                    user_word in first_line
+                    for user_word in ["user", "customer", "client", "human", "question", "query"]
+                )
+                is_assistant = any(
+                    assistant_word in first_line
+                    for assistant_word in ["assistant", "agent", "bot", "ai", "system", "support", "answer", "response"]
+                )
+
+                sender = "User" if (is_user or (not is_assistant and i % 2 == 0)) else "Assistant"
+
+                if save_message(session, sender, message_content):
+                    messages_created += 1
+
+            logger.info(f"Created {messages_created} messages using timestamp-based parsing")
+            return messages_created
+
+        # Simple heuristic: alternate between user and assistant, with first message from user
+        # Start with paragraphs (blank line separations) as message boundaries
+        paragraphs = []
+        current_paragraph = []
+
+        for line in lines:
+            if line.strip():
+                current_paragraph.append(line)
+            elif current_paragraph:  # Empty line and we have a paragraph
+                paragraphs.append("\n".join(current_paragraph))
+                current_paragraph = []
+
+        # Add the last paragraph if it's not empty
+        if current_paragraph:
+            paragraphs.append("\n".join(current_paragraph))
+
+        # If we have just one paragraph, try to split by sentence boundaries for very long transcripts
+        if len(paragraphs) == 1 and len(paragraphs[0].split()) > 100:
+            import re
+
+            # Try to split by sentence boundaries
+            text = paragraphs[0]
+            # Define sentence ending patterns
+            sentence_endings = r"(?<=[.!?])\s+"
+            sentences = re.split(sentence_endings, text)
+            # Group sentences into logical chunks (assuming alternating speakers)
+            chunks = []
+            current_chunk = []
+
+            for i, sentence in enumerate(sentences):
+                current_chunk.append(sentence)
+                # Every 2-3 sentences or on a natural break like a question mark
+                if (i % 2 == 1 and sentence.endswith("?")) or len(current_chunk) >= 3:
+                    chunks.append(" ".join(current_chunk))
+                    current_chunk = []
+
+            # Add any remaining sentences
+            if current_chunk:
+                chunks.append(" ".join(current_chunk))
+
+            # Save the chunks alternating between user and assistant
+            for i, chunk in enumerate(chunks):
+                if chunk.strip():
+                    sender = "User" if i % 2 == 0 else "Assistant"
+                    if save_message(session, sender, chunk):
+                        messages_created += 1
+
+            logger.info(f"Created {messages_created} messages by splitting single paragraph into sentences")
+            return messages_created
+
+        # Save messages alternating between user and assistant
+        for i, paragraph in enumerate(paragraphs):
+            if paragraph.strip():  # Only save non-empty paragraphs
+                sender = "User" if i % 2 == 0 else "Assistant"
+                if save_message(session, sender, paragraph):
+                    messages_created += 1
+
+        logger.info(f"Created {messages_created} messages using intelligent split for session {session.session_id}")
+        return messages_created
+
+    # Standard processing with recognized patterns
    for line in lines:
-        if line.startswith("User:"):
-            if (
-                current_sender
-                and current_message_lines
-                and save_message(session, current_sender, "\n".join(current_message_lines))
-            ):
-                messages_created += 1
+        line_stripped = line.strip()
+
+        # Skip empty lines at the beginning
+        if not line_stripped and not current_sender:
+            continue
+
+        # Check if this line indicates a new sender
+        is_user_message = any(line_stripped.startswith(pattern) for pattern in user_patterns)
+        is_assistant_message = any(line_stripped.startswith(pattern) for pattern in assistant_patterns)
+
+        if is_user_message:
+            # Save previous message if any
+            save_current_message()
+
+            # Start new user message
            current_sender = "User"
-            current_message_lines = [line.replace("User:", "").strip()]
-        elif line.startswith("Assistant:"):
-            if (
-                current_sender
-                and current_message_lines
-                and save_message(session, current_sender, "\n".join(current_message_lines))
-            ):
-                messages_created += 1
+            # Remove the prefix from the line
+            for pattern in user_patterns:
+                if line_stripped.startswith(pattern):
+                    line = line[len(pattern) :].strip()
+                    break
+            current_message_lines = [line] if line.strip() else []
+        elif is_assistant_message:
+            # Save previous message if any
+            save_current_message()
+
+            # Start new assistant message
            current_sender = "Assistant"
-            current_message_lines = [line.replace("Assistant:", "").strip()]
+            # Remove the prefix from the line
+            for pattern in assistant_patterns:
+                if line_stripped.startswith(pattern):
+                    line = line[len(pattern) :].strip()
+                    break
+            current_message_lines = [line] if line.strip() else []
        elif current_sender:
-            current_message_lines.append(line.strip())
+            # Continue adding to current message
+            current_message_lines.append(line)
+        else:
+            # If we get here with no current_sender, assume it's the start of a user message
+            logger.warning(f"Found line without sender prefix: '{line}'. Assuming User message.")
+            current_sender = "User"
+            current_message_lines = [line]

    # Save the last message
-    if (
-        current_sender
-        and current_message_lines
-        and save_message(session, current_sender, "\n".join(current_message_lines))
-    ):
-        messages_created += 1
+    save_current_message()
+
+    # Handle case with no messages parsed (possibly incorrectly formatted transcript)
+    if messages_created == 0 and lines:
+        logger.warning(
+            f"No messages were parsed from transcript for session {session.session_id}. Using fallback parsing."
+        )
+
+        # Fallback: Just split the transcript in half, first part user, second part assistant
+        mid_point = len(lines) // 2
+        user_content = "\n".join(lines[:mid_point])
+        assistant_content = "\n".join(lines[mid_point:])
+
+        # Save the split messages if they have content
+        if user_content.strip() and save_message(session, "User", user_content):
+            messages_created += 1
+
+        if assistant_content.strip() and save_message(session, "Assistant", assistant_content):
+            messages_created += 1
+
+        logger.info(f"Created {messages_created} messages using fallback parsing")

    logger.info(f"Created {messages_created} messages for session {session.session_id}")
    return messages_created