mirror of
https://github.com/kjanat/livegraphs-django.git
synced 2026-01-16 07:52:11 +01:00
Enhance data integration and transcript parsing
- Improved date parsing in fetch_and_store_chat_data to support multiple formats and added error logging for unparseable dates. - Enhanced parse_and_store_transcript_messages to handle empty transcripts and expanded message pattern recognition for both User and Assistant. - Implemented intelligent splitting of transcripts based on detected patterns and timestamps, with fallback mechanisms for unrecognized formats. - Updated documentation for Celery and Redis setup, troubleshooting, and project structure. - Added markdown linting configuration and scripts for code formatting. - Updated Nginx configuration to change the web server port. - Added xlsxwriter dependency for Excel file handling in project requirements.
This commit is contained in:
@ -1,9 +1,11 @@
|
||||
# dashboard/views_export.py
|
||||
|
||||
import csv
|
||||
import io
|
||||
import json
|
||||
from datetime import timedelta
|
||||
|
||||
import xlsxwriter
|
||||
from django.contrib.auth.decorators import login_required
|
||||
from django.db.models import Q
|
||||
from django.http import HttpResponse
|
||||
@ -207,6 +209,11 @@ def export_chats_json(request):
|
||||
data_source = DataSource.objects.get(id=data_source_id)
|
||||
filename = f"{data_source.name.replace(' ', '_').lower()}_chat_sessions"
|
||||
|
||||
# Add company name, date, and timestamp to the filename
|
||||
current_time = timezone.now().strftime("%Y%m%d_%H%M%S")
|
||||
company_name = company.name.replace(" ", "_").lower()
|
||||
filename = f"{company_name}_{filename}_{current_time}"
|
||||
|
||||
# Prepare the data for JSON export using list comprehension
|
||||
data = [
|
||||
{
|
||||
@ -248,3 +255,188 @@ def export_chats_json(request):
|
||||
json.dump(export_data, response, indent=2)
|
||||
|
||||
return response
|
||||
|
||||
|
||||
@login_required
|
||||
def export_chats_excel(request):
|
||||
"""Export chat sessions to Excel with filtering options"""
|
||||
user = request.user
|
||||
company = user.company
|
||||
|
||||
if not company:
|
||||
return HttpResponse("You are not associated with any company.", status=403)
|
||||
|
||||
# Get and apply filters
|
||||
data_source_id = request.GET.get("data_source_id")
|
||||
dashboard_id = request.GET.get("dashboard_id")
|
||||
view = request.GET.get("view", "all")
|
||||
start_date = request.GET.get("start_date")
|
||||
end_date = request.GET.get("end_date")
|
||||
country = request.GET.get("country")
|
||||
sentiment = request.GET.get("sentiment")
|
||||
escalated = request.GET.get("escalated")
|
||||
|
||||
# Base queryset
|
||||
sessions = ChatSession.objects.filter(data_source__company=company)
|
||||
|
||||
# Apply data source filter if selected
|
||||
if data_source_id:
|
||||
data_source = get_object_or_404(DataSource, id=data_source_id, company=company)
|
||||
sessions = sessions.filter(data_source=data_source)
|
||||
|
||||
# Apply dashboard filter if selected
|
||||
if dashboard_id:
|
||||
dashboard = get_object_or_404(Dashboard, id=dashboard_id, company=company)
|
||||
data_sources = dashboard.data_sources.all()
|
||||
sessions = sessions.filter(data_source__in=data_sources)
|
||||
|
||||
# Apply view filter
|
||||
if view == "recent":
|
||||
seven_days_ago = timezone.now() - timedelta(days=7)
|
||||
sessions = sessions.filter(start_time__gte=seven_days_ago)
|
||||
elif view == "positive":
|
||||
sessions = sessions.filter(Q(sentiment__icontains="positive"))
|
||||
elif view == "negative":
|
||||
sessions = sessions.filter(Q(sentiment__icontains="negative"))
|
||||
elif view == "escalated":
|
||||
sessions = sessions.filter(escalated=True)
|
||||
|
||||
# Apply additional filters
|
||||
if start_date:
|
||||
sessions = sessions.filter(start_time__date__gte=start_date)
|
||||
if end_date:
|
||||
sessions = sessions.filter(start_time__date__lte=end_date)
|
||||
if country:
|
||||
sessions = sessions.filter(country__icontains=country)
|
||||
if sentiment:
|
||||
sessions = sessions.filter(sentiment__icontains=sentiment)
|
||||
if escalated:
|
||||
escalated_val = escalated.lower() == "true"
|
||||
sessions = sessions.filter(escalated=escalated_val)
|
||||
|
||||
# Order by most recent first
|
||||
sessions = sessions.order_by("-start_time")
|
||||
|
||||
# Create the filename
|
||||
filename = "chat_sessions"
|
||||
if dashboard_id:
|
||||
dashboard = Dashboard.objects.get(id=dashboard_id)
|
||||
filename = f"{dashboard.name.replace(' ', '_').lower()}_chat_sessions"
|
||||
elif data_source_id:
|
||||
data_source = DataSource.objects.get(id=data_source_id)
|
||||
filename = f"{data_source.name.replace(' ', '_').lower()}_chat_sessions"
|
||||
|
||||
# Add company name, date, and timestamp to the filename
|
||||
current_time = timezone.now().strftime("%Y%m%d_%H%M%S")
|
||||
company_name = company.name.replace(" ", "_").lower()
|
||||
filename = f"{company_name}_{filename}_{current_time}"
|
||||
|
||||
# Create in-memory output file
|
||||
output = io.BytesIO()
|
||||
|
||||
# Create Excel workbook and worksheet
|
||||
workbook = xlsxwriter.Workbook(output)
|
||||
worksheet = workbook.add_worksheet("Chat Sessions")
|
||||
|
||||
# Add a bold format to use to highlight cells
|
||||
bold = workbook.add_format({"bold": True, "bg_color": "#D9EAD3"})
|
||||
date_format = workbook.add_format({"num_format": "yyyy-mm-dd hh:mm:ss"})
|
||||
|
||||
# Write header row with formatting
|
||||
headers = [
|
||||
"Session ID",
|
||||
"Start Time",
|
||||
"End Time",
|
||||
"IP Address",
|
||||
"Country",
|
||||
"Language",
|
||||
"Messages Sent",
|
||||
"Sentiment",
|
||||
"Escalated",
|
||||
"Forwarded HR",
|
||||
"Full Transcript",
|
||||
"Avg Response Time (s)",
|
||||
"Tokens",
|
||||
"Tokens EUR",
|
||||
"Category",
|
||||
"Initial Message",
|
||||
"User Rating",
|
||||
]
|
||||
|
||||
for col, header in enumerate(headers):
|
||||
worksheet.write(0, col, header, bold)
|
||||
|
||||
# Write data rows
|
||||
for row_num, session in enumerate(sessions, 1):
|
||||
worksheet.write(row_num, 0, session.session_id)
|
||||
# Write dates with proper formatting if not None
|
||||
if session.start_time:
|
||||
worksheet.write_datetime(row_num, 1, session.start_time, date_format)
|
||||
else:
|
||||
worksheet.write(row_num, 1, None)
|
||||
|
||||
if session.end_time:
|
||||
worksheet.write_datetime(row_num, 2, session.end_time, date_format)
|
||||
else:
|
||||
worksheet.write(row_num, 2, None)
|
||||
|
||||
worksheet.write(row_num, 3, session.ip_address)
|
||||
worksheet.write(row_num, 4, session.country)
|
||||
worksheet.write(row_num, 5, session.language)
|
||||
worksheet.write(row_num, 6, session.messages_sent)
|
||||
worksheet.write(row_num, 7, session.sentiment)
|
||||
worksheet.write(row_num, 8, "Yes" if session.escalated else "No")
|
||||
worksheet.write(row_num, 9, "Yes" if session.forwarded_hr else "No")
|
||||
worksheet.write(row_num, 10, session.full_transcript)
|
||||
worksheet.write(row_num, 11, session.avg_response_time)
|
||||
worksheet.write(row_num, 12, session.tokens)
|
||||
worksheet.write(row_num, 13, session.tokens_eur)
|
||||
worksheet.write(row_num, 14, session.category)
|
||||
worksheet.write(row_num, 15, session.initial_msg)
|
||||
worksheet.write(row_num, 16, session.user_rating)
|
||||
|
||||
# Add summary sheet with metadata
|
||||
summary = workbook.add_worksheet("Summary")
|
||||
summary.write(0, 0, "Export Information", bold)
|
||||
summary.write(1, 0, "Company:", bold)
|
||||
summary.write(1, 1, company.name)
|
||||
summary.write(2, 0, "Export Date:", bold)
|
||||
summary.write(2, 1, timezone.now().strftime("%Y-%m-%d %H:%M:%S"))
|
||||
summary.write(3, 0, "Total Records:", bold)
|
||||
summary.write(3, 1, len(sessions))
|
||||
|
||||
# Add filters if used
|
||||
filter_row = 5
|
||||
summary.write(filter_row, 0, "Filters Applied:", bold)
|
||||
filter_row += 1
|
||||
|
||||
if data_source_id:
|
||||
data_source = DataSource.objects.get(id=data_source_id)
|
||||
summary.write(filter_row, 0, "Data Source:")
|
||||
summary.write(filter_row, 1, data_source.name)
|
||||
filter_row += 1
|
||||
|
||||
if dashboard_id:
|
||||
dashboard = Dashboard.objects.get(id=dashboard_id)
|
||||
summary.write(filter_row, 0, "Dashboard:")
|
||||
summary.write(filter_row, 1, dashboard.name)
|
||||
filter_row += 1
|
||||
|
||||
if view != "all":
|
||||
summary.write(filter_row, 0, "View:")
|
||||
summary.write(filter_row, 1, view.title())
|
||||
filter_row += 1
|
||||
|
||||
# Auto-adjust column widths for better readability
|
||||
for i, width in enumerate([20, 20, 20, 15, 15, 10, 12, 15, 10, 12, 30, 15, 10, 10, 20, 50, 10]):
|
||||
worksheet.set_column(i, i, width)
|
||||
|
||||
# Close the workbook
|
||||
workbook.close()
|
||||
|
||||
# Set up the response
|
||||
output.seek(0)
|
||||
response = HttpResponse(output, content_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")
|
||||
response["Content-Disposition"] = f'attachment; filename="{filename}.xlsx"'
|
||||
|
||||
return response
|
||||
|
||||
@ -94,19 +94,41 @@ def fetch_and_store_chat_data(source_id=None):
|
||||
padded_row = row + [""] * (len(header) - len(row))
|
||||
data = dict(zip(header, padded_row, strict=False))
|
||||
|
||||
try:
|
||||
# Try European date format (DD.MM.YYYY) first
|
||||
start_time = make_aware(datetime.strptime(data["start_time"], "%d.%m.%Y %H:%M:%S"))
|
||||
except ValueError:
|
||||
# Fallback to ISO format (YYYY-MM-DD)
|
||||
start_time = make_aware(datetime.strptime(data["start_time"], "%Y-%m-%d %H:%M:%S"))
|
||||
# Parse date fields with multiple format support
|
||||
start_time = None
|
||||
end_time = None
|
||||
|
||||
try:
|
||||
# Try European date format (DD.MM.YYYY) first
|
||||
end_time = make_aware(datetime.strptime(data["end_time"], "%d.%m.%Y %H:%M:%S"))
|
||||
except ValueError:
|
||||
# Fallback to ISO format (YYYY-MM-DD)
|
||||
end_time = make_aware(datetime.strptime(data["end_time"], "%Y-%m-%d %H:%M:%S"))
|
||||
# List of date formats to try
|
||||
date_formats = [
|
||||
"%d.%m.%Y %H:%M:%S", # European format: DD.MM.YYYY HH:MM:SS
|
||||
"%Y-%m-%d %H:%M:%S", # ISO format: YYYY-MM-DD HH:MM:SS
|
||||
"%m/%d/%Y %H:%M:%S", # US format: MM/DD/YYYY HH:MM:SS
|
||||
"%Y-%m-%dT%H:%M:%S", # ISO format with T separator
|
||||
"%Y-%m-%dT%H:%M:%S.%fZ", # ISO format with milliseconds and Z
|
||||
]
|
||||
|
||||
# Try to parse start_time with multiple formats
|
||||
for date_format in date_formats:
|
||||
try:
|
||||
start_time = make_aware(datetime.strptime(data["start_time"], date_format))
|
||||
break
|
||||
except (ValueError, TypeError):
|
||||
continue
|
||||
|
||||
# Try to parse end_time with multiple formats
|
||||
for date_format in date_formats:
|
||||
try:
|
||||
end_time = make_aware(datetime.strptime(data["end_time"], date_format))
|
||||
break
|
||||
except (ValueError, TypeError):
|
||||
continue
|
||||
|
||||
# If we couldn't parse the dates, log an error and skip this row
|
||||
if not start_time or not end_time:
|
||||
error_msg = f"Could not parse date fields for session {data['session_id']}: start_time={data['start_time']}, end_time={data['end_time']}"
|
||||
logger.error(error_msg)
|
||||
stats["errors"] += 1
|
||||
continue
|
||||
|
||||
messages_sent = int(data["messages_sent"]) if data["messages_sent"] else None
|
||||
escalated = data["escalated"].lower() == "true" if data["escalated"] else None
|
||||
@ -199,6 +221,10 @@ def fetch_and_store_transcript(session, timeout=30):
|
||||
def parse_and_store_transcript_messages(session, transcript_content):
|
||||
"""Parse and store messages from a transcript.
|
||||
|
||||
This function parses a chat transcript that contains messages from both User and Assistant.
|
||||
It identifies message boundaries by looking for lines that start with common sender patterns,
|
||||
and groups all following lines until the next sender change as part of that message.
|
||||
|
||||
Args:
|
||||
session: The ChatSession object
|
||||
transcript_content: The raw transcript content
|
||||
@ -206,6 +232,11 @@ def parse_and_store_transcript_messages(session, transcript_content):
|
||||
Returns:
|
||||
int: Number of messages created
|
||||
"""
|
||||
# Handle empty transcripts
|
||||
if not transcript_content or transcript_content.strip() == "":
|
||||
logger.warning(f"Empty transcript received for session {session.session_id}")
|
||||
return 0
|
||||
|
||||
lines = transcript_content.splitlines()
|
||||
current_sender = None
|
||||
current_message_lines = []
|
||||
@ -217,35 +248,285 @@ def parse_and_store_transcript_messages(session, transcript_content):
|
||||
logger.info(f"Deleting {existing_count} existing messages for session {session.session_id}")
|
||||
ChatMessage.objects.filter(session=session).delete()
|
||||
|
||||
# Define common message patterns to detect - expanded to include more variations
|
||||
user_patterns = [
|
||||
"User:",
|
||||
"[User]:",
|
||||
"Customer:",
|
||||
"[Customer]:",
|
||||
"Client:",
|
||||
"[Client]:",
|
||||
"Human:",
|
||||
"[Human]:",
|
||||
"Me:",
|
||||
"[Me]:",
|
||||
"Question:",
|
||||
"User >",
|
||||
"Customer >",
|
||||
"User said:",
|
||||
"Customer said:",
|
||||
"User writes:",
|
||||
"User asked:",
|
||||
"User message:",
|
||||
"From user:",
|
||||
"Client message:",
|
||||
"Q:",
|
||||
"Input:",
|
||||
"Query:",
|
||||
"Person:",
|
||||
"Visitor:",
|
||||
"Guest:",
|
||||
"User input:",
|
||||
"User query:",
|
||||
]
|
||||
assistant_patterns = [
|
||||
"Assistant:",
|
||||
"[Assistant]:",
|
||||
"Agent:",
|
||||
"[Agent]:",
|
||||
"Bot:",
|
||||
"[Bot]:",
|
||||
"AI:",
|
||||
"[AI]:",
|
||||
"ChatGPT:",
|
||||
"[ChatGPT]:",
|
||||
"System:",
|
||||
"[System]:",
|
||||
"Support:",
|
||||
"[Support]:",
|
||||
"Answer:",
|
||||
"Assistant >",
|
||||
"Bot >",
|
||||
"Assistant said:",
|
||||
"Assistant writes:",
|
||||
"AI responded:",
|
||||
"LLM:",
|
||||
"[LLM]:",
|
||||
"Response:",
|
||||
"A:",
|
||||
"Output:",
|
||||
"AI output:",
|
||||
"Model:",
|
||||
"[Model]:",
|
||||
"Assistant message:",
|
||||
"From assistant:",
|
||||
"Bot response:",
|
||||
"AI says:",
|
||||
"NotsoAI:",
|
||||
"[NotsoAI]:",
|
||||
"Notso:",
|
||||
"[Notso]:",
|
||||
]
|
||||
|
||||
# Function to save current message before starting a new one
|
||||
def save_current_message():
|
||||
nonlocal current_sender, current_message_lines, messages_created
|
||||
if current_sender and current_message_lines:
|
||||
message_text = "\n".join(current_message_lines)
|
||||
# Only save if there's actual content (not just whitespace)
|
||||
if message_text.strip() and save_message(session, current_sender, message_text):
|
||||
messages_created += 1
|
||||
logger.debug(f"Saved {current_sender} message with {len(current_message_lines)} lines")
|
||||
|
||||
# Initial scan to detect format type and potential message boundaries
|
||||
has_recognized_patterns = False
|
||||
potential_timestamps = []
|
||||
timestamp_pattern_count = 0
|
||||
|
||||
# Regex patterns for common timestamp formats
|
||||
import re
|
||||
|
||||
timestamp_patterns = [
|
||||
r"^\[\d{2}:\d{2}:\d{2}\]", # [HH:MM:SS]
|
||||
r"^\[\d{2}:\d{2}\]", # [HH:MM]
|
||||
r"^\(\d{2}:\d{2}:\d{2}\)", # (HH:MM:SS)
|
||||
r"^\(\d{2}:\d{2}\)", # (HH:MM)
|
||||
r"^\d{2}:\d{2}:\d{2} -", # HH:MM:SS -
|
||||
r"^\d{2}:\d{2} -", # HH:MM -
|
||||
r"^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}", # YYYY-MM-DD HH:MM:SS
|
||||
]
|
||||
|
||||
# First pass: detect format and message boundaries
|
||||
for i, line in enumerate(lines):
|
||||
line_stripped = line.strip()
|
||||
|
||||
# Check for standard message patterns
|
||||
if any(line_stripped.startswith(pattern) for pattern in user_patterns + assistant_patterns):
|
||||
has_recognized_patterns = True
|
||||
|
||||
# Check for timestamp patterns that might indicate message boundaries
|
||||
for pattern in timestamp_patterns:
|
||||
if re.match(pattern, line_stripped):
|
||||
timestamp_pattern_count += 1
|
||||
potential_timestamps.append(i)
|
||||
break
|
||||
|
||||
# If no recognized patterns are found, try to intelligently split the transcript
|
||||
if not has_recognized_patterns and len(lines) > 0:
|
||||
logger.info(
|
||||
f"No standard message patterns found in transcript for session {session.session_id}. Attempting intelligent split."
|
||||
)
|
||||
|
||||
# Try timestamp-based parsing if we have enough consistent timestamps
|
||||
if timestamp_pattern_count > 3 and timestamp_pattern_count > 0.2 * len(lines):
|
||||
logger.info(f"Attempting timestamp-based parsing with {timestamp_pattern_count} detected timestamps")
|
||||
|
||||
# Add the end of file as a boundary
|
||||
potential_timestamps.append(len(lines))
|
||||
|
||||
# Process messages between timestamps
|
||||
for i in range(len(potential_timestamps) - 1):
|
||||
start_idx = potential_timestamps[i]
|
||||
end_idx = potential_timestamps[i + 1]
|
||||
|
||||
message_content = "\n".join(lines[start_idx:end_idx])
|
||||
first_line = lines[start_idx].lower()
|
||||
|
||||
# Simple heuristic to identify sender
|
||||
is_user = any(
|
||||
user_word in first_line
|
||||
for user_word in ["user", "customer", "client", "human", "question", "query"]
|
||||
)
|
||||
is_assistant = any(
|
||||
assistant_word in first_line
|
||||
for assistant_word in ["assistant", "agent", "bot", "ai", "system", "support", "answer", "response"]
|
||||
)
|
||||
|
||||
sender = "User" if (is_user or (not is_assistant and i % 2 == 0)) else "Assistant"
|
||||
|
||||
if save_message(session, sender, message_content):
|
||||
messages_created += 1
|
||||
|
||||
logger.info(f"Created {messages_created} messages using timestamp-based parsing")
|
||||
return messages_created
|
||||
|
||||
# Simple heuristic: alternate between user and assistant, with first message from user
|
||||
# Start with paragraphs (blank line separations) as message boundaries
|
||||
paragraphs = []
|
||||
current_paragraph = []
|
||||
|
||||
for line in lines:
|
||||
if line.strip():
|
||||
current_paragraph.append(line)
|
||||
elif current_paragraph: # Empty line and we have a paragraph
|
||||
paragraphs.append("\n".join(current_paragraph))
|
||||
current_paragraph = []
|
||||
|
||||
# Add the last paragraph if it's not empty
|
||||
if current_paragraph:
|
||||
paragraphs.append("\n".join(current_paragraph))
|
||||
|
||||
# If we have just one paragraph, try to split by sentence boundaries for very long transcripts
|
||||
if len(paragraphs) == 1 and len(paragraphs[0].split()) > 100:
|
||||
import re
|
||||
|
||||
# Try to split by sentence boundaries
|
||||
text = paragraphs[0]
|
||||
# Define sentence ending patterns
|
||||
sentence_endings = r"(?<=[.!?])\s+"
|
||||
sentences = re.split(sentence_endings, text)
|
||||
# Group sentences into logical chunks (assuming alternating speakers)
|
||||
chunks = []
|
||||
current_chunk = []
|
||||
|
||||
for i, sentence in enumerate(sentences):
|
||||
current_chunk.append(sentence)
|
||||
# Every 2-3 sentences or on a natural break like a question mark
|
||||
if (i % 2 == 1 and sentence.endswith("?")) or len(current_chunk) >= 3:
|
||||
chunks.append(" ".join(current_chunk))
|
||||
current_chunk = []
|
||||
|
||||
# Add any remaining sentences
|
||||
if current_chunk:
|
||||
chunks.append(" ".join(current_chunk))
|
||||
|
||||
# Save the chunks alternating between user and assistant
|
||||
for i, chunk in enumerate(chunks):
|
||||
if chunk.strip():
|
||||
sender = "User" if i % 2 == 0 else "Assistant"
|
||||
if save_message(session, sender, chunk):
|
||||
messages_created += 1
|
||||
|
||||
logger.info(f"Created {messages_created} messages by splitting single paragraph into sentences")
|
||||
return messages_created
|
||||
|
||||
# Save messages alternating between user and assistant
|
||||
for i, paragraph in enumerate(paragraphs):
|
||||
if paragraph.strip(): # Only save non-empty paragraphs
|
||||
sender = "User" if i % 2 == 0 else "Assistant"
|
||||
if save_message(session, sender, paragraph):
|
||||
messages_created += 1
|
||||
|
||||
logger.info(f"Created {messages_created} messages using intelligent split for session {session.session_id}")
|
||||
return messages_created
|
||||
|
||||
# Standard processing with recognized patterns
|
||||
for line in lines:
|
||||
if line.startswith("User:"):
|
||||
if (
|
||||
current_sender
|
||||
and current_message_lines
|
||||
and save_message(session, current_sender, "\n".join(current_message_lines))
|
||||
):
|
||||
messages_created += 1
|
||||
line_stripped = line.strip()
|
||||
|
||||
# Skip empty lines at the beginning
|
||||
if not line_stripped and not current_sender:
|
||||
continue
|
||||
|
||||
# Check if this line indicates a new sender
|
||||
is_user_message = any(line_stripped.startswith(pattern) for pattern in user_patterns)
|
||||
is_assistant_message = any(line_stripped.startswith(pattern) for pattern in assistant_patterns)
|
||||
|
||||
if is_user_message:
|
||||
# Save previous message if any
|
||||
save_current_message()
|
||||
|
||||
# Start new user message
|
||||
current_sender = "User"
|
||||
current_message_lines = [line.replace("User:", "").strip()]
|
||||
elif line.startswith("Assistant:"):
|
||||
if (
|
||||
current_sender
|
||||
and current_message_lines
|
||||
and save_message(session, current_sender, "\n".join(current_message_lines))
|
||||
):
|
||||
messages_created += 1
|
||||
# Remove the prefix from the line
|
||||
for pattern in user_patterns:
|
||||
if line_stripped.startswith(pattern):
|
||||
line = line[len(pattern) :].strip()
|
||||
break
|
||||
current_message_lines = [line] if line.strip() else []
|
||||
elif is_assistant_message:
|
||||
# Save previous message if any
|
||||
save_current_message()
|
||||
|
||||
# Start new assistant message
|
||||
current_sender = "Assistant"
|
||||
current_message_lines = [line.replace("Assistant:", "").strip()]
|
||||
# Remove the prefix from the line
|
||||
for pattern in assistant_patterns:
|
||||
if line_stripped.startswith(pattern):
|
||||
line = line[len(pattern) :].strip()
|
||||
break
|
||||
current_message_lines = [line] if line.strip() else []
|
||||
elif current_sender:
|
||||
current_message_lines.append(line.strip())
|
||||
# Continue adding to current message
|
||||
current_message_lines.append(line)
|
||||
else:
|
||||
# If we get here with no current_sender, assume it's the start of a user message
|
||||
logger.warning(f"Found line without sender prefix: '{line}'. Assuming User message.")
|
||||
current_sender = "User"
|
||||
current_message_lines = [line]
|
||||
|
||||
# Save the last message
|
||||
if (
|
||||
current_sender
|
||||
and current_message_lines
|
||||
and save_message(session, current_sender, "\n".join(current_message_lines))
|
||||
):
|
||||
messages_created += 1
|
||||
save_current_message()
|
||||
|
||||
# Handle case with no messages parsed (possibly incorrectly formatted transcript)
|
||||
if messages_created == 0 and lines:
|
||||
logger.warning(
|
||||
f"No messages were parsed from transcript for session {session.session_id}. Using fallback parsing."
|
||||
)
|
||||
|
||||
# Fallback: Just split the transcript in half, first part user, second part assistant
|
||||
mid_point = len(lines) // 2
|
||||
user_content = "\n".join(lines[:mid_point])
|
||||
assistant_content = "\n".join(lines[mid_point:])
|
||||
|
||||
# Save the split messages if they have content
|
||||
if user_content.strip() and save_message(session, "User", user_content):
|
||||
messages_created += 1
|
||||
|
||||
if assistant_content.strip() and save_message(session, "Assistant", assistant_content):
|
||||
messages_created += 1
|
||||
|
||||
logger.info(f"Created {messages_created} messages using fallback parsing")
|
||||
|
||||
logger.info(f"Created {messages_created} messages for session {session.session_id}")
|
||||
return messages_created
|
||||
|
||||
Reference in New Issue
Block a user