Enhance data integration and transcript parsing

- Improved date parsing in fetch_and_store_chat_data to support multiple formats and added error logging for unparseable dates.
- Enhanced parse_and_store_transcript_messages to handle empty transcripts and expanded message pattern recognition for both User and Assistant.
- Implemented intelligent splitting of transcripts based on detected patterns and timestamps, with fallback mechanisms for unrecognized formats.
- Updated documentation for Celery and Redis setup, troubleshooting, and project structure.
- Added markdown linting configuration and scripts for code formatting.
- Updated Nginx configuration to change the web server port.
- Added xlsxwriter dependency for Excel file handling in project requirements.
This commit is contained in:
2025-05-18 19:18:31 +00:00
parent 8bbbb109bd
commit f0ae061fa7
24 changed files with 1672 additions and 931 deletions

View File

@ -94,19 +94,41 @@ def fetch_and_store_chat_data(source_id=None):
padded_row = row + [""] * (len(header) - len(row))
data = dict(zip(header, padded_row, strict=False))
try:
# Try European date format (DD.MM.YYYY) first
start_time = make_aware(datetime.strptime(data["start_time"], "%d.%m.%Y %H:%M:%S"))
except ValueError:
# Fallback to ISO format (YYYY-MM-DD)
start_time = make_aware(datetime.strptime(data["start_time"], "%Y-%m-%d %H:%M:%S"))
# Parse date fields with multiple format support
start_time = None
end_time = None
try:
# Try European date format (DD.MM.YYYY) first
end_time = make_aware(datetime.strptime(data["end_time"], "%d.%m.%Y %H:%M:%S"))
except ValueError:
# Fallback to ISO format (YYYY-MM-DD)
end_time = make_aware(datetime.strptime(data["end_time"], "%Y-%m-%d %H:%M:%S"))
# List of date formats to try
date_formats = [
"%d.%m.%Y %H:%M:%S", # European format: DD.MM.YYYY HH:MM:SS
"%Y-%m-%d %H:%M:%S", # ISO format: YYYY-MM-DD HH:MM:SS
"%m/%d/%Y %H:%M:%S", # US format: MM/DD/YYYY HH:MM:SS
"%Y-%m-%dT%H:%M:%S", # ISO format with T separator
"%Y-%m-%dT%H:%M:%S.%fZ", # ISO format with milliseconds and Z
]
# Try to parse start_time with multiple formats
for date_format in date_formats:
try:
start_time = make_aware(datetime.strptime(data["start_time"], date_format))
break
except (ValueError, TypeError):
continue
# Try to parse end_time with multiple formats
for date_format in date_formats:
try:
end_time = make_aware(datetime.strptime(data["end_time"], date_format))
break
except (ValueError, TypeError):
continue
# If we couldn't parse the dates, log an error and skip this row
if not start_time or not end_time:
error_msg = f"Could not parse date fields for session {data['session_id']}: start_time={data['start_time']}, end_time={data['end_time']}"
logger.error(error_msg)
stats["errors"] += 1
continue
messages_sent = int(data["messages_sent"]) if data["messages_sent"] else None
escalated = data["escalated"].lower() == "true" if data["escalated"] else None
@ -199,6 +221,10 @@ def fetch_and_store_transcript(session, timeout=30):
def parse_and_store_transcript_messages(session, transcript_content):
"""Parse and store messages from a transcript.
This function parses a chat transcript that contains messages from both User and Assistant.
It identifies message boundaries by looking for lines that start with common sender patterns,
and groups all following lines until the next sender change as part of that message.
Args:
session: The ChatSession object
transcript_content: The raw transcript content
@ -206,6 +232,11 @@ def parse_and_store_transcript_messages(session, transcript_content):
Returns:
int: Number of messages created
"""
# Handle empty transcripts
if not transcript_content or transcript_content.strip() == "":
logger.warning(f"Empty transcript received for session {session.session_id}")
return 0
lines = transcript_content.splitlines()
current_sender = None
current_message_lines = []
@ -217,35 +248,285 @@ def parse_and_store_transcript_messages(session, transcript_content):
logger.info(f"Deleting {existing_count} existing messages for session {session.session_id}")
ChatMessage.objects.filter(session=session).delete()
# Define common message patterns to detect - expanded to include more variations
user_patterns = [
"User:",
"[User]:",
"Customer:",
"[Customer]:",
"Client:",
"[Client]:",
"Human:",
"[Human]:",
"Me:",
"[Me]:",
"Question:",
"User >",
"Customer >",
"User said:",
"Customer said:",
"User writes:",
"User asked:",
"User message:",
"From user:",
"Client message:",
"Q:",
"Input:",
"Query:",
"Person:",
"Visitor:",
"Guest:",
"User input:",
"User query:",
]
assistant_patterns = [
"Assistant:",
"[Assistant]:",
"Agent:",
"[Agent]:",
"Bot:",
"[Bot]:",
"AI:",
"[AI]:",
"ChatGPT:",
"[ChatGPT]:",
"System:",
"[System]:",
"Support:",
"[Support]:",
"Answer:",
"Assistant >",
"Bot >",
"Assistant said:",
"Assistant writes:",
"AI responded:",
"LLM:",
"[LLM]:",
"Response:",
"A:",
"Output:",
"AI output:",
"Model:",
"[Model]:",
"Assistant message:",
"From assistant:",
"Bot response:",
"AI says:",
"NotsoAI:",
"[NotsoAI]:",
"Notso:",
"[Notso]:",
]
# Function to save current message before starting a new one
def save_current_message():
nonlocal current_sender, current_message_lines, messages_created
if current_sender and current_message_lines:
message_text = "\n".join(current_message_lines)
# Only save if there's actual content (not just whitespace)
if message_text.strip() and save_message(session, current_sender, message_text):
messages_created += 1
logger.debug(f"Saved {current_sender} message with {len(current_message_lines)} lines")
# Initial scan to detect format type and potential message boundaries
has_recognized_patterns = False
potential_timestamps = []
timestamp_pattern_count = 0
# Regex patterns for common timestamp formats
import re
timestamp_patterns = [
r"^\[\d{2}:\d{2}:\d{2}\]", # [HH:MM:SS]
r"^\[\d{2}:\d{2}\]", # [HH:MM]
r"^\(\d{2}:\d{2}:\d{2}\)", # (HH:MM:SS)
r"^\(\d{2}:\d{2}\)", # (HH:MM)
r"^\d{2}:\d{2}:\d{2} -", # HH:MM:SS -
r"^\d{2}:\d{2} -", # HH:MM -
r"^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}", # YYYY-MM-DD HH:MM:SS
]
# First pass: detect format and message boundaries
for i, line in enumerate(lines):
line_stripped = line.strip()
# Check for standard message patterns
if any(line_stripped.startswith(pattern) for pattern in user_patterns + assistant_patterns):
has_recognized_patterns = True
# Check for timestamp patterns that might indicate message boundaries
for pattern in timestamp_patterns:
if re.match(pattern, line_stripped):
timestamp_pattern_count += 1
potential_timestamps.append(i)
break
# If no recognized patterns are found, try to intelligently split the transcript
if not has_recognized_patterns and len(lines) > 0:
logger.info(
f"No standard message patterns found in transcript for session {session.session_id}. Attempting intelligent split."
)
# Try timestamp-based parsing if we have enough consistent timestamps
if timestamp_pattern_count > 3 and timestamp_pattern_count > 0.2 * len(lines):
logger.info(f"Attempting timestamp-based parsing with {timestamp_pattern_count} detected timestamps")
# Add the end of file as a boundary
potential_timestamps.append(len(lines))
# Process messages between timestamps
for i in range(len(potential_timestamps) - 1):
start_idx = potential_timestamps[i]
end_idx = potential_timestamps[i + 1]
message_content = "\n".join(lines[start_idx:end_idx])
first_line = lines[start_idx].lower()
# Simple heuristic to identify sender
is_user = any(
user_word in first_line
for user_word in ["user", "customer", "client", "human", "question", "query"]
)
is_assistant = any(
assistant_word in first_line
for assistant_word in ["assistant", "agent", "bot", "ai", "system", "support", "answer", "response"]
)
sender = "User" if (is_user or (not is_assistant and i % 2 == 0)) else "Assistant"
if save_message(session, sender, message_content):
messages_created += 1
logger.info(f"Created {messages_created} messages using timestamp-based parsing")
return messages_created
# Simple heuristic: alternate between user and assistant, with first message from user
# Start with paragraphs (blank line separations) as message boundaries
paragraphs = []
current_paragraph = []
for line in lines:
if line.strip():
current_paragraph.append(line)
elif current_paragraph: # Empty line and we have a paragraph
paragraphs.append("\n".join(current_paragraph))
current_paragraph = []
# Add the last paragraph if it's not empty
if current_paragraph:
paragraphs.append("\n".join(current_paragraph))
# If we have just one paragraph, try to split by sentence boundaries for very long transcripts
if len(paragraphs) == 1 and len(paragraphs[0].split()) > 100:
import re
# Try to split by sentence boundaries
text = paragraphs[0]
# Define sentence ending patterns
sentence_endings = r"(?<=[.!?])\s+"
sentences = re.split(sentence_endings, text)
# Group sentences into logical chunks (assuming alternating speakers)
chunks = []
current_chunk = []
for i, sentence in enumerate(sentences):
current_chunk.append(sentence)
# Every 2-3 sentences or on a natural break like a question mark
if (i % 2 == 1 and sentence.endswith("?")) or len(current_chunk) >= 3:
chunks.append(" ".join(current_chunk))
current_chunk = []
# Add any remaining sentences
if current_chunk:
chunks.append(" ".join(current_chunk))
# Save the chunks alternating between user and assistant
for i, chunk in enumerate(chunks):
if chunk.strip():
sender = "User" if i % 2 == 0 else "Assistant"
if save_message(session, sender, chunk):
messages_created += 1
logger.info(f"Created {messages_created} messages by splitting single paragraph into sentences")
return messages_created
# Save messages alternating between user and assistant
for i, paragraph in enumerate(paragraphs):
if paragraph.strip(): # Only save non-empty paragraphs
sender = "User" if i % 2 == 0 else "Assistant"
if save_message(session, sender, paragraph):
messages_created += 1
logger.info(f"Created {messages_created} messages using intelligent split for session {session.session_id}")
return messages_created
# Standard processing with recognized patterns
for line in lines:
if line.startswith("User:"):
if (
current_sender
and current_message_lines
and save_message(session, current_sender, "\n".join(current_message_lines))
):
messages_created += 1
line_stripped = line.strip()
# Skip empty lines at the beginning
if not line_stripped and not current_sender:
continue
# Check if this line indicates a new sender
is_user_message = any(line_stripped.startswith(pattern) for pattern in user_patterns)
is_assistant_message = any(line_stripped.startswith(pattern) for pattern in assistant_patterns)
if is_user_message:
# Save previous message if any
save_current_message()
# Start new user message
current_sender = "User"
current_message_lines = [line.replace("User:", "").strip()]
elif line.startswith("Assistant:"):
if (
current_sender
and current_message_lines
and save_message(session, current_sender, "\n".join(current_message_lines))
):
messages_created += 1
# Remove the prefix from the line
for pattern in user_patterns:
if line_stripped.startswith(pattern):
line = line[len(pattern) :].strip()
break
current_message_lines = [line] if line.strip() else []
elif is_assistant_message:
# Save previous message if any
save_current_message()
# Start new assistant message
current_sender = "Assistant"
current_message_lines = [line.replace("Assistant:", "").strip()]
# Remove the prefix from the line
for pattern in assistant_patterns:
if line_stripped.startswith(pattern):
line = line[len(pattern) :].strip()
break
current_message_lines = [line] if line.strip() else []
elif current_sender:
current_message_lines.append(line.strip())
# Continue adding to current message
current_message_lines.append(line)
else:
# If we get here with no current_sender, assume it's the start of a user message
logger.warning(f"Found line without sender prefix: '{line}'. Assuming User message.")
current_sender = "User"
current_message_lines = [line]
# Save the last message
if (
current_sender
and current_message_lines
and save_message(session, current_sender, "\n".join(current_message_lines))
):
messages_created += 1
save_current_message()
# Handle case with no messages parsed (possibly incorrectly formatted transcript)
if messages_created == 0 and lines:
logger.warning(
f"No messages were parsed from transcript for session {session.session_id}. Using fallback parsing."
)
# Fallback: Just split the transcript in half, first part user, second part assistant
mid_point = len(lines) // 2
user_content = "\n".join(lines[:mid_point])
assistant_content = "\n".join(lines[mid_point:])
# Save the split messages if they have content
if user_content.strip() and save_message(session, "User", user_content):
messages_created += 1
if assistant_content.strip() and save_message(session, "Assistant", assistant_content):
messages_created += 1
logger.info(f"Created {messages_created} messages using fallback parsing")
logger.info(f"Created {messages_created} messages for session {session.session_id}")
return messages_created