Files
livegraphs-django/dashboard_project/data_integration/utils.py

341 lines
12 KiB
Python

import csv
import logging
from datetime import datetime
import bleach
import requests
from bleach.css_sanitizer import CSSSanitizer
from django.utils.timezone import make_aware
from .models import ChatMessage, ChatSession, ExternalDataSource
logger = logging.getLogger(__name__)
EXPECTED_HEADERS = [
"session_id",
"start_time",
"end_time",
"ip_address",
"country",
"language",
"messages_sent",
"sentiment",
"escalated",
"forwarded_hr",
"full_transcript",
"avg_response_time",
"tokens",
"tokens_eur",
"category",
"initial_msg",
"user_rating",
]
def fetch_and_store_chat_data(source_id=None):
"""Fetch chat data from an external API and store it in the database.
Args:
source_id: Optional ID of specific ExternalDataSource to use.
If None, will use the first active source.
Returns:
dict: Stats about the operation (sessions created, updated, errors)
"""
if source_id:
source = ExternalDataSource.objects.filter(id=source_id, is_active=True).first()
if not source:
logger.error(f"Data source with ID {source_id} not found or not active.")
return {
"success": False,
"error": f"Data source with ID {source_id} not found or not active.",
}
else:
source = ExternalDataSource.objects.filter(is_active=True).first()
if not source:
logger.warning("No active data source found.")
return {"success": False, "error": "No active data source found."}
stats = {
"sessions_created": 0,
"sessions_updated": 0,
"transcripts_processed": 0,
"errors": 0,
"success": True,
}
try:
# Fetch data from API with timeout from source settings or default
timeout = getattr(source, "timeout", 30)
response = requests.get(
source.api_url,
auth=((source.get_auth_username(), source.get_auth_password()) if source.get_auth_username() else None),
timeout=timeout,
)
response.raise_for_status()
except requests.RequestException as e:
error_msg = f"Error fetching data from API {source.api_url}: {e}"
logger.error(error_msg)
return {"success": False, "error": error_msg}
# Process CSV data
csv_data = response.content.decode("utf-8").splitlines()
reader = csv.reader(csv_data)
# Skip header if present, or use predefined if not
# header = next(reader) # Assuming the first row is a header
# For this specific case, we know the header is missing.
header = EXPECTED_HEADERS
for row in reader:
if not row: # Skip empty rows
continue
try:
# Fix for zip() argument mismatch: pad the row with empty strings if needed
padded_row = row + [""] * (len(header) - len(row))
data = dict(zip(header, padded_row, strict=False))
try:
# Try European date format (DD.MM.YYYY) first
start_time = make_aware(datetime.strptime(data["start_time"], "%d.%m.%Y %H:%M:%S"))
except ValueError:
# Fallback to ISO format (YYYY-MM-DD)
start_time = make_aware(datetime.strptime(data["start_time"], "%Y-%m-%d %H:%M:%S"))
try:
# Try European date format (DD.MM.YYYY) first
end_time = make_aware(datetime.strptime(data["end_time"], "%d.%m.%Y %H:%M:%S"))
except ValueError:
# Fallback to ISO format (YYYY-MM-DD)
end_time = make_aware(datetime.strptime(data["end_time"], "%Y-%m-%d %H:%M:%S"))
messages_sent = int(data["messages_sent"]) if data["messages_sent"] else None
escalated = data["escalated"].lower() == "true" if data["escalated"] else None
forwarded_hr = data["forwarded_hr"].lower() == "true" if data["forwarded_hr"] else None
avg_response_time = float(data["avg_response_time"]) if data["avg_response_time"] else None
tokens = int(data["tokens"]) if data["tokens"] else None
tokens_eur = float(data["tokens_eur"]) if data["tokens_eur"] else None
user_rating = int(data["user_rating"]) if data["user_rating"] and data["user_rating"].isdigit() else None
session, created = ChatSession.objects.update_or_create(
session_id=data["session_id"],
defaults={
"start_time": start_time,
"end_time": end_time,
"ip_address": data.get("ip_address"),
"country": data.get("country"),
"language": data.get("language"),
"messages_sent": messages_sent,
"sentiment": data.get("sentiment"),
"escalated": escalated,
"forwarded_hr": forwarded_hr,
"full_transcript_url": data.get("full_transcript"),
"avg_response_time": avg_response_time,
"tokens": tokens,
"tokens_eur": tokens_eur,
"category": data.get("category"),
"initial_msg": data.get("initial_msg"),
"user_rating": user_rating,
},
)
if created:
stats["sessions_created"] += 1
logger.info(f"Created session: {session.session_id}")
else:
stats["sessions_updated"] += 1
logger.info(f"Updated session: {session.session_id}")
# Fetch and process transcript if URL is present
if session.full_transcript_url:
transcript_result = fetch_and_store_transcript(session, timeout)
if transcript_result["success"]:
stats["transcripts_processed"] += 1
except Exception as e:
logger.error(f"Error processing row: {row}. Error: {e}", exc_info=True)
stats["errors"] += 1
continue
source.last_synced = make_aware(datetime.now())
source.save()
logger.info("Data sync complete. Stats: {stats}")
return stats
def fetch_and_store_transcript(session, timeout=30):
"""Fetch and process transcript for a chat session.
Args:
session: The ChatSession object
timeout: Timeout in seconds for the request
Returns:
dict: Result of the operation
"""
result = {"success": False, "messages_created": 0, "error": None}
try:
transcript_response = requests.get(session.full_transcript_url, timeout=timeout)
transcript_response.raise_for_status()
transcript_content = transcript_response.content.decode("utf-8")
messages_created = parse_and_store_transcript_messages(session, transcript_content)
result["success"] = True
result["messages_created"] = messages_created
return result
except requests.RequestException as e:
error_msg = f"Error fetching transcript for session {session.session_id}: {e}"
logger.error(error_msg)
result["error"] = error_msg
return result
except Exception as e:
error_msg = f"Error processing transcript for session {session.session_id}: {e}"
logger.error(error_msg, exc_info=True)
result["error"] = error_msg
return result
def parse_and_store_transcript_messages(session, transcript_content):
"""Parse and store messages from a transcript.
Args:
session: The ChatSession object
transcript_content: The raw transcript content
Returns:
int: Number of messages created
"""
lines = transcript_content.splitlines()
current_sender = None
current_message_lines = []
messages_created = 0
# First, delete existing messages for this session to avoid duplicates
existing_count = ChatMessage.objects.filter(session=session).count()
if existing_count > 0:
logger.info(f"Deleting {existing_count} existing messages for session {session.session_id}")
ChatMessage.objects.filter(session=session).delete()
for line in lines:
if line.startswith("User:"):
if (
current_sender
and current_message_lines
and save_message(session, current_sender, "\n".join(current_message_lines))
):
messages_created += 1
current_sender = "User"
current_message_lines = [line.replace("User:", "").strip()]
elif line.startswith("Assistant:"):
if (
current_sender
and current_message_lines
and save_message(session, current_sender, "\n".join(current_message_lines))
):
messages_created += 1
current_sender = "Assistant"
current_message_lines = [line.replace("Assistant:", "").strip()]
elif current_sender:
current_message_lines.append(line.strip())
# Save the last message
if (
current_sender
and current_message_lines
and save_message(session, current_sender, "\n".join(current_message_lines))
):
messages_created += 1
logger.info(f"Created {messages_created} messages for session {session.session_id}")
return messages_created
def save_message(session, sender, message_text):
"""Save a message for a chat session.
Args:
session: The ChatSession object
sender: The sender of the message ("User" or "Assistant")
message_text: The message text, which may contain HTML
Returns:
bool: True if message was created, False otherwise
"""
if not message_text.strip():
return False
try:
# Create a CSS sanitizer with allowed CSS properties
css_sanitizer = CSSSanitizer(
allowed_css_properties=[
"color",
"background-color",
"font-family",
"font-size",
"font-weight",
"font-style",
"text-decoration",
"text-align",
"margin",
"margin-left",
"margin-right",
"margin-top",
"margin-bottom",
"padding",
"padding-left",
"padding-right",
"padding-top",
"padding-bottom",
"border",
"border-radius",
"width",
"height",
"line-height",
]
)
# Sanitize HTML content before saving if necessary
safe_html = bleach.clean(
message_text,
tags=[
"b",
"i",
"u",
"em",
"strong",
"a",
"br",
"p",
"ul",
"ol",
"li",
"span",
"div",
"pre",
"code",
"blockquote",
],
attributes={
"a": ["href", "title", "target"],
"span": ["style", "class"],
"div": ["style", "class"],
"p": ["style", "class"],
"pre": ["style", "class"],
},
css_sanitizer=css_sanitizer,
strip=True,
)
ChatMessage.objects.create(
session=session,
sender=sender,
message=message_text,
safe_html_message=safe_html,
)
logger.debug(f"Stored message for session {session.session_id} from {sender}")
return True
except Exception as e:
logger.error(f"Error saving message for session {session.session_id}: {e}", exc_info=True)
return False