mirror of
https://github.com/kjanat/livegraphs-django.git
synced 2026-01-16 06:32:10 +01:00
341 lines
12 KiB
Python
341 lines
12 KiB
Python
import csv
|
|
import logging
|
|
from datetime import datetime
|
|
|
|
import bleach
|
|
import requests
|
|
from bleach.css_sanitizer import CSSSanitizer
|
|
from django.utils.timezone import make_aware
|
|
|
|
from .models import ChatMessage, ChatSession, ExternalDataSource
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
EXPECTED_HEADERS = [
|
|
"session_id",
|
|
"start_time",
|
|
"end_time",
|
|
"ip_address",
|
|
"country",
|
|
"language",
|
|
"messages_sent",
|
|
"sentiment",
|
|
"escalated",
|
|
"forwarded_hr",
|
|
"full_transcript",
|
|
"avg_response_time",
|
|
"tokens",
|
|
"tokens_eur",
|
|
"category",
|
|
"initial_msg",
|
|
"user_rating",
|
|
]
|
|
|
|
|
|
def fetch_and_store_chat_data(source_id=None):
|
|
"""Fetch chat data from an external API and store it in the database.
|
|
|
|
Args:
|
|
source_id: Optional ID of specific ExternalDataSource to use.
|
|
If None, will use the first active source.
|
|
|
|
Returns:
|
|
dict: Stats about the operation (sessions created, updated, errors)
|
|
"""
|
|
if source_id:
|
|
source = ExternalDataSource.objects.filter(id=source_id, is_active=True).first()
|
|
if not source:
|
|
logger.error(f"Data source with ID {source_id} not found or not active.")
|
|
return {
|
|
"success": False,
|
|
"error": f"Data source with ID {source_id} not found or not active.",
|
|
}
|
|
else:
|
|
source = ExternalDataSource.objects.filter(is_active=True).first()
|
|
if not source:
|
|
logger.warning("No active data source found.")
|
|
return {"success": False, "error": "No active data source found."}
|
|
|
|
stats = {
|
|
"sessions_created": 0,
|
|
"sessions_updated": 0,
|
|
"transcripts_processed": 0,
|
|
"errors": 0,
|
|
"success": True,
|
|
}
|
|
|
|
try:
|
|
# Fetch data from API with timeout from source settings or default
|
|
timeout = getattr(source, "timeout", 30)
|
|
response = requests.get(
|
|
source.api_url,
|
|
auth=((source.get_auth_username(), source.get_auth_password()) if source.get_auth_username() else None),
|
|
timeout=timeout,
|
|
)
|
|
response.raise_for_status()
|
|
except requests.RequestException as e:
|
|
error_msg = f"Error fetching data from API {source.api_url}: {e}"
|
|
logger.error(error_msg)
|
|
return {"success": False, "error": error_msg}
|
|
|
|
# Process CSV data
|
|
csv_data = response.content.decode("utf-8").splitlines()
|
|
reader = csv.reader(csv_data)
|
|
# Skip header if present, or use predefined if not
|
|
# header = next(reader) # Assuming the first row is a header
|
|
# For this specific case, we know the header is missing.
|
|
header = EXPECTED_HEADERS
|
|
|
|
for row in reader:
|
|
if not row: # Skip empty rows
|
|
continue
|
|
try:
|
|
# Fix for zip() argument mismatch: pad the row with empty strings if needed
|
|
padded_row = row + [""] * (len(header) - len(row))
|
|
data = dict(zip(header, padded_row, strict=False))
|
|
|
|
try:
|
|
# Try European date format (DD.MM.YYYY) first
|
|
start_time = make_aware(datetime.strptime(data["start_time"], "%d.%m.%Y %H:%M:%S"))
|
|
except ValueError:
|
|
# Fallback to ISO format (YYYY-MM-DD)
|
|
start_time = make_aware(datetime.strptime(data["start_time"], "%Y-%m-%d %H:%M:%S"))
|
|
|
|
try:
|
|
# Try European date format (DD.MM.YYYY) first
|
|
end_time = make_aware(datetime.strptime(data["end_time"], "%d.%m.%Y %H:%M:%S"))
|
|
except ValueError:
|
|
# Fallback to ISO format (YYYY-MM-DD)
|
|
end_time = make_aware(datetime.strptime(data["end_time"], "%Y-%m-%d %H:%M:%S"))
|
|
|
|
messages_sent = int(data["messages_sent"]) if data["messages_sent"] else None
|
|
escalated = data["escalated"].lower() == "true" if data["escalated"] else None
|
|
forwarded_hr = data["forwarded_hr"].lower() == "true" if data["forwarded_hr"] else None
|
|
avg_response_time = float(data["avg_response_time"]) if data["avg_response_time"] else None
|
|
tokens = int(data["tokens"]) if data["tokens"] else None
|
|
tokens_eur = float(data["tokens_eur"]) if data["tokens_eur"] else None
|
|
user_rating = int(data["user_rating"]) if data["user_rating"] and data["user_rating"].isdigit() else None
|
|
|
|
session, created = ChatSession.objects.update_or_create(
|
|
session_id=data["session_id"],
|
|
defaults={
|
|
"start_time": start_time,
|
|
"end_time": end_time,
|
|
"ip_address": data.get("ip_address"),
|
|
"country": data.get("country"),
|
|
"language": data.get("language"),
|
|
"messages_sent": messages_sent,
|
|
"sentiment": data.get("sentiment"),
|
|
"escalated": escalated,
|
|
"forwarded_hr": forwarded_hr,
|
|
"full_transcript_url": data.get("full_transcript"),
|
|
"avg_response_time": avg_response_time,
|
|
"tokens": tokens,
|
|
"tokens_eur": tokens_eur,
|
|
"category": data.get("category"),
|
|
"initial_msg": data.get("initial_msg"),
|
|
"user_rating": user_rating,
|
|
},
|
|
)
|
|
|
|
if created:
|
|
stats["sessions_created"] += 1
|
|
logger.info(f"Created session: {session.session_id}")
|
|
else:
|
|
stats["sessions_updated"] += 1
|
|
logger.info(f"Updated session: {session.session_id}")
|
|
|
|
# Fetch and process transcript if URL is present
|
|
if session.full_transcript_url:
|
|
transcript_result = fetch_and_store_transcript(session, timeout)
|
|
if transcript_result["success"]:
|
|
stats["transcripts_processed"] += 1
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error processing row: {row}. Error: {e}", exc_info=True)
|
|
stats["errors"] += 1
|
|
continue
|
|
|
|
source.last_synced = make_aware(datetime.now())
|
|
source.save()
|
|
logger.info("Data sync complete. Stats: {stats}")
|
|
|
|
return stats
|
|
|
|
|
|
def fetch_and_store_transcript(session, timeout=30):
|
|
"""Fetch and process transcript for a chat session.
|
|
|
|
Args:
|
|
session: The ChatSession object
|
|
timeout: Timeout in seconds for the request
|
|
|
|
Returns:
|
|
dict: Result of the operation
|
|
"""
|
|
result = {"success": False, "messages_created": 0, "error": None}
|
|
|
|
try:
|
|
transcript_response = requests.get(session.full_transcript_url, timeout=timeout)
|
|
transcript_response.raise_for_status()
|
|
transcript_content = transcript_response.content.decode("utf-8")
|
|
messages_created = parse_and_store_transcript_messages(session, transcript_content)
|
|
|
|
result["success"] = True
|
|
result["messages_created"] = messages_created
|
|
return result
|
|
except requests.RequestException as e:
|
|
error_msg = f"Error fetching transcript for session {session.session_id}: {e}"
|
|
logger.error(error_msg)
|
|
result["error"] = error_msg
|
|
return result
|
|
except Exception as e:
|
|
error_msg = f"Error processing transcript for session {session.session_id}: {e}"
|
|
logger.error(error_msg, exc_info=True)
|
|
result["error"] = error_msg
|
|
return result
|
|
|
|
|
|
def parse_and_store_transcript_messages(session, transcript_content):
|
|
"""Parse and store messages from a transcript.
|
|
|
|
Args:
|
|
session: The ChatSession object
|
|
transcript_content: The raw transcript content
|
|
|
|
Returns:
|
|
int: Number of messages created
|
|
"""
|
|
lines = transcript_content.splitlines()
|
|
current_sender = None
|
|
current_message_lines = []
|
|
messages_created = 0
|
|
|
|
# First, delete existing messages for this session to avoid duplicates
|
|
existing_count = ChatMessage.objects.filter(session=session).count()
|
|
if existing_count > 0:
|
|
logger.info(f"Deleting {existing_count} existing messages for session {session.session_id}")
|
|
ChatMessage.objects.filter(session=session).delete()
|
|
|
|
for line in lines:
|
|
if line.startswith("User:"):
|
|
if (
|
|
current_sender
|
|
and current_message_lines
|
|
and save_message(session, current_sender, "\n".join(current_message_lines))
|
|
):
|
|
messages_created += 1
|
|
current_sender = "User"
|
|
current_message_lines = [line.replace("User:", "").strip()]
|
|
elif line.startswith("Assistant:"):
|
|
if (
|
|
current_sender
|
|
and current_message_lines
|
|
and save_message(session, current_sender, "\n".join(current_message_lines))
|
|
):
|
|
messages_created += 1
|
|
current_sender = "Assistant"
|
|
current_message_lines = [line.replace("Assistant:", "").strip()]
|
|
elif current_sender:
|
|
current_message_lines.append(line.strip())
|
|
|
|
# Save the last message
|
|
if (
|
|
current_sender
|
|
and current_message_lines
|
|
and save_message(session, current_sender, "\n".join(current_message_lines))
|
|
):
|
|
messages_created += 1
|
|
|
|
logger.info(f"Created {messages_created} messages for session {session.session_id}")
|
|
return messages_created
|
|
|
|
|
|
def save_message(session, sender, message_text):
|
|
"""Save a message for a chat session.
|
|
|
|
Args:
|
|
session: The ChatSession object
|
|
sender: The sender of the message ("User" or "Assistant")
|
|
message_text: The message text, which may contain HTML
|
|
|
|
Returns:
|
|
bool: True if message was created, False otherwise
|
|
"""
|
|
if not message_text.strip():
|
|
return False
|
|
|
|
try:
|
|
# Create a CSS sanitizer with allowed CSS properties
|
|
css_sanitizer = CSSSanitizer(
|
|
allowed_css_properties=[
|
|
"color",
|
|
"background-color",
|
|
"font-family",
|
|
"font-size",
|
|
"font-weight",
|
|
"font-style",
|
|
"text-decoration",
|
|
"text-align",
|
|
"margin",
|
|
"margin-left",
|
|
"margin-right",
|
|
"margin-top",
|
|
"margin-bottom",
|
|
"padding",
|
|
"padding-left",
|
|
"padding-right",
|
|
"padding-top",
|
|
"padding-bottom",
|
|
"border",
|
|
"border-radius",
|
|
"width",
|
|
"height",
|
|
"line-height",
|
|
]
|
|
)
|
|
|
|
# Sanitize HTML content before saving if necessary
|
|
safe_html = bleach.clean(
|
|
message_text,
|
|
tags=[
|
|
"b",
|
|
"i",
|
|
"u",
|
|
"em",
|
|
"strong",
|
|
"a",
|
|
"br",
|
|
"p",
|
|
"ul",
|
|
"ol",
|
|
"li",
|
|
"span",
|
|
"div",
|
|
"pre",
|
|
"code",
|
|
"blockquote",
|
|
],
|
|
attributes={
|
|
"a": ["href", "title", "target"],
|
|
"span": ["style", "class"],
|
|
"div": ["style", "class"],
|
|
"p": ["style", "class"],
|
|
"pre": ["style", "class"],
|
|
},
|
|
css_sanitizer=css_sanitizer,
|
|
strip=True,
|
|
)
|
|
|
|
ChatMessage.objects.create(
|
|
session=session,
|
|
sender=sender,
|
|
message=message_text,
|
|
safe_html_message=safe_html,
|
|
)
|
|
logger.debug(f"Stored message for session {session.session_id} from {sender}")
|
|
return True
|
|
except Exception as e:
|
|
logger.error(f"Error saving message for session {session.session_id}: {e}", exc_info=True)
|
|
return False
|