Implement data integration tasks with Celery, including periodic fetching and manual refresh of chat data; add utility functions for data processing and transcript handling; create views and URLs for manual data refresh; establish Redis and Celery configuration; enhance error handling and logging; introduce scripts for data cleanup and fixing dashboard data; update documentation for Redis and Celery setup and troubleshooting.

This commit is contained in:
2025-05-18 13:33:11 +00:00
parent e8f2d2adc2
commit 8bbbb109bd
63 changed files with 4601 additions and 164 deletions

View File

@ -0,0 +1,125 @@
from django.contrib import admin
from django.utils.html import format_html
from .models import ChatMessage, ChatSession, ExternalDataSource
from .tasks import refresh_specific_source
@admin.register(ExternalDataSource)
class ExternalDataSourceAdmin(admin.ModelAdmin):
list_display = (
"name",
"api_url",
"is_active",
"last_synced",
"status_badge",
"sync_interval",
"refresh_action",
)
list_filter = ("is_active",)
search_fields = ("name", "api_url")
readonly_fields = ("last_synced", "error_count", "last_error")
fieldsets = (
(None, {"fields": ("name", "api_url", "is_active")}),
(
"Authentication",
{
"fields": ("auth_username", "auth_password"),
"description": "Credentials can also be provided via environment variables.",
},
),
("Sync Settings", {"fields": ("sync_interval", "timeout")}),
("Status", {"fields": ("last_synced", "error_count", "last_error")}),
)
@admin.display(description="Status")
def status_badge(self, obj):
"""Display a colored status badge"""
status = obj.get_status()
if status == "Active":
return format_html(
'<span style="color: white; background-color: green; padding: 3px 8px; border-radius: 10px;">{}</span>',
status,
)
elif status == "Inactive":
return format_html(
'<span style="color: white; background-color: gray; padding: 3px 8px; border-radius: 10px;">{}</span>',
status,
)
elif "Error" in status:
return format_html(
'<span style="color: white; background-color: red; padding: 3px 8px; border-radius: 10px;">{}</span>',
status,
)
else:
return format_html(
'<span style="color: white; background-color: orange; padding: 3px 8px; border-radius: 10px;">{}</span>',
status,
)
@admin.display(description="Actions")
def refresh_action(self, obj):
"""Button to manually refresh a data source"""
if obj.is_active:
url = f"/admin/data_integration/externaldatasource/refresh/{obj.id}/"
return format_html('<a class="button" href="{}">Refresh Now</a>', url)
return "Inactive"
def refresh_source(self, request, source_id):
"""Run a task to refresh the source data"""
task = refresh_specific_source.delay(source_id)
self.message_user(request, f"Data refresh task started (Task ID: {task.id})")
def get_urls(self):
from django.urls import path
urls = super().get_urls()
custom_urls = [
path(
"refresh/<int:source_id>/",
self.admin_site.admin_view(self.refresh_source),
name="data_integration_externaldatasource_refresh",
),
]
return custom_urls + urls
@admin.register(ChatSession)
class ChatSessionAdmin(admin.ModelAdmin):
list_display = (
"session_id",
"start_time",
"end_time",
"country",
"language",
"messages_sent",
"sentiment",
)
list_filter = ("country", "language", "sentiment")
search_fields = ("session_id", "country", "ip_address")
readonly_fields = ("session_id",)
@admin.register(ChatMessage)
class ChatMessageAdmin(admin.ModelAdmin):
list_display = ("session", "sender", "timestamp", "message_preview")
list_filter = ("sender", "timestamp")
search_fields = ("message", "session__session_id")
readonly_fields = ("safe_html_display",)
@admin.display(description="Message")
def message_preview(self, obj):
"""Show a preview of the message"""
if len(obj.message) > 50:
return obj.message[:50] + "..."
return obj.message
@admin.display(description="Sanitized HTML Preview")
def safe_html_display(self, obj):
"""Display the sanitized HTML"""
if obj.safe_html_message:
return format_html(
'<div style="padding: 10px; border: 1px solid #ccc; background-color: #f9f9f9;">{}</div>',
obj.safe_html_message,
)
return "No HTML content"

View File

@ -0,0 +1,6 @@
from django.apps import AppConfig
class DataIntegrationConfig(AppConfig):
default_auto_field = "django.db.models.BigAutoField"
name = "data_integration"

View File

@ -0,0 +1,27 @@
from data_integration.models import ExternalDataSource
from django.core.management.base import BaseCommand
class Command(BaseCommand):
help = "Create default external data source configuration"
def handle(self, *_args, **_options):
if not ExternalDataSource.objects.exists():
source = ExternalDataSource.objects.create( # nosec: B106
name="Notso AI Chat API",
api_url="https://HOST/COMPANY/chats",
auth_username="DEFAULT_USERNAME", # Will be set via environment variables
auth_password="DEFAULT_PASSWORD", # Will be set via environment variables
is_active=True,
sync_interval=int(self.get_env_var("CHAT_DATA_FETCH_INTERVAL", "3600")),
timeout=int(self.get_env_var("FETCH_DATA_TIMEOUT", "300")),
)
self.stdout.write(self.style.SUCCESS(f"Created default external data source: {source.name}"))
else:
self.stdout.write(self.style.SUCCESS("External data source already exists, no action taken."))
def get_env_var(self, name, default):
"""Get environment variable or return default"""
import os
return os.environ.get(name, default)

View File

@ -0,0 +1,11 @@
from data_integration.utils import fetch_and_store_chat_data
from django.core.management.base import BaseCommand
class Command(BaseCommand):
help = "Fetches chat data from the external API and stores it in the database"
def handle(self, *_args, **_options): # Mark as unused
self.stdout.write(self.style.SUCCESS("Starting data fetch..."))
fetch_and_store_chat_data()
self.stdout.write(self.style.SUCCESS("Successfully fetched and stored chat data."))

View File

@ -0,0 +1,79 @@
#!/usr/bin/env python
"""
Migration Fix Script for ExternalDataSource
This management command adds the missing fields to ExternalDataSource
model directly using SQL, which is useful if Django migrations
are having issues.
"""
import logging
from django.core.management.base import BaseCommand
from django.db import connection
logger = logging.getLogger(__name__)
class Command(BaseCommand):
help = "Fix missing columns in ExternalDataSource table"
def handle(self, *args, **options): # noqa: ARG002
self.stdout.write("Checking ExternalDataSource schema...")
# Check if columns exist
with connection.cursor() as cursor:
cursor.execute("PRAGMA table_info(data_integration_externaldatasource)")
columns = [col[1] for col in cursor.fetchall()]
missing_columns = []
if "error_count" not in columns:
missing_columns.append("error_count")
if "last_error" not in columns:
missing_columns.append("last_error")
if "sync_interval" not in columns:
missing_columns.append("sync_interval")
if "timeout" not in columns:
missing_columns.append("timeout")
if not missing_columns:
self.stdout.write(self.style.SUCCESS("✅ All columns exist in ExternalDataSource table"))
return
self.stdout.write(f"Missing columns: {', '.join(missing_columns)}")
self.stdout.write("Adding missing columns...")
try:
# Add missing columns with SQLite
for col in missing_columns:
if col == "error_count":
cursor.execute(
"ALTER TABLE data_integration_externaldatasource ADD COLUMN error_count integer DEFAULT 0"
)
elif col == "last_error":
cursor.execute(
"ALTER TABLE data_integration_externaldatasource ADD COLUMN last_error varchar(255) NULL"
)
elif col == "sync_interval":
cursor.execute(
"ALTER TABLE data_integration_externaldatasource ADD COLUMN sync_interval integer DEFAULT 3600"
)
elif col == "timeout":
cursor.execute(
"ALTER TABLE data_integration_externaldatasource ADD COLUMN timeout integer DEFAULT 300"
)
self.stdout.write(
self.style.SUCCESS(f"✅ Successfully added missing columns: {', '.join(missing_columns)}")
)
# Verify columns were added
cursor.execute("PRAGMA table_info(data_integration_externaldatasource)")
updated_columns = [col[1] for col in cursor.fetchall()]
self.stdout.write(f"Current columns: {', '.join(updated_columns)}")
except Exception as e:
self.stdout.write(self.style.ERROR(f"❌ Error adding columns: {e}"))
self.stdout.write(self.style.WARNING("Consider running Django migrations instead:"))
self.stdout.write(" python manage.py makemigrations data_integration")
self.stdout.write(" python manage.py migrate data_integration")

View File

@ -0,0 +1,47 @@
import logging
from data_integration.tasks import test_task
from django.core.management.base import BaseCommand
from django.utils import timezone
logger = logging.getLogger(__name__)
class Command(BaseCommand):
help = "Test Celery configuration by executing a simple task"
def handle(self, *args, **options): # noqa: ARG002
self.stdout.write(f"Testing Celery configuration at {timezone.now()}")
try:
# Run the test task
self.stdout.write("Submitting test task to Celery...")
result = test_task.delay()
task_id = result.id
self.stdout.write(f"Task submitted with ID: {task_id}")
self.stdout.write("Waiting for task result (this may take a few seconds)...")
# Try to get the result with a timeout
try:
task_result = result.get(timeout=10) # 10 second timeout
self.stdout.write(self.style.SUCCESS(f"✅ Task completed successfully with result: {task_result}"))
return
except TimeoutError:
self.stdout.write(
self.style.WARNING(
"⚠️ Task did not complete within the timeout period. "
"This might be normal if Celery worker isn't running."
)
)
self.stdout.write(
"To check task status, run Celery worker in another terminal with:\n"
" make celery\n"
f"And then check status of task {task_id}"
)
except Exception as e:
self.stdout.write(self.style.ERROR(f"❌ Error testing Celery: {e}"))
self.stdout.write("Make sure the Celery broker (Redis or SQLite) is properly configured.")
self.stdout.write("To start Celery, run:\n make celery")

View File

@ -0,0 +1,69 @@
#!/usr/bin/env python
"""
Test the ExternalDataSource Model Schema
This management command tests if the ExternalDataSource schema has been correctly updated.
"""
import logging
from data_integration.models import ExternalDataSource
from django.core.management.base import BaseCommand
logger = logging.getLogger(__name__)
class Command(BaseCommand):
help = "Test ExternalDataSource model fields"
def handle(self, *args, **options): # noqa: ARG002
self.stdout.write("Testing ExternalDataSource schema...")
try:
# Get or create a test source
source, created = ExternalDataSource.objects.get_or_create(
name="Test Source",
defaults={
"api_url": "https://example.com/api",
"is_active": False,
},
)
if created:
self.stdout.write(f"Created test source with ID: {source.id}")
else:
self.stdout.write(f"Using existing test source with ID: {source.id}")
# Test setting each field
fields_to_test = {
"error_count": 0,
"last_error": "Test error message",
"sync_interval": 7200,
"timeout": 600,
}
for field, value in fields_to_test.items():
try:
setattr(source, field, value)
self.stdout.write(self.style.SUCCESS(f"✅ Successfully set {field} = {value}"))
except AttributeError:
self.stdout.write(self.style.ERROR(f"❌ Field {field} doesn't exist on the model"))
try:
source.save()
self.stdout.write(self.style.SUCCESS("✅ Successfully saved with all fields"))
except Exception as e:
self.stdout.write(self.style.ERROR(f"❌ Error saving model: {e}"))
# Read back the values to verify
refreshed_source = ExternalDataSource.objects.get(id=source.id)
self.stdout.write("\nVerifying saved values:")
for field, expected_value in fields_to_test.items():
actual_value = getattr(refreshed_source, field, "MISSING")
if actual_value == expected_value:
self.stdout.write(self.style.SUCCESS(f"{field} = {actual_value} (correct)"))
else:
self.stdout.write(self.style.ERROR(f"{field} = {actual_value} (expected: {expected_value})"))
except Exception as e:
self.stdout.write(self.style.ERROR(f"❌ Test failed: {e}"))

View File

@ -0,0 +1,117 @@
import bleach
from bleach.css_sanitizer import CSSSanitizer
from django.core.management.base import BaseCommand
class Command(BaseCommand):
help = "Test the HTML sanitizer with CSS Sanitizer"
def handle(self, *args, **options): # noqa: ARG002
# Create a test HTML string with various style attributes
test_html = """
<div style="color: red; background-color: yellow; transform: rotate(30deg);">
<p style="font-size: 16px; margin: 10px;">
This is a <span style="font-weight: bold; color: blue;">styled</span> paragraph.
</p>
<script>alert('XSS attack');</script>
<a href="javascript:alert('Evil');" style="text-decoration: none;">Dangerous Link</a>
<img src="x" onerror="alert('XSS')" style="border: 1px solid red;">
</div>
"""
# Create CSS sanitizer with allowed properties
css_sanitizer = CSSSanitizer(
allowed_css_properties=[
"color",
"background-color",
"font-family",
"font-size",
"font-weight",
"font-style",
"text-decoration",
"text-align",
"margin",
"margin-left",
"margin-right",
"margin-top",
"margin-bottom",
"padding",
"padding-left",
"padding-right",
"padding-top",
"padding-bottom",
"border",
"border-radius",
"width",
"height",
"line-height",
]
)
# Clean the HTML
cleaned_html = bleach.clean(
test_html,
tags=[
"b",
"i",
"u",
"em",
"strong",
"a",
"br",
"p",
"ul",
"ol",
"li",
"span",
"div",
"pre",
"code",
"blockquote",
],
attributes={
"a": ["href", "title", "target"],
"span": ["style", "class"],
"div": ["style", "class"],
"p": ["style", "class"],
"pre": ["style", "class"],
},
css_sanitizer=css_sanitizer,
strip=True,
)
# Print the results
self.stdout.write(self.style.SUCCESS("Original HTML:"))
self.stdout.write(test_html)
self.stdout.write("\n\n")
self.stdout.write(self.style.SUCCESS("Cleaned HTML:"))
self.stdout.write(cleaned_html)
self.stdout.write("\n\n")
# Check if unsafe attributes and styles were removed
self.stdout.write(self.style.SUCCESS("Security Checks:"))
if "script" not in cleaned_html:
self.stdout.write(self.style.SUCCESS("✓ Script tags removed"))
else:
self.stdout.write(self.style.ERROR("✗ Script tags found"))
if "javascript:" not in cleaned_html:
self.stdout.write(self.style.SUCCESS("✓ JavaScript URLs removed"))
else:
self.stdout.write(self.style.ERROR("✗ JavaScript URLs found"))
if "onerror" not in cleaned_html:
self.stdout.write(self.style.SUCCESS("✓ Event handlers removed"))
else:
self.stdout.write(self.style.ERROR("✗ Event handlers found"))
if "transform" not in cleaned_html:
self.stdout.write(self.style.SUCCESS("✓ Unsafe CSS properties removed"))
else:
self.stdout.write(self.style.ERROR("✗ Unsafe CSS properties found"))
if "img" not in cleaned_html:
self.stdout.write(self.style.SUCCESS("✓ Unsupported tags removed"))
else:
self.stdout.write(self.style.ERROR("✗ Unsupported tags found"))

View File

@ -0,0 +1,68 @@
import logging
from django.conf import settings
from django.core.management.base import BaseCommand
logger = logging.getLogger(__name__)
class Command(BaseCommand):
help = "Test Redis connection for Celery"
def handle(self, *args, **options): # noqa: ARG002
self.stdout.write("Testing Redis connection...")
try:
import redis
# Get Redis configuration from settings
redis_host = getattr(settings, "REDIS_HOST", "localhost")
redis_port = int(getattr(settings, "REDIS_PORT", 6379))
redis_db = int(getattr(settings, "REDIS_DB", 0))
# Override from environment if set
import os
if "REDIS_URL" in os.environ:
self.stdout.write(f"REDIS_URL environment variable found: {os.environ['REDIS_URL']}")
# Try to connect and ping
redis_client = redis.Redis(host=redis_host, port=redis_port, db=redis_db, socket_connect_timeout=2)
ping_result = redis_client.ping()
if ping_result:
self.stdout.write(
self.style.SUCCESS(
f"✅ Redis connection successful! Connected to {redis_host}:{redis_port}/{redis_db}"
)
)
self.stdout.write(f"Broker URL: {settings.CELERY_BROKER_URL}")
self.stdout.write(f"Result backend: {settings.CELERY_RESULT_BACKEND}")
# Try to set and get a value
test_key = "test_redis_connection"
test_value = "success"
redis_client.set(test_key, test_value)
retrieved_value = redis_client.get(test_key)
if retrieved_value and retrieved_value.decode() == test_value:
self.stdout.write(self.style.SUCCESS("✅ Redis SET/GET test passed!"))
else:
self.stdout.write(
self.style.WARNING(
f"⚠️ Redis SET/GET test failed: Got {retrieved_value} instead of {test_value}"
)
)
# Clean up
redis_client.delete(test_key)
else:
self.stdout.write(self.style.ERROR("❌ Redis ping failed!"))
except redis.exceptions.ConnectionError as e:
self.stdout.write(self.style.ERROR(f"❌ Redis connection error: {e}"))
self.stdout.write("Celery will use SQLite fallback if configured.")
except ImportError:
self.stdout.write(self.style.ERROR("❌ Redis package not installed. Install with: pip install redis"))
except Exception as e:
self.stdout.write(self.style.ERROR(f"❌ Error: {e}"))

View File

@ -0,0 +1,99 @@
# Generated by Django 5.2.1 on 2025-05-17 21:14
import django.db.models.deletion
from django.db import migrations, models
class Migration(migrations.Migration):
initial = True
dependencies = []
operations = [
migrations.CreateModel(
name="ChatSession",
fields=[
(
"id",
models.BigAutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
("session_id", models.CharField(max_length=255, unique=True)),
("start_time", models.DateTimeField()),
("end_time", models.DateTimeField()),
("ip_address", models.GenericIPAddressField(blank=True, null=True)),
("country", models.CharField(blank=True, max_length=255, null=True)),
("language", models.CharField(blank=True, max_length=255, null=True)),
("messages_sent", models.IntegerField(blank=True, null=True)),
("sentiment", models.CharField(blank=True, max_length=255, null=True)),
("escalated", models.BooleanField(blank=True, null=True)),
("forwarded_hr", models.BooleanField(blank=True, null=True)),
(
"full_transcript_url",
models.URLField(blank=True, max_length=1024, null=True),
),
("avg_response_time", models.FloatField(blank=True, null=True)),
("tokens", models.IntegerField(blank=True, null=True)),
("tokens_eur", models.FloatField(blank=True, null=True)),
("category", models.CharField(blank=True, max_length=255, null=True)),
("initial_msg", models.TextField(blank=True, null=True)),
("user_rating", models.IntegerField(blank=True, null=True)),
],
),
migrations.CreateModel(
name="ExternalDataSource",
fields=[
(
"id",
models.BigAutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
("name", models.CharField(default="External API", max_length=255)),
("api_url", models.URLField(default="https://proto.notso.ai/XY/chats")),
(
"auth_username",
models.CharField(blank=True, max_length=255, null=True),
),
(
"auth_password",
models.CharField(blank=True, max_length=255, null=True),
),
("last_synced", models.DateTimeField(blank=True, null=True)),
("is_active", models.BooleanField(default=True)),
],
),
migrations.CreateModel(
name="ChatMessage",
fields=[
(
"id",
models.BigAutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
("timestamp", models.DateTimeField(auto_now_add=True)),
("sender", models.CharField(max_length=255)),
("message", models.TextField()),
("safe_html_message", models.TextField(blank=True, null=True)),
(
"session",
models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
related_name="messages",
to="data_integration.chatsession",
),
),
],
),
]

View File

@ -0,0 +1,43 @@
# Generated by Django 5.2.1 on 2025-05-17 22:33
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
("data_integration", "0001_initial"),
]
operations = [
migrations.AddField(
model_name="externaldatasource",
name="error_count",
field=models.IntegerField(default=0),
),
migrations.AddField(
model_name="externaldatasource",
name="last_error",
field=models.CharField(blank=True, max_length=255, null=True),
),
migrations.AddField(
model_name="externaldatasource",
name="sync_interval",
field=models.IntegerField(
default=3600,
help_text="Sync interval in seconds. Default is 3600 (1 hour)",
),
),
migrations.AddField(
model_name="externaldatasource",
name="timeout",
field=models.IntegerField(
default=300,
help_text="Timeout in seconds for each sync operation. Default is 300 (5 minutes)",
),
),
migrations.AlterField(
model_name="externaldatasource",
name="api_url",
field=models.URLField(default="https://proto.notso.ai/jumbo/chats"),
),
]

View File

@ -0,0 +1,78 @@
import os
from django.db import models
class ChatSession(models.Model):
session_id = models.CharField(max_length=255, unique=True)
start_time = models.DateTimeField()
end_time = models.DateTimeField()
ip_address = models.GenericIPAddressField(null=True, blank=True)
country = models.CharField(max_length=255, null=True, blank=True)
language = models.CharField(max_length=255, null=True, blank=True)
messages_sent = models.IntegerField(null=True, blank=True)
sentiment = models.CharField(max_length=255, null=True, blank=True)
escalated = models.BooleanField(null=True, blank=True)
forwarded_hr = models.BooleanField(null=True, blank=True)
full_transcript_url = models.URLField(max_length=1024, null=True, blank=True)
avg_response_time = models.FloatField(null=True, blank=True)
tokens = models.IntegerField(null=True, blank=True)
tokens_eur = models.FloatField(null=True, blank=True)
category = models.CharField(max_length=255, null=True, blank=True)
initial_msg = models.TextField(null=True, blank=True)
user_rating = models.IntegerField(null=True, blank=True)
def __str__(self):
return self.session_id
class ChatMessage(models.Model):
session = models.ForeignKey(ChatSession, related_name="messages", on_delete=models.CASCADE)
timestamp = models.DateTimeField(auto_now_add=True) # Changed to auto_now_add for simplicity
sender = models.CharField(max_length=255) # "User" or "Assistant"
message = models.TextField()
safe_html_message = models.TextField(blank=True, null=True) # For storing sanitized HTML
def __str__(self):
return f"{self.session.session_id} - {self.sender} at {self.timestamp}"
class ExternalDataSource(models.Model):
name = models.CharField(max_length=255, default="External API")
api_url = models.URLField(default="https://proto.notso.ai/jumbo/chats")
auth_username = models.CharField(max_length=255, blank=True, null=True)
auth_password = models.CharField(
max_length=255, blank=True, null=True
) # Consider using a more secure way to store credentials
last_synced = models.DateTimeField(null=True, blank=True)
is_active = models.BooleanField(default=True)
error_count = models.IntegerField(default=0)
last_error = models.CharField(max_length=255, blank=True, null=True)
sync_interval = models.IntegerField(default=3600, help_text="Sync interval in seconds. Default is 3600 (1 hour)")
timeout = models.IntegerField(
default=300,
help_text="Timeout in seconds for each sync operation. Default is 300 (5 minutes)",
)
def get_auth_username(self):
"""Get username from environment variable if set, otherwise use stored value"""
env_username = os.environ.get("EXTERNAL_API_USERNAME")
return env_username if env_username else self.auth_username
def get_auth_password(self):
"""Get password from environment variable if set, otherwise use stored value"""
env_password = os.environ.get("EXTERNAL_API_PASSWORD")
return env_password if env_password else self.auth_password
def get_status(self):
"""Get the status of this data source"""
if not self.is_active:
return "Inactive"
if not self.last_synced:
return "Never synced"
if self.error_count > 0:
return f"Error ({self.error_count})"
return "Active"
def __str__(self):
return self.name

View File

@ -0,0 +1,116 @@
import logging
import os
from celery import shared_task
from django.db import utils as django_db_utils
from django.utils import timezone
from .models import ExternalDataSource
from .utils import fetch_and_store_chat_data
logger = logging.getLogger(__name__)
@shared_task(name="data_integration.tasks.test_task", bind=True)
def test_task(self):
"""A simple test task to verify Celery is working without external dependencies."""
logger.info("Test task executed at %s (task_id: %s)", timezone.now(), self.request.id)
return "Test task completed successfully!"
@shared_task(
name="data_integration.tasks.periodic_fetch_chat_data",
bind=True,
autoretry_for=(Exception,),
retry_kwargs={"max_retries": 3, "countdown": 60},
soft_time_limit=int(os.environ.get("FETCH_DATA_TIMEOUT", 300)), # 5 minutes default
)
def periodic_fetch_chat_data(self):
"""Periodically fetch and process chat data from external sources.
This task:
1. Fetches data from all active external data sources
2. Processes and stores the data in the database
3. Updates the last_synced timestamp on each source
4. Handles errors with retries
"""
logger.info("Starting periodic chat data fetch (task_id: %s)...", self.request.id)
try:
# Get all active data sources
active_sources = ExternalDataSource.objects.filter(is_active=True)
if not active_sources.exists():
logger.warning("No active external data sources found. Skipping fetch.")
return "No active data sources found"
successful_sources = []
failed_sources = []
for source in active_sources:
try:
logger.info(f"Processing source: {source.name} (ID: {source.id})")
fetch_and_store_chat_data(source_id=source.id)
source.last_synced = timezone.now()
# Check if error_count field exists in the model
update_fields = ["last_synced"]
try:
source.error_count = 0
source.last_error = None
update_fields.extend(["error_count", "last_error"])
except AttributeError:
# Fields might not exist yet if migrations haven't been applied
logger.warning("New fields not available. Run migrations to enable error tracking.")
source.save(update_fields=update_fields)
successful_sources.append(source.name)
except Exception as e:
logger.error(f"Error fetching data from source {source.name}: {e}", exc_info=True)
try:
source.error_count = getattr(source, "error_count", 0) + 1
source.last_error = str(e)[:255] # Truncate to fit in the field
source.save(update_fields=["error_count", "last_error"])
except (AttributeError, django_db_utils.OperationalError):
# If fields don't exist, just update last_synced
logger.warning("Could not update error fields. Run migrations to enable error tracking.")
source.last_synced = timezone.now()
source.save(update_fields=["last_synced"])
failed_sources.append(source.name)
if failed_sources and not successful_sources:
# If all sources failed, we should raise an exception to trigger retry
raise Exception(f"All data sources failed: {', '.join(failed_sources)}")
result_message = f"Completed: {len(successful_sources)} successful, {len(failed_sources)} failed"
logger.info(result_message)
return result_message
except Exception as e:
logger.error(f"Error during periodic chat data fetch: {e}", exc_info=True)
raise # Re-raise to trigger Celery retry
@shared_task(name="data_integration.tasks.refresh_specific_source", bind=True)
def refresh_specific_source(self, source_id):
"""Manually refresh a specific data source.
Args:
source_id: ID of the ExternalDataSource to refresh
"""
logger.info(f"Starting manual refresh of data source ID: {source_id} (task_id: {self.request.id})")
try:
source = ExternalDataSource.objects.get(id=source_id)
fetch_and_store_chat_data(source_id=source_id)
source.last_synced = timezone.now()
source.error_count = 0
source.last_error = None
source.save(update_fields=["last_synced", "error_count", "last_error"])
logger.info(f"Manual refresh of data source {source.name} completed successfully")
return f"Successfully refreshed data source: {source.name}"
except ExternalDataSource.DoesNotExist:
logger.error(f"Data source with ID {source_id} does not exist")
return f"Error: Data source with ID {source_id} does not exist"
except Exception as e:
logger.error(
f"Error during manual refresh of data source {source_id}: {e}",
exc_info=True,
)
return f"Error: {str(e)}"

View File

@ -0,0 +1 @@
# Create your tests here.

View File

@ -0,0 +1,14 @@
from django.urls import path
from . import views
app_name = "data_integration"
urlpatterns = [
path("manual-refresh/", views.manual_data_refresh, name="manual_data_refresh"),
path(
"refresh/<int:source_id>/",
views.refresh_specific_datasource,
name="refresh_specific_datasource",
),
]

View File

@ -0,0 +1,340 @@
import csv
import logging
from datetime import datetime
import bleach
import requests
from bleach.css_sanitizer import CSSSanitizer
from django.utils.timezone import make_aware
from .models import ChatMessage, ChatSession, ExternalDataSource
logger = logging.getLogger(__name__)
EXPECTED_HEADERS = [
"session_id",
"start_time",
"end_time",
"ip_address",
"country",
"language",
"messages_sent",
"sentiment",
"escalated",
"forwarded_hr",
"full_transcript",
"avg_response_time",
"tokens",
"tokens_eur",
"category",
"initial_msg",
"user_rating",
]
def fetch_and_store_chat_data(source_id=None):
"""Fetch chat data from an external API and store it in the database.
Args:
source_id: Optional ID of specific ExternalDataSource to use.
If None, will use the first active source.
Returns:
dict: Stats about the operation (sessions created, updated, errors)
"""
if source_id:
source = ExternalDataSource.objects.filter(id=source_id, is_active=True).first()
if not source:
logger.error(f"Data source with ID {source_id} not found or not active.")
return {
"success": False,
"error": f"Data source with ID {source_id} not found or not active.",
}
else:
source = ExternalDataSource.objects.filter(is_active=True).first()
if not source:
logger.warning("No active data source found.")
return {"success": False, "error": "No active data source found."}
stats = {
"sessions_created": 0,
"sessions_updated": 0,
"transcripts_processed": 0,
"errors": 0,
"success": True,
}
try:
# Fetch data from API with timeout from source settings or default
timeout = getattr(source, "timeout", 30)
response = requests.get(
source.api_url,
auth=((source.get_auth_username(), source.get_auth_password()) if source.get_auth_username() else None),
timeout=timeout,
)
response.raise_for_status()
except requests.RequestException as e:
error_msg = f"Error fetching data from API {source.api_url}: {e}"
logger.error(error_msg)
return {"success": False, "error": error_msg}
# Process CSV data
csv_data = response.content.decode("utf-8").splitlines()
reader = csv.reader(csv_data)
# Skip header if present, or use predefined if not
# header = next(reader) # Assuming the first row is a header
# For this specific case, we know the header is missing.
header = EXPECTED_HEADERS
for row in reader:
if not row: # Skip empty rows
continue
try:
# Fix for zip() argument mismatch: pad the row with empty strings if needed
padded_row = row + [""] * (len(header) - len(row))
data = dict(zip(header, padded_row, strict=False))
try:
# Try European date format (DD.MM.YYYY) first
start_time = make_aware(datetime.strptime(data["start_time"], "%d.%m.%Y %H:%M:%S"))
except ValueError:
# Fallback to ISO format (YYYY-MM-DD)
start_time = make_aware(datetime.strptime(data["start_time"], "%Y-%m-%d %H:%M:%S"))
try:
# Try European date format (DD.MM.YYYY) first
end_time = make_aware(datetime.strptime(data["end_time"], "%d.%m.%Y %H:%M:%S"))
except ValueError:
# Fallback to ISO format (YYYY-MM-DD)
end_time = make_aware(datetime.strptime(data["end_time"], "%Y-%m-%d %H:%M:%S"))
messages_sent = int(data["messages_sent"]) if data["messages_sent"] else None
escalated = data["escalated"].lower() == "true" if data["escalated"] else None
forwarded_hr = data["forwarded_hr"].lower() == "true" if data["forwarded_hr"] else None
avg_response_time = float(data["avg_response_time"]) if data["avg_response_time"] else None
tokens = int(data["tokens"]) if data["tokens"] else None
tokens_eur = float(data["tokens_eur"]) if data["tokens_eur"] else None
user_rating = int(data["user_rating"]) if data["user_rating"] and data["user_rating"].isdigit() else None
session, created = ChatSession.objects.update_or_create(
session_id=data["session_id"],
defaults={
"start_time": start_time,
"end_time": end_time,
"ip_address": data.get("ip_address"),
"country": data.get("country"),
"language": data.get("language"),
"messages_sent": messages_sent,
"sentiment": data.get("sentiment"),
"escalated": escalated,
"forwarded_hr": forwarded_hr,
"full_transcript_url": data.get("full_transcript"),
"avg_response_time": avg_response_time,
"tokens": tokens,
"tokens_eur": tokens_eur,
"category": data.get("category"),
"initial_msg": data.get("initial_msg"),
"user_rating": user_rating,
},
)
if created:
stats["sessions_created"] += 1
logger.info(f"Created session: {session.session_id}")
else:
stats["sessions_updated"] += 1
logger.info(f"Updated session: {session.session_id}")
# Fetch and process transcript if URL is present
if session.full_transcript_url:
transcript_result = fetch_and_store_transcript(session, timeout)
if transcript_result["success"]:
stats["transcripts_processed"] += 1
except Exception as e:
logger.error(f"Error processing row: {row}. Error: {e}", exc_info=True)
stats["errors"] += 1
continue
source.last_synced = make_aware(datetime.now())
source.save()
logger.info("Data sync complete. Stats: {stats}")
return stats
def fetch_and_store_transcript(session, timeout=30):
"""Fetch and process transcript for a chat session.
Args:
session: The ChatSession object
timeout: Timeout in seconds for the request
Returns:
dict: Result of the operation
"""
result = {"success": False, "messages_created": 0, "error": None}
try:
transcript_response = requests.get(session.full_transcript_url, timeout=timeout)
transcript_response.raise_for_status()
transcript_content = transcript_response.content.decode("utf-8")
messages_created = parse_and_store_transcript_messages(session, transcript_content)
result["success"] = True
result["messages_created"] = messages_created
return result
except requests.RequestException as e:
error_msg = f"Error fetching transcript for session {session.session_id}: {e}"
logger.error(error_msg)
result["error"] = error_msg
return result
except Exception as e:
error_msg = f"Error processing transcript for session {session.session_id}: {e}"
logger.error(error_msg, exc_info=True)
result["error"] = error_msg
return result
def parse_and_store_transcript_messages(session, transcript_content):
"""Parse and store messages from a transcript.
Args:
session: The ChatSession object
transcript_content: The raw transcript content
Returns:
int: Number of messages created
"""
lines = transcript_content.splitlines()
current_sender = None
current_message_lines = []
messages_created = 0
# First, delete existing messages for this session to avoid duplicates
existing_count = ChatMessage.objects.filter(session=session).count()
if existing_count > 0:
logger.info(f"Deleting {existing_count} existing messages for session {session.session_id}")
ChatMessage.objects.filter(session=session).delete()
for line in lines:
if line.startswith("User:"):
if (
current_sender
and current_message_lines
and save_message(session, current_sender, "\n".join(current_message_lines))
):
messages_created += 1
current_sender = "User"
current_message_lines = [line.replace("User:", "").strip()]
elif line.startswith("Assistant:"):
if (
current_sender
and current_message_lines
and save_message(session, current_sender, "\n".join(current_message_lines))
):
messages_created += 1
current_sender = "Assistant"
current_message_lines = [line.replace("Assistant:", "").strip()]
elif current_sender:
current_message_lines.append(line.strip())
# Save the last message
if (
current_sender
and current_message_lines
and save_message(session, current_sender, "\n".join(current_message_lines))
):
messages_created += 1
logger.info(f"Created {messages_created} messages for session {session.session_id}")
return messages_created
def save_message(session, sender, message_text):
"""Save a message for a chat session.
Args:
session: The ChatSession object
sender: The sender of the message ("User" or "Assistant")
message_text: The message text, which may contain HTML
Returns:
bool: True if message was created, False otherwise
"""
if not message_text.strip():
return False
try:
# Create a CSS sanitizer with allowed CSS properties
css_sanitizer = CSSSanitizer(
allowed_css_properties=[
"color",
"background-color",
"font-family",
"font-size",
"font-weight",
"font-style",
"text-decoration",
"text-align",
"margin",
"margin-left",
"margin-right",
"margin-top",
"margin-bottom",
"padding",
"padding-left",
"padding-right",
"padding-top",
"padding-bottom",
"border",
"border-radius",
"width",
"height",
"line-height",
]
)
# Sanitize HTML content before saving if necessary
safe_html = bleach.clean(
message_text,
tags=[
"b",
"i",
"u",
"em",
"strong",
"a",
"br",
"p",
"ul",
"ol",
"li",
"span",
"div",
"pre",
"code",
"blockquote",
],
attributes={
"a": ["href", "title", "target"],
"span": ["style", "class"],
"div": ["style", "class"],
"p": ["style", "class"],
"pre": ["style", "class"],
},
css_sanitizer=css_sanitizer,
strip=True,
)
ChatMessage.objects.create(
session=session,
sender=sender,
message=message_text,
safe_html_message=safe_html,
)
logger.debug(f"Stored message for session {session.session_id} from {sender}")
return True
except Exception as e:
logger.error(f"Error saving message for session {session.session_id}: {e}", exc_info=True)
return False

View File

@ -0,0 +1,54 @@
from django.contrib import messages
from django.contrib.admin.views.decorators import staff_member_required
from django.contrib.auth.decorators import login_required, user_passes_test
from django.shortcuts import get_object_or_404, redirect
from .models import ExternalDataSource
from .tasks import periodic_fetch_chat_data, refresh_specific_source
from .utils import fetch_and_store_chat_data
# Create your views here.
def is_superuser(user):
return user.is_superuser
@login_required
@user_passes_test(is_superuser)
def manual_data_refresh(request):
if request.method == "POST":
try:
# Try to use Celery first
try:
# Asynchronous with Celery
periodic_fetch_chat_data.delay()
messages.success(
request,
"Manual data refresh triggered successfully. The data will be updated shortly.",
)
except Exception:
# Fall back to synchronous if Celery is not available
fetch_and_store_chat_data()
messages.success(
request,
"Manual data refresh completed successfully (synchronous mode).",
)
except Exception as e:
messages.error(request, f"Failed to refresh data: {e}")
return redirect(request.headers.get("referer", "dashboard")) # Redirect to previous page or dashboard
@staff_member_required
def refresh_specific_datasource(request, source_id):
"""View to trigger refresh of a specific data source. Used as a backup for admin URLs."""
source = get_object_or_404(ExternalDataSource, pk=source_id)
try:
# Try to use Celery
task = refresh_specific_source.delay(source_id)
messages.success(request, f"Data refresh task started for {source.name} (Task ID: {task.id})")
except Exception as e:
messages.error(request, f"Failed to refresh data source {source.name}: {e}")
return redirect(request.headers.get("referer", "/admin/data_integration/externaldatasource/"))