|
2 | 2 | import logging |
3 | 3 |
|
4 | 4 | import celery |
| 5 | +import sentry_sdk |
5 | 6 | from celery.exceptions import Ignore |
6 | 7 | from django.conf import settings |
7 | 8 | from django.db.models import Q |
8 | 9 |
|
| 10 | +from learning_resources.content_summarizer import ContentSummarizer |
9 | 11 | from learning_resources.models import ( |
10 | 12 | ContentFile, |
11 | 13 | Course, |
|
32 | 34 | chunks, |
33 | 35 | now_in_utc, |
34 | 36 | ) |
| 37 | +from vector_search.constants import ( |
| 38 | + CONTENT_FILES_COLLECTION_NAME, |
| 39 | + RESOURCES_COLLECTION_NAME, |
| 40 | +) |
35 | 41 | from vector_search.utils import ( |
36 | 42 | embed_learning_resources, |
37 | 43 | embed_topics, |
| 44 | + filter_existing_qdrant_points_by_ids, |
38 | 45 | remove_qdrant_records, |
| 46 | + vector_point_id, |
39 | 47 | ) |
40 | 48 |
|
41 | 49 | log = logging.getLogger(__name__) |
@@ -369,6 +377,139 @@ def remove_run_content_files(run_id): |
369 | 377 |
|
370 | 378 |
|
371 | 379 | @app.task |
| 380 | +def embeddings_healthcheck(): |
| 381 | + """ |
| 382 | + Check for missing embeddings and summaries in Qdrant and log warnings to Sentry |
| 383 | + """ |
| 384 | + remaining_content_files = [] |
| 385 | + remaining_resources = [] |
| 386 | + resource_point_ids = {} |
| 387 | + all_resources = LearningResource.objects.filter( |
| 388 | + Q(published=True) | Q(test_mode=True) |
| 389 | + ) |
| 390 | + |
| 391 | + for lr in all_resources: |
| 392 | + run = ( |
| 393 | + lr.best_run |
| 394 | + if lr.best_run |
| 395 | + else lr.runs.filter(published=True).order_by("-start_date").first() |
| 396 | + ) |
| 397 | + point_id = vector_point_id(lr.readable_id) |
| 398 | + resource_point_ids[point_id] = {"resource_id": lr.readable_id, "id": lr.id} |
| 399 | + content_file_point_ids = {} |
| 400 | + if run: |
| 401 | + for cf in run.content_files.filter(published=True): |
| 402 | + if cf and cf.content: |
| 403 | + point_id = vector_point_id( |
| 404 | + f"{lr.readable_id}.{run.run_id}.{cf.key}.0" |
| 405 | + ) |
| 406 | + content_file_point_ids[point_id] = {"key": cf.key, "id": cf.id} |
| 407 | + for batch in chunks(content_file_point_ids.keys(), chunk_size=200): |
| 408 | + remaining_content_files.extend( |
| 409 | + filter_existing_qdrant_points_by_ids( |
| 410 | + batch, collection_name=CONTENT_FILES_COLLECTION_NAME |
| 411 | + ) |
| 412 | + ) |
| 413 | + |
| 414 | + for batch in chunks( |
| 415 | + all_resources.values_list("readable_id", flat=True), |
| 416 | + chunk_size=200, |
| 417 | + ): |
| 418 | + remaining_resources.extend( |
| 419 | + filter_existing_qdrant_points_by_ids( |
| 420 | + [vector_point_id(pid) for pid in batch], |
| 421 | + collection_name=RESOURCES_COLLECTION_NAME, |
| 422 | + ) |
| 423 | + ) |
| 424 | + |
| 425 | + remaining_content_file_ids = [ |
| 426 | + content_file_point_ids.get(p, {}).get("id") for p in remaining_content_files |
| 427 | + ] |
| 428 | + remaining_resource_ids = [ |
| 429 | + resource_point_ids.get(p, {}).get("id") for p in remaining_resources |
| 430 | + ] |
| 431 | + missing_summaries = _missing_summaries() |
| 432 | + log.info( |
| 433 | + "Embeddings healthcheck found %d missing content file embeddings", |
| 434 | + len(remaining_content_files), |
| 435 | + ) |
| 436 | + log.info( |
| 437 | + "Embeddings healthcheck found %d missing resource embeddings", |
| 438 | + len(remaining_resources), |
| 439 | + ) |
| 440 | + log.info( |
| 441 | + "Embeddings healthcheck found %d missing summaries and flashcards", |
| 442 | + len(missing_summaries), |
| 443 | + ) |
| 444 | + |
| 445 | + if len(remaining_content_files) > 0: |
| 446 | + _sentry_healthcheck_log( |
| 447 | + "embeddings", |
| 448 | + "missing_content_file_embeddings", |
| 449 | + { |
| 450 | + "count": len(remaining_content_files), |
| 451 | + "ids": remaining_content_file_ids, |
| 452 | + "run_ids": set( |
| 453 | + ContentFile.objects.filter( |
| 454 | + id__in=remaining_content_file_ids |
| 455 | + ).values_list("run__run_id", flat=True)[:100] |
| 456 | + ), |
| 457 | + }, |
| 458 | + f"Warning: {len(remaining_content_files)} missing content file " |
| 459 | + "embeddings detected", |
| 460 | + ) |
| 461 | + |
| 462 | + if len(remaining_resources) > 0: |
| 463 | + _sentry_healthcheck_log( |
| 464 | + "embeddings", |
| 465 | + "missing_learning_resource_embeddings", |
| 466 | + { |
| 467 | + "count": len(remaining_resource_ids), |
| 468 | + "ids": remaining_resource_ids, |
| 469 | + "titles": list( |
| 470 | + LearningResource.objects.filter( |
| 471 | + id__in=remaining_resource_ids |
| 472 | + ).values_list("title", flat=True) |
| 473 | + ), |
| 474 | + }, |
| 475 | + f"Warning: {len(remaining_resource_ids)} missing learning resource " |
| 476 | + "embeddings detected", |
| 477 | + ) |
| 478 | + if len(missing_summaries) > 0: |
| 479 | + _sentry_healthcheck_log( |
| 480 | + "embeddings", |
| 481 | + "missing_content_file_summaries", |
| 482 | + { |
| 483 | + "count": len(missing_summaries), |
| 484 | + "ids": missing_summaries, |
| 485 | + "run_ids": set( |
| 486 | + ContentFile.objects.filter(id__in=missing_summaries).values_list( |
| 487 | + "run__run_id", flat=True |
| 488 | + )[:100] |
| 489 | + ), |
| 490 | + }, |
| 491 | + f"Warning: {len(missing_summaries)} missing content file summaries " |
| 492 | + "detected", |
| 493 | + ) |
| 494 | + |
| 495 | + |
| 496 | +def _missing_summaries(): |
| 497 | + summarizer = ContentSummarizer() |
| 498 | + return summarizer.get_unprocessed_content_file_ids( |
| 499 | + LearningResource.objects.filter(require_summaries=True) |
| 500 | + .filter(Q(published=True) | Q(test_mode=True)) |
| 501 | + .values_list("id", flat=True) |
| 502 | + ) |
| 503 | + |
| 504 | + |
| 505 | +def _sentry_healthcheck_log(healthcheck, alert_type, context, message): |
| 506 | + with sentry_sdk.new_scope() as scope: |
| 507 | + scope.set_tag("healthcheck", healthcheck) |
| 508 | + scope.set_tag("alert_type", alert_type) |
| 509 | + scope.set_context("missing_content_file_embeddings", context) |
| 510 | + sentry_sdk.capture_message(message) |
| 511 | + |
| 512 | + |
372 | 513 | def sync_topics(): |
373 | 514 | """ |
374 | 515 | Sync topics to the Qdrant collection |
|
0 commit comments