embeddings healthcheck (#2676)

shanbady · web-flow · commit 3ba27bdc4edd · 2025-11-07T16:40:57.000-05:00
* stashing changes

* fixing output

* adding sentry logging and docstring

* adding safe getters

* switching off of deprecated push_scope

* added tests

* adding normal log lines and extra context

* trunacate run ids

* adding periodic celery task

* adding check for summaries

* adding test
diff --git a/main/settings_celery.py b/main/settings_celery.py
@@ -184,6 +184,12 @@
             "task": "vector_search.tasks.sync_topics",
             "schedule": crontab(minute=0, hour="6,18,23"),  # 2am 2pm and 7pm EST
         },
+        "weekly_check_missing_embeddings": {
+            "task": "vector_search.tasks.embeddings_healthcheck",
+            "schedule": crontab(
+                minute=0, hour=6, day_of_week=6
+            ),  # 2:00am EST on Friday
+        },
     }
 )
 
diff --git a/vector_search/tasks.py b/vector_search/tasks.py
@@ -2,10 +2,12 @@
 import logging
 
 import celery
+import sentry_sdk
 from celery.exceptions import Ignore
 from django.conf import settings
 from django.db.models import Q
 
+from learning_resources.content_summarizer import ContentSummarizer
 from learning_resources.models import (
     ContentFile,
     Course,
@@ -32,10 +34,16 @@
     chunks,
     now_in_utc,
 )
+from vector_search.constants import (
+    CONTENT_FILES_COLLECTION_NAME,
+    RESOURCES_COLLECTION_NAME,
+)
 from vector_search.utils import (
     embed_learning_resources,
     embed_topics,
+    filter_existing_qdrant_points_by_ids,
     remove_qdrant_records,
+    vector_point_id,
 )
 
 log = logging.getLogger(__name__)
@@ -369,6 +377,139 @@ def remove_run_content_files(run_id):
 
 
 @app.task
+def embeddings_healthcheck():
+    """
+    Check for missing embeddings and summaries in Qdrant and log warnings to Sentry
+    """
+    remaining_content_files = []
+    remaining_resources = []
+    resource_point_ids = {}
+    all_resources = LearningResource.objects.filter(
+        Q(published=True) | Q(test_mode=True)
+    )
+
+    for lr in all_resources:
+        run = (
+            lr.best_run
+            if lr.best_run
+            else lr.runs.filter(published=True).order_by("-start_date").first()
+        )
+        point_id = vector_point_id(lr.readable_id)
+        resource_point_ids[point_id] = {"resource_id": lr.readable_id, "id": lr.id}
+        content_file_point_ids = {}
+        if run:
+            for cf in run.content_files.filter(published=True):
+                if cf and cf.content:
+                    point_id = vector_point_id(
+                        f"{lr.readable_id}.{run.run_id}.{cf.key}.0"
+                    )
+                    content_file_point_ids[point_id] = {"key": cf.key, "id": cf.id}
+            for batch in chunks(content_file_point_ids.keys(), chunk_size=200):
+                remaining_content_files.extend(
+                    filter_existing_qdrant_points_by_ids(
+                        batch, collection_name=CONTENT_FILES_COLLECTION_NAME
+                    )
+                )
+
+    for batch in chunks(
+        all_resources.values_list("readable_id", flat=True),
+        chunk_size=200,
+    ):
+        remaining_resources.extend(
+            filter_existing_qdrant_points_by_ids(
+                [vector_point_id(pid) for pid in batch],
+                collection_name=RESOURCES_COLLECTION_NAME,
+            )
+        )
+
+    remaining_content_file_ids = [
+        content_file_point_ids.get(p, {}).get("id") for p in remaining_content_files
+    ]
+    remaining_resource_ids = [
+        resource_point_ids.get(p, {}).get("id") for p in remaining_resources
+    ]
+    missing_summaries = _missing_summaries()
+    log.info(
+        "Embeddings healthcheck found %d missing content file embeddings",
+        len(remaining_content_files),
+    )
+    log.info(
+        "Embeddings healthcheck found %d missing resource embeddings",
+        len(remaining_resources),
+    )
+    log.info(
+        "Embeddings healthcheck found %d missing summaries and flashcards",
+        len(missing_summaries),
+    )
+
+    if len(remaining_content_files) > 0:
+        _sentry_healthcheck_log(
+            "embeddings",
+            "missing_content_file_embeddings",
+            {
+                "count": len(remaining_content_files),
+                "ids": remaining_content_file_ids,
+                "run_ids": set(
+                    ContentFile.objects.filter(
+                        id__in=remaining_content_file_ids
+                    ).values_list("run__run_id", flat=True)[:100]
+                ),
+            },
+            f"Warning: {len(remaining_content_files)} missing content file "
+            "embeddings detected",
+        )
+
+    if len(remaining_resources) > 0:
+        _sentry_healthcheck_log(
+            "embeddings",
+            "missing_learning_resource_embeddings",
+            {
+                "count": len(remaining_resource_ids),
+                "ids": remaining_resource_ids,
+                "titles": list(
+                    LearningResource.objects.filter(
+                        id__in=remaining_resource_ids
+                    ).values_list("title", flat=True)
+                ),
+            },
+            f"Warning: {len(remaining_resource_ids)} missing learning resource "
+            "embeddings detected",
+        )
+    if len(missing_summaries) > 0:
+        _sentry_healthcheck_log(
+            "embeddings",
+            "missing_content_file_summaries",
+            {
+                "count": len(missing_summaries),
+                "ids": missing_summaries,
+                "run_ids": set(
+                    ContentFile.objects.filter(id__in=missing_summaries).values_list(
+                        "run__run_id", flat=True
+                    )[:100]
+                ),
+            },
+            f"Warning: {len(missing_summaries)} missing content file summaries "
+            "detected",
+        )
+
+
+def _missing_summaries():
+    summarizer = ContentSummarizer()
+    return summarizer.get_unprocessed_content_file_ids(
+        LearningResource.objects.filter(require_summaries=True)
+        .filter(Q(published=True) | Q(test_mode=True))
+        .values_list("id", flat=True)
+    )
+
+
+def _sentry_healthcheck_log(healthcheck, alert_type, context, message):
+    with sentry_sdk.new_scope() as scope:
+        scope.set_tag("healthcheck", healthcheck)
+        scope.set_tag("alert_type", alert_type)
+        scope.set_context("missing_content_file_embeddings", context)
+        sentry_sdk.capture_message(message)
+
+
 def sync_topics():
     """
     Sync topics to the Qdrant collection
diff --git a/vector_search/tasks_test.py b/vector_search/tasks_test.py
@@ -10,8 +10,10 @@
 )
 from learning_resources.factories import (
     ContentFileFactory,
+    ContentSummarizerConfigurationFactory,
     CourseFactory,
     LearningResourceFactory,
+    LearningResourcePlatformFactory,
     LearningResourceRunFactory,
     ProgramFactory,
 )
@@ -24,8 +26,10 @@
     embed_learning_resources_by_id,
     embed_new_content_files,
     embed_new_learning_resources,
+    embeddings_healthcheck,
     start_embed_resources,
 )
+from vector_search.utils import vector_point_id
 
 pytestmark = pytest.mark.django_db
 
@@ -388,3 +392,90 @@ def test_embed_new_content_files_without_runs(mocker, mocked_celery):
     embedded_ids = generate_embeddings_mock.si.mock_calls[0].args[0]
     for contentfile_id in content_files_without_run:
         assert contentfile_id in embedded_ids
+
+
+def test_embeddings_healthcheck_no_missing_embeddings(mocker):
+    """
+    Test embeddings_healthcheck when there are no missing embeddings
+    """
+    lr = LearningResourceFactory.create(published=True)
+    LearningResourceRunFactory.create(published=True, learning_resource=lr)
+    ContentFileFactory.create(run=lr.runs.first(), content="test", published=True)
+    mock_sentry = mocker.patch("vector_search.tasks.sentry_sdk", autospec=True)
+    mocker.patch(
+        "vector_search.tasks.filter_existing_qdrant_points_by_ids", return_value=[]
+    )
+
+    embeddings_healthcheck()
+    assert mock_sentry.capture_message.call_count == 0
+
+
+def test_embeddings_healthcheck_missing_both(mocker):
+    """
+    Test embeddings_healthcheck when there are missing content files and learning resources
+    """
+    lr = LearningResourceFactory.create(published=True)
+    LearningResourceRunFactory.create(published=True, learning_resource=lr)
+    cf = ContentFileFactory.create(run=lr.runs.first(), content="test", published=True)
+    mocker.patch(
+        "vector_search.tasks.filter_existing_qdrant_points_by_ids",
+        side_effect=[
+            [vector_point_id(lr.readable_id)],
+            [
+                vector_point_id(
+                    f"{cf.run.learning_resource.id}.{cf.run.run_id}.{cf.key}.0"
+                )
+            ],
+        ],
+    )
+    mock_sentry = mocker.patch("vector_search.tasks.sentry_sdk.capture_message")
+
+    embeddings_healthcheck()
+
+    assert mock_sentry.call_count == 2
+
+
+def test_embeddings_healthcheck_missing_summaries(mocker):
+    """
+    Test embeddings_healthcheck for missing contentfile summaries/flashcards
+    """
+    content_extension = [".srt"]
+    content_type = ["file"]
+    platform = LearningResourcePlatformFactory.create()
+    ContentSummarizerConfigurationFactory.create(
+        allowed_extensions=content_extension,
+        allowed_content_types=content_type,
+        is_active=True,
+        llm_model="test",
+        platform__code=platform.code,
+    )
+    resource = LearningResourceFactory.create(
+        published=True, require_summaries=True, platform=platform
+    )
+    resource.runs.all().delete()
+    learning_resource_run = LearningResourceRunFactory.create(
+        published=True,
+        learning_resource=resource,
+    )
+    learning_resource_run.learning_resource = resource
+    learning_resource_run.save()
+
+    ContentFileFactory.create(
+        published=True,
+        content="test",
+        file_extension=content_extension[0],
+        summary="",
+        content_type=content_type[0],
+        run=learning_resource_run,
+    )
+    mocker.patch(
+        "vector_search.tasks.filter_existing_qdrant_points_by_ids",
+    )
+    mock_sentry = mocker.patch("vector_search.tasks.sentry_sdk.capture_message")
+
+    embeddings_healthcheck()
+    assert mock_sentry.call_count == 1
+    assert (
+        mock_sentry.mock_calls[0].args[0]
+        == "Warning: 1 missing content file summaries detected"
+    )

Original file line number	Diff line number	Diff line change
`@@ -184,6 +184,12 @@`
`184`	`184`	`"task": "vector_search.tasks.sync_topics",`
`185`	`185`	`"schedule": crontab(minute=0, hour="6,18,23"), # 2am 2pm and 7pm EST`
`186`	`186`	`},`
	`187`	`+ "weekly_check_missing_embeddings": {`
	`188`	`+ "task": "vector_search.tasks.embeddings_healthcheck",`
	`189`	`+ "schedule": crontab(`
	`190`	`+ minute=0, hour=6, day_of_week=6`
	`191`	`+ ), # 2:00am EST on Friday`
	`192`	`+ },`
`187`	`193`	`}`
`188`	`194`	`)`
`189`	`195`