Skip to content

Commit 3ba27bd

Browse files
authored
embeddings healthcheck (#2676)
* stashing changes * fixing output * adding sentry logging and docstring * adding safe getters * switching off of deprecated push_scope * added tests * adding normal log lines and extra context * trunacate run ids * adding periodic celery task * adding check for summaries * adding test
1 parent a39da81 commit 3ba27bd

File tree

3 files changed

+238
-0
lines changed

3 files changed

+238
-0
lines changed

main/settings_celery.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,12 @@
184184
"task": "vector_search.tasks.sync_topics",
185185
"schedule": crontab(minute=0, hour="6,18,23"), # 2am 2pm and 7pm EST
186186
},
187+
"weekly_check_missing_embeddings": {
188+
"task": "vector_search.tasks.embeddings_healthcheck",
189+
"schedule": crontab(
190+
minute=0, hour=6, day_of_week=6
191+
), # 2:00am EST on Friday
192+
},
187193
}
188194
)
189195

vector_search/tasks.py

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,12 @@
22
import logging
33

44
import celery
5+
import sentry_sdk
56
from celery.exceptions import Ignore
67
from django.conf import settings
78
from django.db.models import Q
89

10+
from learning_resources.content_summarizer import ContentSummarizer
911
from learning_resources.models import (
1012
ContentFile,
1113
Course,
@@ -32,10 +34,16 @@
3234
chunks,
3335
now_in_utc,
3436
)
37+
from vector_search.constants import (
38+
CONTENT_FILES_COLLECTION_NAME,
39+
RESOURCES_COLLECTION_NAME,
40+
)
3541
from vector_search.utils import (
3642
embed_learning_resources,
3743
embed_topics,
44+
filter_existing_qdrant_points_by_ids,
3845
remove_qdrant_records,
46+
vector_point_id,
3947
)
4048

4149
log = logging.getLogger(__name__)
@@ -369,6 +377,139 @@ def remove_run_content_files(run_id):
369377

370378

371379
@app.task
380+
def embeddings_healthcheck():
381+
"""
382+
Check for missing embeddings and summaries in Qdrant and log warnings to Sentry
383+
"""
384+
remaining_content_files = []
385+
remaining_resources = []
386+
resource_point_ids = {}
387+
all_resources = LearningResource.objects.filter(
388+
Q(published=True) | Q(test_mode=True)
389+
)
390+
391+
for lr in all_resources:
392+
run = (
393+
lr.best_run
394+
if lr.best_run
395+
else lr.runs.filter(published=True).order_by("-start_date").first()
396+
)
397+
point_id = vector_point_id(lr.readable_id)
398+
resource_point_ids[point_id] = {"resource_id": lr.readable_id, "id": lr.id}
399+
content_file_point_ids = {}
400+
if run:
401+
for cf in run.content_files.filter(published=True):
402+
if cf and cf.content:
403+
point_id = vector_point_id(
404+
f"{lr.readable_id}.{run.run_id}.{cf.key}.0"
405+
)
406+
content_file_point_ids[point_id] = {"key": cf.key, "id": cf.id}
407+
for batch in chunks(content_file_point_ids.keys(), chunk_size=200):
408+
remaining_content_files.extend(
409+
filter_existing_qdrant_points_by_ids(
410+
batch, collection_name=CONTENT_FILES_COLLECTION_NAME
411+
)
412+
)
413+
414+
for batch in chunks(
415+
all_resources.values_list("readable_id", flat=True),
416+
chunk_size=200,
417+
):
418+
remaining_resources.extend(
419+
filter_existing_qdrant_points_by_ids(
420+
[vector_point_id(pid) for pid in batch],
421+
collection_name=RESOURCES_COLLECTION_NAME,
422+
)
423+
)
424+
425+
remaining_content_file_ids = [
426+
content_file_point_ids.get(p, {}).get("id") for p in remaining_content_files
427+
]
428+
remaining_resource_ids = [
429+
resource_point_ids.get(p, {}).get("id") for p in remaining_resources
430+
]
431+
missing_summaries = _missing_summaries()
432+
log.info(
433+
"Embeddings healthcheck found %d missing content file embeddings",
434+
len(remaining_content_files),
435+
)
436+
log.info(
437+
"Embeddings healthcheck found %d missing resource embeddings",
438+
len(remaining_resources),
439+
)
440+
log.info(
441+
"Embeddings healthcheck found %d missing summaries and flashcards",
442+
len(missing_summaries),
443+
)
444+
445+
if len(remaining_content_files) > 0:
446+
_sentry_healthcheck_log(
447+
"embeddings",
448+
"missing_content_file_embeddings",
449+
{
450+
"count": len(remaining_content_files),
451+
"ids": remaining_content_file_ids,
452+
"run_ids": set(
453+
ContentFile.objects.filter(
454+
id__in=remaining_content_file_ids
455+
).values_list("run__run_id", flat=True)[:100]
456+
),
457+
},
458+
f"Warning: {len(remaining_content_files)} missing content file "
459+
"embeddings detected",
460+
)
461+
462+
if len(remaining_resources) > 0:
463+
_sentry_healthcheck_log(
464+
"embeddings",
465+
"missing_learning_resource_embeddings",
466+
{
467+
"count": len(remaining_resource_ids),
468+
"ids": remaining_resource_ids,
469+
"titles": list(
470+
LearningResource.objects.filter(
471+
id__in=remaining_resource_ids
472+
).values_list("title", flat=True)
473+
),
474+
},
475+
f"Warning: {len(remaining_resource_ids)} missing learning resource "
476+
"embeddings detected",
477+
)
478+
if len(missing_summaries) > 0:
479+
_sentry_healthcheck_log(
480+
"embeddings",
481+
"missing_content_file_summaries",
482+
{
483+
"count": len(missing_summaries),
484+
"ids": missing_summaries,
485+
"run_ids": set(
486+
ContentFile.objects.filter(id__in=missing_summaries).values_list(
487+
"run__run_id", flat=True
488+
)[:100]
489+
),
490+
},
491+
f"Warning: {len(missing_summaries)} missing content file summaries "
492+
"detected",
493+
)
494+
495+
496+
def _missing_summaries():
497+
summarizer = ContentSummarizer()
498+
return summarizer.get_unprocessed_content_file_ids(
499+
LearningResource.objects.filter(require_summaries=True)
500+
.filter(Q(published=True) | Q(test_mode=True))
501+
.values_list("id", flat=True)
502+
)
503+
504+
505+
def _sentry_healthcheck_log(healthcheck, alert_type, context, message):
506+
with sentry_sdk.new_scope() as scope:
507+
scope.set_tag("healthcheck", healthcheck)
508+
scope.set_tag("alert_type", alert_type)
509+
scope.set_context("missing_content_file_embeddings", context)
510+
sentry_sdk.capture_message(message)
511+
512+
372513
def sync_topics():
373514
"""
374515
Sync topics to the Qdrant collection

vector_search/tasks_test.py

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,10 @@
1010
)
1111
from learning_resources.factories import (
1212
ContentFileFactory,
13+
ContentSummarizerConfigurationFactory,
1314
CourseFactory,
1415
LearningResourceFactory,
16+
LearningResourcePlatformFactory,
1517
LearningResourceRunFactory,
1618
ProgramFactory,
1719
)
@@ -24,8 +26,10 @@
2426
embed_learning_resources_by_id,
2527
embed_new_content_files,
2628
embed_new_learning_resources,
29+
embeddings_healthcheck,
2730
start_embed_resources,
2831
)
32+
from vector_search.utils import vector_point_id
2933

3034
pytestmark = pytest.mark.django_db
3135

@@ -388,3 +392,90 @@ def test_embed_new_content_files_without_runs(mocker, mocked_celery):
388392
embedded_ids = generate_embeddings_mock.si.mock_calls[0].args[0]
389393
for contentfile_id in content_files_without_run:
390394
assert contentfile_id in embedded_ids
395+
396+
397+
def test_embeddings_healthcheck_no_missing_embeddings(mocker):
398+
"""
399+
Test embeddings_healthcheck when there are no missing embeddings
400+
"""
401+
lr = LearningResourceFactory.create(published=True)
402+
LearningResourceRunFactory.create(published=True, learning_resource=lr)
403+
ContentFileFactory.create(run=lr.runs.first(), content="test", published=True)
404+
mock_sentry = mocker.patch("vector_search.tasks.sentry_sdk", autospec=True)
405+
mocker.patch(
406+
"vector_search.tasks.filter_existing_qdrant_points_by_ids", return_value=[]
407+
)
408+
409+
embeddings_healthcheck()
410+
assert mock_sentry.capture_message.call_count == 0
411+
412+
413+
def test_embeddings_healthcheck_missing_both(mocker):
414+
"""
415+
Test embeddings_healthcheck when there are missing content files and learning resources
416+
"""
417+
lr = LearningResourceFactory.create(published=True)
418+
LearningResourceRunFactory.create(published=True, learning_resource=lr)
419+
cf = ContentFileFactory.create(run=lr.runs.first(), content="test", published=True)
420+
mocker.patch(
421+
"vector_search.tasks.filter_existing_qdrant_points_by_ids",
422+
side_effect=[
423+
[vector_point_id(lr.readable_id)],
424+
[
425+
vector_point_id(
426+
f"{cf.run.learning_resource.id}.{cf.run.run_id}.{cf.key}.0"
427+
)
428+
],
429+
],
430+
)
431+
mock_sentry = mocker.patch("vector_search.tasks.sentry_sdk.capture_message")
432+
433+
embeddings_healthcheck()
434+
435+
assert mock_sentry.call_count == 2
436+
437+
438+
def test_embeddings_healthcheck_missing_summaries(mocker):
439+
"""
440+
Test embeddings_healthcheck for missing contentfile summaries/flashcards
441+
"""
442+
content_extension = [".srt"]
443+
content_type = ["file"]
444+
platform = LearningResourcePlatformFactory.create()
445+
ContentSummarizerConfigurationFactory.create(
446+
allowed_extensions=content_extension,
447+
allowed_content_types=content_type,
448+
is_active=True,
449+
llm_model="test",
450+
platform__code=platform.code,
451+
)
452+
resource = LearningResourceFactory.create(
453+
published=True, require_summaries=True, platform=platform
454+
)
455+
resource.runs.all().delete()
456+
learning_resource_run = LearningResourceRunFactory.create(
457+
published=True,
458+
learning_resource=resource,
459+
)
460+
learning_resource_run.learning_resource = resource
461+
learning_resource_run.save()
462+
463+
ContentFileFactory.create(
464+
published=True,
465+
content="test",
466+
file_extension=content_extension[0],
467+
summary="",
468+
content_type=content_type[0],
469+
run=learning_resource_run,
470+
)
471+
mocker.patch(
472+
"vector_search.tasks.filter_existing_qdrant_points_by_ids",
473+
)
474+
mock_sentry = mocker.patch("vector_search.tasks.sentry_sdk.capture_message")
475+
476+
embeddings_healthcheck()
477+
assert mock_sentry.call_count == 1
478+
assert (
479+
mock_sentry.mock_calls[0].args[0]
480+
== "Warning: 1 missing content file summaries detected"
481+
)

0 commit comments

Comments
 (0)