diff --git a/readthedocs/api/v2/views/model_views.py b/readthedocs/api/v2/views/model_views.py index 4fe63641ace..ea5a4659dba 100644 --- a/readthedocs/api/v2/views/model_views.py +++ b/readthedocs/api/v2/views/model_views.py @@ -13,6 +13,7 @@ from django.db.models import When from django.http import Http404 from django.template.loader import render_to_string +from django.utils import timezone from rest_framework import decorators from rest_framework import status from rest_framework import viewsets @@ -297,6 +298,31 @@ def concurrent(self, request, **kwargs): } return Response(data) + @decorators.action( + detail=True, + permission_classes=[HasBuildAPIKey], + methods=["post"], + ) + def healthcheck(self, request, **kwargs): + build = self.get_object() + log.debug( + "Healthcheck received.", + build_id=build.pk, + project_slug=build.version.project.slug, + ) + build_api_key = request.build_api_key + if build.version.project.slug != build_api_key.project.slug: + log.warning( + "Project slug doesn't match the one attached to the API key.", + api_key_id=build_api_key.id, + project_slug=build.version.project.slug, + ) + raise Http404() + + build.healthcheck = timezone.now() + build.save() + return Response(status=status.HTTP_204_NO_CONTENT) + def retrieve(self, *args, **kwargs): """ Retrieves command data from storage. diff --git a/readthedocs/builds/migrations/0064_healthcheck.py b/readthedocs/builds/migrations/0064_healthcheck.py new file mode 100644 index 00000000000..a3398930a5e --- /dev/null +++ b/readthedocs/builds/migrations/0064_healthcheck.py @@ -0,0 +1,21 @@ +# Generated by Django 5.2.4 on 2025-07-17 11:39 + +from django.db import migrations +from django.db import models +from django_safemigrate import Safe + + +class Migration(migrations.Migration): + safe = Safe.before_deploy() + + dependencies = [ + ("builds", "0063_alter_buildcommandresult"), + ] + + operations = [ + migrations.AddField( + model_name="build", + name="healthcheck", + field=models.DateTimeField(blank=True, null=True, verbose_name="Healthcheck"), + ), + ] diff --git a/readthedocs/builds/models.py b/readthedocs/builds/models.py index 1c47d733427..917a0917fc4 100644 --- a/readthedocs/builds/models.py +++ b/readthedocs/builds/models.py @@ -644,6 +644,7 @@ class Build(models.Model): blank=True, ) date = models.DateTimeField(_("Date"), auto_now_add=True, db_index=True) + healthcheck = models.DateTimeField(_("Healthcheck"), null=True, blank=True) success = models.BooleanField(_("Success"), default=True) # TODO: remove these fields (setup, setup_error, output, error, exit_code) diff --git a/readthedocs/doc_builder/director.py b/readthedocs/doc_builder/director.py index ada7d680b13..d7cd30c6238 100644 --- a/readthedocs/doc_builder/director.py +++ b/readthedocs/doc_builder/director.py @@ -120,6 +120,7 @@ def create_vcs_environment(self): environment=self.get_vcs_env_vars(), container_image=settings.RTD_DOCKER_CLONE_IMAGE, api_client=self.data.api_client, + build_api_key=self.data.build_api_key, ) def create_build_environment(self): @@ -130,6 +131,7 @@ def create_build_environment(self): build=self.data.build, environment=self.get_build_env_vars(), api_client=self.data.api_client, + build_api_key=self.data.build_api_key, ) def setup_environment(self): diff --git a/readthedocs/doc_builder/environments.py b/readthedocs/doc_builder/environments.py index e933a60cac8..794829b11b8 100644 --- a/readthedocs/doc_builder/environments.py +++ b/readthedocs/doc_builder/environments.py @@ -9,6 +9,7 @@ import structlog from django.conf import settings +from django.urls import reverse from django.utils.translation import gettext_lazy as _ from docker import APIClient from docker.errors import APIError as DockerAPIError @@ -19,6 +20,7 @@ from readthedocs.builds.models import BuildCommandResultMixin from readthedocs.core.utils import slugify +from readthedocs.projects.models import Feature from .constants import DOCKER_HOSTNAME_MAX_LEN from .constants import DOCKER_IMAGE @@ -573,6 +575,7 @@ class DockerBuildEnvironment(BaseBuildEnvironment): container_time_limit = DOCKER_LIMITS.get("time") def __init__(self, *args, **kwargs): + self.build_api_key = kwargs.pop("build_api_key", None) container_image = kwargs.pop("container_image", None) super().__init__(*args, **kwargs) self.client = None @@ -829,7 +832,48 @@ def create_container(self): runtime="runsc", # gVisor runtime ) client.start(container=self.container_id) + + if self.project.has_feature(Feature.BUILD_HEALTHCHECK): + self._run_background_healthcheck() + except (DockerAPIError, ConnectionError) as exc: raise BuildAppError( BuildAppError.GENERIC_WITH_BUILD_ID, exception_messag=exc.explanation ) from exc + + def _run_background_healthcheck(self): + """ + Run a cURL command in the background to ping the healthcheck API. + + The API saves the last ping timestamp on each call. Then a periodic Celery task + checks this value for all the running builds and decide if the build is stalled or not. + If it's stalled, it terminates those builds and mark them as fail. + """ + log.debug("Running build with healthcheck.") + + build_id = self.build.get("id") + healthcheck_url = reverse("build-healthcheck", kwargs={"pk": build_id}) + if settings.RTD_DOCKER_COMPOSE and "ngrok" in settings.PRODUCTION_DOMAIN: + # NOTE: we do require using NGROK here to go over internet because I + # didn't find a way to access the `web` container from inside the + # container the `build` container created for this particular build + # (there are 3 containers involved locally here: web, build, and user's build) + # + # This shouldn't happen in production, because we are not doing Docker in Docker. + url = f"http://readthedocs.ngrok.io{healthcheck_url}" + else: + url = f"{settings.SLUMBER_API_HOST}{healthcheck_url}" + + cmd = f"/bin/bash -c 'while true; do curl --max-time 2 -H \"Authorization: Token {self.build_api_key}\" -X POST {url}; sleep {settings.RTD_BUILD_HEALTHCHECK_DELAY}; done;'" + log.debug("Healthcheck command to run.", command=cmd) + + client = self.get_client() + exec_cmd = client.exec_create( + container=self.container_id, + cmd=cmd, + user=settings.RTD_DOCKER_USER, + stdout=True, + stderr=True, + ) + # `detach=True` allows us to run this command in the background + client.exec_start(exec_id=exec_cmd["Id"], stream=False, detach=True) diff --git a/readthedocs/projects/models.py b/readthedocs/projects/models.py index d6f982b5753..393cfefc136 100644 --- a/readthedocs/projects/models.py +++ b/readthedocs/projects/models.py @@ -1976,6 +1976,7 @@ def add_features(sender, **kwargs): # Build related features SCALE_IN_PROTECTION = "scale_in_prtection" USE_S3_SCOPED_CREDENTIALS_ON_BUILDERS = "use_s3_scoped_credentials_on_builders" + BUILD_HEALTHCHECK = "build_healthcheck" FEATURES = ( ( @@ -2050,6 +2051,10 @@ def add_features(sender, **kwargs): USE_S3_SCOPED_CREDENTIALS_ON_BUILDERS, _("Build: Use S3 scoped credentials for uploading build artifacts."), ), + ( + BUILD_HEALTHCHECK, + _("Build: Use background cURL healthcheck."), + ), ) FEATURES = sorted(FEATURES, key=lambda x: x[1]) diff --git a/readthedocs/projects/tasks/builds.py b/readthedocs/projects/tasks/builds.py index f688171b4d6..5607d363624 100644 --- a/readthedocs/projects/tasks/builds.py +++ b/readthedocs/projects/tasks/builds.py @@ -108,6 +108,7 @@ class TaskData: # Slumber client to interact with the API v2. api_client: API = None + build_api_key: str = None start_time: timezone.datetime = None environment_class: type[DockerBuildEnvironment] | type[LocalBuildEnvironment] = None @@ -381,7 +382,8 @@ def before_start(self, task_id, args, kwargs): # anymore and we are not using it self.data.environment_class = LocalBuildEnvironment - self.data.api_client = setup_api(kwargs["build_api_key"]) + self.data.build_api_key = kwargs["build_api_key"] + self.data.api_client = setup_api(self.data.build_api_key) self.data.build = self.get_build(self.data.build_pk) self.data.version = self.get_version(self.data.version_pk) diff --git a/readthedocs/projects/tasks/utils.py b/readthedocs/projects/tasks/utils.py index e5599339c4d..9a1fa8a0184 100644 --- a/readthedocs/projects/tasks/utils.py +++ b/readthedocs/projects/tasks/utils.py @@ -17,6 +17,7 @@ from readthedocs.core.utils.filesystem import safe_rmtree from readthedocs.doc_builder.exceptions import BuildAppError from readthedocs.notifications.models import Notification +from readthedocs.projects.models import Feature from readthedocs.storage import build_media_storage from readthedocs.worker import app @@ -95,6 +96,53 @@ def clean_project_resources(project, version=None, version_slug=None): project.imported_files.all().delete() +@app.task() +def finish_unhealthy_builds(): + """ + Finish inactive builds. + + A build is consider inactive if the last healthcheck reported was more than + RTD_BUILD_HEALTHCHECK_TIMEOUT seconds ago. + + These inactive builds will be marked as ``success=False`` and + ``state=CANCELLED`` with an ``error`` to be communicated to the user. + """ + log.debug("Running task to finish inactive builds (no healtcheck received).") + delta = datetime.timedelta(seconds=settings.RTD_BUILD_HEALTHCHECK_TIMEOUT) + query = ( + ~Q(state__in=BUILD_FINAL_STATES) + & Q(healthcheck__lt=timezone.now() - delta) + & Q(project__feature__feature_id=Feature.BUILD_HEALTHCHECK) + ) + + projects_finished = set() + builds_finished = [] + builds = Build.objects.filter(query)[:50] + for build in builds: + build.success = False + build.state = BUILD_STATE_CANCELLED + build.save() + + # Tell Celery to cancel this task in case it's in a zombie state. + app.control.revoke(build.task_id, signal="SIGINT", terminate=True) + + Notification.objects.add( + message_id=BuildAppError.BUILD_TERMINATED_DUE_INACTIVITY, + attached_to=build, + ) + + builds_finished.append(build.pk) + projects_finished.add(build.project.slug) + + if builds_finished: + log.info( + 'Builds marked as "Terminated due inactivity" (not healthcheck received).', + count=len(builds_finished), + project_slugs=projects_finished, + build_pks=builds_finished, + ) + + @app.task() def finish_inactive_builds(): """ @@ -118,6 +166,7 @@ def finish_inactive_builds(): ~Q(state__in=BUILD_FINAL_STATES) & Q(date__lt=timezone.now() - delta) & Q(date__gt=timezone.now() - datetime.timedelta(days=1)) + & ~Q(project__feature__feature_id=Feature.BUILD_HEALTHCHECK) ) projects_finished = set() diff --git a/readthedocs/settings/base.py b/readthedocs/settings/base.py index 126d61507f8..fed4f3efb10 100644 --- a/readthedocs/settings/base.py +++ b/readthedocs/settings/base.py @@ -144,6 +144,8 @@ def SHOW_DEBUG_TOOLBAR(self): RTD_STABLE = "stable" RTD_STABLE_VERBOSE_NAME = "stable" RTD_CLEAN_AFTER_BUILD = False + RTD_BUILD_HEALTHCHECK_TIMEOUT = 60 # seconds + RTD_BUILD_HEALTHCHECK_DELAY = 15 # seconds RTD_MAX_CONCURRENT_BUILDS = 4 RTD_BUILDS_MAX_RETRIES = 25 RTD_BUILDS_RETRY_DELAY = 5 * 60 # seconds @@ -510,6 +512,13 @@ def TEMPLATES(self): CELERY_DEFAULT_QUEUE = "celery" CELERYBEAT_SCHEDULER = "django_celery_beat.schedulers:DatabaseScheduler" CELERYBEAT_SCHEDULE = { + "every-minute-finish-unhealthy-builds": { + "task": "readthedocs.projects.tasks.utils.finish_unhealthy_builds", + "schedule": crontab(minute="*"), + "options": {"queue": "web"}, + }, + # TODO: delete `quarter-finish-inactive-builds` once we are fully + # migrated into build healthcheck "quarter-finish-inactive-builds": { "task": "readthedocs.projects.tasks.utils.finish_inactive_builds", "schedule": crontab(minute="*/15"),