From a828d35405611098e2f2350657f3076df31c8977 Mon Sep 17 00:00:00 2001 From: Manuel Kaufmann Date: Thu, 17 Jul 2025 14:55:54 +0200 Subject: [PATCH 01/14] Builds: run a healthcheck command in the background Inside the Docker container that runs all the build process, we execute a simple `curl` command to hit our API at `/build//healthcheck/` every 2 seconds to communicate the backend the build is healthy. The backend runs a periodic task every 5 seconds and check for those builds that haven't had activity in the last 30 seconds and cancel those builds. Closes https://github.com/readthedocs/readthedocs.org/issues/11870 --- readthedocs/api/v2/views/model_views.py | 26 +++++++++++++++ .../builds/migrations/0064_healthcheck.py | 21 ++++++++++++ readthedocs/builds/models.py | 1 + readthedocs/doc_builder/director.py | 2 ++ readthedocs/doc_builder/environments.py | 33 +++++++++++++++++-- readthedocs/projects/tasks/builds.py | 4 ++- 6 files changed, 83 insertions(+), 4 deletions(-) create mode 100644 readthedocs/builds/migrations/0064_healthcheck.py diff --git a/readthedocs/api/v2/views/model_views.py b/readthedocs/api/v2/views/model_views.py index 4fe63641ace..ea5a4659dba 100644 --- a/readthedocs/api/v2/views/model_views.py +++ b/readthedocs/api/v2/views/model_views.py @@ -13,6 +13,7 @@ from django.db.models import When from django.http import Http404 from django.template.loader import render_to_string +from django.utils import timezone from rest_framework import decorators from rest_framework import status from rest_framework import viewsets @@ -297,6 +298,31 @@ def concurrent(self, request, **kwargs): } return Response(data) + @decorators.action( + detail=True, + permission_classes=[HasBuildAPIKey], + methods=["post"], + ) + def healthcheck(self, request, **kwargs): + build = self.get_object() + log.debug( + "Healthcheck received.", + build_id=build.pk, + project_slug=build.version.project.slug, + ) + build_api_key = request.build_api_key + if build.version.project.slug != build_api_key.project.slug: + log.warning( + "Project slug doesn't match the one attached to the API key.", + api_key_id=build_api_key.id, + project_slug=build.version.project.slug, + ) + raise Http404() + + build.healthcheck = timezone.now() + build.save() + return Response(status=status.HTTP_204_NO_CONTENT) + def retrieve(self, *args, **kwargs): """ Retrieves command data from storage. diff --git a/readthedocs/builds/migrations/0064_healthcheck.py b/readthedocs/builds/migrations/0064_healthcheck.py new file mode 100644 index 00000000000..a3398930a5e --- /dev/null +++ b/readthedocs/builds/migrations/0064_healthcheck.py @@ -0,0 +1,21 @@ +# Generated by Django 5.2.4 on 2025-07-17 11:39 + +from django.db import migrations +from django.db import models +from django_safemigrate import Safe + + +class Migration(migrations.Migration): + safe = Safe.before_deploy() + + dependencies = [ + ("builds", "0063_alter_buildcommandresult"), + ] + + operations = [ + migrations.AddField( + model_name="build", + name="healthcheck", + field=models.DateTimeField(blank=True, null=True, verbose_name="Healthcheck"), + ), + ] diff --git a/readthedocs/builds/models.py b/readthedocs/builds/models.py index 1c47d733427..917a0917fc4 100644 --- a/readthedocs/builds/models.py +++ b/readthedocs/builds/models.py @@ -644,6 +644,7 @@ class Build(models.Model): blank=True, ) date = models.DateTimeField(_("Date"), auto_now_add=True, db_index=True) + healthcheck = models.DateTimeField(_("Healthcheck"), null=True, blank=True) success = models.BooleanField(_("Success"), default=True) # TODO: remove these fields (setup, setup_error, output, error, exit_code) diff --git a/readthedocs/doc_builder/director.py b/readthedocs/doc_builder/director.py index ada7d680b13..d7cd30c6238 100644 --- a/readthedocs/doc_builder/director.py +++ b/readthedocs/doc_builder/director.py @@ -120,6 +120,7 @@ def create_vcs_environment(self): environment=self.get_vcs_env_vars(), container_image=settings.RTD_DOCKER_CLONE_IMAGE, api_client=self.data.api_client, + build_api_key=self.data.build_api_key, ) def create_build_environment(self): @@ -130,6 +131,7 @@ def create_build_environment(self): build=self.data.build, environment=self.get_build_env_vars(), api_client=self.data.api_client, + build_api_key=self.data.build_api_key, ) def setup_environment(self): diff --git a/readthedocs/doc_builder/environments.py b/readthedocs/doc_builder/environments.py index e933a60cac8..b11dd36d04e 100644 --- a/readthedocs/doc_builder/environments.py +++ b/readthedocs/doc_builder/environments.py @@ -10,13 +10,13 @@ import structlog from django.conf import settings from django.utils.translation import gettext_lazy as _ +from requests.exceptions import ConnectionError +from requests.exceptions import ReadTimeout + from docker import APIClient from docker.errors import APIError as DockerAPIError from docker.errors import DockerException from docker.errors import NotFound as DockerNotFoundError -from requests.exceptions import ConnectionError -from requests.exceptions import ReadTimeout - from readthedocs.builds.models import BuildCommandResultMixin from readthedocs.core.utils import slugify @@ -573,6 +573,7 @@ class DockerBuildEnvironment(BaseBuildEnvironment): container_time_limit = DOCKER_LIMITS.get("time") def __init__(self, *args, **kwargs): + self.build_api_key = kwargs.pop("build_api_key", None) container_image = kwargs.pop("container_image", None) super().__init__(*args, **kwargs) self.client = None @@ -829,6 +830,32 @@ def create_container(self): runtime="runsc", # gVisor runtime ) client.start(container=self.container_id) + + # Run a healthcheck that will ping our API constantly. + # If we detect the API is not pinged for 10s we mark this build as failed. + build_id = self.build.get("id") + + # NOTE: we do require using NGROK here to go over internet because I + # didn't find a way to access the `web` container from inside the + # container the `build` container created for this particular build. + # + # This shouldn't happen in production, because we are not doing Docker in Docker. + # + # TODO: update this URL to work in production + url = f"http://readthedocs.ngrok.io/api/v2/build/{build_id}/healthcheck/" + cmd = f"/bin/bash -c 'while true; do curl --max-time 2 -H \"Authorization: Token {self.build_api_key}\" -X POST {url}; sleep 2; done;'" + log.debug("Healthcheck.", command=cmd) + exec_cmd = client.exec_create( + container=self.container_id, + cmd=cmd, + user=settings.RTD_DOCKER_USER, + stdout=True, + stderr=True, + ) + # `detach=True` allows us to run this command in the background + client.exec_start(exec_id=exec_cmd["Id"], stream=False, detach=True) + # output = client.exec_start(exec_id=exec_cmd["Id"], stream=False) + # log.warning("Healthcheck.", command=cmd, output=output) except (DockerAPIError, ConnectionError) as exc: raise BuildAppError( BuildAppError.GENERIC_WITH_BUILD_ID, exception_messag=exc.explanation diff --git a/readthedocs/projects/tasks/builds.py b/readthedocs/projects/tasks/builds.py index f688171b4d6..5607d363624 100644 --- a/readthedocs/projects/tasks/builds.py +++ b/readthedocs/projects/tasks/builds.py @@ -108,6 +108,7 @@ class TaskData: # Slumber client to interact with the API v2. api_client: API = None + build_api_key: str = None start_time: timezone.datetime = None environment_class: type[DockerBuildEnvironment] | type[LocalBuildEnvironment] = None @@ -381,7 +382,8 @@ def before_start(self, task_id, args, kwargs): # anymore and we are not using it self.data.environment_class = LocalBuildEnvironment - self.data.api_client = setup_api(kwargs["build_api_key"]) + self.data.build_api_key = kwargs["build_api_key"] + self.data.api_client = setup_api(self.data.build_api_key) self.data.build = self.get_build(self.data.build_pk) self.data.version = self.get_version(self.data.version_pk) From 4e09d7622508c54f6d80a6c849b2d0d86ba559ec Mon Sep 17 00:00:00 2001 From: Manuel Kaufmann Date: Thu, 17 Jul 2025 15:13:25 +0200 Subject: [PATCH 02/14] Builds: update `finish_inactive_builds` to use `Build.healthcheck` --- readthedocs/projects/tasks/utils.py | 50 +++++++++++------------------ readthedocs/settings/base.py | 4 +-- 2 files changed, 20 insertions(+), 34 deletions(-) diff --git a/readthedocs/projects/tasks/utils.py b/readthedocs/projects/tasks/utils.py index e5599339c4d..d28355bf43c 100644 --- a/readthedocs/projects/tasks/utils.py +++ b/readthedocs/projects/tasks/utils.py @@ -1,6 +1,7 @@ import datetime import os import re +import signal import boto3 import structlog @@ -100,43 +101,27 @@ def finish_inactive_builds(): """ Finish inactive builds. - A build is consider inactive if it's not in a final state and it has been - "running" for more time that the allowed one (``Project.container_time_limit`` - or ``DOCKER_LIMITS['time']`` plus a 20% of it). + A build is consider inactive if the last healthcheck reported was more than + 30 seconds ago. - These inactive builds will be marked as ``success`` and ``CANCELLED`` with an - ``error`` to be communicated to the user. + These inactive builds will be marked as ``success=False`` and + ``state=CANCELLED`` with an ``error`` to be communicated to the user. """ - # TODO similar to the celery task time limit, we can't infer this from - # Docker settings anymore, because Docker settings are determined on the - # build servers dynamically. - # time_limit = int(DOCKER_LIMITS['time'] * 1.2) - # Set time as maximum celery task time limit + 5m - time_limit = 7200 + 300 - delta = datetime.timedelta(seconds=time_limit) - query = ( - ~Q(state__in=BUILD_FINAL_STATES) - & Q(date__lt=timezone.now() - delta) - & Q(date__gt=timezone.now() - datetime.timedelta(days=1)) - ) + log.debug("Running task to finish inactive builds (no healtcheck received).") + delta = datetime.timedelta(seconds=30) + query = ~Q(state__in=BUILD_FINAL_STATES) & Q(healthcheck__lt=timezone.now() - delta) projects_finished = set() builds_finished = [] builds = Build.objects.filter(query)[:50] for build in builds: - if build.project.container_time_limit: - custom_delta = datetime.timedelta( - seconds=int(build.project.container_time_limit), - ) - if build.date + custom_delta > timezone.now(): - # Do not mark as CANCELLED builds with a custom time limit that wasn't - # expired yet (they are still building the project version) - continue - build.success = False build.state = BUILD_STATE_CANCELLED build.save() + # Tell Celery to cancel this task in case it's in a zombie state. + app.control.revoke(build.task_id, signal=signal.SIGINT, terminate=True) + Notification.objects.add( message_id=BuildAppError.BUILD_TERMINATED_DUE_INACTIVITY, attached_to=build, @@ -145,12 +130,13 @@ def finish_inactive_builds(): builds_finished.append(build.pk) projects_finished.add(build.project.slug) - log.info( - 'Builds marked as "Terminated due inactivity".', - count=len(builds_finished), - project_slugs=projects_finished, - build_pks=builds_finished, - ) + if builds_finished: + log.info( + 'Builds marked as "Terminated due inactivity" (not healthcheck received).', + count=len(builds_finished), + project_slugs=projects_finished, + build_pks=builds_finished, + ) def send_external_build_status(version_type, build_pk, commit, status): diff --git a/readthedocs/settings/base.py b/readthedocs/settings/base.py index 126d61507f8..350fafc85af 100644 --- a/readthedocs/settings/base.py +++ b/readthedocs/settings/base.py @@ -510,9 +510,9 @@ def TEMPLATES(self): CELERY_DEFAULT_QUEUE = "celery" CELERYBEAT_SCHEDULER = "django_celery_beat.schedulers:DatabaseScheduler" CELERYBEAT_SCHEDULE = { - "quarter-finish-inactive-builds": { + "every-minute-finish-inactive-builds": { "task": "readthedocs.projects.tasks.utils.finish_inactive_builds", - "schedule": crontab(minute="*/15"), + "schedule": crontab(minute="*"), "options": {"queue": "web"}, }, "every-day-delete-old-search-queries": { From cf411bc4c0e86ddd2642b4c1aa25711939fd9bfb Mon Sep 17 00:00:00 2001 From: Manuel Kaufmann Date: Thu, 17 Jul 2025 15:21:53 +0200 Subject: [PATCH 03/14] Use variables for healthcheck delay and timeout --- readthedocs/doc_builder/environments.py | 2 +- readthedocs/projects/tasks/utils.py | 2 +- readthedocs/settings/base.py | 2 ++ 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/readthedocs/doc_builder/environments.py b/readthedocs/doc_builder/environments.py index b11dd36d04e..4d4f5f382ae 100644 --- a/readthedocs/doc_builder/environments.py +++ b/readthedocs/doc_builder/environments.py @@ -843,7 +843,7 @@ def create_container(self): # # TODO: update this URL to work in production url = f"http://readthedocs.ngrok.io/api/v2/build/{build_id}/healthcheck/" - cmd = f"/bin/bash -c 'while true; do curl --max-time 2 -H \"Authorization: Token {self.build_api_key}\" -X POST {url}; sleep 2; done;'" + cmd = f"/bin/bash -c 'while true; do curl --max-time 2 -H \"Authorization: Token {self.build_api_key}\" -X POST {url}; sleep {settings.BUILD_HEALTHCHECK_DELAY}; done;'" log.debug("Healthcheck.", command=cmd) exec_cmd = client.exec_create( container=self.container_id, diff --git a/readthedocs/projects/tasks/utils.py b/readthedocs/projects/tasks/utils.py index d28355bf43c..c1e76040a1b 100644 --- a/readthedocs/projects/tasks/utils.py +++ b/readthedocs/projects/tasks/utils.py @@ -108,7 +108,7 @@ def finish_inactive_builds(): ``state=CANCELLED`` with an ``error`` to be communicated to the user. """ log.debug("Running task to finish inactive builds (no healtcheck received).") - delta = datetime.timedelta(seconds=30) + delta = datetime.timedelta(seconds=settings.RTD_BUILD_HEALTHCHECK_TIMEOUT) query = ~Q(state__in=BUILD_FINAL_STATES) & Q(healthcheck__lt=timezone.now() - delta) projects_finished = set() diff --git a/readthedocs/settings/base.py b/readthedocs/settings/base.py index 350fafc85af..ef5b2b4b441 100644 --- a/readthedocs/settings/base.py +++ b/readthedocs/settings/base.py @@ -144,6 +144,8 @@ def SHOW_DEBUG_TOOLBAR(self): RTD_STABLE = "stable" RTD_STABLE_VERBOSE_NAME = "stable" RTD_CLEAN_AFTER_BUILD = False + RTD_BUILD_HEALTHCHECK_TIMEOUT = 60 # seconds + RTD_BUILD_HEALTHCHECK_DELAY = 5 # seconds RTD_MAX_CONCURRENT_BUILDS = 4 RTD_BUILDS_MAX_RETRIES = 25 RTD_BUILDS_RETRY_DELAY = 5 * 60 # seconds From ca5332cd06ebd142552a7bd870a93b2b9f14e842 Mon Sep 17 00:00:00 2001 From: Manuel Kaufmann Date: Thu, 17 Jul 2025 15:23:05 +0200 Subject: [PATCH 04/14] Rename setting --- readthedocs/doc_builder/environments.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/readthedocs/doc_builder/environments.py b/readthedocs/doc_builder/environments.py index 4d4f5f382ae..1e251c30975 100644 --- a/readthedocs/doc_builder/environments.py +++ b/readthedocs/doc_builder/environments.py @@ -843,7 +843,7 @@ def create_container(self): # # TODO: update this URL to work in production url = f"http://readthedocs.ngrok.io/api/v2/build/{build_id}/healthcheck/" - cmd = f"/bin/bash -c 'while true; do curl --max-time 2 -H \"Authorization: Token {self.build_api_key}\" -X POST {url}; sleep {settings.BUILD_HEALTHCHECK_DELAY}; done;'" + cmd = f"/bin/bash -c 'while true; do curl --max-time 2 -H \"Authorization: Token {self.build_api_key}\" -X POST {url}; sleep {settings.RTD_BUILD_HEALTHCHECK_DELAY}; done;'" log.debug("Healthcheck.", command=cmd) exec_cmd = client.exec_create( container=self.container_id, From 176e187063c1db95a4540a800b0319c2d74c6986 Mon Sep 17 00:00:00 2001 From: Manuel Kaufmann Date: Thu, 17 Jul 2025 16:53:19 +0200 Subject: [PATCH 05/14] Use a feature flag for now --- readthedocs/doc_builder/environments.py | 61 +++++++++++++++---------- readthedocs/projects/models.py | 5 ++ 2 files changed, 42 insertions(+), 24 deletions(-) diff --git a/readthedocs/doc_builder/environments.py b/readthedocs/doc_builder/environments.py index 1e251c30975..b1dc83d7099 100644 --- a/readthedocs/doc_builder/environments.py +++ b/readthedocs/doc_builder/environments.py @@ -9,6 +9,7 @@ import structlog from django.conf import settings +from django.urls import reverse from django.utils.translation import gettext_lazy as _ from requests.exceptions import ConnectionError from requests.exceptions import ReadTimeout @@ -19,6 +20,7 @@ from docker.errors import NotFound as DockerNotFoundError from readthedocs.builds.models import BuildCommandResultMixin from readthedocs.core.utils import slugify +from readthedocs.projects.models import Feature from .constants import DOCKER_HOSTNAME_MAX_LEN from .constants import DOCKER_IMAGE @@ -831,32 +833,43 @@ def create_container(self): ) client.start(container=self.container_id) - # Run a healthcheck that will ping our API constantly. - # If we detect the API is not pinged for 10s we mark this build as failed. - build_id = self.build.get("id") + if self.project.has_feature(Feature.BUILD_HEALTHCHECK): + self._run_background_healthcheck() - # NOTE: we do require using NGROK here to go over internet because I - # didn't find a way to access the `web` container from inside the - # container the `build` container created for this particular build. - # - # This shouldn't happen in production, because we are not doing Docker in Docker. - # - # TODO: update this URL to work in production - url = f"http://readthedocs.ngrok.io/api/v2/build/{build_id}/healthcheck/" - cmd = f"/bin/bash -c 'while true; do curl --max-time 2 -H \"Authorization: Token {self.build_api_key}\" -X POST {url}; sleep {settings.RTD_BUILD_HEALTHCHECK_DELAY}; done;'" - log.debug("Healthcheck.", command=cmd) - exec_cmd = client.exec_create( - container=self.container_id, - cmd=cmd, - user=settings.RTD_DOCKER_USER, - stdout=True, - stderr=True, - ) - # `detach=True` allows us to run this command in the background - client.exec_start(exec_id=exec_cmd["Id"], stream=False, detach=True) - # output = client.exec_start(exec_id=exec_cmd["Id"], stream=False) - # log.warning("Healthcheck.", command=cmd, output=output) except (DockerAPIError, ConnectionError) as exc: raise BuildAppError( BuildAppError.GENERIC_WITH_BUILD_ID, exception_messag=exc.explanation ) from exc + + def _run_background_healthcheck(self): + log.info("Running build with healthcheck.") + + # Run a healthcheck that will ping our API constantly. + # If we detect the API is not pinged for 10s we mark this build as failed. + build_id = self.build.get("id") + + healthcheck_url = reverse("build-healthcheck", kwargs={"pk": build_id}) + if settings.RTD_DOCKER_COMPOSE and "ngrok" in settings.PRODUCTION_DOMAIN: + # NOTE: we do require using NGROK here to go over internet because I + # didn't find a way to access the `web` container from inside the + # container the `build` container created for this particular build + # (there are 3 containers involved locally here) + # + # This shouldn't happen in production, because we are not doing Docker in Docker. + url = f"http://readthedocs.ngrok.io{healthcheck_url}" + else: + url = f"{settings.SLUMBER_API_HOST}{healthcheck_url}" + + cmd = f"/bin/bash -c 'while true; do curl --max-time 2 -H \"Authorization: Token {self.build_api_key}\" -X POST {url}; sleep {settings.RTD_BUILD_HEALTHCHECK_DELAY}; done;'" + log.info("Healthcheck command to run.", command=cmd) + + client = self.get_client() + exec_cmd = client.exec_create( + container=self.container_id, + cmd=cmd, + user=settings.RTD_DOCKER_USER, + stdout=True, + stderr=True, + ) + # `detach=True` allows us to run this command in the background + client.exec_start(exec_id=exec_cmd["Id"], stream=False, detach=True) diff --git a/readthedocs/projects/models.py b/readthedocs/projects/models.py index d6f982b5753..393cfefc136 100644 --- a/readthedocs/projects/models.py +++ b/readthedocs/projects/models.py @@ -1976,6 +1976,7 @@ def add_features(sender, **kwargs): # Build related features SCALE_IN_PROTECTION = "scale_in_prtection" USE_S3_SCOPED_CREDENTIALS_ON_BUILDERS = "use_s3_scoped_credentials_on_builders" + BUILD_HEALTHCHECK = "build_healthcheck" FEATURES = ( ( @@ -2050,6 +2051,10 @@ def add_features(sender, **kwargs): USE_S3_SCOPED_CREDENTIALS_ON_BUILDERS, _("Build: Use S3 scoped credentials for uploading build artifacts."), ), + ( + BUILD_HEALTHCHECK, + _("Build: Use background cURL healthcheck."), + ), ) FEATURES = sorted(FEATURES, key=lambda x: x[1]) From b77e7cfe367c19dbc1752551fc2f83e25d63328a Mon Sep 17 00:00:00 2001 From: Manuel Kaufmann Date: Thu, 17 Jul 2025 19:10:56 +0200 Subject: [PATCH 06/14] Use string for the signal See https://github.com/jrobichaud/django-structlog/issues/870 --- readthedocs/projects/tasks/utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/readthedocs/projects/tasks/utils.py b/readthedocs/projects/tasks/utils.py index c1e76040a1b..6b8595bc0a0 100644 --- a/readthedocs/projects/tasks/utils.py +++ b/readthedocs/projects/tasks/utils.py @@ -1,7 +1,6 @@ import datetime import os import re -import signal import boto3 import structlog @@ -120,7 +119,7 @@ def finish_inactive_builds(): build.save() # Tell Celery to cancel this task in case it's in a zombie state. - app.control.revoke(build.task_id, signal=signal.SIGINT, terminate=True) + app.control.revoke(build.task_id, signal="SIGINT", terminate=True) Notification.objects.add( message_id=BuildAppError.BUILD_TERMINATED_DUE_INACTIVITY, From 77ef6c475a01631a5884020db4971acd46a486bf Mon Sep 17 00:00:00 2001 From: Manuel Kaufmann Date: Mon, 21 Jul 2025 18:26:25 +0200 Subject: [PATCH 07/14] Keep new healthcheck and old behavior working together We differentiate them via a feature flag. --- readthedocs/projects/tasks/utils.py | 67 ++++++++++++++++++++++++++++- readthedocs/settings/base.py | 11 ++++- 2 files changed, 74 insertions(+), 4 deletions(-) diff --git a/readthedocs/projects/tasks/utils.py b/readthedocs/projects/tasks/utils.py index 6b8595bc0a0..ea1261092f0 100644 --- a/readthedocs/projects/tasks/utils.py +++ b/readthedocs/projects/tasks/utils.py @@ -17,6 +17,7 @@ from readthedocs.core.utils.filesystem import safe_rmtree from readthedocs.doc_builder.exceptions import BuildAppError from readthedocs.notifications.models import Notification +from readthedocs.projects.models import Feature from readthedocs.storage import build_media_storage from readthedocs.worker import app @@ -96,7 +97,7 @@ def clean_project_resources(project, version=None, version_slug=None): @app.task() -def finish_inactive_builds(): +def finish_unhealthy_builds(): """ Finish inactive builds. @@ -108,7 +109,11 @@ def finish_inactive_builds(): """ log.debug("Running task to finish inactive builds (no healtcheck received).") delta = datetime.timedelta(seconds=settings.RTD_BUILD_HEALTHCHECK_TIMEOUT) - query = ~Q(state__in=BUILD_FINAL_STATES) & Q(healthcheck__lt=timezone.now() - delta) + query = ( + ~Q(state__in=BUILD_FINAL_STATES) + & Q(healthcheck__lt=timezone.now() - delta) + & Q(project__feature__feature_id=Feature.BUILD_HEALTHCHECK) + ) projects_finished = set() builds_finished = [] @@ -138,6 +143,64 @@ def finish_inactive_builds(): ) +@app.task() +def finish_inactive_builds(): + """ + Finish inactive builds. + + A build is consider inactive if it's not in a final state and it has been + "running" for more time that the allowed one (``Project.container_time_limit`` + or ``DOCKER_LIMITS['time']`` plus a 20% of it). + + These inactive builds will be marked as ``success`` and ``CANCELLED`` with an + ``error`` to be communicated to the user. + """ + # TODO similar to the celery task time limit, we can't infer this from + # Docker settings anymore, because Docker settings are determined on the + # build servers dynamically. + # time_limit = int(DOCKER_LIMITS['time'] * 1.2) + # Set time as maximum celery task time limit + 5m + time_limit = 7200 + 300 + delta = datetime.timedelta(seconds=time_limit) + query = ( + ~Q(state__in=BUILD_FINAL_STATES) + & Q(date__lt=timezone.now() - delta) + & Q(date__gt=timezone.now() - datetime.timedelta(days=1)) + ) + + projects_finished = set() + builds_finished = [] + builds = Build.objects.filter(query)[:50] + for build in builds: + if build.project.container_time_limit: + custom_delta = datetime.timedelta( + seconds=int(build.project.container_time_limit), + ) + if build.date + custom_delta > timezone.now(): + # Do not mark as CANCELLED builds with a custom time limit that wasn't + # expired yet (they are still building the project version) + continue + + build.success = False + build.state = BUILD_STATE_CANCELLED + build.save() + + Notification.objects.add( + message_id=BuildAppError.BUILD_TERMINATED_DUE_INACTIVITY, + attached_to=build, + ) + + builds_finished.append(build.pk) + projects_finished.add(build.project.slug) + + log.info( + 'Builds marked as "Terminated due inactivity".', + count=len(builds_finished), + project_slugs=projects_finished, + build_pks=builds_finished, + ) + + def send_external_build_status(version_type, build_pk, commit, status): """ Check if build is external and Send Build Status for project external versions. diff --git a/readthedocs/settings/base.py b/readthedocs/settings/base.py index ef5b2b4b441..baecb83448a 100644 --- a/readthedocs/settings/base.py +++ b/readthedocs/settings/base.py @@ -512,11 +512,18 @@ def TEMPLATES(self): CELERY_DEFAULT_QUEUE = "celery" CELERYBEAT_SCHEDULER = "django_celery_beat.schedulers:DatabaseScheduler" CELERYBEAT_SCHEDULE = { - "every-minute-finish-inactive-builds": { - "task": "readthedocs.projects.tasks.utils.finish_inactive_builds", + "every-minute-finish-unhealthy-builds": { + "task": "readthedocs.projects.tasks.utils.finish_unhealthy_builds", "schedule": crontab(minute="*"), "options": {"queue": "web"}, }, + # TODO: delete `quarter-finish-inactive-builds` once we are fully + # migrated into build healthcheck + "quarter-finish-inactive-builds": { + "task": "readthedocs.projects.tasks.utils.finish_inactive_builds", + "schedule": crontab(minute="*/15"), + "options": {"queue": "web"}, + }, "every-day-delete-old-search-queries": { "task": "readthedocs.search.tasks.delete_old_search_queries_from_db", "schedule": crontab(minute=0, hour=0), From 181cf8cf349540fd65a0ef8fb71627cde10263c2 Mon Sep 17 00:00:00 2001 From: Manuel Kaufmann Date: Mon, 21 Jul 2025 18:27:29 +0200 Subject: [PATCH 08/14] Skip projects with the feature flag in the old check --- readthedocs/projects/tasks/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/readthedocs/projects/tasks/utils.py b/readthedocs/projects/tasks/utils.py index ea1261092f0..9a1bdb3ddea 100644 --- a/readthedocs/projects/tasks/utils.py +++ b/readthedocs/projects/tasks/utils.py @@ -166,6 +166,7 @@ def finish_inactive_builds(): ~Q(state__in=BUILD_FINAL_STATES) & Q(date__lt=timezone.now() - delta) & Q(date__gt=timezone.now() - datetime.timedelta(days=1)) + & ~Q(project__feature__feature_id=Feature.BUILD_HEALTHCHECK) ) projects_finished = set() From 35322d1f4b2edcfd4754f9780e6bb89c452158f1 Mon Sep 17 00:00:00 2001 From: Manuel Kaufmann Date: Mon, 21 Jul 2025 18:31:53 +0200 Subject: [PATCH 09/14] Improve comments --- readthedocs/doc_builder/environments.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/readthedocs/doc_builder/environments.py b/readthedocs/doc_builder/environments.py index b1dc83d7099..d9dd842d456 100644 --- a/readthedocs/doc_builder/environments.py +++ b/readthedocs/doc_builder/environments.py @@ -845,7 +845,8 @@ def _run_background_healthcheck(self): log.info("Running build with healthcheck.") # Run a healthcheck that will ping our API constantly. - # If we detect the API is not pinged for 10s we mark this build as failed. + # We run a Celery task to check the running build has a valid healthcheck timestamp, + # if that's not the case, these builds are terminated and marked as failed. build_id = self.build.get("id") healthcheck_url = reverse("build-healthcheck", kwargs={"pk": build_id}) @@ -853,7 +854,7 @@ def _run_background_healthcheck(self): # NOTE: we do require using NGROK here to go over internet because I # didn't find a way to access the `web` container from inside the # container the `build` container created for this particular build - # (there are 3 containers involved locally here) + # (there are 3 containers involved locally here: web, build, and user's build) # # This shouldn't happen in production, because we are not doing Docker in Docker. url = f"http://readthedocs.ngrok.io{healthcheck_url}" From a38475ee33b854fc41329e7437bcb958b9ced82c Mon Sep 17 00:00:00 2001 From: Manuel Kaufmann Date: Mon, 21 Jul 2025 18:31:59 +0200 Subject: [PATCH 10/14] Use debug loglevel --- readthedocs/doc_builder/environments.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/readthedocs/doc_builder/environments.py b/readthedocs/doc_builder/environments.py index d9dd842d456..81be324debd 100644 --- a/readthedocs/doc_builder/environments.py +++ b/readthedocs/doc_builder/environments.py @@ -842,7 +842,7 @@ def create_container(self): ) from exc def _run_background_healthcheck(self): - log.info("Running build with healthcheck.") + log.debug("Running build with healthcheck.") # Run a healthcheck that will ping our API constantly. # We run a Celery task to check the running build has a valid healthcheck timestamp, @@ -862,7 +862,7 @@ def _run_background_healthcheck(self): url = f"{settings.SLUMBER_API_HOST}{healthcheck_url}" cmd = f"/bin/bash -c 'while true; do curl --max-time 2 -H \"Authorization: Token {self.build_api_key}\" -X POST {url}; sleep {settings.RTD_BUILD_HEALTHCHECK_DELAY}; done;'" - log.info("Healthcheck command to run.", command=cmd) + log.debug("Healthcheck command to run.", command=cmd) client = self.get_client() exec_cmd = client.exec_create( From 7a2fcb94ca9bf3e8d2dce9e6068f29875cf1a2e5 Mon Sep 17 00:00:00 2001 From: Manuel Kaufmann Date: Mon, 21 Jul 2025 18:41:27 +0200 Subject: [PATCH 11/14] Style --- readthedocs/doc_builder/environments.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/readthedocs/doc_builder/environments.py b/readthedocs/doc_builder/environments.py index 81be324debd..27caab2c0b8 100644 --- a/readthedocs/doc_builder/environments.py +++ b/readthedocs/doc_builder/environments.py @@ -11,13 +11,13 @@ from django.conf import settings from django.urls import reverse from django.utils.translation import gettext_lazy as _ -from requests.exceptions import ConnectionError -from requests.exceptions import ReadTimeout - from docker import APIClient from docker.errors import APIError as DockerAPIError from docker.errors import DockerException from docker.errors import NotFound as DockerNotFoundError +from requests.exceptions import ConnectionError +from requests.exceptions import ReadTimeout + from readthedocs.builds.models import BuildCommandResultMixin from readthedocs.core.utils import slugify from readthedocs.projects.models import Feature From d2f656d8ff1643959e02fbb25cefb859123abb6b Mon Sep 17 00:00:00 2001 From: Manuel Kaufmann Date: Tue, 22 Jul 2025 09:41:27 +0200 Subject: [PATCH 12/14] Add docstring --- readthedocs/doc_builder/environments.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/readthedocs/doc_builder/environments.py b/readthedocs/doc_builder/environments.py index 27caab2c0b8..794829b11b8 100644 --- a/readthedocs/doc_builder/environments.py +++ b/readthedocs/doc_builder/environments.py @@ -842,13 +842,16 @@ def create_container(self): ) from exc def _run_background_healthcheck(self): + """ + Run a cURL command in the background to ping the healthcheck API. + + The API saves the last ping timestamp on each call. Then a periodic Celery task + checks this value for all the running builds and decide if the build is stalled or not. + If it's stalled, it terminates those builds and mark them as fail. + """ log.debug("Running build with healthcheck.") - # Run a healthcheck that will ping our API constantly. - # We run a Celery task to check the running build has a valid healthcheck timestamp, - # if that's not the case, these builds are terminated and marked as failed. build_id = self.build.get("id") - healthcheck_url = reverse("build-healthcheck", kwargs={"pk": build_id}) if settings.RTD_DOCKER_COMPOSE and "ngrok" in settings.PRODUCTION_DOMAIN: # NOTE: we do require using NGROK here to go over internet because I From 9b1eb5b416cabe8d77764519a2548ead842ff322 Mon Sep 17 00:00:00 2001 From: Manuel Kaufmann Date: Tue, 22 Jul 2025 09:45:12 +0200 Subject: [PATCH 13/14] Update docstring --- readthedocs/projects/tasks/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/readthedocs/projects/tasks/utils.py b/readthedocs/projects/tasks/utils.py index 9a1bdb3ddea..9a1fa8a0184 100644 --- a/readthedocs/projects/tasks/utils.py +++ b/readthedocs/projects/tasks/utils.py @@ -102,7 +102,7 @@ def finish_unhealthy_builds(): Finish inactive builds. A build is consider inactive if the last healthcheck reported was more than - 30 seconds ago. + RTD_BUILD_HEALTHCHECK_TIMEOUT seconds ago. These inactive builds will be marked as ``success=False`` and ``state=CANCELLED`` with an ``error`` to be communicated to the user. From e6e1022c30c7638f1bc12ce0db2c3c92bc3a6975 Mon Sep 17 00:00:00 2001 From: Manuel Kaufmann Date: Tue, 22 Jul 2025 09:49:36 +0200 Subject: [PATCH 14/14] Increase healthcheck delay --- readthedocs/settings/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/readthedocs/settings/base.py b/readthedocs/settings/base.py index baecb83448a..fed4f3efb10 100644 --- a/readthedocs/settings/base.py +++ b/readthedocs/settings/base.py @@ -145,7 +145,7 @@ def SHOW_DEBUG_TOOLBAR(self): RTD_STABLE_VERBOSE_NAME = "stable" RTD_CLEAN_AFTER_BUILD = False RTD_BUILD_HEALTHCHECK_TIMEOUT = 60 # seconds - RTD_BUILD_HEALTHCHECK_DELAY = 5 # seconds + RTD_BUILD_HEALTHCHECK_DELAY = 15 # seconds RTD_MAX_CONCURRENT_BUILDS = 4 RTD_BUILDS_MAX_RETRIES = 25 RTD_BUILDS_RETRY_DELAY = 5 * 60 # seconds