Skip to content

Builds: run a healthcheck command in the background #12332

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
Jul 22, 2025
Merged
26 changes: 26 additions & 0 deletions readthedocs/api/v2/views/model_views.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from django.db.models import When
from django.http import Http404
from django.template.loader import render_to_string
from django.utils import timezone
from rest_framework import decorators
from rest_framework import status
from rest_framework import viewsets
Expand Down Expand Up @@ -297,6 +298,31 @@ def concurrent(self, request, **kwargs):
}
return Response(data)

@decorators.action(
detail=True,
permission_classes=[HasBuildAPIKey],
methods=["post"],
)
def healthcheck(self, request, **kwargs):
build = self.get_object()
log.debug(
"Healthcheck received.",
build_id=build.pk,
project_slug=build.version.project.slug,
)
build_api_key = request.build_api_key
if build.version.project.slug != build_api_key.project.slug:
log.warning(
"Project slug doesn't match the one attached to the API key.",
api_key_id=build_api_key.id,
project_slug=build.version.project.slug,
)
raise Http404()

build.healthcheck = timezone.now()
build.save()
return Response(status=status.HTTP_204_NO_CONTENT)

def retrieve(self, *args, **kwargs):
"""
Retrieves command data from storage.
Expand Down
21 changes: 21 additions & 0 deletions readthedocs/builds/migrations/0064_healthcheck.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Generated by Django 5.2.4 on 2025-07-17 11:39

from django.db import migrations
from django.db import models
from django_safemigrate import Safe


class Migration(migrations.Migration):
safe = Safe.before_deploy()

dependencies = [
("builds", "0063_alter_buildcommandresult"),
]

operations = [
migrations.AddField(
model_name="build",
name="healthcheck",
field=models.DateTimeField(blank=True, null=True, verbose_name="Healthcheck"),
),
]
1 change: 1 addition & 0 deletions readthedocs/builds/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -644,6 +644,7 @@ class Build(models.Model):
blank=True,
)
date = models.DateTimeField(_("Date"), auto_now_add=True, db_index=True)
healthcheck = models.DateTimeField(_("Healthcheck"), null=True, blank=True)
success = models.BooleanField(_("Success"), default=True)

# TODO: remove these fields (setup, setup_error, output, error, exit_code)
Expand Down
2 changes: 2 additions & 0 deletions readthedocs/doc_builder/director.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ def create_vcs_environment(self):
environment=self.get_vcs_env_vars(),
container_image=settings.RTD_DOCKER_CLONE_IMAGE,
api_client=self.data.api_client,
build_api_key=self.data.build_api_key,
)

def create_build_environment(self):
Expand All @@ -130,6 +131,7 @@ def create_build_environment(self):
build=self.data.build,
environment=self.get_build_env_vars(),
api_client=self.data.api_client,
build_api_key=self.data.build_api_key,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How was this getting passed into the build before? Seems like we might want to keep using it the same way.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This value is passed as a task argument at https://github.com/readthedocs/readthedocs.org/blob/main/readthedocs/projects/tasks/builds.py#L1027 and then it's used at https://github.com/readthedocs/readthedocs.org/blob/main/readthedocs/projects/tasks/builds.py#L384

Since it's not saved anywhere, we are not able to access it from some inner methods/functions. That's why I had to save this value inside the self.data object we use to share all across the build process we control.

In summary, this value was not used anywhere else than where it was received and that's why we weren't saving it inside the self.data object.

)

def setup_environment(self):
Expand Down
41 changes: 41 additions & 0 deletions readthedocs/doc_builder/environments.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

import structlog
from django.conf import settings
from django.urls import reverse
from django.utils.translation import gettext_lazy as _
from docker import APIClient
from docker.errors import APIError as DockerAPIError
Expand All @@ -19,6 +20,7 @@

from readthedocs.builds.models import BuildCommandResultMixin
from readthedocs.core.utils import slugify
from readthedocs.projects.models import Feature

from .constants import DOCKER_HOSTNAME_MAX_LEN
from .constants import DOCKER_IMAGE
Expand Down Expand Up @@ -573,6 +575,7 @@ class DockerBuildEnvironment(BaseBuildEnvironment):
container_time_limit = DOCKER_LIMITS.get("time")

def __init__(self, *args, **kwargs):
self.build_api_key = kwargs.pop("build_api_key", None)
container_image = kwargs.pop("container_image", None)
super().__init__(*args, **kwargs)
self.client = None
Expand Down Expand Up @@ -829,7 +832,45 @@ def create_container(self):
runtime="runsc", # gVisor runtime
)
client.start(container=self.container_id)

if self.project.has_feature(Feature.BUILD_HEALTHCHECK):
self._run_background_healthcheck()

except (DockerAPIError, ConnectionError) as exc:
raise BuildAppError(
BuildAppError.GENERIC_WITH_BUILD_ID, exception_messag=exc.explanation
) from exc

def _run_background_healthcheck(self):
log.debug("Running build with healthcheck.")

# Run a healthcheck that will ping our API constantly.
# We run a Celery task to check the running build has a valid healthcheck timestamp,
# if that's not the case, these builds are terminated and marked as failed.
build_id = self.build.get("id")

healthcheck_url = reverse("build-healthcheck", kwargs={"pk": build_id})
if settings.RTD_DOCKER_COMPOSE and "ngrok" in settings.PRODUCTION_DOMAIN:
# NOTE: we do require using NGROK here to go over internet because I
# didn't find a way to access the `web` container from inside the
# container the `build` container created for this particular build
# (there are 3 containers involved locally here: web, build, and user's build)
#
# This shouldn't happen in production, because we are not doing Docker in Docker.
url = f"http://readthedocs.ngrok.io{healthcheck_url}"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would really like to find a way to make this work locally without configuration, otherwise we will rarely test that it still works.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should also eventually be the default mode too, so we'll definitely need to find out a better solution. But this doesn't need to be figured out just yet, I feel we could even omit this local development hardcoding from our code for now and return to this later.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Me too, but I wasn't able to find that way. Let me know if you have some ideas on what I may be doing wrong here and I can test them.

else:
url = f"{settings.SLUMBER_API_HOST}{healthcheck_url}"

cmd = f"/bin/bash -c 'while true; do curl --max-time 2 -H \"Authorization: Token {self.build_api_key}\" -X POST {url}; sleep {settings.RTD_BUILD_HEALTHCHECK_DELAY}; done;'"
log.debug("Healthcheck command to run.", command=cmd)

client = self.get_client()
exec_cmd = client.exec_create(
container=self.container_id,
cmd=cmd,
user=settings.RTD_DOCKER_USER,
stdout=True,
stderr=True,
)
# `detach=True` allows us to run this command in the background
client.exec_start(exec_id=exec_cmd["Id"], stream=False, detach=True)
5 changes: 5 additions & 0 deletions readthedocs/projects/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -1976,6 +1976,7 @@ def add_features(sender, **kwargs):
# Build related features
SCALE_IN_PROTECTION = "scale_in_prtection"
USE_S3_SCOPED_CREDENTIALS_ON_BUILDERS = "use_s3_scoped_credentials_on_builders"
BUILD_HEALTHCHECK = "build_healthcheck"

FEATURES = (
(
Expand Down Expand Up @@ -2050,6 +2051,10 @@ def add_features(sender, **kwargs):
USE_S3_SCOPED_CREDENTIALS_ON_BUILDERS,
_("Build: Use S3 scoped credentials for uploading build artifacts."),
),
(
BUILD_HEALTHCHECK,
_("Build: Use background cURL healthcheck."),
),
)

FEATURES = sorted(FEATURES, key=lambda x: x[1])
Expand Down
4 changes: 3 additions & 1 deletion readthedocs/projects/tasks/builds.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ class TaskData:

# Slumber client to interact with the API v2.
api_client: API = None
build_api_key: str = None

start_time: timezone.datetime = None
environment_class: type[DockerBuildEnvironment] | type[LocalBuildEnvironment] = None
Expand Down Expand Up @@ -381,7 +382,8 @@ def before_start(self, task_id, args, kwargs):
# anymore and we are not using it
self.data.environment_class = LocalBuildEnvironment

self.data.api_client = setup_api(kwargs["build_api_key"])
self.data.build_api_key = kwargs["build_api_key"]
self.data.api_client = setup_api(self.data.build_api_key)

self.data.build = self.get_build(self.data.build_pk)
self.data.version = self.get_version(self.data.version_pk)
Expand Down
49 changes: 49 additions & 0 deletions readthedocs/projects/tasks/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from readthedocs.core.utils.filesystem import safe_rmtree
from readthedocs.doc_builder.exceptions import BuildAppError
from readthedocs.notifications.models import Notification
from readthedocs.projects.models import Feature
from readthedocs.storage import build_media_storage
from readthedocs.worker import app

Expand Down Expand Up @@ -95,6 +96,53 @@ def clean_project_resources(project, version=None, version_slug=None):
project.imported_files.all().delete()


@app.task()
def finish_unhealthy_builds():
"""
Finish inactive builds.

A build is consider inactive if the last healthcheck reported was more than
30 seconds ago.

These inactive builds will be marked as ``success=False`` and
``state=CANCELLED`` with an ``error`` to be communicated to the user.
"""
log.debug("Running task to finish inactive builds (no healtcheck received).")
delta = datetime.timedelta(seconds=settings.RTD_BUILD_HEALTHCHECK_TIMEOUT)
query = (
~Q(state__in=BUILD_FINAL_STATES)
& Q(healthcheck__lt=timezone.now() - delta)
& Q(project__feature__feature_id=Feature.BUILD_HEALTHCHECK)
)

projects_finished = set()
builds_finished = []
builds = Build.objects.filter(query)[:50]
for build in builds:
build.success = False
build.state = BUILD_STATE_CANCELLED
build.save()

# Tell Celery to cancel this task in case it's in a zombie state.
app.control.revoke(build.task_id, signal="SIGINT", terminate=True)

Notification.objects.add(
message_id=BuildAppError.BUILD_TERMINATED_DUE_INACTIVITY,
attached_to=build,
)

builds_finished.append(build.pk)
projects_finished.add(build.project.slug)

if builds_finished:
log.info(
'Builds marked as "Terminated due inactivity" (not healthcheck received).',
count=len(builds_finished),
project_slugs=projects_finished,
build_pks=builds_finished,
)


@app.task()
def finish_inactive_builds():
"""
Expand All @@ -118,6 +166,7 @@ def finish_inactive_builds():
~Q(state__in=BUILD_FINAL_STATES)
& Q(date__lt=timezone.now() - delta)
& Q(date__gt=timezone.now() - datetime.timedelta(days=1))
& ~Q(project__feature__feature_id=Feature.BUILD_HEALTHCHECK)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we think this is part of what's causing our build retrying issue? Otherwise, I think we still want this to apply.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I did not see signs that this bug was caused by the task time limit. The task for build healthcheck polling should update the build state if the check fails, so this task shouldn't run on these builds anyways.

If there is a problem with that logic, we should probably use this task as a backup method to ending those builds gracefully.

Copy link
Member Author

@humitos humitos Jul 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, this task should be unrelated to the retrying issue we are suffering.

Here, I'm just removing the builds that using the feature flag to be skipped in the old task since they are managed by the new finish_unhealthy_builds now.

My idea is to delete this old task once we are fully on the healthcheck polling.

)

projects_finished = set()
Expand Down
9 changes: 9 additions & 0 deletions readthedocs/settings/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,8 @@ def SHOW_DEBUG_TOOLBAR(self):
RTD_STABLE = "stable"
RTD_STABLE_VERBOSE_NAME = "stable"
RTD_CLEAN_AFTER_BUILD = False
RTD_BUILD_HEALTHCHECK_TIMEOUT = 60 # seconds
RTD_BUILD_HEALTHCHECK_DELAY = 5 # seconds
RTD_MAX_CONCURRENT_BUILDS = 4
RTD_BUILDS_MAX_RETRIES = 25
RTD_BUILDS_RETRY_DELAY = 5 * 60 # seconds
Expand Down Expand Up @@ -510,6 +512,13 @@ def TEMPLATES(self):
CELERY_DEFAULT_QUEUE = "celery"
CELERYBEAT_SCHEDULER = "django_celery_beat.schedulers:DatabaseScheduler"
CELERYBEAT_SCHEDULE = {
"every-minute-finish-unhealthy-builds": {
"task": "readthedocs.projects.tasks.utils.finish_unhealthy_builds",
"schedule": crontab(minute="*"),
"options": {"queue": "web"},
},
# TODO: delete `quarter-finish-inactive-builds` once we are fully
# migrated into build healthcheck
"quarter-finish-inactive-builds": {
"task": "readthedocs.projects.tasks.utils.finish_inactive_builds",
"schedule": crontab(minute="*/15"),
Expand Down