From 3f2bd1f010507ffee9f4750d0ba05b31bea8e8e7 Mon Sep 17 00:00:00 2001 From: Benjamin Bach Date: Wed, 14 Jun 2023 16:46:23 +0200 Subject: [PATCH 01/21] Adds a feature flag and some initial ability to handle multiple PDFs --- readthedocs/builds/constants.py | 13 ++++++++++ readthedocs/builds/storage.py | 4 +-- readthedocs/doc_builder/backends/sphinx.py | 3 ++- readthedocs/projects/models.py | 2 ++ readthedocs/projects/tasks/builds.py | 25 +++++++++++++++---- .../rtd_tests/tests/test_build_storage.py | 19 ++++++++++++++ readthedocs/storage/rclone.py | 19 ++++++++++++-- readthedocs/telemetry/collectors.py | 4 +-- 8 files changed, 77 insertions(+), 12 deletions(-) diff --git a/readthedocs/builds/constants.py b/readthedocs/builds/constants.py index df1bd6bf7f3..fee73c5c2c8 100644 --- a/readthedocs/builds/constants.py +++ b/readthedocs/builds/constants.py @@ -177,3 +177,16 @@ "epub", "pdf", ) + +# Should replace ARTIFACT_TYPES_WITHOUT_MULTIPLE_FILES_SUPPORT +# Currently hidden behind a feature flag +ARTIFACT_TYPES_WITHOUT_MULTIPLE_FILES_SUPPORT_NO_PDF = ( + "htmlzip", + "epub", +) + +# For certain media types, we only want to copy out specific file extensions. +# This is treated case-insensitive. +ARTIFACTS_WITH_RESTRICTED_EXTENSIONS = { + "pdf": ["pdf"], +} diff --git a/readthedocs/builds/storage.py b/readthedocs/builds/storage.py index da57c40d244..a8dbe6f389b 100644 --- a/readthedocs/builds/storage.py +++ b/readthedocs/builds/storage.py @@ -132,13 +132,13 @@ def _check_suspicious_path(self, path): def _rclone(self): raise NotImplementedError - def rclone_sync_directory(self, source, destination): + def rclone_sync_directory(self, source, destination, **sync_kwargs): """Sync a directory recursively to storage using rclone sync.""" if destination in ("", "/"): raise SuspiciousFileOperation("Syncing all storage cannot be right") self._check_suspicious_path(source) - return self._rclone.sync(source, destination) + return self._rclone.sync(source, destination, **sync_kwargs) def join(self, directory, filepath): return safe_join(directory, filepath) diff --git a/readthedocs/doc_builder/backends/sphinx.py b/readthedocs/doc_builder/backends/sphinx.py index afa1c253421..26173f119f2 100644 --- a/readthedocs/doc_builder/backends/sphinx.py +++ b/readthedocs/doc_builder/backends/sphinx.py @@ -504,6 +504,7 @@ class PdfBuilder(BaseSphinx): pdf_file_name = None def build(self): + """Runs Sphinx to convert to LaTeX, uses latexmk to build PDFs.""" self.run( *self.get_sphinx_cmd(), "-T", @@ -516,7 +517,7 @@ def build(self): f"language={self.project.language}", # Sphinx's source directory (SOURCEDIR). # We are executing this command at the location of the `conf.py` file (CWD). - # TODO: ideally we should execute it from where the repository was clonned, + # TODO: ideally we should execute it from where the repository was cloned, # but that could lead unexpected behavior to some users: # https://github.com/readthedocs/readthedocs.org/pull/9888#issuecomment-1384649346 ".", diff --git a/readthedocs/projects/models.py b/readthedocs/projects/models.py index 377b1e24034..f1d0fc9a0eb 100644 --- a/readthedocs/projects/models.py +++ b/readthedocs/projects/models.py @@ -1941,6 +1941,7 @@ def add_features(sender, **kwargs): DONT_CREATE_INDEX = "dont_create_index" HOSTING_INTEGRATIONS = "hosting_integrations" NO_CONFIG_FILE_DEPRECATED = "no_config_file" + ENABLE_MULTIPLE_PDFS = "mutliple_pdfs" FEATURES = ( ( @@ -2083,6 +2084,7 @@ def add_features(sender, **kwargs): NO_CONFIG_FILE_DEPRECATED, _("Build: Building without a configuration file is deprecated."), ), + (ENABLE_MULTIPLE_PDFS, _("Build: Enable multiple PDF support during builds.")), ) FEATURES = sorted(FEATURES, key=lambda l: l[1]) diff --git a/readthedocs/projects/tasks/builds.py b/readthedocs/projects/tasks/builds.py index d8328aeaa84..fb5b1b72176 100644 --- a/readthedocs/projects/tasks/builds.py +++ b/readthedocs/projects/tasks/builds.py @@ -21,6 +21,8 @@ from readthedocs.builds.constants import ( ARTIFACT_TYPES, ARTIFACT_TYPES_WITHOUT_MULTIPLE_FILES_SUPPORT, + ARTIFACT_TYPES_WITHOUT_MULTIPLE_FILES_SUPPORT_NO_PDF, + ARTIFACTS_WITH_RESTRICTED_EXTENSIONS, BUILD_FINAL_STATES, BUILD_STATE_BUILDING, BUILD_STATE_CLONING, @@ -63,7 +65,7 @@ RepositoryError, SyncRepositoryLocked, ) -from ..models import APIProject, WebHookEvent +from ..models import APIProject, Feature, WebHookEvent from ..signals import before_vcs from .mixins import SyncRepositoryMixin from .search import fileify @@ -539,11 +541,11 @@ def get_valid_artifact_types(self): It performs the following checks on each output format type path: - it exists - it is a directory - - does not contains more than 1 files (only PDF, HTMLZip, ePUB) + - does not contains more than 1 files (HTMLZip, ePUB) TODO: remove the limitation of only 1 file. Add support for multiple PDF files in the output directory and - grab them by using glob syntaxt between other files that could be garbage. + grab them by using glob syntax between other files that could be garbage. """ valid_artifacts = [] for artifact_type in ARTIFACT_TYPES: @@ -570,7 +572,13 @@ def get_valid_artifact_types(self): # Check if there are multiple files on artifact directories. # These output format does not support multiple files yet. # In case multiple files are found, the upload for this format is not performed. - if artifact_type in ARTIFACT_TYPES_WITHOUT_MULTIPLE_FILES_SUPPORT: + if self.data.project.has_feature(Feature.ENABLE_MULTIPLE_PDFS): + single_file_artifacts = ( + ARTIFACT_TYPES_WITHOUT_MULTIPLE_FILES_SUPPORT_NO_PDF + ) + else: + single_file_artifacts = ARTIFACT_TYPES_WITHOUT_MULTIPLE_FILES_SUPPORT + if artifact_type in single_file_artifacts: artifact_format_files = len(os.listdir(artifact_directory)) if artifact_format_files > 1: log.error( @@ -876,7 +884,14 @@ def store_build_artifacts(self): version_type=self.data.version.type, ) try: - build_media_storage.rclone_sync_directory(from_path, to_path) + rclone_kwargs = {} + if media_type in ARTIFACTS_WITH_RESTRICTED_EXTENSIONS: + rclone_kwargs[ + "filter_extensions" + ] = ARTIFACTS_WITH_RESTRICTED_EXTENSIONS[media_type] + build_media_storage.rclone_sync_directory( + from_path, to_path, **rclone_kwargs + ) except Exception as exc: # NOTE: the exceptions reported so far are: # - botocore.exceptions:HTTPClientError diff --git a/readthedocs/rtd_tests/tests/test_build_storage.py b/readthedocs/rtd_tests/tests/test_build_storage.py index 78e138e7b7e..33cb979d2a6 100644 --- a/readthedocs/rtd_tests/tests/test_build_storage.py +++ b/readthedocs/rtd_tests/tests/test_build_storage.py @@ -121,6 +121,25 @@ def test_copy_directory_source_outside_docroot(self): with pytest.raises(SuspiciousFileOperation, match="outside the docroot"): self.storage.copy_directory(tmp_dir, "files") + def test_sync_directory_with_filter(self): + """Test that we can as the rclone_sync_directory method to only include a specific extension.""" + tmp_files_dir = os.path.join(tempfile.mkdtemp(), "files") + # Copy files_dir (with all test files) into tmp_files_dir + shutil.copytree(files_dir, tmp_files_dir, symlinks=True) + storage_dir = "files" + + tree = [ + ("api", ["index.html"]), + "api.fjson", + "conf.py", + "test.html", + ] + with override_settings(DOCROOT=tmp_files_dir): + self.storage.rclone_sync_directory( + tmp_files_dir, storage_dir, filter_extensions=["html"] + ) + self.assertFileTree(storage_dir, [("api", ["index.html"]), "test.html"]) + def test_delete_directory(self): with override_settings(DOCROOT=files_dir): self.storage.copy_directory(files_dir, "files") diff --git a/readthedocs/storage/rclone.py b/readthedocs/storage/rclone.py index ab88f3fe64d..0676ed9229f 100644 --- a/readthedocs/storage/rclone.py +++ b/readthedocs/storage/rclone.py @@ -112,7 +112,7 @@ def execute(self, subcommand, args, options=None): ) return result - def sync(self, source, destination): + def sync(self, source, destination, filter_extensions=None): """ Run the `rclone sync` command. @@ -120,8 +120,23 @@ def sync(self, source, destination): :params source: Local path to the source directory. :params destination: Remote path to the destination directory. + :params filter_extensions: Only copy files in listed file extensions """ - return self.execute("sync", args=[source, self.get_target(destination)]) + options = [] + # See: + # https://rclone.org/filtering/ + if filter_extensions: + options += ["--ignore-case", "--include"] + # Python escape rule: {{ = { + # What we want to have is for instance '*.{pdf}' + filter_pattern = "*.{{{extensions}}}".format( + extensions=",".join(filter_extensions) + ) + options.append(filter_pattern) + + return self.execute( + "sync", args=[source, self.get_target(destination)], options=options + ) class RCloneLocal(BaseRClone): diff --git a/readthedocs/telemetry/collectors.py b/readthedocs/telemetry/collectors.py index af1202d0cdd..ffd21af044d 100644 --- a/readthedocs/telemetry/collectors.py +++ b/readthedocs/telemetry/collectors.py @@ -17,7 +17,7 @@ class BuildDataCollector: """ Build data collector. - Collect data from a runnig build. + Collect data from a running build. """ def __init__(self, environment): @@ -58,7 +58,7 @@ def run(self, *args, **kwargs): def collect(self): """ - Collect all relevant data from the runnig build. + Collect all relevant data from the running build. Data that can be extracted from the database (project/organization) isn't collected here. From f7c3b58e1e78bb42d1260e24da6529a5363e14df Mon Sep 17 00:00:00 2001 From: Benjamin Bach Date: Tue, 20 Jun 2023 19:58:05 +0200 Subject: [PATCH 02/21] WIP: Only copy top-level files --- readthedocs/rtd_tests/tests/test_build_storage.py | 3 ++- readthedocs/storage/rclone.py | 13 +++++++------ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/readthedocs/rtd_tests/tests/test_build_storage.py b/readthedocs/rtd_tests/tests/test_build_storage.py index 33cb979d2a6..89ff4f02a61 100644 --- a/readthedocs/rtd_tests/tests/test_build_storage.py +++ b/readthedocs/rtd_tests/tests/test_build_storage.py @@ -138,7 +138,8 @@ def test_sync_directory_with_filter(self): self.storage.rclone_sync_directory( tmp_files_dir, storage_dir, filter_extensions=["html"] ) - self.assertFileTree(storage_dir, [("api", ["index.html"]), "test.html"]) + # We only accept TOP-LEVEL files, so only test.html and not api/index.html + self.assertFileTree(storage_dir, [("test.html",)]) def test_delete_directory(self): with override_settings(DOCROOT=files_dir): diff --git a/readthedocs/storage/rclone.py b/readthedocs/storage/rclone.py index 0676ed9229f..43bfc558630 100644 --- a/readthedocs/storage/rclone.py +++ b/readthedocs/storage/rclone.py @@ -116,20 +116,21 @@ def sync(self, source, destination, filter_extensions=None): """ Run the `rclone sync` command. - See https://rclone.org/commands/rclone_sync/. + See: + https://rclone.org/commands/rclone_sync/ + https://rclone.org/filtering/ :params source: Local path to the source directory. :params destination: Remote path to the destination directory. - :params filter_extensions: Only copy files in listed file extensions + :params filter_extensions: Only top-level copy files with the listed file extensions. """ options = [] - # See: - # https://rclone.org/filtering/ + if filter_extensions: options += ["--ignore-case", "--include"] # Python escape rule: {{ = { - # What we want to have is for instance '*.{pdf}' - filter_pattern = "*.{{{extensions}}}".format( + # What we want to have is for instance '/*.{pdf}' + filter_pattern = "/*.{{{extensions}}}".format( extensions=",".join(filter_extensions) ) options.append(filter_pattern) From 8efd48087cf3b7798985a24a679120697af0a98a Mon Sep 17 00:00:00 2001 From: Benjamin Bach Date: Tue, 20 Jun 2023 20:28:56 +0200 Subject: [PATCH 03/21] Fix the test --- readthedocs/rtd_tests/tests/test_build_storage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/readthedocs/rtd_tests/tests/test_build_storage.py b/readthedocs/rtd_tests/tests/test_build_storage.py index 89ff4f02a61..f0809032214 100644 --- a/readthedocs/rtd_tests/tests/test_build_storage.py +++ b/readthedocs/rtd_tests/tests/test_build_storage.py @@ -139,7 +139,7 @@ def test_sync_directory_with_filter(self): tmp_files_dir, storage_dir, filter_extensions=["html"] ) # We only accept TOP-LEVEL files, so only test.html and not api/index.html - self.assertFileTree(storage_dir, [("test.html",)]) + self.assertFileTree(storage_dir, ("test.html",)) def test_delete_directory(self): with override_settings(DOCROOT=files_dir): From abea0cbf38d9e27b7a77ff9e7952ce3cda5f59b0 Mon Sep 17 00:00:00 2001 From: Benjamin Bach Date: Wed, 21 Jun 2023 16:11:41 +0200 Subject: [PATCH 04/21] WIP --- readthedocs/builds/constants.py | 11 +++++++++++ readthedocs/projects/tasks/builds.py | 4 +++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/readthedocs/builds/constants.py b/readthedocs/builds/constants.py index fee73c5c2c8..498938df994 100644 --- a/readthedocs/builds/constants.py +++ b/readthedocs/builds/constants.py @@ -178,6 +178,7 @@ "pdf", ) +# Part of Feature: ENABLE_MULTIPLE_PDFS # Should replace ARTIFACT_TYPES_WITHOUT_MULTIPLE_FILES_SUPPORT # Currently hidden behind a feature flag ARTIFACT_TYPES_WITHOUT_MULTIPLE_FILES_SUPPORT_NO_PDF = ( @@ -185,8 +186,18 @@ "epub", ) +# Part of Feature: ENABLE_MULTIPLE_PDFS # For certain media types, we only want to copy out specific file extensions. # This is treated case-insensitive. ARTIFACTS_WITH_RESTRICTED_EXTENSIONS = { "pdf": ["pdf"], + "htmlzip": ["zip"], + "epub": ["epub"], } + +# Part of Feature: ENABLE_MULTIPLE_PDFS +ARTIFACTS_SAVED_AS_IMPORTED_FILE = ( + "htmlzip", + "epub", + "pdf", +) \ No newline at end of file diff --git a/readthedocs/projects/tasks/builds.py b/readthedocs/projects/tasks/builds.py index fb5b1b72176..4b84adf7fbc 100644 --- a/readthedocs/projects/tasks/builds.py +++ b/readthedocs/projects/tasks/builds.py @@ -603,6 +603,8 @@ def get_valid_artifact_types(self): return valid_artifacts + def register_imported_file + def on_success(self, retval, task_id, args, kwargs): valid_artifacts = self.get_valid_artifact_types() @@ -736,7 +738,7 @@ def execute(self): data=self.data, ) - # Clonning + # Cloning self.update_build(state=BUILD_STATE_CLONING) # TODO: remove the ``create_vcs_environment`` hack. Ideally, this should be From 032c7fd52af273f07cad9a6623b8a665df87aaff Mon Sep 17 00:00:00 2001 From: Benjamin Bach Date: Wed, 21 Jun 2023 17:27:23 +0200 Subject: [PATCH 05/21] WIP: Task and ImportedFile creation --- readthedocs/builds/constants.py | 5 ++-- readthedocs/projects/managers.py | 2 +- readthedocs/projects/models.py | 9 +++++- readthedocs/projects/querysets.py | 14 +++++++++ readthedocs/projects/tasks/builds.py | 45 ++++++++++++++++++++++++++-- readthedocs/projects/tasks/search.py | 30 +++++++++++++++++++ 6 files changed, 99 insertions(+), 6 deletions(-) diff --git a/readthedocs/builds/constants.py b/readthedocs/builds/constants.py index 498938df994..4826e91e01e 100644 --- a/readthedocs/builds/constants.py +++ b/readthedocs/builds/constants.py @@ -196,8 +196,9 @@ } # Part of Feature: ENABLE_MULTIPLE_PDFS -ARTIFACTS_SAVED_AS_IMPORTED_FILE = ( +# These artifacts are understood as "downloadable" and will be indexed as an ImportedFile +DOWNLOADABLE_ARTIFACTS = { "htmlzip", "epub", "pdf", -) \ No newline at end of file +} diff --git a/readthedocs/projects/managers.py b/readthedocs/projects/managers.py index 995ea3da4f2..b3f33f3fb70 100644 --- a/readthedocs/projects/managers.py +++ b/readthedocs/projects/managers.py @@ -4,4 +4,4 @@ class HTMLFileManager(models.Manager): def get_queryset(self): - return super().get_queryset().filter(name__endswith='.html') + return super().get_queryset().html_files() diff --git a/readthedocs/projects/models.py b/readthedocs/projects/models.py index f1d0fc9a0eb..7ca5f992aaf 100644 --- a/readthedocs/projects/models.py +++ b/readthedocs/projects/models.py @@ -38,6 +38,7 @@ from readthedocs.projects.querysets import ( ChildRelatedProjectQuerySet, FeatureQuerySet, + ImportedFileQuerySet, ProjectQuerySet, RelatedProjectQuerySet, ) @@ -1482,7 +1483,11 @@ class ImportedFile(models.Model): # max_length is set to 4096 because linux has a maximum path length # of 4096 characters for most filesystems (including EXT4). # https://github.com/rtfd/readthedocs.org/issues/5061 - path = models.CharField(_('Path'), max_length=4096) + # The path of the file is relative to Project.artifact_path(type, version) + path = models.CharField( + _("Path"), + max_length=4096, + ) commit = models.CharField(_('Commit'), max_length=255) build = models.IntegerField(_('Build id'), null=True) modified_date = models.DateTimeField(_('Modified date'), auto_now=True) @@ -1498,6 +1503,8 @@ class ImportedFile(models.Model): null=True, ) + objects = ImportedFileQuerySet.as_manager() + def get_absolute_url(self): return resolve( project=self.project, diff --git a/readthedocs/projects/querysets.py b/readthedocs/projects/querysets.py index c64e772a97a..c97914d25fc 100644 --- a/readthedocs/projects/querysets.py +++ b/readthedocs/projects/querysets.py @@ -233,3 +233,17 @@ def for_project(self, project): Q(default_true=True, add_date__gt=project.pub_date) | Q(future_default_true=True, add_date__lte=project.pub_date) ).distinct() + + +class ImportedFileQuerySet(models.QuerySet): + def html_files(self): + return self.filter(name__endswith=".html") + + def pdf_files(self): + return self.filter(name__endswith=".pdf") + + def htmlzip_files(self): + return self.filter(name__endswith=".zip") + + def epub_files(self): + return self.filter(name__endswith=".epub") diff --git a/readthedocs/projects/tasks/builds.py b/readthedocs/projects/tasks/builds.py index 0f00e4dc56f..a14a8ae3e33 100644 --- a/readthedocs/projects/tasks/builds.py +++ b/readthedocs/projects/tasks/builds.py @@ -32,6 +32,7 @@ BUILD_STATE_UPLOADING, BUILD_STATUS_FAILURE, BUILD_STATUS_SUCCESS, + DOWNLOADABLE_ARTIFACTS, EXTERNAL, UNDELETABLE_ARTIFACT_TYPES, ) @@ -68,7 +69,7 @@ from ..models import APIProject, Feature, WebHookEvent from ..signals import before_vcs from .mixins import SyncRepositoryMixin -from .search import fileify +from .search import fileify, sync_downloadable_artifacts from .utils import BuildRequest, clean_build, send_external_build_status log = structlog.get_logger(__name__) @@ -606,7 +607,44 @@ def get_valid_artifact_types(self): return valid_artifacts - def register_imported_file + def trigger_sync_downloadable_artifacts(self, valid_artifacts): + """ + Triggers the sync_downloadable_artifacts task with the files that were found. + + :param valid_artifacts: list of artifacts allowed by initial validation + :return: None + """ + # A dictionary containing lists of files found for each artifact type. + # Notice that we don't support nesting in sub-folders, so we only need to + # look in the top level folder + # {"pdf": ["file.pdf"]} + + artifact_types = set(valid_artifacts).intersection(DOWNLOADABLE_ARTIFACTS) + + artifacts_found_for_download = { + artifact_type: [] for artifact_type in artifact_types + } + + for artifact_type in artifact_types: + artifact_directory = self.data.project.artifact_path( + version=self.data.version.slug, + type_=artifact_type, + ) + for filename in os.listdir(artifact_directory): + extensions_allowed = ARTIFACTS_WITH_RESTRICTED_EXTENSIONS[artifact_type] + if not os.path.isfile(filename): + continue + if not any(filename.endswith(f".{ext}") for ext in extensions_allowed): + continue + artifacts_found_for_download[artifact_type].append(filename) + + # Store ImportedFiles for artifacts, using their paths in storage + sync_downloadable_artifacts.delay( + version_pk=self.data.version.pk, + commit=self.data.build["commit"], + build=self.data.build["id"], + artifacts_found_for_download=artifacts_found_for_download, + ) def on_success(self, retval, task_id, args, kwargs): valid_artifacts = self.get_valid_artifact_types() @@ -643,6 +681,9 @@ def on_success(self, retval, task_id, args, kwargs): search_ignore=self.data.config.search.ignore, ) + if self.data.project.has_feature(Feature.ENABLE_MULTIPLE_PDFS): + self.trigger_sync_downloadable_artifacts(valid_artifacts) + if not self.data.project.has_valid_clone: self.set_valid_clone() diff --git a/readthedocs/projects/tasks/search.py b/readthedocs/projects/tasks/search.py index dc5dec23890..51284b1686e 100644 --- a/readthedocs/projects/tasks/search.py +++ b/readthedocs/projects/tasks/search.py @@ -1,4 +1,5 @@ import json +import os.path from fnmatch import fnmatch import structlog @@ -68,6 +69,35 @@ def fileify(version_pk, commit, build, search_ranking, search_ignore): log.exception('Failed during ImportedFile syncing') +@app.task(queue="web") +def sync_downloadable_artifacts( + version_pk, commit, build, artifacts_found_for_download +): + """ + Create ImportedFile objects for downloadable files. + + Afterwards, ImportedFile objects are used to generate a list of files that can be downloaded + for each documentation version. + """ + version = Version.objects.get_object_or_log(pk=version_pk) + + # Don't index external version builds for now + if not version or version.type == EXTERNAL: + return + + for artifact_type, artifact_path in artifacts_found_for_download: + name = os.path.basename(artifact_path) + ImportedFile.objects.create( + name=name, + project=version.project, + version=version, + path=artifact_path, + commit=commit, + build=build, + ignore=True, + ) + + def _sync_imported_files(version, build): """ Sync/Update/Delete ImportedFiles objects of this version. From b89dd232091f3e30dc6a93a8cc36d4ee954c4a48 Mon Sep 17 00:00:00 2001 From: Benjamin Bach Date: Thu, 22 Jun 2023 18:48:29 +0200 Subject: [PATCH 06/21] WIP: Need to hide new behavior behind Feature flag --- readthedocs/doc_builder/backends/sphinx.py | 108 ++++++++++++--------- readthedocs/projects/managers.py | 4 +- readthedocs/projects/tasks/builds.py | 16 ++- readthedocs/projects/tasks/search.py | 34 ++++--- 4 files changed, 102 insertions(+), 60 deletions(-) diff --git a/readthedocs/doc_builder/backends/sphinx.py b/readthedocs/doc_builder/backends/sphinx.py index 26173f119f2..315bc7690f1 100644 --- a/readthedocs/doc_builder/backends/sphinx.py +++ b/readthedocs/doc_builder/backends/sphinx.py @@ -86,6 +86,12 @@ def __init__(self, *args, **kwargs): ), self.relative_output_dir, ) + # Isolate temporary files in the _readthedocs/ folder + self.absolute_host_tmp_root = os.path.join( + self.project.checkout_path(self.version.slug), + "_readthedocs/tmp", + ) + self.absolute_container_output_dir = os.path.join( "$READTHEDOCS_OUTPUT", self.relative_output_dir ) @@ -389,6 +395,7 @@ class LocalMediaBuilder(BaseSphinx): def _post_build(self): """Internal post build to create the ZIP file from the HTML output.""" + target_file = os.path.join( self.absolute_container_output_dir, # TODO: shouldn't this name include the name of the version as well? @@ -449,9 +456,12 @@ def _post_build(self): # NOTE: we currently support only one .epub per version epub_filepath = epub_sphinx_filepaths[0] + # Move out the epub file self.run( "mv", epub_filepath, temp_epub_file, cwd=self.project_path, record=False ) + + # Delete everything recursively self.run( "rm", "--recursive", @@ -459,6 +469,8 @@ def _post_build(self): cwd=self.project_path, record=False, ) + + # Create everything again self.run( "mkdir", "--parents", @@ -466,6 +478,8 @@ def _post_build(self): cwd=self.project_path, record=False, ) + + # Restore the epub file self.run( "mv", temp_epub_file, target_file, cwd=self.project_path, record=False ) @@ -501,7 +515,6 @@ class PdfBuilder(BaseSphinx): relative_output_dir = "pdf" sphinx_builder = "latex" - pdf_file_name = None def build(self): """Runs Sphinx to convert to LaTeX, uses latexmk to build PDFs.""" @@ -586,11 +599,11 @@ def _build_latexmk(self, cwd): # When ``-f`` is used, latexmk will continue building if it # encounters errors. We still receive a failure exit code in this # case, but the correct steps should run. - '-f', - '-dvi-', - '-ps-', - f'-jobname={self.project.slug}', - '-interaction=nonstopmode', + "-f", + "-dvi-", + "-ps-", + f"-jobname={self.project.slug}_%A", + "-interaction=nonstopmode", ] cmd_ret = self.build_env.run_command_class( @@ -600,53 +613,60 @@ def _build_latexmk(self, cwd): cwd=self.absolute_host_output_dir, ) - self.pdf_file_name = f"{self.project.slug}.pdf" + pdf_files = glob(os.path.join(self.absolute_host_output_dir, "*.pdf")) + + # There is only 1 PDF file. We will call it project_slug.pdf + # This is the old behavior. + if len(pdf_files) == 1: + os.rename( + pdf_files[0], + os.path.join(self.absolute_host_output_dir, f"{self.project.slug}.pdf"), + ) return cmd_ret.successful def _post_build(self): - """Internal post build to cleanup PDF output directory and leave only one .pdf file.""" + """Internal post build to cleanup PDF output directory and leave only .pdf files.""" - if not self.pdf_file_name: + pdf_files = glob(os.path.join(self.absolute_host_output_dir, "*.pdf")) + if not pdf_files: raise PDFNotFound() - # TODO: merge this with ePUB since it's pretty much the same - temp_pdf_file = f"/tmp/{self.project.slug}-{self.version.slug}.pdf" - target_file = os.path.join( - self.absolute_container_output_dir, - self.pdf_file_name, - ) + if not os.path.exists(self.absolute_host_tmp_root): + os.makedirs(self.absolute_host_tmp_root) + + # We cannot use '*' in commands sent to the host machine, the asterisk gets escaped. + # So we opt for iterating from outside the container + for fname in pdf_files: + os.rename( + fname, + os.path.join(self.absolute_host_tmp_root, os.path.basename(fname)), + ) - # NOTE: we currently support only one .pdf per version - pdf_sphinx_filepath = os.path.join( - self.absolute_container_output_dir, self.pdf_file_name + # Delete everything from the output dir + self.run( + "rm", + "-r", + self.absolute_host_output_dir, + cwd=self.project_path, + record=False, ) - pdf_sphinx_filepath_host = os.path.join( + + # Recreate the output dir + self.run( + "mkdir", + "-p", self.absolute_host_output_dir, - self.pdf_file_name, + cwd=self.project_path, + record=False, ) - if os.path.exists(pdf_sphinx_filepath_host): - self.run( - "mv", - pdf_sphinx_filepath, - temp_pdf_file, - cwd=self.project_path, - record=False, - ) - self.run( - "rm", - "-r", - self.absolute_container_output_dir, - cwd=self.project_path, - record=False, - ) - self.run( - "mkdir", - "-p", - self.absolute_container_output_dir, - cwd=self.project_path, - record=False, - ) - self.run( - "mv", temp_pdf_file, target_file, cwd=self.project_path, record=False + + # Move the PDFs back + # We cannot use '*' in commands sent to the host machine, the asterisk gets escaped. + # So we opt for iterating from outside the container + pdf_files = glob(os.path.join(self.absolute_host_tmp_root, "*.pdf")) + for fname in pdf_files: + os.rename( + fname, + os.path.join(self.absolute_host_output_dir, os.path.basename(fname)), ) diff --git a/readthedocs/projects/managers.py b/readthedocs/projects/managers.py index b3f33f3fb70..54542274913 100644 --- a/readthedocs/projects/managers.py +++ b/readthedocs/projects/managers.py @@ -1,7 +1,9 @@ from django.db import models +from .querysets import ImportedFileQuerySet + class HTMLFileManager(models.Manager): def get_queryset(self): - return super().get_queryset().html_files() + return ImportedFileQuerySet(self.model, using=self._db).html_files() diff --git a/readthedocs/projects/tasks/builds.py b/readthedocs/projects/tasks/builds.py index a14a8ae3e33..dcfa0199b46 100644 --- a/readthedocs/projects/tasks/builds.py +++ b/readthedocs/projects/tasks/builds.py @@ -582,6 +582,7 @@ def get_valid_artifact_types(self): ) else: single_file_artifacts = ARTIFACT_TYPES_WITHOUT_MULTIPLE_FILES_SUPPORT + if artifact_type in single_file_artifacts: artifact_format_files = len(os.listdir(artifact_directory)) if artifact_format_files > 1: @@ -630,11 +631,19 @@ def trigger_sync_downloadable_artifacts(self, valid_artifacts): version=self.data.version.slug, type_=artifact_type, ) - for filename in os.listdir(artifact_directory): + artifact_directory_ls = os.listdir(artifact_directory) + log.info( + "Found artifacts in output directory", + artifact_directory=artifact_directory, + artifact_directory_ls=artifact_directory_ls, + ) + for filename in artifact_directory_ls: + log.info("Found an artifact in output directory", filename=filename) extensions_allowed = ARTIFACTS_WITH_RESTRICTED_EXTENSIONS[artifact_type] - if not os.path.isfile(filename): + if not os.path.isfile(os.path.join(artifact_directory, filename)): continue if not any(filename.endswith(f".{ext}") for ext in extensions_allowed): + log.info("Illegal artifact found", filename=filename) continue artifacts_found_for_download[artifact_type].append(filename) @@ -682,7 +691,8 @@ def on_success(self, retval, task_id, args, kwargs): ) if self.data.project.has_feature(Feature.ENABLE_MULTIPLE_PDFS): - self.trigger_sync_downloadable_artifacts(valid_artifacts) + if valid_artifacts: + self.trigger_sync_downloadable_artifacts(valid_artifacts) if not self.data.project.has_valid_clone: self.set_valid_clone() diff --git a/readthedocs/projects/tasks/search.py b/readthedocs/projects/tasks/search.py index 51284b1686e..803b86cdd38 100644 --- a/readthedocs/projects/tasks/search.py +++ b/readthedocs/projects/tasks/search.py @@ -40,7 +40,7 @@ def fileify(version_pk, commit, build, search_ranking, search_ignore): return log.info( - 'Creating ImportedFiles', + "Creating ImportedFiles for search indexing", project_slug=version.project.slug, version_slug=version.slug, ) @@ -78,24 +78,34 @@ def sync_downloadable_artifacts( Afterwards, ImportedFile objects are used to generate a list of files that can be downloaded for each documentation version. + + :param artifacts_found_for_download: A dictionary with a list of files for each artifact type. + For example: {"pdf": ["path/to/file1.pdf", "path/to/file2.pdf"]} """ version = Version.objects.get_object_or_log(pk=version_pk) + log.info( + "Creating ImportedFiles for artifact downloads", + project_slug=version.project.slug, + version_slug=version.slug, + artifacts_found_for_download=artifacts_found_for_download, + ) # Don't index external version builds for now if not version or version.type == EXTERNAL: return - for artifact_type, artifact_path in artifacts_found_for_download: - name = os.path.basename(artifact_path) - ImportedFile.objects.create( - name=name, - project=version.project, - version=version, - path=artifact_path, - commit=commit, - build=build, - ignore=True, - ) + for artifact_type, artifact_paths in artifacts_found_for_download.items(): + for fpath in artifact_paths: + name = os.path.basename(fpath) + ImportedFile.objects.create( + name=name, + project=version.project, + version=version, + path=fpath, + commit=commit, + build=build, + ignore=True, + ) def _sync_imported_files(version, build): From 653a0ce1c411b536daf5e9977bde23e0be1ca904 Mon Sep 17 00:00:00 2001 From: Benjamin Bach Date: Fri, 23 Jun 2023 16:52:19 +0200 Subject: [PATCH 07/21] Hide new behaviors behind feature flag --- readthedocs/doc_builder/backends/sphinx.py | 87 ++++++++++++++++--- readthedocs/projects/tasks/builds.py | 19 ++-- .../rtd_tests/tests/test_build_storage.py | 24 +++-- 3 files changed, 105 insertions(+), 25 deletions(-) diff --git a/readthedocs/doc_builder/backends/sphinx.py b/readthedocs/doc_builder/backends/sphinx.py index 315bc7690f1..18420a0ecf2 100644 --- a/readthedocs/doc_builder/backends/sphinx.py +++ b/readthedocs/doc_builder/backends/sphinx.py @@ -86,7 +86,9 @@ def __init__(self, *args, **kwargs): ), self.relative_output_dir, ) + # Isolate temporary files in the _readthedocs/ folder + # Used for new feature ENABLE_MULTIPLE_PDFS self.absolute_host_tmp_root = os.path.join( self.project.checkout_path(self.version.slug), "_readthedocs/tmp", @@ -547,7 +549,10 @@ def build(self): # Run LaTeX -> PDF conversions success = self._build_latexmk(self.project_path) - self._post_build() + if self.project.has_feature(Feature.ENABLE_MULTIPLE_PDFS): + self._post_build_multiple() + else: + self._post_build() return success def _build_latexmk(self, cwd): @@ -602,10 +607,16 @@ def _build_latexmk(self, cwd): "-f", "-dvi-", "-ps-", - f"-jobname={self.project.slug}_%A", "-interaction=nonstopmode", ] + if self.project.has_feature(Feature.ENABLE_MULTIPLE_PDFS): + cmd.append(f"-jobname={self.project.slug}_%A") + else: + cmd.append( + f"-jobname={self.project.slug}", + ) + cmd_ret = self.build_env.run_command_class( cls=latex_class, cmd=cmd, @@ -613,19 +624,73 @@ def _build_latexmk(self, cwd): cwd=self.absolute_host_output_dir, ) - pdf_files = glob(os.path.join(self.absolute_host_output_dir, "*.pdf")) - - # There is only 1 PDF file. We will call it project_slug.pdf - # This is the old behavior. - if len(pdf_files) == 1: - os.rename( - pdf_files[0], - os.path.join(self.absolute_host_output_dir, f"{self.project.slug}.pdf"), - ) + if self.project.has_feature(Feature.ENABLE_MULTIPLE_PDFS): + pdf_files = glob(os.path.join(self.absolute_host_output_dir, "*.pdf")) + + # There is only 1 PDF file. We will call it project_slug.pdf + # This is the old behavior. + if len(pdf_files) == 1: + os.rename( + pdf_files[0], + os.path.join( + self.absolute_host_output_dir, f"{self.project.slug}.pdf" + ), + ) + else: + self.pdf_file_name = f"{self.project.slug}.pdf" return cmd_ret.successful + # Removed by Feature.ENABLE_MULTIPLE_PDFS def _post_build(self): + """Internal post build to cleanup PDF output directory and leave only one .pdf file.""" + + if not self.pdf_file_name: + raise PDFNotFound() + + # TODO: merge this with ePUB since it's pretty much the same + temp_pdf_file = f"/tmp/{self.project.slug}-{self.version.slug}.pdf" + target_file = os.path.join( + self.absolute_container_output_dir, + self.pdf_file_name, + ) + + # NOTE: we currently support only one .pdf per version + pdf_sphinx_filepath = os.path.join( + self.absolute_container_output_dir, self.pdf_file_name + ) + pdf_sphinx_filepath_host = os.path.join( + self.absolute_host_output_dir, + self.pdf_file_name, + ) + if os.path.exists(pdf_sphinx_filepath_host): + self.run( + "mv", + pdf_sphinx_filepath, + temp_pdf_file, + cwd=self.project_path, + record=False, + ) + self.run( + "rm", + "-r", + self.absolute_container_output_dir, + cwd=self.project_path, + record=False, + ) + self.run( + "mkdir", + "-p", + self.absolute_container_output_dir, + cwd=self.project_path, + record=False, + ) + self.run( + "mv", temp_pdf_file, target_file, cwd=self.project_path, record=False + ) + + # Introduced by Feature.ENABLE_MULTIPLE_PDFS + def _post_build_multiple(self): """Internal post build to cleanup PDF output directory and leave only .pdf files.""" pdf_files = glob(os.path.join(self.absolute_host_output_dir, "*.pdf")) diff --git a/readthedocs/projects/tasks/builds.py b/readthedocs/projects/tasks/builds.py index dcfa0199b46..9c85139ebae 100644 --- a/readthedocs/projects/tasks/builds.py +++ b/readthedocs/projects/tasks/builds.py @@ -940,14 +940,17 @@ def store_build_artifacts(self): version_type=self.data.version.type, ) try: - rclone_kwargs = {} - if media_type in ARTIFACTS_WITH_RESTRICTED_EXTENSIONS: - rclone_kwargs[ - "filter_extensions" - ] = ARTIFACTS_WITH_RESTRICTED_EXTENSIONS[media_type] - build_media_storage.rclone_sync_directory( - from_path, to_path, **rclone_kwargs - ) + if self.data.project.has_feature(Feature.ENABLE_MULTIPLE_PDFS): + rclone_kwargs = {} + if media_type in ARTIFACTS_WITH_RESTRICTED_EXTENSIONS: + rclone_kwargs[ + "filter_extensions" + ] = ARTIFACTS_WITH_RESTRICTED_EXTENSIONS[media_type] + build_media_storage.rclone_sync_directory( + from_path, to_path, **rclone_kwargs + ) + else: + build_media_storage.rclone_sync_directory(from_path, to_path) except Exception as exc: # NOTE: the exceptions reported so far are: # - botocore.exceptions:HTTPClientError diff --git a/readthedocs/rtd_tests/tests/test_build_storage.py b/readthedocs/rtd_tests/tests/test_build_storage.py index f0809032214..8beaadcdb54 100644 --- a/readthedocs/rtd_tests/tests/test_build_storage.py +++ b/readthedocs/rtd_tests/tests/test_build_storage.py @@ -128,12 +128,6 @@ def test_sync_directory_with_filter(self): shutil.copytree(files_dir, tmp_files_dir, symlinks=True) storage_dir = "files" - tree = [ - ("api", ["index.html"]), - "api.fjson", - "conf.py", - "test.html", - ] with override_settings(DOCROOT=tmp_files_dir): self.storage.rclone_sync_directory( tmp_files_dir, storage_dir, filter_extensions=["html"] @@ -141,6 +135,24 @@ def test_sync_directory_with_filter(self): # We only accept TOP-LEVEL files, so only test.html and not api/index.html self.assertFileTree(storage_dir, ("test.html",)) + def test_sync_directory_with_restriction(self): + """Test that we can as the rclone_sync_directory method to only include a specific extension.""" + tmp_files_dir = os.path.join(tempfile.mkdtemp(), "files") + # Copy files_dir (with all test files) into tmp_files_dir + shutil.copytree(files_dir, tmp_files_dir, symlinks=True) + storage_dir = "files" + + open(os.path.join(tmp_files_dir, "test.pdf"), "w").write("test") + open(os.path.join(tmp_files_dir, "test.exe"), "w").write("test") + + with override_settings(DOCROOT=tmp_files_dir): + self.storage.rclone_sync_directory( + tmp_files_dir, storage_dir, filter_extensions=["pdf"] + ) + # We only accept TOP-LEVEL files, so only test.html and not test.exe + self.assertFileTree(storage_dir, ("test.pdf",)) + + def test_delete_directory(self): with override_settings(DOCROOT=files_dir): self.storage.copy_directory(files_dir, "files") From 216592e00638eda553ee379682367432c7aac104 Mon Sep 17 00:00:00 2001 From: Benjamin Bach Date: Mon, 26 Jun 2023 13:08:30 +0200 Subject: [PATCH 08/21] Melt together two test scenarios --- .../rtd_tests/tests/test_build_storage.py | 26 ++++++++++++------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/readthedocs/rtd_tests/tests/test_build_storage.py b/readthedocs/rtd_tests/tests/test_build_storage.py index 8beaadcdb54..89b645f921b 100644 --- a/readthedocs/rtd_tests/tests/test_build_storage.py +++ b/readthedocs/rtd_tests/tests/test_build_storage.py @@ -135,23 +135,31 @@ def test_sync_directory_with_filter(self): # We only accept TOP-LEVEL files, so only test.html and not api/index.html self.assertFileTree(storage_dir, ("test.html",)) - def test_sync_directory_with_restriction(self): - """Test that we can as the rclone_sync_directory method to only include a specific extension.""" - tmp_files_dir = os.path.join(tempfile.mkdtemp(), "files") - # Copy files_dir (with all test files) into tmp_files_dir - shutil.copytree(files_dir, tmp_files_dir, symlinks=True) - storage_dir = "files" - + # Copy in a couple of other types of files open(os.path.join(tmp_files_dir, "test.pdf"), "w").write("test") open(os.path.join(tmp_files_dir, "test.exe"), "w").write("test") + # Move the .pdf file with override_settings(DOCROOT=tmp_files_dir): self.storage.rclone_sync_directory( tmp_files_dir, storage_dir, filter_extensions=["pdf"] ) - # We only accept TOP-LEVEL files, so only test.html and not test.exe - self.assertFileTree(storage_dir, ("test.pdf",)) + # Test that the PDF file is now there + # ...AND the file we copied in the previos step. + self.assertFileTree( + storage_dir, + ( + "test.html", + "test.pdf", + ), + ) + + def test_sync_directory_with_restriction(self): + """Test that we can as the rclone_sync_directory method to only include a specific extension.""" + tmp_files_dir = os.path.join(tempfile.mkdtemp(), "files") + # Copy files_dir (with all test files) into tmp_files_dir + shutil.copytree(files_dir, tmp_files_dir, symlinks=True) def test_delete_directory(self): with override_settings(DOCROOT=files_dir): From 600b948364eddfba73dedc968b3c8718656f3bf6 Mon Sep 17 00:00:00 2001 From: Benjamin Bach Date: Mon, 26 Jun 2023 13:48:24 +0200 Subject: [PATCH 09/21] Adds tests to model/querysets --- readthedocs/projects/constants.py | 9 ++++ readthedocs/projects/models.py | 18 ++++++- readthedocs/rtd_tests/tests/test_project.py | 16 ++++++- .../rtd_tests/tests/test_project_querysets.py | 47 ++++++++++++++++++- 4 files changed, 86 insertions(+), 4 deletions(-) diff --git a/readthedocs/projects/constants.py b/readthedocs/projects/constants.py index 4cae14dd265..210cbd32c8b 100644 --- a/readthedocs/projects/constants.py +++ b/readthedocs/projects/constants.py @@ -47,6 +47,15 @@ MEDIA_TYPE_JSON, ) +# Map extensions to their media type +MEDIA_TYPES_EXTENSIONS = { + MEDIA_TYPE_HTML: ("html",), + MEDIA_TYPE_PDF: ("pdf",), + MEDIA_TYPE_EPUB: ("epub",), + MEDIA_TYPE_HTMLZIP: ("zip",), + MEDIA_TYPE_JSON: ("json",), +} + BUILD_COMMANDS_OUTPUT_PATH = "_readthedocs/" BUILD_COMMANDS_OUTPUT_PATH_HTML = os.path.join(BUILD_COMMANDS_OUTPUT_PATH, "html") diff --git a/readthedocs/projects/models.py b/readthedocs/projects/models.py index 29e2b2f7d27..3a5d999c13d 100644 --- a/readthedocs/projects/models.py +++ b/readthedocs/projects/models.py @@ -57,11 +57,13 @@ from readthedocs.vcs_support.backends import backend_cls from .constants import ( + BUILD_COMMANDS_OUTPUT_PATH, DOWNLOADABLE_MEDIA_TYPES, MEDIA_TYPE_EPUB, MEDIA_TYPE_HTMLZIP, MEDIA_TYPE_PDF, MEDIA_TYPES, + MEDIA_TYPES_EXTENSIONS, ) log = structlog.get_logger(__name__) @@ -915,10 +917,12 @@ def artifact_path(self, type_, version=LATEST): """ The path to the build docs output for the project. - :param type_: one of `html`, `json`, `htmlzip`, `pdf`, `epub`. + :param type_: A type listed in constants.MEDIA_TYPES :param version: slug of the version. """ - return os.path.join(self.checkout_path(version=version), "_readthedocs", type_) + return os.path.join( + self.checkout_path(version=version), BUILD_COMMANDS_OUTPUT_PATH, type_ + ) def conf_file(self, version=LATEST): """Find a Sphinx ``conf.py`` file in the project checkout.""" @@ -1514,6 +1518,16 @@ def get_absolute_url(self): external=False, ) + def get_media_type(self): + """Returns a matching media type constant from constants.MEDIA_TYPES.""" + __, extension = os.path.splitext(self.name) + if not extension: + return + extension = extension.lstrip(".") + for _type, valid_extensions in MEDIA_TYPES_EXTENSIONS.items(): + if extension in valid_extensions: + return _type + def __str__(self): return '{}: {}'.format(self.name, self.project) diff --git a/readthedocs/rtd_tests/tests/test_project.py b/readthedocs/rtd_tests/tests/test_project.py index a01c4d9837d..1a308b7da83 100644 --- a/readthedocs/rtd_tests/tests/test_project.py +++ b/readthedocs/rtd_tests/tests/test_project.py @@ -22,6 +22,7 @@ from readthedocs.builds.models import Build, Version from readthedocs.oauth.services import GitHubService, GitLabService from readthedocs.projects.constants import ( + BUILD_COMMANDS_OUTPUT_PATH, GITHUB_BRAND, GITLAB_BRAND, MEDIA_TYPE_EPUB, @@ -31,7 +32,7 @@ MEDIA_TYPES, ) from readthedocs.projects.exceptions import ProjectConfigurationError -from readthedocs.projects.models import Project +from readthedocs.projects.models import ImportedFile, Project from readthedocs.projects.tasks.utils import finish_inactive_builds from readthedocs.rtd_tests.mocks.paths import fake_paths_by_regex @@ -279,6 +280,19 @@ def test_git_service_class_gitlab(self): self.pip.save() self.assertEqual(self.pip.git_service_class(), GitLabService) + def test_artifact_path(self): + project1 = get(Project, main_language_project=None) + + imported_pdf = get( + ImportedFile, + project=project1, + name="test1.pdf", + path="test1.pdf", + ) + assert project1.artifact_path(imported_pdf.get_media_type()).endswith( + f"/{BUILD_COMMANDS_OUTPUT_PATH}pdf" + ) + @mock.patch('readthedocs.projects.forms.trigger_build', mock.MagicMock()) class TestProjectTranslations(ProjectMixin, TestCase): diff --git a/readthedocs/rtd_tests/tests/test_project_querysets.py b/readthedocs/rtd_tests/tests/test_project_querysets.py index 0a9424b22ca..63d0e4f3e46 100644 --- a/readthedocs/rtd_tests/tests/test_project_querysets.py +++ b/readthedocs/rtd_tests/tests/test_project_querysets.py @@ -7,7 +7,7 @@ from readthedocs.organizations.models import Organization from readthedocs.projects.constants import PRIVATE, PUBLIC -from readthedocs.projects.models import Feature, Project +from readthedocs.projects.models import Feature, ImportedFile, Project from readthedocs.projects.querysets import ( ChildRelatedProjectQuerySet, ParentRelatedProjectQuerySet, @@ -292,3 +292,48 @@ def test_feature_multiple_projects(self): [repr(feature)], ordered=False, ) + + def test_importedfile_queryset(self): + """ + Asserts that queryset methods function + :return: + """ + project1 = fixture.get(Project, main_language_project=None) + + imported_pdf = get( + ImportedFile, + project=project1, + name="test1.pdf", + path="test1.pdf", + ) + + imported_epub = get( + ImportedFile, + project=project1, + name="test1.epub", + path="test1.epub", + ) + + imported_html = get( + ImportedFile, + project=project1, + name="test1.html", + path="test1.html", + ) + + imported_htmlzip = get( + ImportedFile, + project=project1, + name="test1.zip", + path="test1.zip", + ) + + assert ImportedFile.objects.html_files().count() == 1 + assert ImportedFile.objects.pdf_files().count() == 1 + assert ImportedFile.objects.epub_files().count() == 1 + assert ImportedFile.objects.htmlzip_files().count() == 1 + + assert ImportedFile.objects.html_files().first().name == "test1.html" + assert ImportedFile.objects.pdf_files().first().name == "test1.pdf" + assert ImportedFile.objects.epub_files().first().name == "test1.epub" + assert ImportedFile.objects.htmlzip_files().first().name == "test1.zip" From f452630ae1b368e9faa100daf22290390b778df0 Mon Sep 17 00:00:00 2001 From: Benjamin Bach Date: Mon, 26 Jun 2023 13:51:13 +0200 Subject: [PATCH 10/21] Move constant to where other MEDIA_TYPES are handled --- readthedocs/builds/constants.py | 9 --------- readthedocs/projects/tasks/builds.py | 12 ++++++------ 2 files changed, 6 insertions(+), 15 deletions(-) diff --git a/readthedocs/builds/constants.py b/readthedocs/builds/constants.py index 4826e91e01e..750f176b4c7 100644 --- a/readthedocs/builds/constants.py +++ b/readthedocs/builds/constants.py @@ -186,15 +186,6 @@ "epub", ) -# Part of Feature: ENABLE_MULTIPLE_PDFS -# For certain media types, we only want to copy out specific file extensions. -# This is treated case-insensitive. -ARTIFACTS_WITH_RESTRICTED_EXTENSIONS = { - "pdf": ["pdf"], - "htmlzip": ["zip"], - "epub": ["epub"], -} - # Part of Feature: ENABLE_MULTIPLE_PDFS # These artifacts are understood as "downloadable" and will be indexed as an ImportedFile DOWNLOADABLE_ARTIFACTS = { diff --git a/readthedocs/projects/tasks/builds.py b/readthedocs/projects/tasks/builds.py index 6c6a5cbc47a..5cf9ea1e978 100644 --- a/readthedocs/projects/tasks/builds.py +++ b/readthedocs/projects/tasks/builds.py @@ -22,7 +22,6 @@ ARTIFACT_TYPES, ARTIFACT_TYPES_WITHOUT_MULTIPLE_FILES_SUPPORT, ARTIFACT_TYPES_WITHOUT_MULTIPLE_FILES_SUPPORT_NO_PDF, - ARTIFACTS_WITH_RESTRICTED_EXTENSIONS, BUILD_FINAL_STATES, BUILD_STATE_BUILDING, BUILD_STATE_CLONING, @@ -61,6 +60,7 @@ from readthedocs.telemetry.tasks import save_build_data from readthedocs.worker import app +from ..constants import MEDIA_TYPES_EXTENSIONS from ..exceptions import ( ProjectConfigurationError, RepositoryError, @@ -639,7 +639,7 @@ def trigger_sync_downloadable_artifacts(self, valid_artifacts): ) for filename in artifact_directory_ls: log.info("Found an artifact in output directory", filename=filename) - extensions_allowed = ARTIFACTS_WITH_RESTRICTED_EXTENSIONS[artifact_type] + extensions_allowed = MEDIA_TYPES_EXTENSIONS[artifact_type] if not os.path.isfile(os.path.join(artifact_directory, filename)): continue if not any(filename.endswith(f".{ext}") for ext in extensions_allowed): @@ -947,10 +947,10 @@ def store_build_artifacts(self): try: if self.data.project.has_feature(Feature.ENABLE_MULTIPLE_PDFS): rclone_kwargs = {} - if media_type in ARTIFACTS_WITH_RESTRICTED_EXTENSIONS: - rclone_kwargs[ - "filter_extensions" - ] = ARTIFACTS_WITH_RESTRICTED_EXTENSIONS[media_type] + if media_type in MEDIA_TYPES_EXTENSIONS: + rclone_kwargs["filter_extensions"] = MEDIA_TYPES_EXTENSIONS[ + media_type + ] build_media_storage.rclone_sync_directory( from_path, to_path, **rclone_kwargs ) From 9bfe6f304785fe40cc081211a886853cba849e86 Mon Sep 17 00:00:00 2001 From: Benjamin Bach Date: Mon, 26 Jun 2023 14:12:37 +0200 Subject: [PATCH 11/21] Reuse DOWNLOADABLE_MEDIA_TYPES, add comment about removing pdf_file_name --- readthedocs/builds/constants.py | 9 +-------- readthedocs/doc_builder/backends/sphinx.py | 3 +++ readthedocs/projects/tasks/builds.py | 7 ++++--- 3 files changed, 8 insertions(+), 11 deletions(-) diff --git a/readthedocs/builds/constants.py b/readthedocs/builds/constants.py index 750f176b4c7..694e3ffa010 100644 --- a/readthedocs/builds/constants.py +++ b/readthedocs/builds/constants.py @@ -156,6 +156,7 @@ # All artifact types supported by Read the Docs. # They match the output directory (`_readthedocs/`) +# TODO: Remove this and use constants.MEDIA_TYPES instead ARTIFACT_TYPES = ( "html", "json", @@ -185,11 +186,3 @@ "htmlzip", "epub", ) - -# Part of Feature: ENABLE_MULTIPLE_PDFS -# These artifacts are understood as "downloadable" and will be indexed as an ImportedFile -DOWNLOADABLE_ARTIFACTS = { - "htmlzip", - "epub", - "pdf", -} diff --git a/readthedocs/doc_builder/backends/sphinx.py b/readthedocs/doc_builder/backends/sphinx.py index 18420a0ecf2..128490b912f 100644 --- a/readthedocs/doc_builder/backends/sphinx.py +++ b/readthedocs/doc_builder/backends/sphinx.py @@ -518,6 +518,9 @@ class PdfBuilder(BaseSphinx): relative_output_dir = "pdf" sphinx_builder = "latex" + # TODO: Remove this when dissolving Feature.ENABLE_MULTIPLE_PDFS + pdf_file_name = None + def build(self): """Runs Sphinx to convert to LaTeX, uses latexmk to build PDFs.""" self.run( diff --git a/readthedocs/projects/tasks/builds.py b/readthedocs/projects/tasks/builds.py index 5cf9ea1e978..310fde748de 100644 --- a/readthedocs/projects/tasks/builds.py +++ b/readthedocs/projects/tasks/builds.py @@ -31,7 +31,6 @@ BUILD_STATE_UPLOADING, BUILD_STATUS_FAILURE, BUILD_STATUS_SUCCESS, - DOWNLOADABLE_ARTIFACTS, EXTERNAL, UNDELETABLE_ARTIFACT_TYPES, ) @@ -60,7 +59,7 @@ from readthedocs.telemetry.tasks import save_build_data from readthedocs.worker import app -from ..constants import MEDIA_TYPES_EXTENSIONS +from ..constants import DOWNLOADABLE_MEDIA_TYPES, MEDIA_TYPES_EXTENSIONS from ..exceptions import ( ProjectConfigurationError, RepositoryError, @@ -620,7 +619,9 @@ def trigger_sync_downloadable_artifacts(self, valid_artifacts): # look in the top level folder # {"pdf": ["file.pdf"]} - artifact_types = set(valid_artifacts).intersection(DOWNLOADABLE_ARTIFACTS) + artifact_types = set(valid_artifacts).intersection( + set(DOWNLOADABLE_MEDIA_TYPES) + ) artifacts_found_for_download = { artifact_type: [] for artifact_type in artifact_types From 65e714bea8cab0fdf932e29cb10ce5f9fcf15924 Mon Sep 17 00:00:00 2001 From: Benjamin Bach Date: Mon, 26 Jun 2023 14:20:54 +0200 Subject: [PATCH 12/21] lint --- readthedocs/telemetry/collectors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/readthedocs/telemetry/collectors.py b/readthedocs/telemetry/collectors.py index ffd21af044d..09177848b16 100644 --- a/readthedocs/telemetry/collectors.py +++ b/readthedocs/telemetry/collectors.py @@ -36,7 +36,7 @@ def __init__(self, environment): @staticmethod def _safe_json_loads(content, default=None): - def lowercase(d): # pylint: disable=invalid-name + def lowercase(d): """Convert all dictionary keys to lowercase.""" return {k.lower(): i for k, i in d.items()} From 99ffb3b4fea9a85a35f9ee8282a8d8931b6d5688 Mon Sep 17 00:00:00 2001 From: Benjamin Bach Date: Mon, 26 Jun 2023 15:48:45 +0200 Subject: [PATCH 13/21] WIP --- readthedocs/doc_builder/backends/sphinx.py | 7 +- readthedocs/projects/tests/mockers.py | 8 +- .../projects/tests/test_build_tasks.py | 141 +++++++++++++++++- 3 files changed, 148 insertions(+), 8 deletions(-) diff --git a/readthedocs/doc_builder/backends/sphinx.py b/readthedocs/doc_builder/backends/sphinx.py index 128490b912f..44604f83b69 100644 --- a/readthedocs/doc_builder/backends/sphinx.py +++ b/readthedocs/doc_builder/backends/sphinx.py @@ -6,6 +6,7 @@ import itertools import os +import shutil from glob import glob from pathlib import Path @@ -634,7 +635,7 @@ def _build_latexmk(self, cwd): # This is the old behavior. if len(pdf_files) == 1: os.rename( - pdf_files[0], + os.path.join(self.absolute_host_output_dir, pdf_files[0]), os.path.join( self.absolute_host_output_dir, f"{self.project.slug}.pdf" ), @@ -706,7 +707,7 @@ def _post_build_multiple(self): # We cannot use '*' in commands sent to the host machine, the asterisk gets escaped. # So we opt for iterating from outside the container for fname in pdf_files: - os.rename( + shutil.move( fname, os.path.join(self.absolute_host_tmp_root, os.path.basename(fname)), ) @@ -734,7 +735,7 @@ def _post_build_multiple(self): # So we opt for iterating from outside the container pdf_files = glob(os.path.join(self.absolute_host_tmp_root, "*.pdf")) for fname in pdf_files: - os.rename( + shutil.move( fname, os.path.join(self.absolute_host_output_dir, os.path.basename(fname)), ) diff --git a/readthedocs/projects/tests/mockers.py b/readthedocs/projects/tests/mockers.py index c86d27b6c8c..f4c9499c568 100644 --- a/readthedocs/projects/tests/mockers.py +++ b/readthedocs/projects/tests/mockers.py @@ -75,10 +75,10 @@ def _mock_artifact_builders(self): # self.patches['builder.pdf.LatexBuildCommand.output'] = mock.patch( # 'readthedocs.doc_builder.backends.sphinx.LatexBuildCommand.output', # ) - self.patches['builder.pdf.glob'] = mock.patch( - 'readthedocs.doc_builder.backends.sphinx.glob', - return_value=['output.file'], - ) + # self.patches['builder.pdf.glob'] = mock.patch( + # 'readthedocs.doc_builder.backends.sphinx.glob', + # return_value=['output.file'], + # ) self.patches['builder.pdf.os.path.getmtime'] = mock.patch( 'readthedocs.doc_builder.backends.sphinx.os.path.getmtime', diff --git a/readthedocs/projects/tests/test_build_tasks.py b/readthedocs/projects/tests/test_build_tasks.py index 7a085004ff0..bd414c0aa88 100644 --- a/readthedocs/projects/tests/test_build_tasks.py +++ b/readthedocs/projects/tests/test_build_tasks.py @@ -16,7 +16,13 @@ from readthedocs.config.config import BuildConfigV2 from readthedocs.doc_builder.exceptions import BuildAppError from readthedocs.projects.exceptions import RepositoryError -from readthedocs.projects.models import EnvironmentVariable, Project, WebHookEvent +from readthedocs.projects.models import ( + EnvironmentVariable, + Feature, + ImportedFile, + Project, + WebHookEvent, +) from readthedocs.projects.tasks.builds import sync_repository_task, update_docs_task from readthedocs.telemetry.models import BuildData @@ -309,6 +315,139 @@ def test_build_updates_documentation_type(self, load_yaml_config): "has_htmlzip": False, } + @mock.patch("readthedocs.doc_builder.director.load_yaml_config") + def test_build_updates_multiple_artifacts(self, load_yaml_config): + """ + Test that Feature.ENABLE_MULTIPLE_PDFS works. + + We mock that several .pdfs were created and verify that ImportedFile objects are in place. + """ + + fixture.get( + Feature, + projects=[self.project], + feature_id=Feature.ENABLE_MULTIPLE_PDFS, + ) + + assert self.version.documentation_type == "sphinx" + load_yaml_config.return_value = self._config_file( + { + "version": 2, + "formats": ["epub", "pdf"], + "sphinx": { + "configuration": "docs/conf.py", + }, + } + ) + + # Create the artifact paths, so that `store_build_artifacts` + # properly runs: https://github.com/readthedocs/readthedocs.org/blob/faa611fad689675f81101ea643770a6b669bf529/readthedocs/projects/tasks/builds.py#L798-L804 + os.makedirs(self.project.artifact_path(version=self.version.slug, type_="html")) + + # epub and pdf output dirs + epub_dir = self.project.artifact_path(version=self.version.slug, type_="epub") + pdf_dir = self.project.artifact_path(version=self.version.slug, type_="pdf") + os.makedirs(epub_dir) + os.makedirs(pdf_dir) + + # Create 1 epub and 2 pdfs + pathlib.Path( + os.path.join( + epub_dir, + f"{self.project.slug}.epub", + ) + ).touch() + pathlib.Path( + os.path.join( + pdf_dir, + f"{self.project.slug}-test1.pdf", + ) + ).touch() + pathlib.Path( + os.path.join( + pdf_dir, + f"{self.project.slug}-test2.pdf", + ) + ).touch() + + self._trigger_update_docs_task() + + # Update version state + assert self.requests_mock.request_history[6]._request.method == "PATCH" + assert self.requests_mock.request_history[6].path == "/api/v2/build/1/" + assert self.requests_mock.request_history[6].json() == { + "addons": False, + "build_data": None, + "built": True, + "documentation_type": "sphinx", + "has_pdf": True, + "has_epub": True, + "has_htmlzip": True, + } + + assert ( + ImportedFile.objects.pdf_files() + .filter(project=self.project, name=f"{self.project.slug}-test1.pdf") + .count() + == 1 + ) + assert ( + ImportedFile.objects.pdf_files() + .filter(project=self.project, name=f"{self.project.slug}-test2.pdf") + .count() + == 1 + ) + + @mock.patch("readthedocs.doc_builder.director.load_yaml_config") + def test_build_fail_if_no_artifacts(self, load_yaml_config): + """ + Test that Feature.ENABLE_MULTIPLE_PDFS works but fails when there are no PDFs found + + We mock that several .pdfs were created and verify that ImportedFile objects are in place. + """ + assert self.version.documentation_type == "sphinx" + + fixture.get( + Feature, + projects=[self.project], + feature_id=Feature.ENABLE_MULTIPLE_PDFS, + ) + + load_yaml_config.return_value = self._config_file( + { + "version": 2, + "formats": ["epub", "pdf"], + "sphinx": { + "configuration": "docs/conf.py", + }, + } + ) + + # Create the artifact paths, so that `store_build_artifacts` + # properly runs: https://github.com/readthedocs/readthedocs.org/blob/faa611fad689675f81101ea643770a6b669bf529/readthedocs/projects/tasks/builds.py#L798-L804 + os.makedirs(self.project.artifact_path(version=self.version.slug, type_="html")) + + # epub and pdf output dirs + epub_dir = self.project.artifact_path(version=self.version.slug, type_="epub") + pdf_dir = self.project.artifact_path(version=self.version.slug, type_="pdf") + os.makedirs(epub_dir) + os.makedirs(pdf_dir) + + # Create 1 epub and 0 pdfs + pathlib.Path( + os.path.join( + epub_dir, + f"{self.project.slug}.epub", + ) + ).touch() + + self._trigger_update_docs_task() + + output_json = self.requests_mock.request_history[7].json() + + assert "error" in output_json + assert output_json["output_json"] + @pytest.mark.parametrize( "config", [ From 7b11369f4428065248adcaa928e4ec20e2260d4d Mon Sep 17 00:00:00 2001 From: Benjamin Bach Date: Mon, 26 Jun 2023 17:04:52 +0200 Subject: [PATCH 14/21] Finalize test case for testing that multiple ImportedFile objects are created --- readthedocs/projects/tests/test_build_tasks.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/readthedocs/projects/tests/test_build_tasks.py b/readthedocs/projects/tests/test_build_tasks.py index bd414c0aa88..c1898cc1f78 100644 --- a/readthedocs/projects/tests/test_build_tasks.py +++ b/readthedocs/projects/tests/test_build_tasks.py @@ -350,6 +350,15 @@ def test_build_updates_multiple_artifacts(self, load_yaml_config): os.makedirs(epub_dir) os.makedirs(pdf_dir) + # Create a .tex file because of a validation step that verifies that intermediate + # .tex files were created. + pathlib.Path( + os.path.join( + pdf_dir, + f"{self.project.slug}-test1.tex", + ) + ).touch() + # Create 1 epub and 2 pdfs pathlib.Path( os.path.join( @@ -373,16 +382,16 @@ def test_build_updates_multiple_artifacts(self, load_yaml_config): self._trigger_update_docs_task() # Update version state - assert self.requests_mock.request_history[6]._request.method == "PATCH" - assert self.requests_mock.request_history[6].path == "/api/v2/build/1/" - assert self.requests_mock.request_history[6].json() == { + assert self.requests_mock.request_history[7]._request.method == "PATCH" + assert self.requests_mock.request_history[7].path == "/api/v2/version/1/" + assert self.requests_mock.request_history[7].json() == { "addons": False, "build_data": None, "built": True, "documentation_type": "sphinx", "has_pdf": True, "has_epub": True, - "has_htmlzip": True, + "has_htmlzip": False, } assert ( From 8fadd42448d5d2b3410c51d3cf9d29febf0ab695 Mon Sep 17 00:00:00 2001 From: Benjamin Bach Date: Mon, 26 Jun 2023 17:19:03 +0200 Subject: [PATCH 15/21] Update failure test case --- readthedocs/projects/tests/test_build_tasks.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/readthedocs/projects/tests/test_build_tasks.py b/readthedocs/projects/tests/test_build_tasks.py index c1898cc1f78..0333345a81e 100644 --- a/readthedocs/projects/tests/test_build_tasks.py +++ b/readthedocs/projects/tests/test_build_tasks.py @@ -442,6 +442,16 @@ def test_build_fail_if_no_artifacts(self, load_yaml_config): os.makedirs(epub_dir) os.makedirs(pdf_dir) + + # Create a .tex file because of a validation step that verifies that intermediate + # .tex files were created. + pathlib.Path( + os.path.join( + pdf_dir, + f"{self.project.slug}-test1.tex", + ) + ).touch() + # Create 1 epub and 0 pdfs pathlib.Path( os.path.join( @@ -452,10 +462,10 @@ def test_build_fail_if_no_artifacts(self, load_yaml_config): self._trigger_update_docs_task() - output_json = self.requests_mock.request_history[7].json() + output_json = self.requests_mock.request_history[6].json() assert "error" in output_json - assert output_json["output_json"] + assert "PDF file was not generated" in output_json["error"] @pytest.mark.parametrize( "config", From d91a455b6802d1db8c94e7e84af05be0a51f76be Mon Sep 17 00:00:00 2001 From: Benjamin Bach Date: Mon, 26 Jun 2023 17:22:06 +0200 Subject: [PATCH 16/21] Add some more test comments --- readthedocs/rtd_tests/tests/test_project.py | 10 +++++++--- readthedocs/rtd_tests/tests/test_project_querysets.py | 5 +---- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/readthedocs/rtd_tests/tests/test_project.py b/readthedocs/rtd_tests/tests/test_project.py index 1a308b7da83..74bbd154645 100644 --- a/readthedocs/rtd_tests/tests/test_project.py +++ b/readthedocs/rtd_tests/tests/test_project.py @@ -281,6 +281,7 @@ def test_git_service_class_gitlab(self): self.assertEqual(self.pip.git_service_class(), GitLabService) def test_artifact_path(self): + """Verify that we can generate a meaningful path from known methods and constants.""" project1 = get(Project, main_language_project=None) imported_pdf = get( @@ -289,9 +290,12 @@ def test_artifact_path(self): name="test1.pdf", path="test1.pdf", ) - assert project1.artifact_path(imported_pdf.get_media_type()).endswith( - f"/{BUILD_COMMANDS_OUTPUT_PATH}pdf" - ) + media_path = project1.artifact_path(imported_pdf.get_media_type()) + + # Verify that BUILD_COMMANDS_OUTPUT_PATH is part of the path + assert media_path.endswith(f"/{BUILD_COMMANDS_OUTPUT_PATH}pdf") + # Verify that it ends with /pdf as expected + assert media_path.endswith("/pdf") @mock.patch('readthedocs.projects.forms.trigger_build', mock.MagicMock()) diff --git a/readthedocs/rtd_tests/tests/test_project_querysets.py b/readthedocs/rtd_tests/tests/test_project_querysets.py index 63d0e4f3e46..3b9960837b4 100644 --- a/readthedocs/rtd_tests/tests/test_project_querysets.py +++ b/readthedocs/rtd_tests/tests/test_project_querysets.py @@ -294,10 +294,7 @@ def test_feature_multiple_projects(self): ) def test_importedfile_queryset(self): - """ - Asserts that queryset methods function - :return: - """ + """Verify queryset methods for ImportedFile.""" project1 = fixture.get(Project, main_language_project=None) imported_pdf = get( From c610326d591b0a9d2dfa0dbd859621a9f1289c86 Mon Sep 17 00:00:00 2001 From: Benjamin Bach Date: Mon, 26 Jun 2023 17:38:16 +0200 Subject: [PATCH 17/21] comment update --- readthedocs/projects/constants.py | 4 ++-- readthedocs/projects/tasks/builds.py | 9 ++++----- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/readthedocs/projects/constants.py b/readthedocs/projects/constants.py index 210cbd32c8b..e70c7a484d1 100644 --- a/readthedocs/projects/constants.py +++ b/readthedocs/projects/constants.py @@ -47,9 +47,9 @@ MEDIA_TYPE_JSON, ) -# Map extensions to their media type +# Map media types to their know extensions. +# This is used for validating and uploading artifacts. MEDIA_TYPES_EXTENSIONS = { - MEDIA_TYPE_HTML: ("html",), MEDIA_TYPE_PDF: ("pdf",), MEDIA_TYPE_EPUB: ("epub",), MEDIA_TYPE_HTMLZIP: ("zip",), diff --git a/readthedocs/projects/tasks/builds.py b/readthedocs/projects/tasks/builds.py index 310fde748de..7dc07f5b0de 100644 --- a/readthedocs/projects/tasks/builds.py +++ b/readthedocs/projects/tasks/builds.py @@ -610,15 +610,14 @@ def get_valid_artifact_types(self): def trigger_sync_downloadable_artifacts(self, valid_artifacts): """ Triggers the sync_downloadable_artifacts task with the files that were found. + Performs an additional validation on files contained in valid_artifacts. - :param valid_artifacts: list of artifacts allowed by initial validation + :param valid_artifacts: A list of artifacts allowed by initial validation. Example: + {"pdf": ["file.pdf"]} :return: None """ - # A dictionary containing lists of files found for each artifact type. - # Notice that we don't support nesting in sub-folders, so we only need to - # look in the top level folder - # {"pdf": ["file.pdf"]} + # We only look at artifacts that are part of DOWNLOADABLE_MEDIA_TYPES artifact_types = set(valid_artifacts).intersection( set(DOWNLOADABLE_MEDIA_TYPES) ) From 01e3c2a9376fec59a472e234fb122cf61efb9338 Mon Sep 17 00:00:00 2001 From: Benjamin Bach Date: Wed, 28 Jun 2023 15:42:36 +0200 Subject: [PATCH 18/21] Mock revoking the API key? --- readthedocs/projects/tests/mockers.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/readthedocs/projects/tests/mockers.py b/readthedocs/projects/tests/mockers.py index c86d27b6c8c..6ad92ea9db9 100644 --- a/readthedocs/projects/tests/mockers.py +++ b/readthedocs/projects/tests/mockers.py @@ -262,3 +262,8 @@ def _mock_api(self): f'{settings.SLUMBER_API_HOST}/api/v2/project/{self.project.pk}/', status_code=201, ) + + self.requestsmock.post( + f"{settings.SLUMBER_API_HOST}/api/v2/revoke/", + status_code=204, + ) From c1cc8eb30f10b6fd476d07fc5a28eb02ea1c2c36 Mon Sep 17 00:00:00 2001 From: Benjamin Bach Date: Wed, 28 Jun 2023 17:22:17 +0200 Subject: [PATCH 19/21] Update tests after we stopped mocking 'glob' --- readthedocs/doc_builder/backends/sphinx.py | 27 ++++++++++++---- .../projects/tests/test_build_tasks.py | 31 +++++++++++++++---- 2 files changed, 46 insertions(+), 12 deletions(-) diff --git a/readthedocs/doc_builder/backends/sphinx.py b/readthedocs/doc_builder/backends/sphinx.py index 44604f83b69..e8c6a378b6c 100644 --- a/readthedocs/doc_builder/backends/sphinx.py +++ b/readthedocs/doc_builder/backends/sphinx.py @@ -452,9 +452,7 @@ def _post_build(self): f"{self.project.slug}.epub", ) - epub_sphinx_filepaths = glob( - os.path.join(self.absolute_host_output_dir, "*.epub") - ) + epub_sphinx_filepaths = self._get_epub_files_generated() if epub_sphinx_filepaths: # NOTE: we currently support only one .epub per version epub_filepath = epub_sphinx_filepaths[0] @@ -487,6 +485,14 @@ def _post_build(self): "mv", temp_epub_file, target_file, cwd=self.project_path, record=False ) + def _get_epub_files_generated(self): + """ + Returns a list of *.epub generated by the sphinx build command. + + This is split out in a separate method to be test-friendly. + """ + return glob(os.path.join(self.absolute_host_output_dir, "*.epub")) + class LatexBuildCommand(BuildCommand): @@ -546,9 +552,8 @@ def build(self): bin_path=self.python_env.venv_bin(), ) - tex_files = glob(os.path.join(self.absolute_host_output_dir, "*.tex")) - if not tex_files: - raise BuildUserError("No TeX files were found.") + # Check that we generated .tex files. + self._check_for_files("tex") # Run LaTeX -> PDF conversions success = self._build_latexmk(self.project_path) @@ -559,6 +564,16 @@ def build(self): self._post_build() return success + def _check_for_files(self, extension): + """ + Check that a file pattern exists. + + This method mostly exists so we have a pattern that is test-friend (can be mocked). + """ + tex_files = glob(os.path.join(self.absolute_host_output_dir, f"*.{extension}")) + if not tex_files: + raise BuildUserError("No *.{extension} files were found.") + def _build_latexmk(self, cwd): # These steps are copied from the Makefile generated by Sphinx >= 1.6 # https://github.com/sphinx-doc/sphinx/blob/master/sphinx/texinputs/Makefile_t diff --git a/readthedocs/projects/tests/test_build_tasks.py b/readthedocs/projects/tests/test_build_tasks.py index 0333345a81e..232aadffe10 100644 --- a/readthedocs/projects/tests/test_build_tasks.py +++ b/readthedocs/projects/tests/test_build_tasks.py @@ -26,6 +26,7 @@ from readthedocs.projects.tasks.builds import sync_repository_task, update_docs_task from readthedocs.telemetry.models import BuildData +from ..constants import MEDIA_TYPES_EXTENSIONS from .mockers import BuildEnvironmentMocker @@ -562,10 +563,12 @@ def test_get_env_vars(self, load_yaml_config, build_environment, config, externa @mock.patch("readthedocs.projects.tasks.builds.send_external_build_status") @mock.patch("readthedocs.projects.tasks.builds.UpdateDocsTask.send_notifications") @mock.patch("readthedocs.projects.tasks.builds.clean_build") + @mock.patch("readthedocs.doc_builder.backends.sphinx.PdfBuilder._check_for_files") @mock.patch("readthedocs.doc_builder.director.load_yaml_config") def test_successful_build( self, load_yaml_config, + _check_for_files, clean_build, send_notifications, send_external_build_status, @@ -825,10 +828,22 @@ def test_failed_build( assert revoke_key_request.path == "/api/v2/revoke/" @mock.patch("readthedocs.doc_builder.director.load_yaml_config") + @mock.patch("readthedocs.doc_builder.backends.sphinx.PdfBuilder._check_for_files") + @mock.patch( + "readthedocs.doc_builder.backends.sphinx.EpubBuilder._get_epub_files_generated" + ) def test_build_commands_executed( self, + _get_epub_files_generated, + _check_for_files, load_yaml_config, ): + # This test only works without the feature flag + # Several other tests are also made before this feature flag, so we will have to + # adapt them later -- this one will behave differently for sure, so let's make + # sure no one ends up on a long horrible test assertion debugging trip. + assert not self.project.has_feature(Feature.ENABLE_MULTIPLE_PDFS) + load_yaml_config.return_value = self._config_file( { "version": 2, @@ -838,15 +853,19 @@ def test_build_commands_executed( }, } ) + _get_epub_files_generated.return_value = ["some file names that are not used"] # Create the artifact paths, so it's detected by the builder os.makedirs(self.project.artifact_path(version=self.version.slug, type_="html")) - os.makedirs(self.project.artifact_path(version=self.version.slug, type_="json")) - os.makedirs( - self.project.artifact_path(version=self.version.slug, type_="htmlzip") - ) - os.makedirs(self.project.artifact_path(version=self.version.slug, type_="epub")) - os.makedirs(self.project.artifact_path(version=self.version.slug, type_="pdf")) + for f in ("epub", "pdf", "htmlzip", "json"): + extension = MEDIA_TYPES_EXTENSIONS[f] + os.makedirs(self.project.artifact_path(version=self.version.slug, type_=f)) + pathlib.Path( + os.path.join( + self.project.artifact_path(version=self.version.slug, type_=f), + f"{self.project.slug}.{extension}", + ) + ).touch() self._trigger_update_docs_task() From c49bbc15a757a1a03cf46931b10ba3c050d1ecc6 Mon Sep 17 00:00:00 2001 From: Benjamin Bach Date: Wed, 28 Jun 2023 17:30:22 +0200 Subject: [PATCH 20/21] lint --- readthedocs/projects/tasks/builds.py | 1 + readthedocs/projects/tests/test_build_tasks.py | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/readthedocs/projects/tasks/builds.py b/readthedocs/projects/tasks/builds.py index 7dc07f5b0de..8e6a4e3482a 100644 --- a/readthedocs/projects/tasks/builds.py +++ b/readthedocs/projects/tasks/builds.py @@ -610,6 +610,7 @@ def get_valid_artifact_types(self): def trigger_sync_downloadable_artifacts(self, valid_artifacts): """ Triggers the sync_downloadable_artifacts task with the files that were found. + Performs an additional validation on files contained in valid_artifacts. :param valid_artifacts: A list of artifacts allowed by initial validation. Example: diff --git a/readthedocs/projects/tests/test_build_tasks.py b/readthedocs/projects/tests/test_build_tasks.py index 232aadffe10..1489b23820c 100644 --- a/readthedocs/projects/tests/test_build_tasks.py +++ b/readthedocs/projects/tests/test_build_tasks.py @@ -443,7 +443,6 @@ def test_build_fail_if_no_artifacts(self, load_yaml_config): os.makedirs(epub_dir) os.makedirs(pdf_dir) - # Create a .tex file because of a validation step that verifies that intermediate # .tex files were created. pathlib.Path( From 00752803f5055378677e2dc1a85a30492e029dab Mon Sep 17 00:00:00 2001 From: Benjamin Bach Date: Wed, 28 Jun 2023 19:08:40 +0200 Subject: [PATCH 21/21] Add test case test_build_commands_executed_multiple_artifacts with new pattern from feature flag --- readthedocs/doc_builder/backends/sphinx.py | 29 +- readthedocs/projects/tasks/builds.py | 1 + .../projects/tests/test_build_tasks.py | 280 ++++++++++++++++++ 3 files changed, 301 insertions(+), 9 deletions(-) diff --git a/readthedocs/doc_builder/backends/sphinx.py b/readthedocs/doc_builder/backends/sphinx.py index e8c6a378b6c..dc56c436289 100644 --- a/readthedocs/doc_builder/backends/sphinx.py +++ b/readthedocs/doc_builder/backends/sphinx.py @@ -445,7 +445,7 @@ class EpubBuilder(BaseSphinx): relative_output_dir = "epub" def _post_build(self): - """Internal post build to cleanup EPUB output directory and leave only one .epub file.""" + """Internal post build to clean up EPUB output directory and leave only one .epub file.""" temp_epub_file = f"/tmp/{self.project.slug}-{self.version.slug}.epub" target_file = os.path.join( self.absolute_container_output_dir, @@ -712,7 +712,7 @@ def _post_build(self): def _post_build_multiple(self): """Internal post build to cleanup PDF output directory and leave only .pdf files.""" - pdf_files = glob(os.path.join(self.absolute_host_output_dir, "*.pdf")) + pdf_files = self._pdf_files_generated() if not pdf_files: raise PDFNotFound() @@ -721,11 +721,13 @@ def _post_build_multiple(self): # We cannot use '*' in commands sent to the host machine, the asterisk gets escaped. # So we opt for iterating from outside the container + pdf_file_names = [] for fname in pdf_files: shutil.move( fname, os.path.join(self.absolute_host_tmp_root, os.path.basename(fname)), ) + pdf_file_names.append(os.path.basename(fname)) # Delete everything from the output dir self.run( @@ -746,11 +748,20 @@ def _post_build_multiple(self): ) # Move the PDFs back - # We cannot use '*' in commands sent to the host machine, the asterisk gets escaped. - # So we opt for iterating from outside the container - pdf_files = glob(os.path.join(self.absolute_host_tmp_root, "*.pdf")) - for fname in pdf_files: - shutil.move( - fname, - os.path.join(self.absolute_host_output_dir, os.path.basename(fname)), + # Note: We cannot use '*' in commands sent to the host machine, the asterisk gets escaped. + for basename in pdf_file_names: + self.run( + "mv", + os.path.join(self.absolute_host_tmp_root, basename), + self.absolute_host_output_dir, + cwd=self.project_path, + record=False, ) + + def _pdf_files_generated(self): + """ + Return a list of pdf files generated by the command. + + This method is mainly here to be test and mock friendly. + """ + return glob(os.path.join(self.absolute_host_output_dir, "*.pdf")) diff --git a/readthedocs/projects/tasks/builds.py b/readthedocs/projects/tasks/builds.py index 8e6a4e3482a..ba59f322789 100644 --- a/readthedocs/projects/tasks/builds.py +++ b/readthedocs/projects/tasks/builds.py @@ -691,6 +691,7 @@ def on_success(self, retval, task_id, args, kwargs): search_ignore=self.data.config.search.ignore, ) + # For the new feature, we trigger a task that will create ImportedFile objects. if self.data.project.has_feature(Feature.ENABLE_MULTIPLE_PDFS): if valid_artifacts: self.trigger_sync_downloadable_artifacts(valid_artifacts) diff --git a/readthedocs/projects/tests/test_build_tasks.py b/readthedocs/projects/tests/test_build_tasks.py index 1489b23820c..c3d40ad0641 100644 --- a/readthedocs/projects/tests/test_build_tasks.py +++ b/readthedocs/projects/tests/test_build_tasks.py @@ -1,5 +1,6 @@ import os import pathlib +import tempfile from unittest import mock import django_dynamic_fixture as fixture @@ -1088,6 +1089,285 @@ def test_build_commands_executed( ] ) + @mock.patch("readthedocs.doc_builder.director.load_yaml_config") + @mock.patch("readthedocs.doc_builder.backends.sphinx.PdfBuilder._check_for_files") + @mock.patch( + "readthedocs.doc_builder.backends.sphinx.PdfBuilder._pdf_files_generated" + ) + @mock.patch( + "readthedocs.doc_builder.backends.sphinx.EpubBuilder._get_epub_files_generated" + ) + def test_build_commands_executed_multiple_artifacts( + self, + _get_epub_files_generated, + _pdf_files_generated, + _check_for_files, + load_yaml_config, + ): + """ + This is the new version of test_build_commands_executed with ENABLE_MULTIPLE_PDFS enabled. + """ + fixture.get( + Feature, projects=[self.project], feature_id=Feature.ENABLE_MULTIPLE_PDFS + ) + + load_yaml_config.return_value = self._config_file( + { + "version": 2, + "formats": "all", + "sphinx": { + "configuration": "docs/conf.py", + }, + } + ) + _get_epub_files_generated.return_value = ["output.epub"] + + # Generate fake PDF output + tempdir_for_mock = tempfile.mkdtemp() + open(f"{tempdir_for_mock}/output.pdf", "w").write("") + _pdf_files_generated.return_value = [f"{tempdir_for_mock}/output.pdf"] + + # Create the artifact paths, so it's detected by the builder + os.makedirs(self.project.artifact_path(version=self.version.slug, type_="html")) + for f in ("epub", "pdf", "htmlzip", "json"): + extension = MEDIA_TYPES_EXTENSIONS[f] + os.makedirs(self.project.artifact_path(version=self.version.slug, type_=f)) + pathlib.Path( + os.path.join( + self.project.artifact_path(version=self.version.slug, type_=f), + f"{self.project.slug}.{extension}", + ) + ).touch() + + self._trigger_update_docs_task() + + self.mocker.mocks["git.Backend.run"].assert_has_calls( + [ + mock.call( + "git", "clone", "--no-single-branch", "--depth", "50", mock.ANY, "." + ), + mock.call("git", "checkout", "--force", "a1b2c3"), + mock.call("git", "clean", "-d", "-f", "-f"), + ] + ) + + self.mocker.mocks["environment.run"].assert_has_calls( + [ + mock.call( + "python3.7", + "-mvirtualenv", + mock.ANY, + bin_path=None, + cwd=None, + ), + mock.call( + mock.ANY, + "-m", + "pip", + "install", + "--upgrade", + "--no-cache-dir", + "pip", + "setuptools", + bin_path=mock.ANY, + cwd=mock.ANY, + ), + mock.call( + mock.ANY, + "-m", + "pip", + "install", + "--upgrade", + "--no-cache-dir", + "pillow", + "mock==1.0.1", + "alabaster>=0.7,<0.8,!=0.7.5", + "commonmark==0.9.1", + "recommonmark==0.5.0", + "sphinx<2", + "sphinx-rtd-theme<0.5", + "readthedocs-sphinx-ext<2.3", + "jinja2<3.1.0", + bin_path=mock.ANY, + cwd=mock.ANY, + ), + # FIXME: shouldn't this one be present here? It's not now because + # we are mocking `append_conf` which is the one that triggers this + # command. + # + # mock.call( + # 'cat', + # 'docs/conf.py', + # cwd=mock.ANY, + # ), + mock.call( + mock.ANY, + "-m", + "sphinx", + "-T", + "-E", + "-b", + "html", + "-d", + "_build/doctrees", + "-D", + "language=en", + ".", + "$READTHEDOCS_OUTPUT/html", + cwd=mock.ANY, + bin_path=mock.ANY, + ), + mock.call( + mock.ANY, + "-m", + "sphinx", + "-T", + "-E", + "-b", + "readthedocssinglehtmllocalmedia", + "-d", + "_build/doctrees", + "-D", + "language=en", + ".", + "$READTHEDOCS_OUTPUT/htmlzip", + cwd=mock.ANY, + bin_path=mock.ANY, + ), + mock.call( + "mktemp", + "--directory", + record=False, + ), + mock.call( + "mv", + mock.ANY, + mock.ANY, + cwd=mock.ANY, + record=False, + ), + mock.call( + "mkdir", + "--parents", + mock.ANY, + cwd=mock.ANY, + record=False, + ), + mock.call( + "zip", + "--recurse-paths", + "--symlinks", + mock.ANY, + mock.ANY, + cwd=mock.ANY, + record=False, + ), + mock.call( + mock.ANY, + "-m", + "sphinx", + "-T", + "-E", + "-b", + "latex", + "-d", + "_build/doctrees", + "-D", + "language=en", + ".", + "$READTHEDOCS_OUTPUT/pdf", + cwd=mock.ANY, + bin_path=mock.ANY, + ), + mock.call("cat", "latexmkrc", cwd=mock.ANY), + mock.call("rm", "-r", mock.ANY, cwd=mock.ANY, record=False), + mock.call("mkdir", "-p", mock.ANY, cwd=mock.ANY, record=False), + mock.call( + "mv", + mock.ANY, + mock.ANY, + cwd=mock.ANY, + record=False, + ), + mock.call( + mock.ANY, + "-m", + "sphinx", + "-T", + "-E", + "-b", + "epub", + "-d", + "_build/doctrees", + "-D", + "language=en", + ".", + "$READTHEDOCS_OUTPUT/epub", + cwd=mock.ANY, + bin_path=mock.ANY, + ), + mock.call( + "mv", + "output.epub", + "/tmp/project-latest.epub", + cwd=mock.ANY, + record=False, + ), + mock.call( + "rm", + "--recursive", + "$READTHEDOCS_OUTPUT/epub", + cwd=mock.ANY, + record=False, + ), + mock.call( + "mkdir", + "--parents", + "$READTHEDOCS_OUTPUT/epub", + cwd=mock.ANY, + record=False, + ), + mock.call( + "mv", + "/tmp/project-latest.epub", + mock.ANY, + cwd=mock.ANY, + record=False, + ), + mock.call( + "test", + "-x", + "_build/html", + record=False, + cwd=mock.ANY, + ), + # FIXME: I think we are hitting this issue here: + # https://github.com/pytest-dev/pytest-mock/issues/234 + mock.call("lsb_release", "--description", record=False, demux=True), + mock.call("python", "--version", record=False, demux=True), + mock.call( + "dpkg-query", + "--showformat", + "${package} ${version}\\n", + "--show", + record=False, + demux=True, + ), + mock.call( + "python", + "-m", + "pip", + "list", + "--pre", + "--local", + "--format", + "json", + record=False, + demux=True, + ), + ] + ) + @mock.patch("readthedocs.doc_builder.director.load_yaml_config") def test_install_apt_packages(self, load_yaml_config): config = BuildConfigV2(