From 1550f6634f252970e01604c17a1c72ef03fb56a7 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Fri, 27 Sep 2024 08:23:35 +0200 Subject: [PATCH] Remove dead code MockStreamingDownloadManager --- .../job_runners/config/parquet_and_info.py | 33 +------------------ 1 file changed, 1 insertion(+), 32 deletions(-) diff --git a/services/worker/src/worker/job_runners/config/parquet_and_info.py b/services/worker/src/worker/job_runners/config/parquet_and_info.py index 38828e5c9..f3a7509b8 100644 --- a/services/worker/src/worker/job_runners/config/parquet_and_info.py +++ b/services/worker/src/worker/job_runners/config/parquet_and_info.py @@ -19,7 +19,6 @@ import datasets.exceptions import datasets.info import fsspec -import numpy as np import pyarrow as pa import pyarrow.parquet as pq from datasets import DownloadConfig, Features, load_dataset_builder @@ -34,10 +33,8 @@ FilesIterable, get_authentication_headers_for_url, http_head, - is_relative_path, - url_or_path_join, ) -from datasets.utils.py_utils import asdict, map_nested +from datasets.utils.py_utils import asdict from fsspec.core import filesystem, url_to_fs from fsspec.implementations.http import HTTPFileSystem from fsspec.implementations.local import LocalFileOpener, LocalFileSystem @@ -320,34 +317,6 @@ def _fsspec_request_size(urlpath: str, storage_options: dict[str, Any]) -> Optio return None -class _MockStreamingDownloadManager(StreamingDownloadManager): # type: ignore - def __init__(self, *args: Any, **kwargs: Any) -> None: - super().__init__(*args, **kwargs) - self.ext_data_files: list[str] = [] - - def download(self, url_or_urls: Any) -> Any: - url_or_urls = map_nested( - self._download, - url_or_urls, - map_tuple=True, - parallel_min_length=np.inf, - # ^ parallel_min_length has int type, but is currently used in datasets for a comparison only - # and it works with np.inf. No conversion is involved - # (would raise: OverflowError: cannot convert float infinity to integer) - ) - return url_or_urls - - def _download(self, urlpath: Any) -> str: - urlpath_str = str(urlpath) - if is_relative_path(urlpath_str): - # append the relative path to the base_path - urlpath_str = url_or_path_join(self._base_path, urlpath_str) - elif not urlpath_str.startswith(self._base_path): - # it's an external file - self.ext_data_files.append(urlpath_str) - return urlpath_str - - def get_writer_batch_size_from_info(ds_config_info: datasets.info.DatasetInfo) -> Optional[int]: """ Get the writer_batch_size that defines the maximum row group size in the parquet files.