From 2806504b3ec0b02ea6b146898a6fb62af5ff71b5 Mon Sep 17 00:00:00 2001 From: DenisaCG Date: Wed, 11 Dec 2024 19:59:07 +0100 Subject: [PATCH 01/18] instate fsspec file system --- jupyter_drives/manager.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/jupyter_drives/manager.py b/jupyter_drives/manager.py index 7c467b7..7633f99 100644 --- a/jupyter_drives/manager.py +++ b/jupyter_drives/manager.py @@ -17,6 +17,8 @@ from libcloud.storage.providers import get_driver import pyarrow from aiobotocore.session import get_session +import fsspec +import s3fs from .log import get_logger from .base import DrivesConfig @@ -41,12 +43,16 @@ def __init__(self, config: traitlets.config.Config) -> None: self._client = httpx.AsyncClient() self._content_managers = {} self._max_files_listed = 1000 + + # instate fsspec file system + self._file_system = fsspec.filesystem(self._config.provider, asynchronous=True) # initiate aiobotocore session if we are dealing with S3 drives if self._config.provider == 's3': if self._config.access_key_id and self._config.secret_access_key: self._s3_clients = {} self._s3_session = get_session() + self._file_system = s3fs.S3FileSystem(anon=False, asynchronous=True, key=self._config.access_key_id, secret=self._config.secret_access_key, token=self._config.session_token) else: raise tornado.web.HTTPError( status_code= httpx.codes.BAD_REQUEST, From 91c91300f8ecc6e37c254af7c8e38bacadd287fd Mon Sep 17 00:00:00 2001 From: DenisaCG Date: Wed, 11 Dec 2024 21:51:48 +0100 Subject: [PATCH 02/18] refactor check function --- jupyter_drives/manager.py | 24 +++++++----------------- 1 file changed, 7 insertions(+), 17 deletions(-) diff --git a/jupyter_drives/manager.py b/jupyter_drives/manager.py index 7633f99..a5b6afa 100644 --- a/jupyter_drives/manager.py +++ b/jupyter_drives/manager.py @@ -294,15 +294,9 @@ async def get_contents(self, drive_name, path): # dealing with the case of an empty directory, making sure it is not an empty file if emptyDir is True: - ext_list = ['.R', '.bmp', '.csv', '.gif', '.html', '.ipynb', '.jl', '.jpeg', '.jpg', '.json', '.jsonl', '.md', '.ndjson', '.pdf', '.png', '.py', '.svg', '.tif', '.tiff', '.tsv', '.txt', '.webp', '.yaml', '.yml'] - object_name = os.path.basename(path) - # if object doesn't contain . or doesn't end in one of the registered extensions - if object_name.find('.') == -1 or ext_list.count(os.path.splitext(object_name)[1]) == 0: + check = await self._check_object(drive_name, path) + if check == True: data = [] - - # remove upper logic once directories are fixed - check = self._check_object(drive_name, path) - print(check) response = { "data": data @@ -578,7 +572,7 @@ async def _get_drive_location(self, drive_name): return location - def _check_object(self, drive_name, path): + async def _check_object(self, drive_name, path): """Helping function to check if we are dealing with an empty file or directory. Args: @@ -587,17 +581,13 @@ def _check_object(self, drive_name, path): """ isDir = False try: - location = self._content_managers[drive_name]["location"] - if location not in self._s3_clients: - self._s3_clients[location] = self._s3_session.client('s3', location) - - listing = self._s3_clients[location].list_objects_v2(Bucket = drive_name, Prefix = path + '/') - if 'Contents' in listing: - isDir = True + response = await self._file_system._info(drive_name + '/' + path) + if response["type"]=='directory': + isDir = True except Exception as e: raise tornado.web.HTTPError( status_code= httpx.codes.BAD_REQUEST, - reason=f"The following error occured when retriving the drive location: {e}", + reason=f"The following error occured when checking the object information: {e}", ) return isDir From 794e2fcd84c87017ed9d222fa9c2c258409e5342 Mon Sep 17 00:00:00 2001 From: DenisaCG Date: Wed, 11 Dec 2024 22:52:24 +0100 Subject: [PATCH 03/18] refactor create object functionality --- .gitignore | 3 +++ jupyter_drives/handlers.py | 3 ++- jupyter_drives/manager.py | 38 ++++++++++++++++++++++++++++++-------- src/contents.ts | 1 + src/requests.ts | 6 +++++- 5 files changed, 41 insertions(+), 10 deletions(-) diff --git a/.gitignore b/.gitignore index c8fc73a..2b43863 100644 --- a/.gitignore +++ b/.gitignore @@ -123,3 +123,6 @@ dmypy.json # Yarn cache .yarn/ + +# untitled notebooks +Untitled.ipynb diff --git a/jupyter_drives/handlers.py b/jupyter_drives/handlers.py index 696f082..9b3a775 100644 --- a/jupyter_drives/handlers.py +++ b/jupyter_drives/handlers.py @@ -88,7 +88,8 @@ async def get(self, drive: str = "", path: str = ""): @tornado.web.authenticated async def post(self, drive: str = "", path: str = ""): - result = await self._manager.new_file(drive, path) + body = self.get_json_body() + result = await self._manager.new_file(drive, path, **body) self.finish(result) @tornado.web.authenticated diff --git a/jupyter_drives/manager.py b/jupyter_drives/manager.py index a5b6afa..ba760f7 100644 --- a/jupyter_drives/manager.py +++ b/jupyter_drives/manager.py @@ -2,7 +2,7 @@ import json import logging from typing import Dict, List, Optional, Tuple, Union, Any -from datetime import timedelta +from datetime import timedelta, datetime import os import tornado @@ -309,27 +309,31 @@ async def get_contents(self, drive_name, path): return response - async def new_file(self, drive_name, path): + async def new_file(self, drive_name, path, type): """Create a new file or directory at the given path. Args: drive_name: name of drive where the new content is created path: path where new content should be created + type: whether we are dealing with a file or a directory """ data = {} try: # eliminate leading and trailing backslashes path = path.strip('/') - # TO DO: switch to mode "created", which is not implemented yet - await obs.put_async(self._content_managers[drive_name]["store"], path, b"", mode = "overwrite") - metadata = await obs.head_async(self._content_managers[drive_name]["store"], path) - + if type == 'directory' and self._config.provider == 's3': + await self._create_dir(drive_name, path) + else: + await self._file_system._touch(drive_name + '/' + path) + metadata = await self._file_system._info(drive_name + '/' + path) + data = { "path": path, "content": "", - "last_modified": metadata["last_modified"].isoformat(), - "size": metadata["size"] + "last_modified": metadata["LastModified"].isoformat() if metadata["type"]=='file' else datetime.now().isoformat(), + "size": metadata["size"], + "type": metadata["type"] } except Exception as e: raise tornado.web.HTTPError( @@ -592,6 +596,24 @@ async def _check_object(self, drive_name, path): return isDir + async def _create_dir(self, drive_name, path): + """Helping function to create an empty directory. + + Args: + drive_name: name of drive where object should be created + path: path to object to create + """ + try: + async with self._s3_session.create_client('s3', aws_secret_access_key=self._config.secret_access_key, aws_access_key_id=self._config.access_key_id, aws_session_token=self._config.session_token) as client: + await client.put_object(Bucket=drive_name, Key=path + '/') + except Exception as e: + raise tornado.web.HTTPError( + status_code= httpx.codes.BAD_REQUEST, + reason=f"The following error occured when creating the directory: {e}", + ) + + return + async def _call_provider( self, url: str, diff --git a/src/contents.ts b/src/contents.ts index c628e1d..d430c36 100644 --- a/src/contents.ts +++ b/src/contents.ts @@ -310,6 +310,7 @@ export class Drive implements Contents.IDrive { data = await createObject(currentDrive.name, { name: name, path: relativePath, + type: options.type, registeredFileTypes: this._registeredFileTypes }); } else { diff --git a/src/requests.ts b/src/requests.ts index 1b616c1..49bf16f 100644 --- a/src/requests.ts +++ b/src/requests.ts @@ -218,6 +218,7 @@ export async function createObject( options: { name: string; path: string; + type: string; registeredFileTypes: IRegisteredFileTypes; } ) { @@ -226,7 +227,10 @@ export async function createObject( : options.name; const response = await requestAPI( 'drives/' + driveName + '/' + path, - 'POST' + 'POST', + { + type: options.type + } ); const [fileType, fileMimeType, fileFormat] = getFileType( From 6341cf5c83518e2ff767eb52960a41c221694fce Mon Sep 17 00:00:00 2001 From: DenisaCG Date: Wed, 11 Dec 2024 23:03:05 +0100 Subject: [PATCH 04/18] add new dependencies --- pyproject.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index e5d9d31..eb2d91e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,6 +26,8 @@ dependencies = [ "arro3-core>=0.2.1,<0.3", "pyarrow>=18.0.0,<19.0.0", "aiobotocore>=2.15.2,<2.16.0", + "fsspec>=2024.10.0,<2024.11.0", + "s3fs>=2024.10.0,<2024.11.0", "jupyter_server>=2.14.2,<3", "apache-libcloud>=3.8.0, <4", "entrypoints>=0.4, <0.5", From a4d6824bcdeb333ea58988a49e288c4187869ca5 Mon Sep 17 00:00:00 2001 From: DenisaCG Date: Thu, 12 Dec 2024 13:41:43 +0100 Subject: [PATCH 05/18] refactor new object functionality --- jupyter_drives/manager.py | 27 ++++++--------------------- 1 file changed, 6 insertions(+), 21 deletions(-) diff --git a/jupyter_drives/manager.py b/jupyter_drives/manager.py index ba760f7..4aef1fb 100644 --- a/jupyter_drives/manager.py +++ b/jupyter_drives/manager.py @@ -50,6 +50,7 @@ def __init__(self, config: traitlets.config.Config) -> None: # initiate aiobotocore session if we are dealing with S3 drives if self._config.provider == 's3': if self._config.access_key_id and self._config.secret_access_key: + self._fixDir_suffix = '/.jupyter-drives-fixDir' # fix for creating directories self._s3_clients = {} self._s3_session = get_session() self._file_system = s3fs.S3FileSystem(anon=False, asynchronous=True, key=self._config.access_key_id, secret=self._config.secret_access_key, token=self._config.session_token) @@ -322,10 +323,12 @@ async def new_file(self, drive_name, path, type): # eliminate leading and trailing backslashes path = path.strip('/') + object_name = drive_name + '/' + path + # in the case of S3 directories, we need to add a suffix to feign the creation of a directory if type == 'directory' and self._config.provider == 's3': - await self._create_dir(drive_name, path) - else: - await self._file_system._touch(drive_name + '/' + path) + object_name = object_name + self._fixDir_suffix + + await self._file_system._touch(object_name) metadata = await self._file_system._info(drive_name + '/' + path) data = { @@ -596,24 +599,6 @@ async def _check_object(self, drive_name, path): return isDir - async def _create_dir(self, drive_name, path): - """Helping function to create an empty directory. - - Args: - drive_name: name of drive where object should be created - path: path to object to create - """ - try: - async with self._s3_session.create_client('s3', aws_secret_access_key=self._config.secret_access_key, aws_access_key_id=self._config.access_key_id, aws_session_token=self._config.session_token) as client: - await client.put_object(Bucket=drive_name, Key=path + '/') - except Exception as e: - raise tornado.web.HTTPError( - status_code= httpx.codes.BAD_REQUEST, - reason=f"The following error occured when creating the directory: {e}", - ) - - return - async def _call_provider( self, url: str, From 1d3d26465529e5bb512b291d6faf4d5d626ab25a Mon Sep 17 00:00:00 2001 From: DenisaCG Date: Thu, 12 Dec 2024 13:47:51 +0100 Subject: [PATCH 06/18] iterate --- jupyter_drives/manager.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/jupyter_drives/manager.py b/jupyter_drives/manager.py index 4aef1fb..451bb1f 100644 --- a/jupyter_drives/manager.py +++ b/jupyter_drives/manager.py @@ -327,14 +327,14 @@ async def new_file(self, drive_name, path, type): # in the case of S3 directories, we need to add a suffix to feign the creation of a directory if type == 'directory' and self._config.provider == 's3': object_name = object_name + self._fixDir_suffix - + await self._file_system._touch(object_name) - metadata = await self._file_system._info(drive_name + '/' + path) + metadata = await self._file_system._info(object_name) data = { "path": path, "content": "", - "last_modified": metadata["LastModified"].isoformat() if metadata["type"]=='file' else datetime.now().isoformat(), + "last_modified": metadata["LastModified"].isoformat(), "size": metadata["size"], "type": metadata["type"] } From ab1af9154686b99b5b6687a7b0f1b397eb28aa60 Mon Sep 17 00:00:00 2001 From: DenisaCG Date: Thu, 12 Dec 2024 14:17:58 +0100 Subject: [PATCH 07/18] refactor rename functionality --- jupyter_drives/manager.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/jupyter_drives/manager.py b/jupyter_drives/manager.py index 451bb1f..0361be9 100644 --- a/jupyter_drives/manager.py +++ b/jupyter_drives/manager.py @@ -421,12 +421,19 @@ async def rename_file(self, drive_name, path, new_path): # eliminate leading and trailing backslashes path = path.strip('/') - await obs.rename_async(self._content_managers[drive_name]["store"], path, new_path) - metadata = await obs.head_async(self._content_managers[drive_name]["store"], new_path) + object_name = drive_name + '/' + path + new_object_name = drive_name + '/' + new_path + is_dir = await self._check_object(drive_name, path) + if is_dir == True: + object_name = object_name + self._fixDir_suffix + new_object_name = new_object_name + self._fixDir_suffix + + await self._file_system._mv_file(object_name, new_object_name) + metadata = await self._file_system._info(new_object_name) data = { "path": new_path, - "last_modified": metadata["last_modified"].isoformat(), + "last_modified": metadata["LastModified"].isoformat(), "size": metadata["size"] } except Exception as e: @@ -580,11 +587,11 @@ async def _get_drive_location(self, drive_name): return location async def _check_object(self, drive_name, path): - """Helping function to check if we are dealing with an empty file or directory. + """Helping function to check if we are dealing with a file or directory. Args: drive_name: name of drive where object exists - path: path to object to check + path: path of object to check """ isDir = False try: From 1501aabbe1268fe804b70577d91f32647f93f249 Mon Sep 17 00:00:00 2001 From: DenisaCG Date: Thu, 12 Dec 2024 15:03:17 +0100 Subject: [PATCH 08/18] add helping function to fix directories --- jupyter_drives/manager.py | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/jupyter_drives/manager.py b/jupyter_drives/manager.py index 0361be9..1ae689b 100644 --- a/jupyter_drives/manager.py +++ b/jupyter_drives/manager.py @@ -427,6 +427,7 @@ async def rename_file(self, drive_name, path, new_path): if is_dir == True: object_name = object_name + self._fixDir_suffix new_object_name = new_object_name + self._fixDir_suffix + await self._fix_dir(drive_name, path) await self._file_system._mv_file(object_name, new_object_name) metadata = await self._file_system._info(new_object_name) @@ -599,13 +600,38 @@ async def _check_object(self, drive_name, path): if response["type"]=='directory': isDir = True except Exception as e: - raise tornado.web.HTTPError( + raise tornado.web.HTTPError( status_code= httpx.codes.BAD_REQUEST, reason=f"The following error occured when checking the object information: {e}", ) return isDir + async def _fix_dir(self, drive_name, path): + """Helping function to fix a directory. It applies to the S3 folders created in the AWS console. + + Args: + drive_name: name of drive where object exists + path: path of object to fix + """ + try: + check = await self._file_system._exists(drive_name + '/' + path + self._fixDir_suffix) + if check == True: # directory has right format + return + else: # directory was created from console + # delete original object + async with self._s3_session.create_client('s3', aws_secret_access_key=self._config.secret_access_key, aws_access_key_id=self._config.access_key_id, aws_session_token=self._config.session_token) as client: + await client.delete_object(Bucket=drive_name, Key=path+'/') + # create new directory + await self._file_system._touch(drive_name + '/' + path + self._fixDir_suffix) + except Exception as e: + raise tornado.web.HTTPError( + status_code= httpx.codes.BAD_REQUEST, + reason=f"The following error occured when fixing the directory object: {e}", + ) + + return + async def _call_provider( self, url: str, From e36eccd9ee69439f1ded01fa11e7bdd62f547615 Mon Sep 17 00:00:00 2001 From: DenisaCG Date: Thu, 12 Dec 2024 15:51:27 +0100 Subject: [PATCH 09/18] refactor delete functionality --- jupyter_drives/manager.py | 16 ++++++++++++++-- src/requests.ts | 35 +---------------------------------- 2 files changed, 15 insertions(+), 36 deletions(-) diff --git a/jupyter_drives/manager.py b/jupyter_drives/manager.py index 1ae689b..989fd7b 100644 --- a/jupyter_drives/manager.py +++ b/jupyter_drives/manager.py @@ -458,7 +458,17 @@ async def delete_file(self, drive_name, path): try: # eliminate leading and trailing backslashes path = path.strip('/') - await obs.delete_async(self._content_managers[drive_name]["store"], path) + is_dir = await self._file_system._isdir(drive_name + '/' + path) + if is_dir == True: + await self._fix_dir(drive_name, path) + await self._file_system._rm(drive_name + '/' + path, recursive = True) + + # checking for remaining directories and deleting them + stream = obs.list(self._content_managers[drive_name]["store"], path, chunk_size=100, return_arrow=True) + async for batch in stream: + contents_list = pyarrow.record_batch(batch).to_pylist() + for object in contents_list: + await self._fix_dir(drive_name, object["path"], delete_only = True) except Exception as e: raise tornado.web.HTTPError( @@ -607,7 +617,7 @@ async def _check_object(self, drive_name, path): return isDir - async def _fix_dir(self, drive_name, path): + async def _fix_dir(self, drive_name, path, delete_only = False): """Helping function to fix a directory. It applies to the S3 folders created in the AWS console. Args: @@ -622,6 +632,8 @@ async def _fix_dir(self, drive_name, path): # delete original object async with self._s3_session.create_client('s3', aws_secret_access_key=self._config.secret_access_key, aws_access_key_id=self._config.access_key_id, aws_session_token=self._config.session_token) as client: await client.delete_object(Bucket=drive_name, Key=path+'/') + if delete_only == True: + return # create new directory await self._file_system._touch(drive_name + '/' + path + self._fixDir_suffix) except Exception as e: diff --git a/src/requests.ts b/src/requests.ts index 49bf16f..06816e4 100644 --- a/src/requests.ts +++ b/src/requests.ts @@ -267,26 +267,7 @@ export async function deleteObjects( path: string; } ) { - // get list of contents with given prefix (path) - const response = await requestAPI( - 'drives/' + driveName + '/' + options.path, - 'GET' - ); - - // deleting contents of a directory - if (response.data.length !== undefined && response.data.length !== 0) { - await Promise.all( - response.data.map(async (c: any) => { - return Private.deleteSingleObject(driveName, c.path); - }) - ); - } - try { - // always deleting the object (file or main directory) - return Private.deleteSingleObject(driveName, options.path); - } catch (error) { - // deleting failed if directory didn't exist and was only part of a path - } + await requestAPI('drives/' + driveName + '/' + options.path, 'DELETE'); } /** @@ -517,20 +498,6 @@ export const countObjectNameAppearances = async ( }; namespace Private { - /** - * Helping function for deleting files inside - * a directory, in the case of deleting the directory. - * - * @param driveName - * @param objectPath complete path of object to delete - */ - export async function deleteSingleObject( - driveName: string, - objectPath: string - ) { - await requestAPI('drives/' + driveName + '/' + objectPath, 'DELETE'); - } - /** * Helping function for renaming files inside * a directory, in the case of deleting the directory. From 0084da733ff33867709245114d595bafc0848c08 Mon Sep 17 00:00:00 2001 From: DenisaCG Date: Thu, 12 Dec 2024 16:46:20 +0100 Subject: [PATCH 10/18] refactor copy functionality --- jupyter_drives/manager.py | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/jupyter_drives/manager.py b/jupyter_drives/manager.py index 989fd7b..fdde090 100644 --- a/jupyter_drives/manager.py +++ b/jupyter_drives/manager.py @@ -511,29 +511,26 @@ async def copy_file(self, drive_name, path, to_path, to_drive): # eliminate leading and trailing backslashes path = path.strip('/') - # copy object within same drive + object_name = drive_name + '/' + path + # copy objects within same drive if to_drive == drive_name: - await obs.copy_async(self._content_managers[drive_name]["store"], path, to_path) - metadata = await obs.head_async(self._content_managers[drive_name]["store"], to_path) - # copy object to another drive + to_object_name = drive_name + '/' + to_path + # copy objects to another drive else: - content = b'' - try: - # retrieving contents of file - file = await obs.get_async(self._content_managers[drive_name]["store"], path) - stream = file.stream(min_chunk_size=5 * 1024 * 1024) # 5MB sized chunks - async for buf in stream: - content += buf - except: - # dealing with a directory, no contents to retrieve - pass - - await obs.put_async(self._content_managers[to_drive]["store"], to_path, content) - metadata = await obs.head_async(self._content_managers[to_drive]["store"], to_path) + to_object_name = to_drive + '/' + to_path + + is_dir = await self._check_object(drive_name, path) + if is_dir == True: + object_name = object_name + self._fixDir_suffix + to_object_name = to_object_name + self._fixDir_suffix + await self._fix_dir(drive_name, path) + + await self._file_system._copy(object_name, to_object_name) + metadata = await self._file_system._info(to_object_name) data = { "path": to_path, - "last_modified": metadata["last_modified"].isoformat(), + "last_modified": metadata["LastModified"].isoformat(), "size": metadata["size"] } except Exception as e: From 2ffe341bf0ee055805434478a7a4b205596d2725 Mon Sep 17 00:00:00 2001 From: DenisaCG Date: Thu, 12 Dec 2024 17:56:02 +0100 Subject: [PATCH 11/18] switch check for directory with isdir function --- jupyter_drives/manager.py | 26 +++----------------------- 1 file changed, 3 insertions(+), 23 deletions(-) diff --git a/jupyter_drives/manager.py b/jupyter_drives/manager.py index fdde090..e924d9f 100644 --- a/jupyter_drives/manager.py +++ b/jupyter_drives/manager.py @@ -295,7 +295,7 @@ async def get_contents(self, drive_name, path): # dealing with the case of an empty directory, making sure it is not an empty file if emptyDir is True: - check = await self._check_object(drive_name, path) + check = await self._isdir(drive_name + '/' + path) if check == True: data = [] @@ -423,7 +423,7 @@ async def rename_file(self, drive_name, path, new_path): object_name = drive_name + '/' + path new_object_name = drive_name + '/' + new_path - is_dir = await self._check_object(drive_name, path) + is_dir = await self._isdir(object_name) if is_dir == True: object_name = object_name + self._fixDir_suffix new_object_name = new_object_name + self._fixDir_suffix @@ -519,7 +519,7 @@ async def copy_file(self, drive_name, path, to_path, to_drive): else: to_object_name = to_drive + '/' + to_path - is_dir = await self._check_object(drive_name, path) + is_dir = await self._isdir(object_name) if is_dir == True: object_name = object_name + self._fixDir_suffix to_object_name = to_object_name + self._fixDir_suffix @@ -594,26 +594,6 @@ async def _get_drive_location(self, drive_name): return location - async def _check_object(self, drive_name, path): - """Helping function to check if we are dealing with a file or directory. - - Args: - drive_name: name of drive where object exists - path: path of object to check - """ - isDir = False - try: - response = await self._file_system._info(drive_name + '/' + path) - if response["type"]=='directory': - isDir = True - except Exception as e: - raise tornado.web.HTTPError( - status_code= httpx.codes.BAD_REQUEST, - reason=f"The following error occured when checking the object information: {e}", - ) - - return isDir - async def _fix_dir(self, drive_name, path, delete_only = False): """Helping function to fix a directory. It applies to the S3 folders created in the AWS console. From 1bd6a59ecdfe9bc5f195c4bbcb34d7d0efe5d4fd Mon Sep 17 00:00:00 2001 From: DenisaCG Date: Thu, 12 Dec 2024 22:13:12 +0100 Subject: [PATCH 12/18] refactor saving functionality --- jupyter_drives/manager.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/jupyter_drives/manager.py b/jupyter_drives/manager.py index e924d9f..32efa46 100644 --- a/jupyter_drives/manager.py +++ b/jupyter_drives/manager.py @@ -378,9 +378,8 @@ async def save_file(self, drive_name, path, content, options_format, content_for byte_array = bytearray(slice_) byte_arrays.append(byte_array) - # combine byte arrays and wrap in a BytesIO object - formatted_content = BytesIO(b"".join(byte_arrays)) - formatted_content.seek(0) # reset cursor for any further reading + # combine byte arrays + formatted_content = b"".join(byte_arrays) elif options_format == 'text': formatted_content = content.encode("utf-8") else: @@ -388,13 +387,13 @@ async def save_file(self, drive_name, path, content, options_format, content_for if formatted_content is None or formatted_content == '': formatted_content = b'' - await obs.put_async(self._content_managers[drive_name]["store"], path, formatted_content, mode = "overwrite") - metadata = await obs.head_async(self._content_managers[drive_name]["store"], path) + await self._file_system._pipe(drive_name + '/' + path, formatted_content) + metadata = await self._file_system._info(drive_name + '/' + path) data = { "path": path, "content": content, - "last_modified": metadata["last_modified"].isoformat(), + "last_modified": metadata["LastModified"].isoformat(), "size": metadata["size"] } except Exception as e: From c9eae3860a2d9f2389c8d4662caf4cb91b41f39e Mon Sep 17 00:00:00 2001 From: DenisaCG Date: Thu, 12 Dec 2024 22:15:53 +0100 Subject: [PATCH 13/18] iterate on isdir function --- jupyter_drives/manager.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/jupyter_drives/manager.py b/jupyter_drives/manager.py index 32efa46..51f4073 100644 --- a/jupyter_drives/manager.py +++ b/jupyter_drives/manager.py @@ -295,7 +295,7 @@ async def get_contents(self, drive_name, path): # dealing with the case of an empty directory, making sure it is not an empty file if emptyDir is True: - check = await self._isdir(drive_name + '/' + path) + check = await self._file_system._isdir(drive_name + '/' + path) if check == True: data = [] @@ -422,7 +422,7 @@ async def rename_file(self, drive_name, path, new_path): object_name = drive_name + '/' + path new_object_name = drive_name + '/' + new_path - is_dir = await self._isdir(object_name) + is_dir = await self._file_system._isdir(object_name) if is_dir == True: object_name = object_name + self._fixDir_suffix new_object_name = new_object_name + self._fixDir_suffix @@ -518,7 +518,7 @@ async def copy_file(self, drive_name, path, to_path, to_drive): else: to_object_name = to_drive + '/' + to_path - is_dir = await self._isdir(object_name) + is_dir = await self._file_system._isdir(object_name) if is_dir == True: object_name = object_name + self._fixDir_suffix to_object_name = to_object_name + self._fixDir_suffix From cdf9764064f58bfbb9facf0ec7234541a53e5d45 Mon Sep 17 00:00:00 2001 From: DenisaCG Date: Thu, 12 Dec 2024 22:18:49 +0100 Subject: [PATCH 14/18] remove unused function --- jupyter_drives/manager.py | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/jupyter_drives/manager.py b/jupyter_drives/manager.py index 51f4073..fdb8619 100644 --- a/jupyter_drives/manager.py +++ b/jupyter_drives/manager.py @@ -477,25 +477,6 @@ async def delete_file(self, drive_name, path): return - async def check_file(self, drive_name, path): - """Check if an object already exists within a drive. - - Args: - drive_name: name of drive where object exists - path: path where content is located - """ - try: - # eliminate leading and trailing backslashes - path = path.strip('/') - await obs.head_async(self._content_managers[drive_name]["store"], path) - except Exception: - raise tornado.web.HTTPError( - status_code= httpx.codes.NOT_FOUND, - reason="Object does not already exist within drive.", - ) - - return - async def copy_file(self, drive_name, path, to_path, to_drive): """Save file with new content. From cdaf51892666f26533a58a7c4f15a4b85f01a483 Mon Sep 17 00:00:00 2001 From: DenisaCG Date: Thu, 12 Dec 2024 22:56:07 +0100 Subject: [PATCH 15/18] reformat get function --- jupyter_drives/manager.py | 81 ++++++++++++++++----------------------- 1 file changed, 33 insertions(+), 48 deletions(-) diff --git a/jupyter_drives/manager.py b/jupyter_drives/manager.py index fdb8619..e342052 100644 --- a/jupyter_drives/manager.py +++ b/jupyter_drives/manager.py @@ -223,56 +223,47 @@ async def get_contents(self, drive_name, path): try : data = [] - isDir = False - emptyDir = True # assume we are dealing with an empty directory - - chunk_size = 100 - if self._max_files_listed < chunk_size: - chunk_size = self._max_files_listed - no_batches = int(self._max_files_listed/chunk_size) - - # using Arrow lists as they are recommended for large results - # stream will be an async iterable of RecordBatch - current_batch = 0 - stream = obs.list(self._content_managers[drive_name]["store"], path, chunk_size=chunk_size, return_arrow=True) - async for batch in stream: - current_batch += 1 - # reached last batch that can be shown (partially) - if current_batch == no_batches + 1: - remaining_files = self._max_files_listed - no_batches*chunk_size - - # if content exists we are dealing with a directory - if isDir is False and batch: - isDir = True - emptyDir = False + is_dir = await self._file_system._isdir(drive_name + '/' + path) + + if is_dir == True: + chunk_size = 100 + if self._max_files_listed < chunk_size: + chunk_size = self._max_files_listed + no_batches = int(self._max_files_listed/chunk_size) + + # using Arrow lists as they are recommended for large results + # stream will be an async iterable of RecordBatch + current_batch = 0 + stream = obs.list(self._content_managers[drive_name]["store"], path, chunk_size=chunk_size, return_arrow=True) + async for batch in stream: + current_batch += 1 + # reached last batch that can be shown (partially) + if current_batch == no_batches + 1: + remaining_files = self._max_files_listed - no_batches*chunk_size + + contents_list = pyarrow.record_batch(batch).to_pylist() + for object in contents_list: + # when listing the last batch (partially), make sure we don't exceed limit + if current_batch == no_batches + 1: + if remaining_files <= 0: + break + remaining_files -= 1 + data.append({ + "path": object["path"], + "last_modified": object["last_modified"].isoformat(), + "size": object["size"], + }) - contents_list = pyarrow.record_batch(batch).to_pylist() - for object in contents_list: - # when listing the last batch (partially), make sure we don't exceed limit + # check if we reached the limit of files that can be listed if current_batch == no_batches + 1: - if remaining_files <= 0: - break - remaining_files -= 1 - data.append({ - "path": object["path"], - "last_modified": object["last_modified"].isoformat(), - "size": object["size"], - }) + break - # check if we reached the limit of files that can be listed - if current_batch == no_batches + 1: - break - - # check if we are dealing with an empty drive - if isDir is False and path != '': + else: content = b"" # retrieve contents of object obj = await obs.get_async(self._content_managers[drive_name]["store"], path) stream = obj.stream(min_chunk_size=5 * 1024 * 1024) # 5MB sized chunks async for buf in stream: - # if content exists we are dealing with a file - if emptyDir is True and buf: - emptyDir = False content += buf # retrieve metadata of object @@ -293,12 +284,6 @@ async def get_contents(self, drive_name, path): "size": metadata["size"] } - # dealing with the case of an empty directory, making sure it is not an empty file - if emptyDir is True: - check = await self._file_system._isdir(drive_name + '/' + path) - if check == True: - data = [] - response = { "data": data } From c5d089411915b82c9a6cb1af02dd29710f50a79d Mon Sep 17 00:00:00 2001 From: DenisaCG Date: Fri, 13 Dec 2024 15:11:13 +0100 Subject: [PATCH 16/18] iterate on directory suffix --- jupyter_drives/manager.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/jupyter_drives/manager.py b/jupyter_drives/manager.py index e342052..5af8ca0 100644 --- a/jupyter_drives/manager.py +++ b/jupyter_drives/manager.py @@ -25,6 +25,9 @@ import re +# constant used as suffix to deal with directory objects +EMPTY_DIR_SUFFIX = '/.jupyter_drives_fix_dir' + class JupyterDrivesManager(): """ Jupyter-drives manager class. @@ -49,8 +52,7 @@ def __init__(self, config: traitlets.config.Config) -> None: # initiate aiobotocore session if we are dealing with S3 drives if self._config.provider == 's3': - if self._config.access_key_id and self._config.secret_access_key: - self._fixDir_suffix = '/.jupyter-drives-fixDir' # fix for creating directories + if self._config.access_key_id and self._config.secret_access_key: self._s3_clients = {} self._s3_session = get_session() self._file_system = s3fs.S3FileSystem(anon=False, asynchronous=True, key=self._config.access_key_id, secret=self._config.secret_access_key, token=self._config.session_token) @@ -311,7 +313,7 @@ async def new_file(self, drive_name, path, type): object_name = drive_name + '/' + path # in the case of S3 directories, we need to add a suffix to feign the creation of a directory if type == 'directory' and self._config.provider == 's3': - object_name = object_name + self._fixDir_suffix + object_name = object_name + EMPTY_DIR_SUFFIX await self._file_system._touch(object_name) metadata = await self._file_system._info(object_name) @@ -409,8 +411,8 @@ async def rename_file(self, drive_name, path, new_path): new_object_name = drive_name + '/' + new_path is_dir = await self._file_system._isdir(object_name) if is_dir == True: - object_name = object_name + self._fixDir_suffix - new_object_name = new_object_name + self._fixDir_suffix + object_name = object_name + EMPTY_DIR_SUFFIX + new_object_name = new_object_name + EMPTY_DIR_SUFFIX await self._fix_dir(drive_name, path) await self._file_system._mv_file(object_name, new_object_name) @@ -486,8 +488,8 @@ async def copy_file(self, drive_name, path, to_path, to_drive): is_dir = await self._file_system._isdir(object_name) if is_dir == True: - object_name = object_name + self._fixDir_suffix - to_object_name = to_object_name + self._fixDir_suffix + object_name = object_name + EMPTY_DIR_SUFFIX + to_object_name = to_object_name + EMPTY_DIR_SUFFIX await self._fix_dir(drive_name, path) await self._file_system._copy(object_name, to_object_name) @@ -567,7 +569,7 @@ async def _fix_dir(self, drive_name, path, delete_only = False): path: path of object to fix """ try: - check = await self._file_system._exists(drive_name + '/' + path + self._fixDir_suffix) + check = await self._file_system._exists(drive_name + '/' + path + EMPTY_DIR_SUFFIX) if check == True: # directory has right format return else: # directory was created from console @@ -577,7 +579,7 @@ async def _fix_dir(self, drive_name, path, delete_only = False): if delete_only == True: return # create new directory - await self._file_system._touch(drive_name + '/' + path + self._fixDir_suffix) + await self._file_system._touch(drive_name + '/' + path + EMPTY_DIR_SUFFIX) except Exception as e: raise tornado.web.HTTPError( status_code= httpx.codes.BAD_REQUEST, From 64245452120cdfed2cd0b993d993fe556df66e21 Mon Sep 17 00:00:00 2001 From: Denisa Checiu <91504950+DenisaCG@users.noreply.github.com> Date: Fri, 13 Dec 2024 15:27:31 +0100 Subject: [PATCH 17/18] increase chunk size when retrieving list of contents Co-authored-by: Afshin Taylor Darian --- jupyter_drives/manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jupyter_drives/manager.py b/jupyter_drives/manager.py index 5af8ca0..76865d6 100644 --- a/jupyter_drives/manager.py +++ b/jupyter_drives/manager.py @@ -228,7 +228,7 @@ async def get_contents(self, drive_name, path): is_dir = await self._file_system._isdir(drive_name + '/' + path) if is_dir == True: - chunk_size = 100 + chunk_size = 1024 if self._max_files_listed < chunk_size: chunk_size = self._max_files_listed no_batches = int(self._max_files_listed/chunk_size) From 0ae775788e064c91862918d02918e6f579b86e27 Mon Sep 17 00:00:00 2001 From: DenisaCG Date: Fri, 13 Dec 2024 15:28:15 +0100 Subject: [PATCH 18/18] increase maximum number of files listed --- jupyter_drives/manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jupyter_drives/manager.py b/jupyter_drives/manager.py index 76865d6..209276f 100644 --- a/jupyter_drives/manager.py +++ b/jupyter_drives/manager.py @@ -45,7 +45,7 @@ def __init__(self, config: traitlets.config.Config) -> None: self._config = DrivesConfig(config=config) self._client = httpx.AsyncClient() self._content_managers = {} - self._max_files_listed = 1000 + self._max_files_listed = 1025 # instate fsspec file system self._file_system = fsspec.filesystem(self._config.provider, asynchronous=True)