Skip to content

Commit 4a32768

Browse files
authored
Merge pull request #54 from transformerlab/add/use-fsspec
Switch to using fsspec everywhere
2 parents 3aa9f05 + 53bbb1a commit 4a32768

File tree

12 files changed

+646
-264
lines changed

12 files changed

+646
-264
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -205,3 +205,6 @@ cython_debug/
205205
marimo/_static/
206206
marimo/_lsp/
207207
__marimo__/
208+
209+
210+
scripts/examples/test_fsspec.sh

pyproject.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,13 @@ build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "transformerlab"
7-
version = "0.0.44"
7+
version = "0.0.45"
88
description = "Python SDK for Transformer Lab"
99
readme = "README.md"
1010
requires-python = ">=3.10"
1111
authors = [{ name = "Transformer Lab", email = "developers@transformerlab.ai" }]
1212
license = { file = "LICENSE" }
13-
dependencies = ["werkzeug", "pytest"]
13+
dependencies = ["werkzeug", "pytest", "wandb", "fsspec", "s3fs"]
1414

1515
[project.urls]
1616
"Homepage" = "https://github.com/transformerlab/transformerlab-sdk"

src/lab/dataset.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,15 @@
1-
import os
21
from werkzeug.utils import secure_filename
32

43
from .dirs import get_datasets_dir
54
from .labresource import BaseLabResource
5+
from . import storage
66

77

88
class Dataset(BaseLabResource):
99
def get_dir(self):
1010
"""Abstract method on BaseLabResource"""
1111
dataset_id_safe = secure_filename(str(self.id))
12-
return os.path.join(get_datasets_dir(), dataset_id_safe)
12+
return storage.join(get_datasets_dir(), dataset_id_safe)
1313

1414
def _default_json(self):
1515
# Default metadata modeled after API dataset table fields
@@ -45,14 +45,18 @@ def get_metadata(self):
4545
def list_all():
4646
results = []
4747
datasets_dir = get_datasets_dir()
48-
if not os.path.isdir(datasets_dir):
48+
if not storage.isdir(datasets_dir):
4949
return results
50-
for entry in os.listdir(datasets_dir):
51-
full = os.path.join(datasets_dir, entry)
52-
if not os.path.isdir(full):
50+
try:
51+
entries = storage.ls(datasets_dir, detail=False)
52+
except Exception:
53+
entries = []
54+
for full in entries:
55+
if not storage.isdir(full):
5356
continue
5457
# Attempt to read index.json (or latest snapshot)
5558
try:
59+
entry = full.rstrip("/").split("/")[-1]
5660
ds = Dataset(entry)
5761
results.append(ds.get_metadata())
5862
except Exception:

src/lab/dirs.py

Lines changed: 71 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -3,52 +3,68 @@
33
import os
44
import contextvars
55
from werkzeug.utils import secure_filename
6+
from . import storage
7+
from .storage import _current_tfl_storage_uri
68

79
# TFL_HOME_DIR
8-
if "TFL_HOME_DIR" in os.environ:
10+
if "TFL_HOME_DIR" in os.environ and not (_current_tfl_storage_uri.get() or os.getenv("TFL_STORAGE_URI")):
911
HOME_DIR = os.environ["TFL_HOME_DIR"]
1012
if not os.path.exists(HOME_DIR):
1113
print(f"Error: Home directory {HOME_DIR} does not exist")
1214
exit(1)
1315
print(f"Home directory is set to: {HOME_DIR}")
1416
else:
15-
HOME_DIR = os.path.join(os.path.expanduser("~"), ".transformerlab")
16-
os.makedirs(name=HOME_DIR, exist_ok=True)
17-
print(f"Using default home directory: {HOME_DIR}")
17+
# If TFL_STORAGE_URI is set (via context or env), HOME_DIR concept maps to storage.root_uri()
18+
HOME_DIR = storage.root_uri() if (_current_tfl_storage_uri.get() or os.getenv("TFL_STORAGE_URI")) else os.path.join(os.path.expanduser("~"), ".transformerlab")
19+
if not (_current_tfl_storage_uri.get() or os.getenv("TFL_STORAGE_URI")):
20+
os.makedirs(name=HOME_DIR, exist_ok=True)
21+
print(f"Using default home directory: {HOME_DIR}")
1822

1923
# Context var for organization id (set by host app/session)
2024
_current_org_id: contextvars.ContextVar[str | None] = contextvars.ContextVar(
2125
"current_org_id", default=None
2226
)
2327

24-
2528
def set_organization_id(organization_id: str | None) -> None:
2629
_current_org_id.set(organization_id)
30+
if organization_id is not None:
31+
_current_tfl_storage_uri.set(os.getenv("TFL_API_STORAGE_URI"))
32+
else:
33+
_current_tfl_storage_uri.set(None)
2734

2835

2936
def get_workspace_dir() -> str:
3037
# Remote SkyPilot workspace override (highest precedence)
3138
# Only return container workspace path when value is exactly "true"
3239
if os.getenv("_TFL_REMOTE_SKYPILOT_WORKSPACE") == "true":
40+
if os.getenv("TFL_STORAGE_URI") is not None:
41+
return storage.root_uri()
42+
3343
return "/workspace"
3444

3545
# Explicit override wins
36-
if "TFL_WORKSPACE_DIR" in os.environ:
46+
if "TFL_WORKSPACE_DIR" in os.environ and not (_current_tfl_storage_uri.get() is not None and os.getenv("TFL_STORAGE_URI") is not None):
3747
value = os.environ["TFL_WORKSPACE_DIR"]
3848
if not os.path.exists(value):
3949
print(f"Error: Workspace directory {value} does not exist")
4050
exit(1)
4151
return value
4252

4353
org_id = _current_org_id.get()
54+
4455
if org_id:
45-
path = os.path.join(HOME_DIR, "orgs", org_id, "workspace")
46-
os.makedirs(name=path, exist_ok=True)
56+
# If the storage URI is set, use it for the org workspace
57+
if _current_tfl_storage_uri.get() is not None:
58+
return _current_tfl_storage_uri.get()
59+
path = storage.join(HOME_DIR, "orgs", org_id, "workspace")
60+
storage.makedirs(path, exist_ok=True)
4761
return path
62+
63+
if os.getenv("TFL_STORAGE_URI"):
64+
return storage.root_uri()
4865

49-
# Default single-tenant path
50-
path = os.path.join(HOME_DIR, "workspace")
51-
os.makedirs(name=path, exist_ok=True)
66+
path = storage.join(HOME_DIR, "workspace")
67+
storage.makedirs(path, exist_ok=True)
5268
return path
5369

5470

@@ -79,92 +95,96 @@ def get_workspace_dir() -> str:
7995

8096

8197
def get_experiments_dir() -> str:
82-
path = os.path.join(get_workspace_dir(), "experiments")
83-
os.makedirs(name=path, exist_ok=True)
98+
path = storage.join(get_workspace_dir(), "experiments")
99+
storage.makedirs(path, exist_ok=True)
84100
return path
85101

86102

87103
def get_jobs_dir() -> str:
88104
workspace_dir = get_workspace_dir()
89-
path = os.path.join(workspace_dir, "jobs")
90-
os.makedirs(name=path, exist_ok=True)
105+
path = storage.join(workspace_dir, "jobs")
106+
storage.makedirs(path, exist_ok=True)
91107
return path
92108

93109

94110
def get_global_log_path() -> str:
95-
return os.path.join(get_workspace_dir(), "transformerlab.log")
111+
return storage.join(get_workspace_dir(), "transformerlab.log")
96112

97113

98114
def get_logs_dir() -> str:
99-
path = os.path.join(HOME_DIR, "logs")
100-
os.makedirs(name=path, exist_ok=True)
115+
path = storage.join(HOME_DIR, "logs")
116+
storage.makedirs(path, exist_ok=True)
101117
return path
102118

103119

104120
# TODO: Move this to Experiment
105121
def experiment_dir_by_name(experiment_name: str) -> str:
106122
experiments_dir = get_experiments_dir()
107-
return os.path.join(experiments_dir, experiment_name)
123+
return storage.join(experiments_dir, experiment_name)
108124

109125

110126
def get_plugin_dir() -> str:
111-
return os.path.join(get_workspace_dir(), "plugins")
127+
return storage.join(get_workspace_dir(), "plugins")
112128

113129

114130
def plugin_dir_by_name(plugin_name: str) -> str:
115131
plugin_name = secure_filename(plugin_name)
116-
return os.path.join(get_plugin_dir(), plugin_name)
132+
return storage.join(get_plugin_dir(), plugin_name)
117133

118134

119135
def get_models_dir() -> str:
120-
path = os.path.join(get_workspace_dir(), "models")
121-
os.makedirs(name=path, exist_ok=True)
136+
path = storage.join(get_workspace_dir(), "models")
137+
storage.makedirs(path, exist_ok=True)
122138
return path
123139

124140

125141
def get_datasets_dir() -> str:
126-
path = os.path.join(get_workspace_dir(), "datasets")
127-
os.makedirs(name=path, exist_ok=True)
142+
path = storage.join(get_workspace_dir(), "datasets")
143+
storage.makedirs(path, exist_ok=True)
128144
return path
129145

130146

131147
def get_tasks_dir() -> str:
132-
path = os.path.join(get_workspace_dir(), "tasks")
133-
os.makedirs(name=path, exist_ok=True)
148+
tfl_storage_uri = _current_tfl_storage_uri.get()
149+
if tfl_storage_uri is not None:
150+
return storage.join(tfl_storage_uri, "tasks")
151+
152+
path = storage.join(get_workspace_dir(), "tasks")
153+
storage.makedirs(path, exist_ok=True)
134154
return path
135155

136156

137157
def dataset_dir_by_id(dataset_id: str) -> str:
138-
return os.path.join(get_datasets_dir(), dataset_id)
158+
return storage.join(get_datasets_dir(), dataset_id)
139159

140160

141161
def get_temp_dir() -> str:
142-
path = os.path.join(get_workspace_dir(), "temp")
143-
os.makedirs(name=path, exist_ok=True)
162+
path = storage.join(get_workspace_dir(), "temp")
163+
storage.makedirs(path, exist_ok=True)
144164
return path
145165

146166

147167
def get_prompt_templates_dir() -> str:
148-
path = os.path.join(get_workspace_dir(), "prompt_templates")
149-
os.makedirs(name=path, exist_ok=True)
168+
path = storage.join(get_workspace_dir(), "prompt_templates")
169+
storage.makedirs(path, exist_ok=True)
150170
return path
151171

152172

153173
def get_tools_dir() -> str:
154-
path = os.path.join(get_workspace_dir(), "tools")
155-
os.makedirs(name=path, exist_ok=True)
174+
path = storage.join(get_workspace_dir(), "tools")
175+
storage.makedirs(path, exist_ok=True)
156176
return path
157177

158178

159179
def get_batched_prompts_dir() -> str:
160-
path = os.path.join(get_workspace_dir(), "batched_prompts")
161-
os.makedirs(name=path, exist_ok=True)
180+
path = storage.join(get_workspace_dir(), "batched_prompts")
181+
storage.makedirs(path, exist_ok=True)
162182
return path
163183

164184

165185
def get_galleries_cache_dir() -> str:
166-
path = os.path.join(get_workspace_dir(), "galleries")
167-
os.makedirs(name=path, exist_ok=True)
186+
path = storage.join(get_workspace_dir(), "galleries")
187+
storage.makedirs(path, exist_ok=True)
168188
return path
169189

170190

@@ -175,16 +195,16 @@ def get_job_dir(job_id: str | int) -> str:
175195
instance is not readily available.
176196
"""
177197
job_id_safe = secure_filename(str(job_id))
178-
return os.path.join(get_jobs_dir(), job_id_safe)
198+
return storage.join(get_jobs_dir(), job_id_safe)
179199

180200

181201
def get_job_artifacts_dir(job_id: str | int) -> str:
182202
"""
183203
Return the artifacts directory for a specific job, creating it if needed.
184204
Example: ~/.transformerlab/workspace/jobs/<job_id>/artifacts
185205
"""
186-
path = os.path.join(get_job_dir(job_id), "artifacts")
187-
os.makedirs(name=path, exist_ok=True)
206+
path = storage.join(get_job_dir(job_id), "artifacts")
207+
storage.makedirs(path, exist_ok=True)
188208
return path
189209

190210

@@ -193,8 +213,8 @@ def get_job_checkpoints_dir(job_id: str | int) -> str:
193213
Return the checkpoints directory for a specific job, creating it if needed.
194214
Example: ~/.transformerlab/workspace/jobs/<job_id>/checkpoints
195215
"""
196-
path = os.path.join(get_job_dir(job_id), "checkpoints")
197-
os.makedirs(name=path, exist_ok=True)
216+
path = storage.join(get_job_dir(job_id), "checkpoints")
217+
storage.makedirs(path, exist_ok=True)
198218
return path
199219

200220

@@ -203,8 +223,8 @@ def get_job_eval_results_dir(job_id: str | int) -> str:
203223
Return the eval_results directory for a specific job, creating it if needed.
204224
Example: ~/.transformerlab/workspace/jobs/<job_id>/eval_results
205225
"""
206-
path = os.path.join(get_job_dir(job_id), "eval_results")
207-
os.makedirs(name=path, exist_ok=True)
226+
path = storage.join(get_job_dir(job_id), "eval_results")
227+
storage.makedirs(path, exist_ok=True)
208228
return path
209229

210230

@@ -215,14 +235,14 @@ def get_job_eval_results_dir(job_id: str | int) -> str:
215235
async def eval_output_file(experiment_name: str, eval_name: str) -> str:
216236
experiment_dir = experiment_dir_by_name(experiment_name)
217237
eval_name = secure_filename(eval_name)
218-
p = os.path.join(experiment_dir, "evals", eval_name)
219-
os.makedirs(p, exist_ok=True)
220-
return os.path.join(p, "output.txt")
238+
p = storage.join(experiment_dir, "evals", eval_name)
239+
storage.makedirs(p, exist_ok=True)
240+
return storage.join(p, "output.txt")
221241

222242

223243
async def generation_output_file(experiment_name: str, generation_name: str) -> str:
224244
experiment_dir = experiment_dir_by_name(experiment_name)
225245
generation_name = secure_filename(generation_name)
226-
p = os.path.join(experiment_dir, "generations", generation_name)
227-
os.makedirs(p, exist_ok=True)
228-
return os.path.join(p, "output.txt")
246+
p = storage.join(experiment_dir, "generations", generation_name)
247+
storage.makedirs(p, exist_ok=True)
248+
return storage.join(p, "output.txt")

0 commit comments

Comments
 (0)