From 4faf7e51cf03e6ee724e0c56e30544403b41fc83 Mon Sep 17 00:00:00 2001 From: Celina Hanouti Date: Thu, 9 Oct 2025 16:17:08 +0200 Subject: [PATCH 01/10] add inference endpoints cli --- docs/source/en/guides/cli.md | 28 ++ docs/source/en/guides/inference_endpoints.md | 43 ++ src/huggingface_hub/cli/hf.py | 2 + .../cli/inference_endpoints.py | 416 ++++++++++++++++++ tests/test_cli.py | 196 +++++++++ 5 files changed, 685 insertions(+) create mode 100644 src/huggingface_hub/cli/inference_endpoints.py diff --git a/docs/source/en/guides/cli.md b/docs/source/en/guides/cli.md index 49c6ab38f7..2e44792bec 100644 --- a/docs/source/en/guides/cli.md +++ b/docs/source/en/guides/cli.md @@ -916,3 +916,31 @@ Manage scheduled jobs using # Delete a scheduled job >>> hf jobs scheduled delete ``` + +## hf inference-endpoints + +Use `hf inference-endpoints` to list, deploy, inspect, and manage Inference Endpoints directly from the terminal. + +```bash +# Lists endpoints in your namespace +>>> hf inference-endpoints list + +# Deploy an endpoint +>>> hf inference-endpoints deploy hub my-endpoint --repo gpt2 --framework pytorch --accelerator cpu --instance-size x2 --instance-type intel-icl + +# Deploy an endpoint from Model Catalog +>>> hf inference-endpoints deploy catalog --repo openai/gpt-oss-120b --name my-endpoint + +# Inspect status and metadata +>>> hf inference-endpoints inspect my-endpoint + +# Pause the endpoint +>>> hf inference-endpoints pause my-endpoint + +# Delete without confirmation prompt +>>> hf inference-endpoints delete my-endpoint --yes + +``` + +> [!TIP] +> Add `--namespace` to target an organization, `--token` to override authentication, and use `hf inference-endpoints deploy catalog` to launch an endpoint with an optimized configuration from the Model Catalog. diff --git a/docs/source/en/guides/inference_endpoints.md b/docs/source/en/guides/inference_endpoints.md index c89c47621a..925a23b01c 100644 --- a/docs/source/en/guides/inference_endpoints.md +++ b/docs/source/en/guides/inference_endpoints.md @@ -33,6 +33,13 @@ The first step is to create an Inference Endpoint using [`create_inference_endpo ... ) ``` +CLI equivalent: + +```bash +hf inference-endpoints deploy hub my-endpoint-name --repo gpt2 --framework pytorch --accelerator cpu --vendor aws --region us-east-1 --instance-size x2 --instance-type intel-icl --task text-generation +``` + + In this example, we created a `protected` Inference Endpoint named `"my-endpoint-name"`, to serve [gpt2](https://huggingface.co/gpt2) for `text-generation`. A `protected` Inference Endpoint means your token is required to access the API. We also need to provide additional information to configure the hardware requirements, such as vendor, region, accelerator, instance type, and size. You can check out the list of available resources [here](https://api.endpoints.huggingface.cloud/#/v2%3A%3Aprovider/list_vendors). Alternatively, you can create an Inference Endpoint manually using the [Web interface](https://ui.endpoints.huggingface.co/new) for convenience. Refer to this [guide](https://huggingface.co/docs/inference-endpoints/guides/advanced) for details on advanced settings and their usage. The value returned by [`create_inference_endpoint`] is an [`InferenceEndpoint`] object: @@ -42,6 +49,12 @@ The value returned by [`create_inference_endpoint`] is an [`InferenceEndpoint`] InferenceEndpoint(name='my-endpoint-name', namespace='Wauplin', repository='gpt2', status='pending', url=None) ``` +Or via CLI: + +```bash +hf inference-endpoints inspect my-endpoint-name +``` + It's a dataclass that holds information about the endpoint. You can access important attributes such as `name`, `repository`, `status`, `task`, `created_at`, `updated_at`, etc. If you need it, you can also access the raw response from the server with `endpoint.raw`. Once your Inference Endpoint is created, you can find it on your [personal dashboard](https://ui.endpoints.huggingface.co/). @@ -101,6 +114,14 @@ InferenceEndpoint(name='my-endpoint-name', namespace='Wauplin', repository='gpt2 [InferenceEndpoint(name='aws-starchat-beta', namespace='huggingface', repository='HuggingFaceH4/starchat-beta', status='paused', url=None), ...] ``` +Or via CLI: + +```bash +hf inference-endpoints inspect my-endpoint-name +hf inference-endpoints list --namespace huggingface +hf inference-endpoints list --namespace '*' +``` + ## Check deployment status In the rest of this guide, we will assume that we have a [`InferenceEndpoint`] object called `endpoint`. You might have noticed that the endpoint has a `status` attribute of type [`InferenceEndpointStatus`]. When the Inference Endpoint is deployed and accessible, the status should be `"running"` and the `url` attribute is set: @@ -117,6 +138,12 @@ Before reaching a `"running"` state, the Inference Endpoint typically goes throu InferenceEndpoint(name='my-endpoint-name', namespace='Wauplin', repository='gpt2', status='pending', url=None) ``` +Or via CLI: + +```bash +hf inference-endpoints inspect my-endpoint-name +``` + Instead of fetching the Inference Endpoint status while waiting for it to run, you can directly call [`~InferenceEndpoint.wait`]. This helper takes as input a `timeout` and a `fetch_every` parameter (in seconds) and will block the thread until the Inference Endpoint is deployed. Default values are respectively `None` (no timeout) and `5` seconds. ```py @@ -189,6 +216,14 @@ InferenceEndpoint(name='my-endpoint-name', namespace='Wauplin', repository='gpt2 # Endpoint is not 'running' but still has a URL and will restart on first call. ``` +or via CLI: + +```bash +hf inference-endpoints pause my-endpoint-name +hf inference-endpoints resume my-endpoint-name +hf inference-endpoints scale-to-zero my-endpoint-name +``` + ### Update model or hardware requirements In some cases, you might also want to update your Inference Endpoint without creating a new one. You can either update the hosted model or the hardware requirements to run the model. You can do this using [`~InferenceEndpoint.update`]: @@ -207,6 +242,14 @@ InferenceEndpoint(name='my-endpoint-name', namespace='Wauplin', repository='gpt2 InferenceEndpoint(name='my-endpoint-name', namespace='Wauplin', repository='gpt2-large', status='pending', url=None) ``` +Or via CLI: + +```bash +hf inference-endpoints update my-endpoint-name --repo gpt2-large +hf inference-endpoints update my-endpoint-name --min-replica 2 --max-replica 6 +hf inference-endpoints update my-endpoint-name --accelerator cpu --instance-size x4 --instance-type intel-icl +``` + ### Delete the endpoint Finally if you won't use the Inference Endpoint anymore, you can simply call [`~InferenceEndpoint.delete()`]. diff --git a/src/huggingface_hub/cli/hf.py b/src/huggingface_hub/cli/hf.py index 8306eff084..47e9d47e73 100644 --- a/src/huggingface_hub/cli/hf.py +++ b/src/huggingface_hub/cli/hf.py @@ -17,6 +17,7 @@ from huggingface_hub.cli.auth import auth_cli from huggingface_hub.cli.cache import cache_cli from huggingface_hub.cli.download import download +from huggingface_hub.cli.inference_endpoints import app as inference_endpoints_cli from huggingface_hub.cli.jobs import jobs_cli from huggingface_hub.cli.lfs import lfs_enable_largefiles, lfs_multipart_upload from huggingface_hub.cli.repo import repo_cli @@ -48,6 +49,7 @@ app.add_typer(repo_cli, name="repo") app.add_typer(repo_files_cli, name="repo-files") app.add_typer(jobs_cli, name="jobs") +app.add_typer(inference_endpoints_cli, name="inference-endpoints") def main(): diff --git a/src/huggingface_hub/cli/inference_endpoints.py b/src/huggingface_hub/cli/inference_endpoints.py new file mode 100644 index 0000000000..5b4034da24 --- /dev/null +++ b/src/huggingface_hub/cli/inference_endpoints.py @@ -0,0 +1,416 @@ +"""CLI commands for Hugging Face Inference Endpoints.""" + +from __future__ import annotations + +import json +from typing import Annotated, Optional + +import typer + +from huggingface_hub._inference_endpoints import InferenceEndpoint +from huggingface_hub.errors import HfHubHTTPError +from huggingface_hub.utils import logging + +from ._cli_utils import TokenOpt, get_hf_api, typer_factory + + +logger = logging.get_logger(__name__) + + +app = typer_factory(help="Manage Hugging Face Inference Endpoints.") + +NameArg = Annotated[ + str, + typer.Argument(help="Endpoint name."), +] + +RepoArg = Annotated[ + Optional[str], + typer.Option( + "--repo", help="The name of the model repository associated with the Inference Endpoint (e.g. 'gpt2')." + ), +] + +NamespaceOpt = Annotated[ + Optional[str], + typer.Option( + "--namespace", + help="The namespace where the Inference Endpoint will be created. Defaults to the current user's namespace.", + ), +] + + +FrameworkOpt = Annotated[ + Optional[str], + typer.Option( + "--framework", + help="The machine learning framework used for the model (e.g. 'custom').", + ), +] + +AcceleratorOpt = Annotated[ + Optional[str], + typer.Option( + "--accelerator", + help="The hardware accelerator to be used for inference (e.g. 'cpu').", + ), +] + +InstanceSizeOpt = Annotated[ + Optional[str], + typer.Option( + "--instance-size", + help="The size or type of the instance to be used for hosting the model (e.g. 'x4').", + ), +] + +InstanceTypeOpt = Annotated[ + Optional[str], + typer.Option( + "--instance-type", + help="The cloud instance type where the Inference Endpoint will be deployed (e.g. 'intel-icl').", + ), +] + +RegionOpt = Annotated[ + Optional[str], + typer.Option( + "--region", + help="The cloud region in which the Inference Endpoint will be created (e.g. 'us-east-1').", + ), +] + +TaskOpt = Annotated[ + Optional[str], + typer.Option( + "--task", + help="The task on which to deploy the model (e.g. 'text-classification').", + ), +] +VendorOpt = Annotated[ + Optional[str], + typer.Option( + "--vendor", + help="The cloud provider or vendor where the Inference Endpoint will be hosted (e.g. 'aws').", + ), +] + + +def _print_endpoint(endpoint: InferenceEndpoint) -> None: + typer.echo(json.dumps(endpoint.raw, indent=2, sort_keys=True)) + + +@app.command(help="Lists all inference endpoints for the given namespace.") +def list( + namespace: NamespaceOpt = None, + token: TokenOpt = None, +) -> None: + api = get_hf_api(token=token) + try: + endpoints = api.list_inference_endpoints(namespace=namespace, token=token) + except HfHubHTTPError as error: + typer.echo(f"Listing failed: {error}") + raise typer.Exit(code=error.response.status_code) from error + + typer.echo( + json.dumps( + {"items": [endpoint.raw for endpoint in endpoints]}, + indent=2, + sort_keys=True, + ) + ) + + +deploy_app = typer_factory(help="Deploy Inference Endpoints from Hub repositories or the Model Catalog.") + + +@deploy_app.command(name="hub", help="Deploy an Inference Endpoint from a Hub repository.") +def deploy_from_hub( + name: NameArg, + repo: Annotated[ + str, + typer.Option( + "--repo", + help="The name of the model repository associated with the Inference Endpoint (e.g. 'gpt2').", + ), + ], + framework: Annotated[ + str, + typer.Option( + "--framework", + help="The machine learning framework used for the model (e.g. 'custom').", + ), + ], + accelerator: Annotated[ + str, + typer.Option( + "--accelerator", + help="The hardware accelerator to be used for inference (e.g. 'cpu').", + ), + ], + instance_size: Annotated[ + str, + typer.Option( + "--instance-size", + help="The size or type of the instance to be used for hosting the model (e.g. 'x4').", + ), + ], + instance_type: Annotated[ + str, + typer.Option( + "--instance-type", + help="The cloud instance type where the Inference Endpoint will be deployed (e.g. 'intel-icl').", + ), + ], + region: Annotated[ + str, + typer.Option( + "--region", + help="The cloud region in which the Inference Endpoint will be created (e.g. 'us-east-1').", + ), + ], + vendor: Annotated[ + str, + typer.Option( + "--vendor", + help="The cloud provider or vendor where the Inference Endpoint will be hosted (e.g. 'aws').", + ), + ], + *, + namespace: NamespaceOpt = None, + task: TaskOpt = None, + token: TokenOpt = None, +) -> None: + api = get_hf_api(token=token) + try: + endpoint = api.create_inference_endpoint( + name=name, + repository=repo, + framework=framework, + accelerator=accelerator, + instance_size=instance_size, + instance_type=instance_type, + region=region, + vendor=vendor, + namespace=namespace, + task=task, + token=token, + ) + except HfHubHTTPError as error: + typer.echo(f"Deployment failed: {error}") + raise typer.Exit(code=error.response.status_code) from error + + _print_endpoint(endpoint) + + +@deploy_app.command(name="catalog", help="Deploy an Inference Endpoint from the Model Catalog.") +def deploy_from_catalog( + name: NameArg, + repo: Annotated[ + str, + typer.Option( + "--repo", + help="The name of the model repository associated with the Inference Endpoint (e.g. 'gpt2').", + ), + ], + *, + namespace: NamespaceOpt = None, + token: TokenOpt = None, +) -> None: + api = get_hf_api(token=token) + try: + endpoint = api.create_inference_endpoint_from_catalog( + repo_id=repo, + name=name, + namespace=namespace, + token=token, + ) + except HfHubHTTPError as error: + typer.echo(f"Deployment failed: {error}") + raise typer.Exit(code=error.response.status_code) from error + + _print_endpoint(endpoint) + + +app.add_typer(deploy_app, name="deploy") + + +@app.command(help="Get information about an Inference Endpoint.") +def inspect( + name: NameArg, + namespace: NamespaceOpt = None, + token: TokenOpt = None, +) -> None: + api = get_hf_api(token=token) + try: + endpoint = api.get_inference_endpoint(name=name, namespace=namespace, token=token) + except HfHubHTTPError as error: + typer.echo(f"Fetch failed: {error}") + raise typer.Exit(code=error.response.status_code) from error + + _print_endpoint(endpoint) + + +@app.command(help="Update an existing endpoint.") +def update( + endpoint_name: NameArg, + repo: RepoArg = None, + accelerator: AcceleratorOpt = None, + instance_size: InstanceSizeOpt = None, + instance_type: InstanceTypeOpt = None, + framework: FrameworkOpt = None, + revision: Annotated[ + Optional[str], + typer.Option( + help="The specific model revision to deploy on the Inference Endpoint (e.g. '6c0e6080953db56375760c0471a8c5f2929baf11').", + ), + ] = None, + task: Annotated[ + Optional[str], + typer.Option( + help="The task on which to deploy the model (e.g. 'text-classification').", + ), + ] = None, + min_replica: Annotated[ + Optional[int], + typer.Option( + help="The minimum number of replicas (instances) to keep running for the Inference Endpoint.", + ), + ] = None, + max_replica: Annotated[ + Optional[int], + typer.Option( + help="The maximum number of replicas (instances) to scale to for the Inference Endpoint.", + ), + ] = None, + scale_to_zero_timeout: Annotated[ + Optional[int], + typer.Option( + help="The duration in minutes before an inactive endpoint is scaled to zero.", + ), + ] = None, + namespace: NamespaceOpt = None, + token: TokenOpt = None, +) -> None: + api = get_hf_api(token=token) + try: + endpoint = api.update_inference_endpoint( + name=endpoint_name, + namespace=namespace, + repository=repo, + framework=framework, + revision=revision, + task=task, + accelerator=accelerator, + instance_size=instance_size, + instance_type=instance_type, + min_replica=min_replica, + max_replica=max_replica, + scale_to_zero_timeout=scale_to_zero_timeout, + token=token, + ) + except HfHubHTTPError as error: + typer.echo(f"Update failed: {error}") + raise typer.Exit(code=error.response.status_code) from error + _print_endpoint(endpoint) + + +@app.command(help="Delete an Inference Endpoint permanently.") +def delete( + name: NameArg, + namespace: NamespaceOpt = None, + yes: Annotated[ + bool, + typer.Option( + "--yes", + help="Skip confirmation prompts.", + ), + ] = False, + token: TokenOpt = None, +) -> None: + if not yes: + confirmation = typer.prompt(f"Delete endpoint '{name}'? Type the name to confirm.") + if confirmation != name: + typer.echo("Aborted.") + raise typer.Exit(code=2) + + api = get_hf_api(token=token) + try: + api.delete_inference_endpoint(name=name, namespace=namespace, token=token) + except HfHubHTTPError as error: + typer.echo(f"Delete failed: {error}") + raise typer.Exit(code=error.response.status_code) from error + + typer.echo(f"Deleted '{name}'.") + + +@app.command(help="Pause an Inference Endpoint.") +def pause( + name: NameArg, + namespace: NamespaceOpt = None, + token: TokenOpt = None, +) -> None: + api = get_hf_api(token=token) + try: + endpoint = api.pause_inference_endpoint(name=name, namespace=namespace, token=token) + except HfHubHTTPError as error: + typer.echo(f"Pause failed: {error}") + raise typer.Exit(code=error.response.status_code) from error + + _print_endpoint(endpoint) + + +@app.command(help="Resume an Inference Endpoint.") +def resume( + name: NameArg, + namespace: NamespaceOpt = None, + running_ok: Annotated[ + bool, + typer.Option( + help="If `True`, the method will not raise an error if the Inference Endpoint is already running." + ), + ] = True, + token: TokenOpt = None, +) -> None: + api = get_hf_api(token=token) + try: + endpoint = api.resume_inference_endpoint( + name=name, + namespace=namespace, + token=token, + running_ok=running_ok, + ) + except HfHubHTTPError as error: + typer.echo(f"Resume failed: {error}") + raise typer.Exit(code=error.response.status_code) from error + _print_endpoint(endpoint) + + +@app.command(help="Scale an Inference Endpoint to zero.") +def scale_to_zero( + name: NameArg, + namespace: NamespaceOpt = None, + token: TokenOpt = None, +) -> None: + api = get_hf_api(token=token) + try: + endpoint = api.scale_to_zero_inference_endpoint(name=name, namespace=namespace, token=token) + except HfHubHTTPError as error: + typer.echo(f"Scale To Zero failed: {error}") + raise typer.Exit(code=error.response.status_code) from error + + _print_endpoint(endpoint) + + +@app.command(help="List available Catalog models.") +def list_catalog( + token: TokenOpt = None, +) -> None: + api = get_hf_api(token=token) + try: + models = api.list_inference_catalog(token=token) + except HfHubHTTPError as error: + typer.echo(f"Catalog fetch failed: {error}") + raise typer.Exit(code=error.response.status_code) from error + + typer.echo(json.dumps({"models": models}, indent=2, sort_keys=True)) diff --git a/tests/test_cli.py b/tests/test_cli.py index 7ea7d084c6..e45cacde8d 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -984,6 +984,202 @@ def test_repo_delete_with_all_options(self, runner: CliRunner) -> None: ) +class TestInferenceEndpointsCommands: + def test_list(self, runner: CliRunner) -> None: + endpoint = Mock(raw={"name": "demo"}) + with patch("huggingface_hub.cli.inference_endpoints.get_hf_api") as api_cls: + api = api_cls.return_value + api.list_inference_endpoints.return_value = [endpoint] + result = runner.invoke(app, ["inference-endpoints", "list"]) + assert result.exit_code == 0 + api_cls.assert_called_once_with(token=None) + api.list_inference_endpoints.assert_called_once_with(namespace=None, token=None) + assert '"items"' in result.stdout + assert '"name": "demo"' in result.stdout + + def test_deploy_from_hub(self, runner: CliRunner) -> None: + endpoint = Mock(raw={"name": "hub"}) + with patch("huggingface_hub.cli.inference_endpoints.get_hf_api") as api_cls: + api = api_cls.return_value + api.create_inference_endpoint.return_value = endpoint + result = runner.invoke( + app, + [ + "inference-endpoints", + "deploy", + "hub", + "my-endpoint", + "--repo", + "my-repo", + "--framework", + "custom", + "--accelerator", + "cpu", + "--instance-size", + "x4", + "--instance-type", + "standard", + "--region", + "us-east-1", + "--vendor", + "aws", + ], + ) + assert result.exit_code == 0 + api_cls.assert_called_once_with(token=None) + api.create_inference_endpoint.assert_called_once_with( + name="my-endpoint", + repository="my-repo", + framework="custom", + accelerator="cpu", + instance_size="x4", + instance_type="standard", + region="us-east-1", + vendor="aws", + namespace=None, + token=None, + task=None, + ) + assert '"name": "hub"' in result.stdout + + def test_deploy_from_catalog(self, runner: CliRunner) -> None: + endpoint = Mock(raw={"name": "catalog"}) + with patch("huggingface_hub.cli.inference_endpoints.get_hf_api") as api_cls: + api = api_cls.return_value + api.create_inference_endpoint_from_catalog.return_value = endpoint + result = runner.invoke( + app, + [ + "inference-endpoints", + "deploy", + "catalog", + "catalog-endpoint", + "--repo", + "catalog/model", + ], + ) + assert result.exit_code == 0 + api_cls.assert_called_once_with(token=None) + api.create_inference_endpoint_from_catalog.assert_called_once_with( + repo_id="catalog/model", + name="catalog-endpoint", + namespace=None, + token=None, + ) + assert '"name": "catalog"' in result.stdout + + def test_inspect(self, runner: CliRunner) -> None: + endpoint = Mock(raw={"name": "inspect"}) + with patch("huggingface_hub.cli.inference_endpoints.get_hf_api") as api_cls: + api = api_cls.return_value + api.get_inference_endpoint.return_value = endpoint + result = runner.invoke(app, ["inference-endpoints", "inspect", "my-endpoint"]) + assert result.exit_code == 0 + api_cls.assert_called_once_with(token=None) + api.get_inference_endpoint.assert_called_once_with(name="my-endpoint", namespace=None, token=None) + assert '"name": "inspect"' in result.stdout + + def test_update(self, runner: CliRunner) -> None: + endpoint = Mock(raw={"name": "updated"}) + with patch("huggingface_hub.cli.inference_endpoints.get_hf_api") as api_cls: + api = api_cls.return_value + api.update_inference_endpoint.return_value = endpoint + result = runner.invoke( + app, + [ + "inference-endpoints", + "update", + "my-endpoint", + "--repo", + "my-repo", + "--accelerator", + "gpu", + "--instance-size", + "x4", + ], + ) + assert result.exit_code == 0 + api_cls.assert_called_once_with(token=None) + api.update_inference_endpoint.assert_called_once_with( + name="my-endpoint", + namespace=None, + repository="my-repo", + framework=None, + revision=None, + task=None, + accelerator="gpu", + instance_size="x4", + instance_type=None, + min_replica=None, + max_replica=None, + scale_to_zero_timeout=None, + token=None, + ) + assert '"name": "updated"' in result.stdout + + def test_delete(self, runner: CliRunner) -> None: + with patch("huggingface_hub.cli.inference_endpoints.get_hf_api") as api_cls: + api = api_cls.return_value + result = runner.invoke(app, ["inference-endpoints", "delete", "my-endpoint", "--yes"]) + assert result.exit_code == 0 + api_cls.assert_called_once_with(token=None) + api.delete_inference_endpoint.assert_called_once_with(name="my-endpoint", namespace=None, token=None) + assert "Deleted 'my-endpoint'." in result.stdout + + def test_pause(self, runner: CliRunner) -> None: + endpoint = Mock(raw={"name": "paused"}) + with patch("huggingface_hub.cli.inference_endpoints.get_hf_api") as api_cls: + api = api_cls.return_value + api.pause_inference_endpoint.return_value = endpoint + result = runner.invoke(app, ["inference-endpoints", "pause", "my-endpoint"]) + assert result.exit_code == 0 + api_cls.assert_called_once_with(token=None) + api.pause_inference_endpoint.assert_called_once_with(name="my-endpoint", namespace=None, token=None) + assert '"name": "paused"' in result.stdout + + def test_resume(self, runner: CliRunner) -> None: + endpoint = Mock(raw={"name": "resumed"}) + with patch("huggingface_hub.cli.inference_endpoints.get_hf_api") as api_cls: + api = api_cls.return_value + api.resume_inference_endpoint.return_value = endpoint + result = runner.invoke(app, ["inference-endpoints", "resume", "my-endpoint"]) + assert result.exit_code == 0 + api_cls.assert_called_once_with(token=None) + api.resume_inference_endpoint.assert_called_once_with( + name="my-endpoint", + namespace=None, + token=None, + running_ok=True, + ) + assert '"name": "resumed"' in result.stdout + + def test_scale_to_zero(self, runner: CliRunner) -> None: + endpoint = Mock(raw={"name": "zero"}) + with patch("huggingface_hub.cli.inference_endpoints.get_hf_api") as api_cls: + api = api_cls.return_value + api.scale_to_zero_inference_endpoint.return_value = endpoint + result = runner.invoke(app, ["inference-endpoints", "scale-to-zero", "my-endpoint"]) + assert result.exit_code == 0 + api_cls.assert_called_once_with(token=None) + api.scale_to_zero_inference_endpoint.assert_called_once_with( + name="my-endpoint", + namespace=None, + token=None, + ) + assert '"name": "zero"' in result.stdout + + def test_list_catalog(self, runner: CliRunner) -> None: + with patch("huggingface_hub.cli.inference_endpoints.get_hf_api") as api_cls: + api = api_cls.return_value + api.list_inference_catalog.return_value = ["model"] + result = runner.invoke(app, ["inference-endpoints", "list-catalog"]) + assert result.exit_code == 0 + api_cls.assert_called_once_with(token=None) + api.list_inference_catalog.assert_called_once_with(token=None) + assert '"models"' in result.stdout + assert '"model"' in result.stdout + + @contextmanager def tmp_current_directory() -> Generator[str, None, None]: with SoftTemporaryDirectory() as tmp_dir: From 30c13d69a550d7e0f17c623f8792516900c0b691 Mon Sep 17 00:00:00 2001 From: Celina Hanouti Date: Thu, 9 Oct 2025 16:53:29 +0200 Subject: [PATCH 02/10] fix naming --- docs/source/en/guides/cli.md | 11 ++++++----- docs/source/en/guides/inference_endpoints.md | 4 ++-- src/huggingface_hub/cli/inference_endpoints.py | 4 ++-- tests/test_cli.py | 8 ++++---- 4 files changed, 14 insertions(+), 13 deletions(-) diff --git a/docs/source/en/guides/cli.md b/docs/source/en/guides/cli.md index 2e44792bec..15e5f492ee 100644 --- a/docs/source/en/guides/cli.md +++ b/docs/source/en/guides/cli.md @@ -919,20 +919,21 @@ Manage scheduled jobs using ## hf inference-endpoints -Use `hf inference-endpoints` to list, deploy, inspect, and manage Inference Endpoints directly from the terminal. +Use `hf inference-endpoints` to list, deploy, describe, and manage Inference Endpoints directly from the terminal. ```bash # Lists endpoints in your namespace >>> hf inference-endpoints list -# Deploy an endpoint ->>> hf inference-endpoints deploy hub my-endpoint --repo gpt2 --framework pytorch --accelerator cpu --instance-size x2 --instance-type intel-icl # Deploy an endpoint from Model Catalog >>> hf inference-endpoints deploy catalog --repo openai/gpt-oss-120b --name my-endpoint -# Inspect status and metadata ->>> hf inference-endpoints inspect my-endpoint +# Deploy an endpoint from the Hugging Face Hub +>>> hf inference-endpoints deploy hub my-endpoint --repo gpt2 --framework pytorch --accelerator cpu --instance-size x2 --instance-type intel-icl + +# Show status and metadata +>>> hf inference-endpoints describe my-endpoint # Pause the endpoint >>> hf inference-endpoints pause my-endpoint diff --git a/docs/source/en/guides/inference_endpoints.md b/docs/source/en/guides/inference_endpoints.md index 925a23b01c..8c85fdd8ac 100644 --- a/docs/source/en/guides/inference_endpoints.md +++ b/docs/source/en/guides/inference_endpoints.md @@ -52,7 +52,7 @@ InferenceEndpoint(name='my-endpoint-name', namespace='Wauplin', repository='gpt2 Or via CLI: ```bash -hf inference-endpoints inspect my-endpoint-name +hf inference-endpoints describe my-endpoint-name ``` It's a dataclass that holds information about the endpoint. You can access important attributes such as `name`, `repository`, `status`, `task`, `created_at`, `updated_at`, etc. If you need it, you can also access the raw response from the server with `endpoint.raw`. @@ -117,7 +117,7 @@ InferenceEndpoint(name='my-endpoint-name', namespace='Wauplin', repository='gpt2 Or via CLI: ```bash -hf inference-endpoints inspect my-endpoint-name +hf inference-endpoints describe my-endpoint-name hf inference-endpoints list --namespace huggingface hf inference-endpoints list --namespace '*' ``` diff --git a/src/huggingface_hub/cli/inference_endpoints.py b/src/huggingface_hub/cli/inference_endpoints.py index 5b4034da24..9488b08ed9 100644 --- a/src/huggingface_hub/cli/inference_endpoints.py +++ b/src/huggingface_hub/cli/inference_endpoints.py @@ -121,7 +121,7 @@ def list( ) -deploy_app = typer_factory(help="Deploy Inference Endpoints from Hub repositories or the Model Catalog.") +deploy_app = typer_factory(help="Deploy Inference Endpoints from the Hub or the Catalog.") @deploy_app.command(name="hub", help="Deploy an Inference Endpoint from a Hub repository.") @@ -236,7 +236,7 @@ def deploy_from_catalog( @app.command(help="Get information about an Inference Endpoint.") -def inspect( +def describe( name: NameArg, namespace: NamespaceOpt = None, token: TokenOpt = None, diff --git a/tests/test_cli.py b/tests/test_cli.py index e45cacde8d..886d333269 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1068,16 +1068,16 @@ def test_deploy_from_catalog(self, runner: CliRunner) -> None: ) assert '"name": "catalog"' in result.stdout - def test_inspect(self, runner: CliRunner) -> None: - endpoint = Mock(raw={"name": "inspect"}) + def test_describe(self, runner: CliRunner) -> None: + endpoint = Mock(raw={"name": "describe"}) with patch("huggingface_hub.cli.inference_endpoints.get_hf_api") as api_cls: api = api_cls.return_value api.get_inference_endpoint.return_value = endpoint - result = runner.invoke(app, ["inference-endpoints", "inspect", "my-endpoint"]) + result = runner.invoke(app, ["inference-endpoints", "describe", "my-endpoint"]) assert result.exit_code == 0 api_cls.assert_called_once_with(token=None) api.get_inference_endpoint.assert_called_once_with(name="my-endpoint", namespace=None, token=None) - assert '"name": "inspect"' in result.stdout + assert '"name": "describe"' in result.stdout def test_update(self, runner: CliRunner) -> None: endpoint = Mock(raw={"name": "updated"}) From e670188819f44c1772078c3e2631ee5fb9f03a39 Mon Sep 17 00:00:00 2001 From: Celina Hanouti Date: Thu, 9 Oct 2025 16:54:06 +0200 Subject: [PATCH 03/10] update docs --- docs/source/en/guides/cli.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/guides/cli.md b/docs/source/en/guides/cli.md index 15e5f492ee..dc67102cc5 100644 --- a/docs/source/en/guides/cli.md +++ b/docs/source/en/guides/cli.md @@ -944,4 +944,4 @@ Use `hf inference-endpoints` to list, deploy, describe, and manage Inference End ``` > [!TIP] -> Add `--namespace` to target an organization, `--token` to override authentication, and use `hf inference-endpoints deploy catalog` to launch an endpoint with an optimized configuration from the Model Catalog. +> Add `--namespace` to target an organization, `--token` to override authentication. From b49a70a7407787f762c782f4cab777829f6f3db1 Mon Sep 17 00:00:00 2001 From: Celina Hanouti Date: Thu, 9 Oct 2025 16:56:32 +0200 Subject: [PATCH 04/10] wording --- docs/source/en/guides/inference_endpoints.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/en/guides/inference_endpoints.md b/docs/source/en/guides/inference_endpoints.md index 8c85fdd8ac..ef8a3b5228 100644 --- a/docs/source/en/guides/inference_endpoints.md +++ b/docs/source/en/guides/inference_endpoints.md @@ -33,7 +33,7 @@ The first step is to create an Inference Endpoint using [`create_inference_endpo ... ) ``` -CLI equivalent: +Or via CLI: ```bash hf inference-endpoints deploy hub my-endpoint-name --repo gpt2 --framework pytorch --accelerator cpu --vendor aws --region us-east-1 --instance-size x2 --instance-type intel-icl --task text-generation @@ -216,7 +216,7 @@ InferenceEndpoint(name='my-endpoint-name', namespace='Wauplin', repository='gpt2 # Endpoint is not 'running' but still has a URL and will restart on first call. ``` -or via CLI: +Or via CLI: ```bash hf inference-endpoints pause my-endpoint-name From 7b7b12261da5024687bc30360aeb01431158b440 Mon Sep 17 00:00:00 2001 From: Celina Hanouti Date: Thu, 9 Oct 2025 16:57:10 +0200 Subject: [PATCH 05/10] remove logging --- src/huggingface_hub/cli/inference_endpoints.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/huggingface_hub/cli/inference_endpoints.py b/src/huggingface_hub/cli/inference_endpoints.py index 9488b08ed9..62be20fab8 100644 --- a/src/huggingface_hub/cli/inference_endpoints.py +++ b/src/huggingface_hub/cli/inference_endpoints.py @@ -9,14 +9,10 @@ from huggingface_hub._inference_endpoints import InferenceEndpoint from huggingface_hub.errors import HfHubHTTPError -from huggingface_hub.utils import logging from ._cli_utils import TokenOpt, get_hf_api, typer_factory -logger = logging.get_logger(__name__) - - app = typer_factory(help="Manage Hugging Face Inference Endpoints.") NameArg = Annotated[ From 0862c4addf82e36b28bf84f5c80072b3dcdf1610 Mon Sep 17 00:00:00 2001 From: Celina Hanouti Date: Thu, 9 Oct 2025 16:59:53 +0200 Subject: [PATCH 06/10] don't instantiate logger when not needed --- src/huggingface_hub/cli/download.py | 3 --- src/huggingface_hub/cli/jobs.py | 3 --- src/huggingface_hub/cli/repo.py | 15 ++------------- src/huggingface_hub/cli/repo_files.py | 3 --- src/huggingface_hub/cli/upload_large_folder.py | 4 ---- 5 files changed, 2 insertions(+), 26 deletions(-) diff --git a/src/huggingface_hub/cli/download.py b/src/huggingface_hub/cli/download.py index 655e44d0f7..0a80f0caed 100644 --- a/src/huggingface_hub/cli/download.py +++ b/src/huggingface_hub/cli/download.py @@ -49,9 +49,6 @@ from ._cli_utils import RepoIdArg, RepoTypeOpt, RevisionOpt, TokenOpt -logger = logging.get_logger(__name__) - - def download( repo_id: RepoIdArg, filenames: Annotated[ diff --git a/src/huggingface_hub/cli/jobs.py b/src/huggingface_hub/cli/jobs.py index 5b61cd8731..07363a88a6 100644 --- a/src/huggingface_hub/cli/jobs.py +++ b/src/huggingface_hub/cli/jobs.py @@ -63,14 +63,11 @@ from huggingface_hub import SpaceHardware, get_token from huggingface_hub.errors import HfHubHTTPError -from huggingface_hub.utils import logging from huggingface_hub.utils._dotenv import load_dotenv from ._cli_utils import TokenOpt, get_hf_api, typer_factory -logger = logging.get_logger(__name__) - SUGGESTED_FLAVORS = [item.value for item in SpaceHardware if item.value != "zero-a10g"] # Common job-related options diff --git a/src/huggingface_hub/cli/repo.py b/src/huggingface_hub/cli/repo.py index bb67ba9172..9751b22b4e 100644 --- a/src/huggingface_hub/cli/repo.py +++ b/src/huggingface_hub/cli/repo.py @@ -27,22 +27,11 @@ import typer from huggingface_hub.errors import HfHubHTTPError, RepositoryNotFoundError, RevisionNotFoundError -from huggingface_hub.utils import ANSI, logging +from huggingface_hub.utils import ANSI -from ._cli_utils import ( - PrivateOpt, - RepoIdArg, - RepoType, - RepoTypeOpt, - RevisionOpt, - TokenOpt, - get_hf_api, - typer_factory, -) +from ._cli_utils import PrivateOpt, RepoIdArg, RepoType, RepoTypeOpt, RevisionOpt, TokenOpt, get_hf_api, typer_factory -logger = logging.get_logger(__name__) - repo_cli = typer_factory(help="Manage repos on the Hub.") tag_cli = typer_factory(help="Manage tags for a repo on the Hub.") branch_cli = typer_factory(help="Manage branches for a repo on the Hub.") diff --git a/src/huggingface_hub/cli/repo_files.py b/src/huggingface_hub/cli/repo_files.py index 68ffb3e0e8..787be2d48e 100644 --- a/src/huggingface_hub/cli/repo_files.py +++ b/src/huggingface_hub/cli/repo_files.py @@ -43,9 +43,6 @@ from ._cli_utils import RepoIdArg, RepoType, RepoTypeOpt, RevisionOpt, TokenOpt, get_hf_api, typer_factory -logger = logging.get_logger(__name__) - - repo_files_cli = typer_factory(help="Manage files in a repo on the Hub.") diff --git a/src/huggingface_hub/cli/upload_large_folder.py b/src/huggingface_hub/cli/upload_large_folder.py index af4fc55836..4484fb1890 100644 --- a/src/huggingface_hub/cli/upload_large_folder.py +++ b/src/huggingface_hub/cli/upload_large_folder.py @@ -19,15 +19,11 @@ import typer -from huggingface_hub import logging from huggingface_hub.utils import ANSI, disable_progress_bars from ._cli_utils import PrivateOpt, RepoIdArg, RepoType, RepoTypeOpt, RevisionOpt, TokenOpt, get_hf_api -logger = logging.get_logger(__name__) - - def upload_large_folder( repo_id: RepoIdArg, local_path: Annotated[ From d81a59cef1f6a4decedb0505102c252b0b6edcf9 Mon Sep 17 00:00:00 2001 From: Celina Hanouti Date: Thu, 9 Oct 2025 17:04:00 +0200 Subject: [PATCH 07/10] refactor --- .../cli/inference_endpoints.py | 116 ++++++------------ 1 file changed, 36 insertions(+), 80 deletions(-) diff --git a/src/huggingface_hub/cli/inference_endpoints.py b/src/huggingface_hub/cli/inference_endpoints.py index 62be20fab8..09337b5f26 100644 --- a/src/huggingface_hub/cli/inference_endpoints.py +++ b/src/huggingface_hub/cli/inference_endpoints.py @@ -20,78 +20,14 @@ typer.Argument(help="Endpoint name."), ] -RepoArg = Annotated[ - Optional[str], - typer.Option( - "--repo", help="The name of the model repository associated with the Inference Endpoint (e.g. 'gpt2')." - ), -] - NamespaceOpt = Annotated[ Optional[str], typer.Option( - "--namespace", help="The namespace where the Inference Endpoint will be created. Defaults to the current user's namespace.", ), ] -FrameworkOpt = Annotated[ - Optional[str], - typer.Option( - "--framework", - help="The machine learning framework used for the model (e.g. 'custom').", - ), -] - -AcceleratorOpt = Annotated[ - Optional[str], - typer.Option( - "--accelerator", - help="The hardware accelerator to be used for inference (e.g. 'cpu').", - ), -] - -InstanceSizeOpt = Annotated[ - Optional[str], - typer.Option( - "--instance-size", - help="The size or type of the instance to be used for hosting the model (e.g. 'x4').", - ), -] - -InstanceTypeOpt = Annotated[ - Optional[str], - typer.Option( - "--instance-type", - help="The cloud instance type where the Inference Endpoint will be deployed (e.g. 'intel-icl').", - ), -] - -RegionOpt = Annotated[ - Optional[str], - typer.Option( - "--region", - help="The cloud region in which the Inference Endpoint will be created (e.g. 'us-east-1').", - ), -] - -TaskOpt = Annotated[ - Optional[str], - typer.Option( - "--task", - help="The task on which to deploy the model (e.g. 'text-classification').", - ), -] -VendorOpt = Annotated[ - Optional[str], - typer.Option( - "--vendor", - help="The cloud provider or vendor where the Inference Endpoint will be hosted (e.g. 'aws').", - ), -] - - def _print_endpoint(endpoint: InferenceEndpoint) -> None: typer.echo(json.dumps(endpoint.raw, indent=2, sort_keys=True)) @@ -126,55 +62,53 @@ def deploy_from_hub( repo: Annotated[ str, typer.Option( - "--repo", help="The name of the model repository associated with the Inference Endpoint (e.g. 'gpt2').", ), ], framework: Annotated[ str, typer.Option( - "--framework", help="The machine learning framework used for the model (e.g. 'custom').", ), ], accelerator: Annotated[ str, typer.Option( - "--accelerator", help="The hardware accelerator to be used for inference (e.g. 'cpu').", ), ], instance_size: Annotated[ str, typer.Option( - "--instance-size", help="The size or type of the instance to be used for hosting the model (e.g. 'x4').", ), ], instance_type: Annotated[ str, typer.Option( - "--instance-type", help="The cloud instance type where the Inference Endpoint will be deployed (e.g. 'intel-icl').", ), ], region: Annotated[ str, typer.Option( - "--region", help="The cloud region in which the Inference Endpoint will be created (e.g. 'us-east-1').", ), ], vendor: Annotated[ str, typer.Option( - "--vendor", help="The cloud provider or vendor where the Inference Endpoint will be hosted (e.g. 'aws').", ), ], *, namespace: NamespaceOpt = None, - task: TaskOpt = None, + task: Annotated[ + Optional[str], + typer.Option( + help="The task on which to deploy the model (e.g. 'text-classification').", + ), + ] = None, token: TokenOpt = None, ) -> None: api = get_hf_api(token=token) @@ -205,11 +139,9 @@ def deploy_from_catalog( repo: Annotated[ str, typer.Option( - "--repo", help="The name of the model repository associated with the Inference Endpoint (e.g. 'gpt2').", ), ], - *, namespace: NamespaceOpt = None, token: TokenOpt = None, ) -> None: @@ -250,11 +182,36 @@ def describe( @app.command(help="Update an existing endpoint.") def update( endpoint_name: NameArg, - repo: RepoArg = None, - accelerator: AcceleratorOpt = None, - instance_size: InstanceSizeOpt = None, - instance_type: InstanceTypeOpt = None, - framework: FrameworkOpt = None, + repo: Annotated[ + Optional[str], + typer.Option( + help="The name of the model repository associated with the Inference Endpoint (e.g. 'gpt2').", + ), + ] = None, + accelerator: Annotated[ + Optional[str], + typer.Option( + help="The hardware accelerator to be used for inference (e.g. 'cpu').", + ), + ] = None, + instance_size: Annotated[ + Optional[str], + typer.Option( + help="The size or type of the instance to be used for hosting the model (e.g. 'x4').", + ), + ] = None, + instance_type: Annotated[ + Optional[str], + typer.Option( + help="The cloud instance type where the Inference Endpoint will be deployed (e.g. 'intel-icl').", + ), + ] = None, + framework: Annotated[ + Optional[str], + typer.Option( + help="The machine learning framework used for the model (e.g. 'custom').", + ), + ] = None, revision: Annotated[ Optional[str], typer.Option( @@ -318,7 +275,6 @@ def delete( yes: Annotated[ bool, typer.Option( - "--yes", help="Skip confirmation prompts.", ), ] = False, From 6a50b0b4b07d2b9edacf0ddb7a8e2b30e2d56833 Mon Sep 17 00:00:00 2001 From: Celina Hanouti Date: Thu, 9 Oct 2025 17:13:07 +0200 Subject: [PATCH 08/10] remove unused import --- src/huggingface_hub/cli/inference_endpoints.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/huggingface_hub/cli/inference_endpoints.py b/src/huggingface_hub/cli/inference_endpoints.py index 09337b5f26..3ed8250e55 100644 --- a/src/huggingface_hub/cli/inference_endpoints.py +++ b/src/huggingface_hub/cli/inference_endpoints.py @@ -1,7 +1,5 @@ """CLI commands for Hugging Face Inference Endpoints.""" -from __future__ import annotations - import json from typing import Annotated, Optional From c5b0638905424f4eabda18c66c9966c1e2f671da Mon Sep 17 00:00:00 2001 From: Celina Hanouti Date: Thu, 9 Oct 2025 17:15:51 +0200 Subject: [PATCH 09/10] nit --- src/huggingface_hub/cli/inference_endpoints.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/huggingface_hub/cli/inference_endpoints.py b/src/huggingface_hub/cli/inference_endpoints.py index 3ed8250e55..07db3ed968 100644 --- a/src/huggingface_hub/cli/inference_endpoints.py +++ b/src/huggingface_hub/cli/inference_endpoints.py @@ -30,7 +30,7 @@ def _print_endpoint(endpoint: InferenceEndpoint) -> None: typer.echo(json.dumps(endpoint.raw, indent=2, sort_keys=True)) -@app.command(help="Lists all inference endpoints for the given namespace.") +@app.command(help="Lists all Inference Endpoints for the given namespace.") def list( namespace: NamespaceOpt = None, token: TokenOpt = None, From 5b4111d2e4e934de27ae5ba3321c1e5f0a570c61 Mon Sep 17 00:00:00 2001 From: Celina Hanouti Date: Thu, 9 Oct 2025 18:01:40 +0200 Subject: [PATCH 10/10] nit --- docs/source/en/guides/cli.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/docs/source/en/guides/cli.md b/docs/source/en/guides/cli.md index dc67102cc5..dab79c67ae 100644 --- a/docs/source/en/guides/cli.md +++ b/docs/source/en/guides/cli.md @@ -925,7 +925,6 @@ Use `hf inference-endpoints` to list, deploy, describe, and manage Inference End # Lists endpoints in your namespace >>> hf inference-endpoints list - # Deploy an endpoint from Model Catalog >>> hf inference-endpoints deploy catalog --repo openai/gpt-oss-120b --name my-endpoint @@ -940,7 +939,6 @@ Use `hf inference-endpoints` to list, deploy, describe, and manage Inference End # Delete without confirmation prompt >>> hf inference-endpoints delete my-endpoint --yes - ``` > [!TIP]