From b7e21db33781735d447c8d90cfb5f163cdce2d06 Mon Sep 17 00:00:00 2001 From: axel7083 <42176370+axel7083@users.noreply.github.com> Date: Fri, 18 Apr 2025 17:11:54 +0200 Subject: [PATCH 1/3] feat(inference): vllm support Signed-off-by: axel7083 <42176370+axel7083@users.noreply.github.com> --- packages/backend/src/assets/ai.json | 7 + .../backend/src/assets/inference-images.json | 3 + .../backend/src/managers/modelsManager.ts | 3 + .../src/managers/playgroundV2Manager.ts | 3 + .../src/models/HuggingFaceModelHandler.ts | 1 + packages/backend/src/studio.ts | 4 + packages/backend/src/workers/provider/VLLM.ts | 162 ++++++++++++++++++ packages/shared/src/models/IInference.ts | 1 + 8 files changed, 184 insertions(+) create mode 100644 packages/backend/src/workers/provider/VLLM.ts diff --git a/packages/backend/src/assets/ai.json b/packages/backend/src/assets/ai.json index 4a6442fb6..eb510d977 100644 --- a/packages/backend/src/assets/ai.json +++ b/packages/backend/src/assets/ai.json @@ -526,6 +526,13 @@ "license": "Apache-2.0", "url": "huggingface:/OpenVINO/mistral-7B-instruct-v0.2-int4-ov", "backend": "openvino" + }, + { + "id": "Qwen/Qwen2-VL-2B-Instruct", + "name": "Qwen/Qwen2-VL-2B-Instruct", + "description": "Qwen/Qwen2-VL-2B-Instruct", + "url": "huggingface:/Qwen/Qwen2-VL-2B-Instruct", + "backend": "vllm" } ], "categories": [ diff --git a/packages/backend/src/assets/inference-images.json b/packages/backend/src/assets/inference-images.json index 880cbc40e..a36044c7b 100644 --- a/packages/backend/src/assets/inference-images.json +++ b/packages/backend/src/assets/inference-images.json @@ -8,5 +8,8 @@ }, "openvino": { "default": "quay.io/ramalama/openvino@sha256:670d91cc322933cc4263606459317cd4ca3fcfb16d59a46b11dcd498c2cd7cb5" + }, + "vllm": { + "default": "public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v0.8.4" } } diff --git a/packages/backend/src/managers/modelsManager.ts b/packages/backend/src/managers/modelsManager.ts index 09a26c0cf..74465757b 100644 --- a/packages/backend/src/managers/modelsManager.ts +++ b/packages/backend/src/managers/modelsManager.ts @@ -375,6 +375,8 @@ export class ModelsManager implements Disposable { model: ModelInfo, labels?: { [key: string]: string }, ): Promise { + console.log('[ModelsManager] upload model', model); + // ensure the model upload is not disabled if (this.configurationRegistry.getExtensionConfiguration().modelUploadDisabled) { console.warn('The model upload is disabled, this may cause the inference server to take a few minutes to start.'); @@ -392,6 +394,7 @@ export class ModelsManager implements Disposable { // perform download const path = uploader.perform(model.id); + console.log('[ModelsManager] path got', path); await this.updateModelInfos(); return path; diff --git a/packages/backend/src/managers/playgroundV2Manager.ts b/packages/backend/src/managers/playgroundV2Manager.ts index b7b4a53ad..78f9cc8ae 100644 --- a/packages/backend/src/managers/playgroundV2Manager.ts +++ b/packages/backend/src/managers/playgroundV2Manager.ts @@ -34,6 +34,7 @@ import { McpServerManager } from './playground/McpServerManager'; import type { ToolSet } from 'ai'; import { simulateStreamingMiddleware, wrapLanguageModel } from 'ai'; import { toMcpClients } from '../utils/mcpUtils'; +import { InferenceType } from '@shared/models/IInference'; export class PlaygroundV2Manager implements Disposable { readonly #conversationRegistry: ConversationRegistry; @@ -122,8 +123,10 @@ export class PlaygroundV2Manager implements Disposable { // create/start inference server if necessary const servers = this.inferenceManager.getServers(); + console.log('servers', servers); const server = servers.find(s => s.models.map(mi => mi.id).includes(model.id)); if (!server) { + console.warn(`no server running found with modelId ${model.id}, creating new one`); await this.inferenceManager.createInferenceServer( await withDefaultConfiguration({ modelsInfo: [model], diff --git a/packages/backend/src/models/HuggingFaceModelHandler.ts b/packages/backend/src/models/HuggingFaceModelHandler.ts index 7c50af411..49c587d97 100644 --- a/packages/backend/src/models/HuggingFaceModelHandler.ts +++ b/packages/backend/src/models/HuggingFaceModelHandler.ts @@ -23,6 +23,7 @@ import type { CompletionEvent } from './baseEvent'; import { getDurationSecondsSince } from '../utils/utils'; import type { ModelsManager } from '../managers/modelsManager'; import fs from 'node:fs/promises'; +import { dirname, basename } from 'node:path'; function parseURL(url: string): { repo: string; revision?: string } | undefined { const u = URL.parse(url); diff --git a/packages/backend/src/studio.ts b/packages/backend/src/studio.ts index 811c1eeaa..dd6b4ef62 100644 --- a/packages/backend/src/studio.ts +++ b/packages/backend/src/studio.ts @@ -64,6 +64,7 @@ import { LLAMA_STACK_API_CHANNEL, type LlamaStackAPI } from '@shared/LlamaStackA import { LlamaStackManager } from './managers/llama-stack/llamaStackManager'; import { OpenVINO } from './workers/provider/OpenVINO'; import os from 'node:os'; +import { VLLM } from './workers/provider/VLLM'; export class Studio { readonly #extensionContext: ExtensionContext; @@ -289,6 +290,9 @@ export class Studio { ), ); } + this.#extensionContext.subscriptions.push( + this.#inferenceProviderRegistry.register(new VLLM(this.#taskRegistry, this.#podmanConnection)), + ); /** * The inference manager create, stop, manage Inference servers diff --git a/packages/backend/src/workers/provider/VLLM.ts b/packages/backend/src/workers/provider/VLLM.ts new file mode 100644 index 000000000..7f2a81e64 --- /dev/null +++ b/packages/backend/src/workers/provider/VLLM.ts @@ -0,0 +1,162 @@ +/********************************************************************** + * Copyright (C) 2024 Red Hat, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ***********************************************************************/ + +import { InferenceProvider } from './InferenceProvider'; +import type { TaskRegistry } from '../../registries/TaskRegistry'; +import type { PodmanConnection } from '../../managers/podmanConnection'; +import { type InferenceServer, InferenceType } from '@shared/models/IInference'; +import type { InferenceServerConfig } from '@shared/models/InferenceServerConfig'; +import type { ContainerProviderConnection, MountConfig } from '@podman-desktop/api'; +import * as images from '../../assets/inference-images.json'; +import { LABEL_INFERENCE_SERVER } from '../../utils/inferenceUtils'; +import { DISABLE_SELINUX_LABEL_SECURITY_OPTION } from '../../utils/utils'; +import { basename, dirname } from 'node:path'; +import { join as joinposix } from 'node:path/posix'; +import { getLocalModelFile } from '../../utils/modelsUtils'; +import { SECOND } from './LlamaCppPython'; + +export class VLLM extends InferenceProvider { + constructor( + taskRegistry: TaskRegistry, + private podmanConnection: PodmanConnection, + ) { + super(taskRegistry, InferenceType.VLLM, 'vllm'); + } + + dispose(): void {} + + public enabled = (): boolean => true; + + /** + * Here is an example + * + * podman run -it --rm + * -v C:\Users\axels\.cache\huggingface\hub\models--mistralai--Mistral-7B-v0.1:/cache/models--mistralai--Mistral-7B-v0.1 + * -e HF_HUB_CACHE=/cache + * localhost/vllm-cpu-env:latest + * --model=/cache/models--mistralai--Mistral-7B-v0.1/snapshots/7231864981174d9bee8c7687c24c8344414eae6b + * + * @param config + */ + override async perform(config: InferenceServerConfig): Promise { + if (config.modelsInfo.length !== 1) + throw new Error(`only one model is supported, received ${config.modelsInfo.length}`); + + const modelInfo = config.modelsInfo[0]; + if (modelInfo.backend !== InferenceType.VLLM) { + throw new Error(`VLLM requires models with backend type ${InferenceType.VLLM} got ${modelInfo.backend}.`); + } + + if (modelInfo.file === undefined) { + throw new Error('The model info file provided is undefined'); + } + + console.log('[VLLM]', config); + console.log('[VLLM] modelInfo.file', modelInfo.file.path); + + // something ~/.cache/huggingface/hub/models--facebook--opt-125m/snapshots + // modelInfo.file.path + + const fullPath = getLocalModelFile(modelInfo); + + // modelInfo.file.path must be under the form $(HF_HUB_CACHE)/--/snapshots/ + const parent = dirname(fullPath); + const commitHash = basename(fullPath); + const name = basename(parent); + if (name !== 'snapshots') throw new Error('you must provide snapshot path for vllm'); + const modelCache = dirname(parent); + + let connection: ContainerProviderConnection | undefined; + if (config.connection) { + connection = this.podmanConnection.getContainerProviderConnection(config.connection); + } else { + connection = this.podmanConnection.findRunningContainerProviderConnection(); + } + + if (!connection) throw new Error('no running connection could be found'); + + const labels: Record = { + ...config.labels, + [LABEL_INFERENCE_SERVER]: JSON.stringify(config.modelsInfo.map(model => model.id)), + }; + + const imageInfo = await this.pullImage(connection, config.image ?? images.vllm.default, labels); + // https://huggingface.co/docs/transformers/main/en/installation#offline-mode + // HF_HUB_OFFLINE in main + // TRANSFORMERS_OFFLINE for legacy + const envs: string[] = [`HF_HUB_CACHE=/cache`, 'TRANSFORMERS_OFFLINE=1', 'HF_HUB_OFFLINE=1']; + + labels['api'] = `http://localhost:${config.port}/inference`; + + const mounts: MountConfig = [ + { + Target: `/cache/${modelInfo.id}`, + Source: modelCache, + Type: 'bind', + }, + ]; + + const containerInfo = await this.createContainer( + imageInfo.engineId, + { + Image: imageInfo.Id, + Detach: true, + Labels: labels, + HostConfig: { + AutoRemove: false, + Mounts: mounts, + PortBindings: { + '8000/tcp': [ + { + HostPort: `${config.port}`, + }, + ], + }, + SecurityOpt: [DISABLE_SELINUX_LABEL_SECURITY_OPTION], + }, + HealthCheck: { + // must be the port INSIDE the container not the exposed one + Test: ['CMD-SHELL', `curl -sSf localhost:8000/version > /dev/null`], + Interval: SECOND * 5, + Retries: 4 * 5, + }, + Env: envs, + Cmd: [ + `--model=${joinposix('/cache', modelInfo.id, 'snapshots', commitHash)}`, + `--served_model_name=${modelInfo.file.file}`, + '--chat-template-content-format=openai', + ], + }, + labels, + ); + + return { + models: [modelInfo], + status: 'running', + connection: { + port: config.port, + }, + container: { + containerId: containerInfo.id, + engineId: containerInfo.engineId, + }, + type: InferenceType.VLLM, + labels: labels, + }; + } +} diff --git a/packages/shared/src/models/IInference.ts b/packages/shared/src/models/IInference.ts index 50a0bf5e0..6c167fd7a 100644 --- a/packages/shared/src/models/IInference.ts +++ b/packages/shared/src/models/IInference.ts @@ -21,6 +21,7 @@ export enum InferenceType { LLAMA_CPP = 'llama-cpp', WHISPER_CPP = 'whisper-cpp', OPENVINO = 'openvino', + VLLM = 'vllm', NONE = 'none', } From f101d1329624e52a1c37dde2775fc62e1778d51c Mon Sep 17 00:00:00 2001 From: Jeff MAURY Date: Thu, 15 May 2025 16:24:07 +0200 Subject: [PATCH 2/3] fix: rebase and align with Windows Signed-off-by: Jeff MAURY --- .../src/managers/playgroundV2Manager.ts | 2 +- .../src/models/HuggingFaceModelHandler.ts | 1 - packages/backend/src/workers/provider/VLLM.ts | 25 +++++++------------ .../src/pages/InferenceServerDetails.svelte | 1 + packages/shared/src/models/IInference.ts | 1 + 5 files changed, 12 insertions(+), 18 deletions(-) diff --git a/packages/backend/src/managers/playgroundV2Manager.ts b/packages/backend/src/managers/playgroundV2Manager.ts index 78f9cc8ae..fef481269 100644 --- a/packages/backend/src/managers/playgroundV2Manager.ts +++ b/packages/backend/src/managers/playgroundV2Manager.ts @@ -256,7 +256,7 @@ export class PlaygroundV2Manager implements Disposable { const start = Date.now(); streamProcessor - .stream(model, tools, options) + .stream(model, tools, server.type === InferenceType.VLLM ? {} : options) .consumeStream() .then(() => { this.telemetry.logUsage('playground.message.complete', { diff --git a/packages/backend/src/models/HuggingFaceModelHandler.ts b/packages/backend/src/models/HuggingFaceModelHandler.ts index 49c587d97..7c50af411 100644 --- a/packages/backend/src/models/HuggingFaceModelHandler.ts +++ b/packages/backend/src/models/HuggingFaceModelHandler.ts @@ -23,7 +23,6 @@ import type { CompletionEvent } from './baseEvent'; import { getDurationSecondsSince } from '../utils/utils'; import type { ModelsManager } from '../managers/modelsManager'; import fs from 'node:fs/promises'; -import { dirname, basename } from 'node:path'; function parseURL(url: string): { repo: string; revision?: string } | undefined { const u = URL.parse(url); diff --git a/packages/backend/src/workers/provider/VLLM.ts b/packages/backend/src/workers/provider/VLLM.ts index 7f2a81e64..77b8d77cd 100644 --- a/packages/backend/src/workers/provider/VLLM.ts +++ b/packages/backend/src/workers/provider/VLLM.ts @@ -25,9 +25,7 @@ import type { ContainerProviderConnection, MountConfig } from '@podman-desktop/a import * as images from '../../assets/inference-images.json'; import { LABEL_INFERENCE_SERVER } from '../../utils/inferenceUtils'; import { DISABLE_SELINUX_LABEL_SECURITY_OPTION } from '../../utils/utils'; -import { basename, dirname } from 'node:path'; -import { join as joinposix } from 'node:path/posix'; -import { getLocalModelFile } from '../../utils/modelsUtils'; +import { getHuggingFaceModelMountInfo } from '../../utils/modelsUtils'; import { SECOND } from './LlamaCppPython'; export class VLLM extends InferenceProvider { @@ -72,14 +70,9 @@ export class VLLM extends InferenceProvider { // something ~/.cache/huggingface/hub/models--facebook--opt-125m/snapshots // modelInfo.file.path - const fullPath = getLocalModelFile(modelInfo); - - // modelInfo.file.path must be under the form $(HF_HUB_CACHE)/--/snapshots/ - const parent = dirname(fullPath); - const commitHash = basename(fullPath); - const name = basename(parent); - if (name !== 'snapshots') throw new Error('you must provide snapshot path for vllm'); - const modelCache = dirname(parent); + // get model mount settings + const mountInfo = getHuggingFaceModelMountInfo(modelInfo); + const modelCache = mountInfo.suffix ? `/cache/${mountInfo.suffix}` : '/cache'; let connection: ContainerProviderConnection | undefined; if (config.connection) { @@ -101,12 +94,12 @@ export class VLLM extends InferenceProvider { // TRANSFORMERS_OFFLINE for legacy const envs: string[] = [`HF_HUB_CACHE=/cache`, 'TRANSFORMERS_OFFLINE=1', 'HF_HUB_OFFLINE=1']; - labels['api'] = `http://localhost:${config.port}/inference`; + labels['api'] = `http://localhost:${config.port}/v1`; const mounts: MountConfig = [ { - Target: `/cache/${modelInfo.id}`, - Source: modelCache, + Target: `/cache`, + Source: mountInfo.mount, Type: 'bind', }, ]; @@ -137,8 +130,8 @@ export class VLLM extends InferenceProvider { }, Env: envs, Cmd: [ - `--model=${joinposix('/cache', modelInfo.id, 'snapshots', commitHash)}`, - `--served_model_name=${modelInfo.file.file}`, + `--model=${modelCache}`, + `--served_model_name=${modelInfo.name}`, '--chat-template-content-format=openai', ], }, diff --git a/packages/frontend/src/pages/InferenceServerDetails.svelte b/packages/frontend/src/pages/InferenceServerDetails.svelte index 0ea8b1d88..8e89db501 100644 --- a/packages/frontend/src/pages/InferenceServerDetails.svelte +++ b/packages/frontend/src/pages/InferenceServerDetails.svelte @@ -56,6 +56,7 @@ const generate = async (language: string, variant: string): Promise => { let options: RequestOptions | undefined; switch (service?.type) { case InferenceType.LLAMA_CPP: + case InferenceType.VLLM: options = { url: `http://localhost:${service?.connection.port || '??'}/v1/chat/completions`, method: 'POST', diff --git a/packages/shared/src/models/IInference.ts b/packages/shared/src/models/IInference.ts index 6c167fd7a..1f9e4e09b 100644 --- a/packages/shared/src/models/IInference.ts +++ b/packages/shared/src/models/IInference.ts @@ -29,6 +29,7 @@ const InferenceTypeLabel = { 'llama-cpp': 'llamacpp', 'whisper-cpp': 'whispercpp', openvino: 'openvino', + vllm: 'vLLM', none: 'None', }; From 14e1328438a535ab28ed84715cefa5870118cb30 Mon Sep 17 00:00:00 2001 From: Jeff MAURY Date: Thu, 19 Jun 2025 17:50:53 +0200 Subject: [PATCH 3/3] fix: use vllm 0.9.0.1 and support ARM arch Signed-off-by: Jeff MAURY --- packages/backend/src/assets/inference-images.json | 2 +- packages/backend/src/workers/provider/VLLM.ts | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/packages/backend/src/assets/inference-images.json b/packages/backend/src/assets/inference-images.json index a36044c7b..886076b98 100644 --- a/packages/backend/src/assets/inference-images.json +++ b/packages/backend/src/assets/inference-images.json @@ -10,6 +10,6 @@ "default": "quay.io/ramalama/openvino@sha256:670d91cc322933cc4263606459317cd4ca3fcfb16d59a46b11dcd498c2cd7cb5" }, "vllm": { - "default": "public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v0.8.4" + "default": "quay.io/podman-ai-lab/vllm@sha256:8a2d2894835bcb560b7c76b0c0f0e8d19ef21de5ad0c9508809ba73cfe349780" } } diff --git a/packages/backend/src/workers/provider/VLLM.ts b/packages/backend/src/workers/provider/VLLM.ts index 77b8d77cd..e413aa90e 100644 --- a/packages/backend/src/workers/provider/VLLM.ts +++ b/packages/backend/src/workers/provider/VLLM.ts @@ -133,6 +133,7 @@ export class VLLM extends InferenceProvider { `--model=${modelCache}`, `--served_model_name=${modelInfo.name}`, '--chat-template-content-format=openai', + '--dtype=float32', ], }, labels,