From b7e21db33781735d447c8d90cfb5f163cdce2d06 Mon Sep 17 00:00:00 2001
From: axel7083 <42176370+axel7083@users.noreply.github.com>
Date: Fri, 18 Apr 2025 17:11:54 +0200
Subject: [PATCH 1/3] feat(inference): vllm support

Signed-off-by: axel7083 <42176370+axel7083@users.noreply.github.com>
---
 packages/backend/src/assets/ai.json           |   7 +
 .../backend/src/assets/inference-images.json  |   3 +
 .../backend/src/managers/modelsManager.ts     |   3 +
 .../src/managers/playgroundV2Manager.ts       |   3 +
 .../src/models/HuggingFaceModelHandler.ts     |   1 +
 packages/backend/src/studio.ts                |   4 +
 packages/backend/src/workers/provider/VLLM.ts | 162 ++++++++++++++++++
 packages/shared/src/models/IInference.ts      |   1 +
 8 files changed, 184 insertions(+)
 create mode 100644 packages/backend/src/workers/provider/VLLM.ts
diff --git a/packages/backend/src/assets/ai.json b/packages/backend/src/assets/ai.json
index 4a6442fb6..eb510d977 100644
--- a/packages/backend/src/assets/ai.json
+++ b/packages/backend/src/assets/ai.json
@@ -526,6 +526,13 @@
       "license": "Apache-2.0",
       "url": "huggingface:/OpenVINO/mistral-7B-instruct-v0.2-int4-ov",
       "backend": "openvino"
+    },
+    {
+      "id": "Qwen/Qwen2-VL-2B-Instruct",
+      "name": "Qwen/Qwen2-VL-2B-Instruct",
+      "description": "Qwen/Qwen2-VL-2B-Instruct",
+      "url": "huggingface:/Qwen/Qwen2-VL-2B-Instruct",
+      "backend": "vllm"
     }
   ],
   "categories": [
diff --git a/packages/backend/src/assets/inference-images.json b/packages/backend/src/assets/inference-images.json
index 880cbc40e..a36044c7b 100644
--- a/packages/backend/src/assets/inference-images.json
+++ b/packages/backend/src/assets/inference-images.json
@@ -8,5 +8,8 @@
   },
   "openvino": {
     "default": "quay.io/ramalama/openvino@sha256:670d91cc322933cc4263606459317cd4ca3fcfb16d59a46b11dcd498c2cd7cb5"
+  },
+  "vllm": {
+    "default": "public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v0.8.4"
   }
 }
diff --git a/packages/backend/src/managers/modelsManager.ts b/packages/backend/src/managers/modelsManager.ts
index 09a26c0cf..74465757b 100644
--- a/packages/backend/src/managers/modelsManager.ts
+++ b/packages/backend/src/managers/modelsManager.ts
@@ -375,6 +375,8 @@ export class ModelsManager implements Disposable {
     model: ModelInfo,
     labels?: { [key: string]: string },
   ): Promise<string> {
+    console.log('[ModelsManager] upload model', model);
+
     // ensure the model upload is not disabled
     if (this.configurationRegistry.getExtensionConfiguration().modelUploadDisabled) {
       console.warn('The model upload is disabled, this may cause the inference server to take a few minutes to start.');
@@ -392,6 +394,7 @@ export class ModelsManager implements Disposable {
 
     // perform download
     const path = uploader.perform(model.id);
+    console.log('[ModelsManager] path got', path);
     await this.updateModelInfos();
 
     return path;
diff --git a/packages/backend/src/managers/playgroundV2Manager.ts b/packages/backend/src/managers/playgroundV2Manager.ts
index b7b4a53ad..78f9cc8ae 100644
--- a/packages/backend/src/managers/playgroundV2Manager.ts
+++ b/packages/backend/src/managers/playgroundV2Manager.ts
@@ -34,6 +34,7 @@ import { McpServerManager } from './playground/McpServerManager';
 import type { ToolSet } from 'ai';
 import { simulateStreamingMiddleware, wrapLanguageModel } from 'ai';
 import { toMcpClients } from '../utils/mcpUtils';
+import { InferenceType } from '@shared/models/IInference';
 
 export class PlaygroundV2Manager implements Disposable {
   readonly #conversationRegistry: ConversationRegistry;
@@ -122,8 +123,10 @@ export class PlaygroundV2Manager implements Disposable {
 
     // create/start inference server if necessary
     const servers = this.inferenceManager.getServers();
+    console.log('servers', servers);
     const server = servers.find(s => s.models.map(mi => mi.id).includes(model.id));
     if (!server) {
+      console.warn(`no server running found with modelId ${model.id}, creating new one`);
       await this.inferenceManager.createInferenceServer(
         await withDefaultConfiguration({
           modelsInfo: [model],
diff --git a/packages/backend/src/models/HuggingFaceModelHandler.ts b/packages/backend/src/models/HuggingFaceModelHandler.ts
index 7c50af411..49c587d97 100644
--- a/packages/backend/src/models/HuggingFaceModelHandler.ts
+++ b/packages/backend/src/models/HuggingFaceModelHandler.ts
@@ -23,6 +23,7 @@ import type { CompletionEvent } from './baseEvent';
 import { getDurationSecondsSince } from '../utils/utils';
 import type { ModelsManager } from '../managers/modelsManager';
 import fs from 'node:fs/promises';
+import { dirname, basename } from 'node:path';
 
 function parseURL(url: string): { repo: string; revision?: string } | undefined {
   const u = URL.parse(url);
diff --git a/packages/backend/src/studio.ts b/packages/backend/src/studio.ts
index 811c1eeaa..dd6b4ef62 100644
--- a/packages/backend/src/studio.ts
+++ b/packages/backend/src/studio.ts
@@ -64,6 +64,7 @@ import { LLAMA_STACK_API_CHANNEL, type LlamaStackAPI } from '@shared/LlamaStackA
 import { LlamaStackManager } from './managers/llama-stack/llamaStackManager';
 import { OpenVINO } from './workers/provider/OpenVINO';
 import os from 'node:os';
+import { VLLM } from './workers/provider/VLLM';
 
 export class Studio {
   readonly #extensionContext: ExtensionContext;
@@ -289,6 +290,9 @@ export class Studio {
         ),
       );
     }
+    this.#extensionContext.subscriptions.push(
+      this.#inferenceProviderRegistry.register(new VLLM(this.#taskRegistry, this.#podmanConnection)),
+    );
 
     /**
      * The inference manager create, stop, manage Inference servers
diff --git a/packages/backend/src/workers/provider/VLLM.ts b/packages/backend/src/workers/provider/VLLM.ts
new file mode 100644
index 000000000..7f2a81e64
--- /dev/null
+++ b/packages/backend/src/workers/provider/VLLM.ts
@@ -0,0 +1,162 @@
+/**********************************************************************
+ * Copyright (C) 2024 Red Hat, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ***********************************************************************/
+
+import { InferenceProvider } from './InferenceProvider';
+import type { TaskRegistry } from '../../registries/TaskRegistry';
+import type { PodmanConnection } from '../../managers/podmanConnection';
+import { type InferenceServer, InferenceType } from '@shared/models/IInference';
+import type { InferenceServerConfig } from '@shared/models/InferenceServerConfig';
+import type { ContainerProviderConnection, MountConfig } from '@podman-desktop/api';
+import * as images from '../../assets/inference-images.json';
+import { LABEL_INFERENCE_SERVER } from '../../utils/inferenceUtils';
+import { DISABLE_SELINUX_LABEL_SECURITY_OPTION } from '../../utils/utils';
+import { basename, dirname } from 'node:path';
+import { join as joinposix } from 'node:path/posix';
+import { getLocalModelFile } from '../../utils/modelsUtils';
+import { SECOND } from './LlamaCppPython';
+
+export class VLLM extends InferenceProvider {
+  constructor(
+    taskRegistry: TaskRegistry,
+    private podmanConnection: PodmanConnection,
+  ) {
+    super(taskRegistry, InferenceType.VLLM, 'vllm');
+  }
+
+  dispose(): void {}
+
+  public enabled = (): boolean => true;
+
+  /**
+   * Here is an example
+   *
+   * podman run -it --rm
+   *  -v C:\Users\axels\.cache\huggingface\hub\models--mistralai--Mistral-7B-v0.1:/cache/models--mistralai--Mistral-7B-v0.1
+   *  -e HF_HUB_CACHE=/cache
+   *  localhost/vllm-cpu-env:latest
+   *  --model=/cache/models--mistralai--Mistral-7B-v0.1/snapshots/7231864981174d9bee8c7687c24c8344414eae6b
+   *
+   * @param config
+   */
+  override async perform(config: InferenceServerConfig): Promise<InferenceServer> {
+    if (config.modelsInfo.length !== 1)
+      throw new Error(`only one model is supported, received ${config.modelsInfo.length}`);
+
+    const modelInfo = config.modelsInfo[0];
+    if (modelInfo.backend !== InferenceType.VLLM) {
+      throw new Error(`VLLM requires models with backend type ${InferenceType.VLLM} got ${modelInfo.backend}.`);
+    }
+
+    if (modelInfo.file === undefined) {
+      throw new Error('The model info file provided is undefined');
+    }
+
+    console.log('[VLLM]', config);
+    console.log('[VLLM] modelInfo.file', modelInfo.file.path);
+
+    // something ~/.cache/huggingface/hub/models--facebook--opt-125m/snapshots
+    // modelInfo.file.path
+
+    const fullPath = getLocalModelFile(modelInfo);
+
+    // modelInfo.file.path must be under the form $(HF_HUB_CACHE)/<repo-type>--<repo-id>/snapshots/<commit-hash>
+    const parent = dirname(fullPath);
+    const commitHash = basename(fullPath);
+    const name = basename(parent);
+    if (name !== 'snapshots') throw new Error('you must provide snapshot path for vllm');
+    const modelCache = dirname(parent);
+
+    let connection: ContainerProviderConnection | undefined;
+    if (config.connection) {
+      connection = this.podmanConnection.getContainerProviderConnection(config.connection);
+    } else {
+      connection = this.podmanConnection.findRunningContainerProviderConnection();
+    }
+
+    if (!connection) throw new Error('no running connection could be found');
+
+    const labels: Record<string, string> = {
+      ...config.labels,
+      [LABEL_INFERENCE_SERVER]: JSON.stringify(config.modelsInfo.map(model => model.id)),
+    };
+
+    const imageInfo = await this.pullImage(connection, config.image ?? images.vllm.default, labels);
+    // https://huggingface.co/docs/transformers/main/en/installation#offline-mode
+    // HF_HUB_OFFLINE in main
+    // TRANSFORMERS_OFFLINE for legacy
+    const envs: string[] = [`HF_HUB_CACHE=/cache`, 'TRANSFORMERS_OFFLINE=1', 'HF_HUB_OFFLINE=1'];
+
+    labels['api'] = `http://localhost:${config.port}/inference`;
+
+    const mounts: MountConfig = [
+      {
+        Target: `/cache/${modelInfo.id}`,
+        Source: modelCache,
+        Type: 'bind',
+      },
+    ];
+
+    const containerInfo = await this.createContainer(
+      imageInfo.engineId,
+      {
+        Image: imageInfo.Id,
+        Detach: true,
+        Labels: labels,
+        HostConfig: {
+          AutoRemove: false,
+          Mounts: mounts,
+          PortBindings: {
+            '8000/tcp': [
+              {
+                HostPort: `${config.port}`,
+              },
+            ],
+          },
+          SecurityOpt: [DISABLE_SELINUX_LABEL_SECURITY_OPTION],
+        },
+        HealthCheck: {
+          // must be the port INSIDE the container not the exposed one
+          Test: ['CMD-SHELL', `curl -sSf localhost:8000/version > /dev/null`],
+          Interval: SECOND * 5,
+          Retries: 4 * 5,
+        },
+        Env: envs,
+        Cmd: [
+          `--model=${joinposix('/cache', modelInfo.id, 'snapshots', commitHash)}`,
+          `--served_model_name=${modelInfo.file.file}`,
+          '--chat-template-content-format=openai',
+        ],
+      },
+      labels,
+    );
+
+    return {
+      models: [modelInfo],
+      status: 'running',
+      connection: {
+        port: config.port,
+      },
+      container: {
+        containerId: containerInfo.id,
+        engineId: containerInfo.engineId,
+      },
+      type: InferenceType.VLLM,
+      labels: labels,
+    };
+  }
+}
diff --git a/packages/shared/src/models/IInference.ts b/packages/shared/src/models/IInference.ts
index 50a0bf5e0..6c167fd7a 100644
--- a/packages/shared/src/models/IInference.ts
+++ b/packages/shared/src/models/IInference.ts
@@ -21,6 +21,7 @@ export enum InferenceType {
   LLAMA_CPP = 'llama-cpp',
   WHISPER_CPP = 'whisper-cpp',
   OPENVINO = 'openvino',
+  VLLM = 'vllm',
   NONE = 'none',
 }
 

From f101d1329624e52a1c37dde2775fc62e1778d51c Mon Sep 17 00:00:00 2001
From: Jeff MAURY <jmaury@redhat.com>
Date: Thu, 15 May 2025 16:24:07 +0200
Subject: [PATCH 2/3] fix: rebase and align with Windows

Signed-off-by: Jeff MAURY <jmaury@redhat.com>
---
 .../src/managers/playgroundV2Manager.ts       |  2 +-
 .../src/models/HuggingFaceModelHandler.ts     |  1 -
 packages/backend/src/workers/provider/VLLM.ts | 25 +++++++------------
 .../src/pages/InferenceServerDetails.svelte   |  1 +
 packages/shared/src/models/IInference.ts      |  1 +
 5 files changed, 12 insertions(+), 18 deletions(-)

diff --git a/packages/backend/src/managers/playgroundV2Manager.ts b/packages/backend/src/managers/playgroundV2Manager.ts
index 78f9cc8ae..fef481269 100644
--- a/packages/backend/src/managers/playgroundV2Manager.ts
+++ b/packages/backend/src/managers/playgroundV2Manager.ts
@@ -256,7 +256,7 @@ export class PlaygroundV2Manager implements Disposable {
 
     const start = Date.now();
     streamProcessor
-      .stream(model, tools, options)
+      .stream(model, tools, server.type === InferenceType.VLLM ? {} : options)
       .consumeStream()
       .then(() => {
         this.telemetry.logUsage('playground.message.complete', {
diff --git a/packages/backend/src/models/HuggingFaceModelHandler.ts b/packages/backend/src/models/HuggingFaceModelHandler.ts
index 49c587d97..7c50af411 100644
--- a/packages/backend/src/models/HuggingFaceModelHandler.ts
+++ b/packages/backend/src/models/HuggingFaceModelHandler.ts
@@ -23,7 +23,6 @@ import type { CompletionEvent } from './baseEvent';
 import { getDurationSecondsSince } from '../utils/utils';
 import type { ModelsManager } from '../managers/modelsManager';
 import fs from 'node:fs/promises';
-import { dirname, basename } from 'node:path';
 
 function parseURL(url: string): { repo: string; revision?: string } | undefined {
   const u = URL.parse(url);
diff --git a/packages/backend/src/workers/provider/VLLM.ts b/packages/backend/src/workers/provider/VLLM.ts
index 7f2a81e64..77b8d77cd 100644
--- a/packages/backend/src/workers/provider/VLLM.ts
+++ b/packages/backend/src/workers/provider/VLLM.ts
@@ -25,9 +25,7 @@ import type { ContainerProviderConnection, MountConfig } from '@podman-desktop/a
 import * as images from '../../assets/inference-images.json';
 import { LABEL_INFERENCE_SERVER } from '../../utils/inferenceUtils';
 import { DISABLE_SELINUX_LABEL_SECURITY_OPTION } from '../../utils/utils';
-import { basename, dirname } from 'node:path';
-import { join as joinposix } from 'node:path/posix';
-import { getLocalModelFile } from '../../utils/modelsUtils';
+import { getHuggingFaceModelMountInfo } from '../../utils/modelsUtils';
 import { SECOND } from './LlamaCppPython';
 
 export class VLLM extends InferenceProvider {
@@ -72,14 +70,9 @@ export class VLLM extends InferenceProvider {
     // something ~/.cache/huggingface/hub/models--facebook--opt-125m/snapshots
     // modelInfo.file.path
 
-    const fullPath = getLocalModelFile(modelInfo);
-
-    // modelInfo.file.path must be under the form $(HF_HUB_CACHE)/<repo-type>--<repo-id>/snapshots/<commit-hash>
-    const parent = dirname(fullPath);
-    const commitHash = basename(fullPath);
-    const name = basename(parent);
-    if (name !== 'snapshots') throw new Error('you must provide snapshot path for vllm');
-    const modelCache = dirname(parent);
+    // get model mount settings
+    const mountInfo = getHuggingFaceModelMountInfo(modelInfo);
+    const modelCache = mountInfo.suffix ? `/cache/${mountInfo.suffix}` : '/cache';
 
     let connection: ContainerProviderConnection | undefined;
     if (config.connection) {
@@ -101,12 +94,12 @@ export class VLLM extends InferenceProvider {
     // TRANSFORMERS_OFFLINE for legacy
     const envs: string[] = [`HF_HUB_CACHE=/cache`, 'TRANSFORMERS_OFFLINE=1', 'HF_HUB_OFFLINE=1'];
 
-    labels['api'] = `http://localhost:${config.port}/inference`;
+    labels['api'] = `http://localhost:${config.port}/v1`;
 
     const mounts: MountConfig = [
       {
-        Target: `/cache/${modelInfo.id}`,
-        Source: modelCache,
+        Target: `/cache`,
+        Source: mountInfo.mount,
         Type: 'bind',
       },
     ];
@@ -137,8 +130,8 @@ export class VLLM extends InferenceProvider {
         },
         Env: envs,
         Cmd: [
-          `--model=${joinposix('/cache', modelInfo.id, 'snapshots', commitHash)}`,
-          `--served_model_name=${modelInfo.file.file}`,
+          `--model=${modelCache}`,
+          `--served_model_name=${modelInfo.name}`,
           '--chat-template-content-format=openai',
         ],
       },
diff --git a/packages/frontend/src/pages/InferenceServerDetails.svelte b/packages/frontend/src/pages/InferenceServerDetails.svelte
index 0ea8b1d88..8e89db501 100644
--- a/packages/frontend/src/pages/InferenceServerDetails.svelte
+++ b/packages/frontend/src/pages/InferenceServerDetails.svelte
@@ -56,6 +56,7 @@ const generate = async (language: string, variant: string): Promise<void> => {
   let options: RequestOptions | undefined;
   switch (service?.type) {
     case InferenceType.LLAMA_CPP:
+    case InferenceType.VLLM:
       options = {
         url: `http://localhost:${service?.connection.port || '??'}/v1/chat/completions`,
         method: 'POST',
diff --git a/packages/shared/src/models/IInference.ts b/packages/shared/src/models/IInference.ts
index 6c167fd7a..1f9e4e09b 100644
--- a/packages/shared/src/models/IInference.ts
+++ b/packages/shared/src/models/IInference.ts
@@ -29,6 +29,7 @@ const InferenceTypeLabel = {
   'llama-cpp': 'llamacpp',
   'whisper-cpp': 'whispercpp',
   openvino: 'openvino',
+  vllm: 'vLLM',
   none: 'None',
 };
 

From 14e1328438a535ab28ed84715cefa5870118cb30 Mon Sep 17 00:00:00 2001
From: Jeff MAURY <jmauy@redhat.com>
Date: Thu, 19 Jun 2025 17:50:53 +0200
Subject: [PATCH 3/3] fix: use vllm 0.9.0.1 and support ARM arch

Signed-off-by: Jeff MAURY <jmauy@redhat.com>
---
 packages/backend/src/assets/inference-images.json | 2 +-
 packages/backend/src/workers/provider/VLLM.ts     | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/packages/backend/src/assets/inference-images.json b/packages/backend/src/assets/inference-images.json
index a36044c7b..886076b98 100644
--- a/packages/backend/src/assets/inference-images.json
+++ b/packages/backend/src/assets/inference-images.json
@@ -10,6 +10,6 @@
     "default": "quay.io/ramalama/openvino@sha256:670d91cc322933cc4263606459317cd4ca3fcfb16d59a46b11dcd498c2cd7cb5"
   },
   "vllm": {
-    "default": "public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v0.8.4"
+    "default": "quay.io/podman-ai-lab/vllm@sha256:8a2d2894835bcb560b7c76b0c0f0e8d19ef21de5ad0c9508809ba73cfe349780"
   }
 }
diff --git a/packages/backend/src/workers/provider/VLLM.ts b/packages/backend/src/workers/provider/VLLM.ts
index 77b8d77cd..e413aa90e 100644
--- a/packages/backend/src/workers/provider/VLLM.ts
+++ b/packages/backend/src/workers/provider/VLLM.ts
@@ -133,6 +133,7 @@ export class VLLM extends InferenceProvider {
           `--model=${modelCache}`,
           `--served_model_name=${modelInfo.name}`,
           '--chat-template-content-format=openai',
+          '--dtype=float32',
         ],
       },
       labels,