Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion .github/workflows/e2e_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ jobs:
fail-fast: false
matrix:
mode: ["server", "library"]
environment: ["ci", "azure", "vertexai"]
environment: ["ci", "azure", "vertexai", "watsonx"]

name: "E2E: ${{ matrix.mode }} mode / ${{ matrix.environment }}"

Expand Down Expand Up @@ -200,6 +200,8 @@ jobs:
VERTEX_AI_PROJECT: ${{ secrets.VERTEX_AI_PROJECT }}
GOOGLE_APPLICATION_CREDENTIALS: ${{ env.GOOGLE_APPLICATION_CREDENTIALS }}
GCP_KEYS_PATH: ${{ env.GCP_KEYS_PATH }}
WATSONX_PROJECT_ID: ${{ secrets.WATSONX_PROJECT_ID }}
WATSONX_API_KEY: ${{ secrets.WATSONX_API_KEY }}
run: |
# Debug: Check if environment variable is available for docker-compose
echo "OPENAI_API_KEY is set: $([ -n "$OPENAI_API_KEY" ] && echo 'YES' || echo 'NO')"
Expand All @@ -226,6 +228,8 @@ jobs:
VERTEX_AI_PROJECT: ${{ secrets.VERTEX_AI_PROJECT }}
GOOGLE_APPLICATION_CREDENTIALS: ${{ env.GOOGLE_APPLICATION_CREDENTIALS }}
GCP_KEYS_PATH: ${{ env.GCP_KEYS_PATH }}
WATSONX_PROJECT_ID: ${{ secrets.WATSONX_PROJECT_ID }}
WATSONX_API_KEY: ${{ secrets.WATSONX_API_KEY }}
run: |
echo "Starting service in library mode (1 container)"
docker compose -f docker-compose-library.yaml up -d
Expand Down Expand Up @@ -256,6 +260,13 @@ jobs:
exit 1
}

# watsonx has a different convention than "<provider>/<model>"
- name: Set watsonx test overrides
if: matrix.environment == 'watsonx'
run: |
echo "E2E_DEFAULT_MODEL_OVERRIDE=watsonx/watsonx/meta-llama/llama-3-3-70b-instruct" >> $GITHUB_ENV
echo "E2E_DEFAULT_PROVIDER_OVERRIDE=watsonx" >> $GITHUB_ENV

- name: Run e2e tests
env:
TERM: xterm-256color
Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ Lightspeed Core Stack is based on the FastAPI framework (Uvicorn). The service i
| OpenAI | https://platform.openai.com |
| Azure OpenAI | https://azure.microsoft.com/en-us/products/ai-services/openai-service |
| Google VertexAI| https://cloud.google.com/vertex-ai |
| IBM WatsonX | https://www.ibm.com/products/watsonx |
| RHOAI (vLLM) | See tests/e2e-prow/rhoai/configs/run.yaml |
| RHEL AI (vLLM) | See tests/e2e/configs/run-rhelai.yaml |

Expand Down Expand Up @@ -177,6 +178,7 @@ __Note__: Support for individual models is dependent on the specific inference p
| Azure | gpt-5, gpt-5-mini, gpt-5-nano, gpt-5-chat, gpt-4.1, gpt-4.1-mini, gpt-4.1-nano, o3-mini, o4-mini | Yes | remote::azure | [1](examples/azure-run.yaml) |
| Azure | o1, o1-mini | No | remote::azure | |
| VertexAI | google/gemini-2.0-flash, google/gemini-2.5-flash, google/gemini-2.5-pro [^1] | Yes | remote::vertexai | [1](examples/vertexai-run.yaml) |
| WatsonX | meta-llama/llama-3-3-70b-instruct | Yes | remote::watsonx | [1](examples/watsonx-run.yaml) |

[^1]: List of models is limited by design in llama-stack, future versions will probably allow to use more models (see [here](https://github.com/llamastack/llama-stack/blob/release-0.3.x/llama_stack/providers/remote/inference/vertexai/vertexai.py#L54))

Expand Down
4 changes: 4 additions & 0 deletions docker-compose-library.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@ services:
- GOOGLE_APPLICATION_CREDENTIALS=${GOOGLE_APPLICATION_CREDENTIALS:-}
- VERTEX_AI_PROJECT=${VERTEX_AI_PROJECT:-}
- VERTEX_AI_LOCATION=${VERTEX_AI_LOCATION:-}
# WatsonX
- WATSONX_BASE_URL=${WATSONX_BASE_URL:-}
- WATSONX_PROJECT_ID=${WATSONX_PROJECT_ID:-}
- WATSONX_API_KEY=${WATSONX_API_KEY:-}
# Enable debug logging if needed
- LLAMA_STACK_LOGGING=${LLAMA_STACK_LOGGING:-}
healthcheck:
Expand Down
4 changes: 4 additions & 0 deletions docker-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,10 @@ services:
- GOOGLE_APPLICATION_CREDENTIALS=${GOOGLE_APPLICATION_CREDENTIALS:-}
- VERTEX_AI_PROJECT=${VERTEX_AI_PROJECT:-}
- VERTEX_AI_LOCATION=${VERTEX_AI_LOCATION:-}
# WatsonX
- WATSONX_BASE_URL=${WATSONX_BASE_URL:-}
- WATSONX_PROJECT_ID=${WATSONX_PROJECT_ID:-}
- WATSONX_API_KEY=${WATSONX_API_KEY:-}
# Enable debug logging if needed
- LLAMA_STACK_LOGGING=${LLAMA_STACK_LOGGING:-}
networks:
Expand Down
2 changes: 1 addition & 1 deletion docs/providers.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ The tables below summarize each provider category, containing the following atri
| tgi | remote | `huggingface_hub`, `aiohttp` | ❌ |
| together | remote | `together` | ❌ |
| vertexai | remote | `google-auth` | ✅ |
| watsonx | remote | `ibm_watsonx_ai` | |
| watsonx | remote | `litellm` | |

Red Hat providers:

Expand Down
161 changes: 161 additions & 0 deletions examples/watsonx-run.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
version: 2

apis:
- agents
- batches
- datasetio
- eval
- files
- inference
- safety
- scoring
- telemetry
- tool_runtime
- vector_io

benchmarks: []
conversations_store:
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/storage/conversations.db}
type: sqlite
datasets: []
image_name: starter
# external_providers_dir: /opt/app-root/src/.llama/providers.d
inference_store:
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/storage/inference-store.db}
type: sqlite
metadata_store:
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/storage/registry.db}
type: sqlite

providers:
inference:
- provider_id: watsonx
provider_type: remote::watsonx
config:
url: ${env.WATSONX_BASE_URL:=https://us-south.ml.cloud.ibm.com}
api_key: ${env.WATSONX_API_KEY:=key-not-set}
project_id: ${env.WATSONX_PROJECT_ID:=project-not-set}
timeout: 1200
- config: {}
provider_id: sentence-transformers
provider_type: inline::sentence-transformers
files:
- config:
metadata_store:
table_name: files_metadata
backend: sql_default
storage_dir: ${env.SQLITE_STORE_DIR:=~/.llama/storage/files}
provider_id: meta-reference-files
provider_type: inline::localfs
safety: [] # WARNING: Shields disabled due to infinite loop issue with LLM calls
# - config:
# excluded_categories: []
# provider_id: llama-guard
# provider_type: inline::llama-guard
scoring:
- provider_id: basic
provider_type: inline::basic
config: {}
- provider_id: llm-as-judge
provider_type: inline::llm-as-judge
config: {}
- provider_id: braintrust
provider_type: inline::braintrust
config:
openai_api_key: '********'
tool_runtime:
- config: {} # Enable the RAG tool
provider_id: rag-runtime
provider_type: inline::rag-runtime
vector_io:
- config: # Define the storage backend for RAG
persistence:
namespace: vector_io::faiss
backend: kv_default
provider_id: faiss
provider_type: inline::faiss
agents:
- config:
persistence:
agent_state:
namespace: agents_state
backend: kv_default
responses:
table_name: agents_responses
backend: sql_default
provider_id: meta-reference
provider_type: inline::meta-reference
batches:
- config:
kvstore:
namespace: batches_store
backend: kv_default
provider_id: reference
provider_type: inline::reference
datasetio:
- config:
kvstore:
namespace: huggingface_datasetio
backend: kv_default
provider_id: huggingface
provider_type: remote::huggingface
- config:
kvstore:
namespace: localfs_datasetio
backend: kv_default
provider_id: localfs
provider_type: inline::localfs
eval:
- config:
kvstore:
namespace: eval_store
backend: kv_default
provider_id: meta-reference
provider_type: inline::meta-reference
scoring_fns: []
telemetry:
enabled: true
server:
port: 8321
storage:
backends:
kv_default: # Define the storage backend type for RAG, in this case registry and RAG are unified i.e. information on registered resources (e.g. models, vector_stores) are saved together with the RAG chunks
type: kv_sqlite
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/storage/rag/kv_store.db}
sql_default:
type: sql_sqlite
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/storage/sql_store.db}
stores:
metadata:
namespace: registry
backend: kv_default
inference:
table_name: inference_store
backend: sql_default
max_write_queue_size: 10000
num_writers: 4
conversations:
table_name: openai_conversations
backend: sql_default
prompts:
namespace: prompts
backend: kv_default
registered_resources:
models:
- model_id: custom-watsonx-model
provider_id: watsonx
model_type: llm
provider_model_id: watsonx/meta-llama/llama-3-3-70b-instruct
shields: [] # WARNING: Shields disabled due to infinite loop issue with LLM calls
vector_dbs: []
datasets: []
scoring_fns: []
benchmarks: []
tool_groups:
- toolgroup_id: builtin::rag # Register the RAG tool
provider_id: rag-runtime
vector_stores:
default_provider_id: faiss
default_embedding_model: # Define the default embedding model for RAG
provider_id: sentence-transformers
model_id: nomic-ai/nomic-embed-text-v1.5
7 changes: 4 additions & 3 deletions src/app/endpoints/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@
Toolgroup,
ToolgroupAgentToolGroupWithArgs,
)
from llama_stack_client.types.alpha.tool_execution_step import ToolExecutionStep
from llama_stack_client.types.model_list_response import ModelListResponse
from llama_stack_client.types.shared.interleaved_content_item import TextContentItem
from llama_stack_client.types.alpha.tool_execution_step import ToolExecutionStep
from sqlalchemy.exc import SQLAlchemyError

import constants
Expand All @@ -41,8 +41,8 @@
ForbiddenResponse,
InternalServerErrorResponse,
NotFoundResponse,
QueryResponse,
PromptTooLongResponse,
QueryResponse,
QuotaExceededResponse,
ReferencedDocument,
ServiceUnavailableResponse,
Expand Down Expand Up @@ -540,7 +540,8 @@ def select_model_and_provider_id(
logger.debug("Searching for model: %s, provider: %s", model_id, provider_id)
# TODO: Create sepparate validation of provider
if not any(
m.identifier == llama_stack_model_id and m.provider_id == provider_id
m.identifier in (llama_stack_model_id, model_id)
and m.provider_id == provider_id
for m in models
):
message = f"Model {model_id} from provider {provider_id} not found in available models"
Expand Down
Loading
Loading