Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -862,6 +862,26 @@ Service exposes metrics in Prometheus format on `/metrics` endpoint. Scraping th
curl 'http://127.0.0.1:8080/metrics'
```

#### Available Metrics

The service exports several types of metrics:

**API and LLM Metrics:**
- `ols_rest_api_calls_total` - REST API calls counter
- `ols_response_duration_seconds` - Response durations
- `ols_llm_calls_total` - LLM calls counter
- `ols_llm_token_sent_total` / `ols_llm_token_received_total` - Token usage counters

**Quota Metrics** (when quota handlers are configured):
- `ols_quota_limit_total{subject_type, subject_id}` - Total quota allocated per subject
- `ols_quota_available_total{subject_type, subject_id}` - Available quota remaining
- `ols_quota_utilization_percent{subject_type, subject_id}` - Quota utilization percentage
- `ols_token_usage_total{user_id, provider, model, token_type}` - Cumulative token consumption
- `ols_quota_warning_subjects_total{subject_type}` - Number of subjects with >80% quota usage
- `ols_quota_exceeded_subjects_total{subject_type}` - Number of subjects that exceeded quota

Quota metrics are automatically updated when the `/metrics` endpoint is accessed and periodically in the background (every 5 minutes by default). These metrics provide insights into token usage patterns, quota utilization, and help with capacity planning and cost management.

### Gradio UI

There is a minimal Gradio UI you can use when running the OLS server locally. To use it, it is needed to enable UI in `olsconfig.yaml` file:
Expand Down
41 changes: 39 additions & 2 deletions ols/app/metrics/metrics.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Prometheus metrics that are exposed by REST API."""

from typing import Annotated, Any
import logging
from typing import Annotated, Any, Optional

from fastapi import APIRouter, Depends
from fastapi.responses import PlainTextResponse
Expand All @@ -14,9 +15,15 @@
)

from ols import config
from ols.app.metrics.quota_metrics_service import (
get_quota_metrics_collector,
update_quota_metrics_on_request,
)
from ols.src.auth.auth import get_auth_dependency
from ols.utils.config import AppConfig

logger = logging.getLogger(__name__)

router = APIRouter(tags=["metrics"])
auth_dependency = get_auth_dependency(
config.ols_config, virtual_path="/ols-metrics-access"
Expand Down Expand Up @@ -56,16 +63,46 @@
)


def get_quota_metrics_dependency() -> Optional[Any]:
"""FastAPI dependency to provide quota metrics collector.

Returns:
QuotaMetricsCollector instance or None if not configured or failed to initialize
"""
try:
# Check if quota handlers are configured
if (
config.ols_config.quota_handlers is None
or config.ols_config.quota_handlers.storage is None
):
logger.debug("Quota handlers not configured, skipping quota metrics")
return None

return get_quota_metrics_collector(config.ols_config.quota_handlers.storage)

except Exception as e:
logger.error("Failed to initialize quota metrics collector: %s", e)
# Return None to gracefully degrade - metrics endpoint should still work
return None


@router.get("/metrics", response_class=PlainTextResponse)
def get_metrics(auth: Annotated[Any, Depends(auth_dependency)]) -> PlainTextResponse:
def get_metrics(
auth: Annotated[Any, Depends(auth_dependency)],
quota_collector: Annotated[Optional[Any], Depends(get_quota_metrics_dependency)],
) -> PlainTextResponse:
"""Metrics Endpoint.

Args:
auth: The Authentication handler (FastAPI Depends) that will handle authentication Logic.
quota_collector: The quota metrics collector dependency (optional)

Returns:
Response containing the latest metrics.
"""
# Update quota metrics if collector is available
update_quota_metrics_on_request(quota_collector)

return PlainTextResponse(content=generate_latest(), media_type=CONTENT_TYPE_LATEST)


Expand Down
168 changes: 168 additions & 0 deletions ols/app/metrics/quota_metrics_collector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
"""Prometheus metrics collector for quota utilization statistics."""

import logging
from typing import Dict, Set

from prometheus_client import Gauge

from ols.app.metrics.quota_metrics_repository import QuotaMetricsRepository

logger = logging.getLogger(__name__)


class QuotaMetricsCollector:
"""Collector for quota-related Prometheus metrics."""

def __init__(self, repository: QuotaMetricsRepository) -> None:
"""Initialize the quota metrics collector."""
self.repository = repository

# Initialize Prometheus metrics
self.quota_limit_total = Gauge(
"ols_quota_limit_total",
"Total quota limit allocated",
["subject_type", "subject_id"],
)

self.quota_available_total = Gauge(
"ols_quota_available_total",
"Available quota remaining",
["subject_type", "subject_id"],
)

self.quota_utilization_percent = Gauge(
"ols_quota_utilization_percent",
"Quota utilization as percentage",
["subject_type", "subject_id"],
)

self.token_usage_total = Gauge(
"ols_token_usage_total",
"Total tokens consumed",
["user_id", "provider", "model", "token_type"],
)

self.quota_warning_subjects_total = Gauge(
"ols_quota_warning_subjects_total",
"Number of subjects with >80% quota usage",
["subject_type"],
)

self.quota_exceeded_subjects_total = Gauge(
"ols_quota_exceeded_subjects_total",
"Number of subjects that exceeded quota",
["subject_type"],
)

logger.info("QuotaMetricsCollector initialized")

def update_quota_metrics(self) -> None:
"""Update quota-related Prometheus metrics."""
try:
# Check database health first
if not self.repository.health_check():
logger.warning(
"Database health check failed, skipping quota metrics update"
)
return

logger.debug("Updating quota metrics")
quota_records = self.repository.get_quota_records()

# Track seen metrics to clear stale ones
seen_quota_metrics: Set[tuple] = set()

# Counters for warning and exceeded subjects
warning_counts: Dict[str, int] = {}
exceeded_counts: Dict[str, int] = {}

for record in quota_records:
subject_type = "user" if record.subject == "u" else "cluster"
subject_id = record.id if record.id else "cluster"

labels = (subject_type, subject_id)
seen_quota_metrics.add(labels)

# Update basic quota metrics
self.quota_limit_total.labels(*labels).set(record.quota_limit)
self.quota_available_total.labels(*labels).set(record.available)
self.quota_utilization_percent.labels(*labels).set(
record.utilization_percent
)

# Track warning and exceeded thresholds
if record.utilization_percent > 100:
exceeded_counts[subject_type] = (
exceeded_counts.get(subject_type, 0) + 1
)
elif record.utilization_percent > 80:
warning_counts[subject_type] = (
warning_counts.get(subject_type, 0) + 1
)

# Update threshold metrics
for subject_type in ["user", "cluster"]:
self.quota_warning_subjects_total.labels(subject_type).set(
warning_counts.get(subject_type, 0)
)
self.quota_exceeded_subjects_total.labels(subject_type).set(
exceeded_counts.get(subject_type, 0)
)

logger.debug("Updated %d quota records", len(quota_records))

except Exception as e:
logger.error("Error updating quota metrics: %s", e)

def update_token_usage_metrics(self) -> None:
"""Update token usage Prometheus metrics."""
try:
# Check database health first
if not self.repository.health_check():
logger.warning(
"Database health check failed, skipping token usage metrics update"
)
return

logger.debug("Updating token usage metrics")
token_records = self.repository.get_token_usage_records()

# Track seen metrics to clear stale ones
seen_token_metrics: Set[tuple] = set()

for record in token_records:
# Update input token metrics
input_labels = (record.user_id, record.provider, record.model, "input")
seen_token_metrics.add(input_labels)
self.token_usage_total.labels(*input_labels).set(record.input_tokens)

# Update output token metrics
output_labels = (
record.user_id,
record.provider,
record.model,
"output",
)
seen_token_metrics.add(output_labels)
self.token_usage_total.labels(*output_labels).set(record.output_tokens)

logger.debug("Updated %d token usage records", len(token_records))

except Exception as e:
logger.error("Error updating token usage metrics: %s", e)

def update_all_metrics(self) -> None:
"""Update all quota-related metrics."""
logger.debug("Starting comprehensive quota metrics update")

try:
self.update_quota_metrics()
except Exception as e:
logger.error("Failed to update quota metrics: %s", e)

try:
self.update_token_usage_metrics()
except Exception as e:
logger.error("Failed to update token usage metrics: %s", e)

logger.debug("Completed quota metrics update")
Loading