openshift · thoraxe · Sep 8, 2025 · Sep 8, 2025 · Sep 8, 2025
diff --git a/README.md b/README.md
@@ -862,6 +862,26 @@ Service exposes metrics in Prometheus format on `/metrics` endpoint. Scraping th
 curl 'http://127.0.0.1:8080/metrics'
 ```
 
+#### Available Metrics
+
+The service exports several types of metrics:
+
+**API and LLM Metrics:**
+- `ols_rest_api_calls_total` - REST API calls counter
+- `ols_response_duration_seconds` - Response durations
+- `ols_llm_calls_total` - LLM calls counter
+- `ols_llm_token_sent_total` / `ols_llm_token_received_total` - Token usage counters
+
+**Quota Metrics** (when quota handlers are configured):
+- `ols_quota_limit_total{subject_type, subject_id}` - Total quota allocated per subject
+- `ols_quota_available_total{subject_type, subject_id}` - Available quota remaining  
+- `ols_quota_utilization_percent{subject_type, subject_id}` - Quota utilization percentage
+- `ols_token_usage_total{user_id, provider, model, token_type}` - Cumulative token consumption
+- `ols_quota_warning_subjects_total{subject_type}` - Number of subjects with >80% quota usage
+- `ols_quota_exceeded_subjects_total{subject_type}` - Number of subjects that exceeded quota
+
+Quota metrics are automatically updated when the `/metrics` endpoint is accessed and periodically in the background (every 5 minutes by default). These metrics provide insights into token usage patterns, quota utilization, and help with capacity planning and cost management.
+
 ### Gradio UI
 
 There is a minimal Gradio UI you can use when running the OLS server locally.  To use it, it is needed to enable UI in `olsconfig.yaml` file:

diff --git a/ols/app/metrics/metrics.py b/ols/app/metrics/metrics.py
@@ -1,6 +1,7 @@
 """Prometheus metrics that are exposed by REST API."""
 
-from typing import Annotated, Any
+import logging
+from typing import Annotated, Any, Optional
 
 from fastapi import APIRouter, Depends
 from fastapi.responses import PlainTextResponse
@@ -14,9 +15,15 @@
 )
 
 from ols import config
+from ols.app.metrics.quota_metrics_service import (
+    get_quota_metrics_collector,
+    update_quota_metrics_on_request,
+)
 from ols.src.auth.auth import get_auth_dependency
 from ols.utils.config import AppConfig
 
+logger = logging.getLogger(__name__)
+
 router = APIRouter(tags=["metrics"])
 auth_dependency = get_auth_dependency(
     config.ols_config, virtual_path="/ols-metrics-access"
@@ -56,16 +63,46 @@
 )
 
 
+def get_quota_metrics_dependency() -> Optional[Any]:
+    """FastAPI dependency to provide quota metrics collector.
+
+    Returns:
+        QuotaMetricsCollector instance or None if not configured or failed to initialize
+    """
+    try:
+        # Check if quota handlers are configured
+        if (
+            config.ols_config.quota_handlers is None
+            or config.ols_config.quota_handlers.storage is None
+        ):
+            logger.debug("Quota handlers not configured, skipping quota metrics")
+            return None
+
+        return get_quota_metrics_collector(config.ols_config.quota_handlers.storage)
+
+    except Exception as e:
+        logger.error("Failed to initialize quota metrics collector: %s", e)
+        # Return None to gracefully degrade - metrics endpoint should still work
+        return None
+
+
 @router.get("/metrics", response_class=PlainTextResponse)
-def get_metrics(auth: Annotated[Any, Depends(auth_dependency)]) -> PlainTextResponse:
+def get_metrics(
+    auth: Annotated[Any, Depends(auth_dependency)],
+    quota_collector: Annotated[Optional[Any], Depends(get_quota_metrics_dependency)],
+) -> PlainTextResponse:
     """Metrics Endpoint.
 
     Args:
         auth: The Authentication handler (FastAPI Depends) that will handle authentication Logic.
+        quota_collector: The quota metrics collector dependency (optional)
 
     Returns:
         Response containing the latest metrics.
     """
+    # Update quota metrics if collector is available
+    update_quota_metrics_on_request(quota_collector)
+
     return PlainTextResponse(content=generate_latest(), media_type=CONTENT_TYPE_LATEST)
 
 

diff --git a/ols/app/metrics/quota_metrics_collector.py b/ols/app/metrics/quota_metrics_collector.py
@@ -0,0 +1,168 @@
+"""Prometheus metrics collector for quota utilization statistics."""
+
+import logging
+from typing import Dict, Set
+
+from prometheus_client import Gauge
+
+from ols.app.metrics.quota_metrics_repository import QuotaMetricsRepository
+
+logger = logging.getLogger(__name__)
+
+
+class QuotaMetricsCollector:
+    """Collector for quota-related Prometheus metrics."""
+
+    def __init__(self, repository: QuotaMetricsRepository) -> None:
+        """Initialize the quota metrics collector."""
+        self.repository = repository
+
+        # Initialize Prometheus metrics
+        self.quota_limit_total = Gauge(
+            "ols_quota_limit_total",
+            "Total quota limit allocated",
+            ["subject_type", "subject_id"],
+        )
+
+        self.quota_available_total = Gauge(
+            "ols_quota_available_total",
+            "Available quota remaining",
+            ["subject_type", "subject_id"],
+        )
+
+        self.quota_utilization_percent = Gauge(
+            "ols_quota_utilization_percent",
+            "Quota utilization as percentage",
+            ["subject_type", "subject_id"],
+        )
+
+        self.token_usage_total = Gauge(
+            "ols_token_usage_total",
+            "Total tokens consumed",
+            ["user_id", "provider", "model", "token_type"],
+        )
+
+        self.quota_warning_subjects_total = Gauge(
+            "ols_quota_warning_subjects_total",
+            "Number of subjects with >80% quota usage",
+            ["subject_type"],
+        )
+
+        self.quota_exceeded_subjects_total = Gauge(
+            "ols_quota_exceeded_subjects_total",
+            "Number of subjects that exceeded quota",
+            ["subject_type"],
+        )
+
+        logger.info("QuotaMetricsCollector initialized")
+
+    def update_quota_metrics(self) -> None:
+        """Update quota-related Prometheus metrics."""
+        try:
+            # Check database health first
+            if not self.repository.health_check():
+                logger.warning(
+                    "Database health check failed, skipping quota metrics update"
+                )
+                return
+
+            logger.debug("Updating quota metrics")
+            quota_records = self.repository.get_quota_records()
+
+            # Track seen metrics to clear stale ones
+            seen_quota_metrics: Set[tuple] = set()
+
+            # Counters for warning and exceeded subjects
+            warning_counts: Dict[str, int] = {}
+            exceeded_counts: Dict[str, int] = {}
+
+            for record in quota_records:
+                subject_type = "user" if record.subject == "u" else "cluster"
+                subject_id = record.id if record.id else "cluster"
+
+                labels = (subject_type, subject_id)
+                seen_quota_metrics.add(labels)
+
+                # Update basic quota metrics
+                self.quota_limit_total.labels(*labels).set(record.quota_limit)
+                self.quota_available_total.labels(*labels).set(record.available)
+                self.quota_utilization_percent.labels(*labels).set(
+                    record.utilization_percent
+                )
+
+                # Track warning and exceeded thresholds
+                if record.utilization_percent > 100:
+                    exceeded_counts[subject_type] = (
+                        exceeded_counts.get(subject_type, 0) + 1
+                    )
+                elif record.utilization_percent > 80:
+                    warning_counts[subject_type] = (
+                        warning_counts.get(subject_type, 0) + 1
+                    )
+
+            # Update threshold metrics
+            for subject_type in ["user", "cluster"]:
+                self.quota_warning_subjects_total.labels(subject_type).set(
+                    warning_counts.get(subject_type, 0)
+                )
+                self.quota_exceeded_subjects_total.labels(subject_type).set(
+                    exceeded_counts.get(subject_type, 0)
+                )
+
+            logger.debug("Updated %d quota records", len(quota_records))
+
+        except Exception as e:
+            logger.error("Error updating quota metrics: %s", e)
+
+    def update_token_usage_metrics(self) -> None:
+        """Update token usage Prometheus metrics."""
+        try:
+            # Check database health first
+            if not self.repository.health_check():
+                logger.warning(
+                    "Database health check failed, skipping token usage metrics update"
+                )
+                return
+
+            logger.debug("Updating token usage metrics")
+            token_records = self.repository.get_token_usage_records()
+
+            # Track seen metrics to clear stale ones
+            seen_token_metrics: Set[tuple] = set()
+
+            for record in token_records:
+                # Update input token metrics
+                input_labels = (record.user_id, record.provider, record.model, "input")
+                seen_token_metrics.add(input_labels)
+                self.token_usage_total.labels(*input_labels).set(record.input_tokens)
+
+                # Update output token metrics
+                output_labels = (
+                    record.user_id,
+                    record.provider,
+                    record.model,
+                    "output",
+                )
+                seen_token_metrics.add(output_labels)
+                self.token_usage_total.labels(*output_labels).set(record.output_tokens)
+
+            logger.debug("Updated %d token usage records", len(token_records))
+
+        except Exception as e:
+            logger.error("Error updating token usage metrics: %s", e)
+
+    def update_all_metrics(self) -> None:
+        """Update all quota-related metrics."""
+        logger.debug("Starting comprehensive quota metrics update")
+
+        try:
+            self.update_quota_metrics()
+        except Exception as e:
+            logger.error("Failed to update quota metrics: %s", e)
+
+        try:
+            self.update_token_usage_metrics()
+        except Exception as e:
+            logger.error("Failed to update token usage metrics: %s", e)
+
+        logger.debug("Completed quota metrics update")