diff --git a/tests/e2e/conftest.py b/tests/e2e/conftest.py index 00de64131..a072e38d6 100644 --- a/tests/e2e/conftest.py +++ b/tests/e2e/conftest.py @@ -18,7 +18,8 @@ from scripts.upload_artifact_s3 import upload_artifact_s3 from tests.e2e.utils import client as client_utils -from tests.e2e.utils import ols_installer +from tests.e2e.utils import cluster, ols_installer +from tests.e2e.utils.adapt_ols_config import adapt_ols_config from tests.e2e.utils.wait_for_ols import wait_for_ols from tests.scripts.must_gather import must_gather @@ -45,11 +46,38 @@ def pytest_sessionstart(): # OLS_URL env only needs to be set when running against a local ols instance, # when ols is run against a cluster the url is retrieved from the cluster. ols_url = os.getenv("OLS_URL", "") - if "localhost" not in ols_url: on_cluster = True try: - ols_url, token, metrics_token = ols_installer.install_ols() + result = cluster.run_oc( + [ + "get", + "clusterserviceversion", + "-n", + "openshift-lightspeed", + "-o", + "json", + ] + ) + csv_data = json.loads(result.stdout) + print(csv_data) + + if not csv_data["items"]: + print("OLS Operator is not installed yet.") + ols_url, token, metrics_token = ols_installer.install_ols() + else: + print("OLS Operator is already installed. Skipping install.") + provider = os.getenv("PROVIDER", "openai") + creds = os.getenv("PROVIDER_KEY_PATH", "empty") + # create the llm api key secret ols will mount + provider_list = provider.split() + creds_list = creds.split() + for i, prov in enumerate(provider_list): + ols_installer.create_secrets( + prov, creds_list[i], len(provider_list) + ) + ols_url, token, metrics_token = adapt_ols_config() + except Exception as e: print(f"Error setting up OLS on cluster: {e}") must_gather() @@ -271,8 +299,8 @@ def get_secret_value(env: str) -> str: def pytest_sessionfinish(session): """Create datarouter compatible archive to upload into report portal.""" - # Gather OLS artifacts at the end of tests - if on_cluster: + # Gather OLS artifacts only if there were failures + if on_cluster and session.testsfailed > 0: must_gather() # Sending reports to report portal try: diff --git a/tests/e2e/utils/adapt_ols_config.py b/tests/e2e/utils/adapt_ols_config.py new file mode 100644 index 000000000..fb751f671 --- /dev/null +++ b/tests/e2e/utils/adapt_ols_config.py @@ -0,0 +1,424 @@ +"""Functions to adapt OLS configuration for different providers. + +Handles multi-provider test scenarios dynamically. +""" + +import os +import time + +import yaml + +from ols.constants import DEFAULT_CONFIGURATION_FILE +from tests.e2e.utils import cluster as cluster_utils +from tests.e2e.utils.constants import OLS_COLLECTOR_DISABLING_FILE +from tests.e2e.utils.retry import retry_until_timeout_or_success +from tests.e2e.utils.wait_for_ols import wait_for_ols + + +def apply_olsconfig(provider_list: list[str]) -> None: + """Apply the correct OLSConfig CR based on provider configuration. + + Args: + provider_list: List of provider names to configure. + """ + if len(provider_list) == 1: + provider = provider_list[0] + crd_yml_name = f"olsconfig.crd.{provider}" + ols_config_suffix = os.getenv("OLS_CONFIG_SUFFIX", "default") + if ols_config_suffix != "default": + crd_yml_name += f"_{ols_config_suffix}" + print(f"Applying olsconfig CR from {crd_yml_name}.yaml") + cluster_utils.run_oc( + ["apply", "-f", f"tests/config/operator_install/{crd_yml_name}.yaml"], + ignore_existing_resource=False, + ) + else: + print("Applying evaluation olsconfig CR for multiple providers") + cluster_utils.run_oc( + [ + "apply", + "-f", + "tests/config/operator_install/olsconfig.crd.evaluation.yaml", + ], + ignore_existing_resource=True, + ) + print("OLSConfig CR applied successfully") + + +def update_ols_configmap() -> None: + """Update OLS configmap with additional e2e test configurations. + + Configures logging levels and user data collector settings for testing. + """ + try: + # Get the current configmap + configmap_yaml = cluster_utils.run_oc( + ["get", "cm/olsconfig", "-o", "yaml"] + ).stdout + configmap = yaml.safe_load(configmap_yaml) + olsconfig = yaml.safe_load(configmap["data"][DEFAULT_CONFIGURATION_FILE]) + + # Ensure proper logging config for e2e tests + if "ols_config" not in olsconfig: + olsconfig["ols_config"] = {} + if "logging_config" not in olsconfig["ols_config"]: + olsconfig["ols_config"]["logging_config"] = {} + + # Set INFO level to avoid redacted logs + olsconfig["ols_config"]["logging_config"]["lib_log_level"] = "INFO" + + # Configure user data collection for OLS service (to store data) + olsconfig["ols_config"]["user_data_collection"] = { + "feedback_disabled": False, + "feedback_storage": "/app-root/ols-user-data/feedback", + "transcripts_disabled": False, + "transcripts_storage": "/app-root/ols-user-data/transcripts", + } + + olsconfig["user_data_collector_config"] = { + "data_storage": "/app-root/ols-user-data", + "log_level": "debug", + "collection_interval": 10, + "run_without_initial_wait": True, + "ingress_env": "stage", + "cp_offline_token": os.getenv("CP_OFFLINE_TOKEN", ""), + } + + # Update the configmap + configmap["data"][DEFAULT_CONFIGURATION_FILE] = yaml.dump(olsconfig) + updated_configmap = yaml.dump(configmap) + cluster_utils.run_oc(["apply", "-f", "-"], command=updated_configmap) + print("OLS configmap updated successfully") + + except Exception as e: + raise RuntimeError( + f"Failed to update OLS configmap with e2e settings: {e}" + ) from e + + +def setup_service_accounts(namespace: str) -> None: + """Set up service accounts and access roles. + + Args: + namespace: The Kubernetes namespace to create service accounts in. + """ + print("Ensuring 'test-user' service account exists...") + cluster_utils.run_oc( + ["create", "sa", "test-user", "-n", namespace], + ignore_existing_resource=True, + ) + + print("Ensuring 'metrics-test-user' service account exists...") + cluster_utils.run_oc( + ["create", "sa", "metrics-test-user", "-n", namespace], + ignore_existing_resource=True, + ) + + print("Granting access roles to service accounts...") + cluster_utils.grant_sa_user_access("test-user", "lightspeed-operator-query-access") + cluster_utils.grant_sa_user_access( + "metrics-test-user", "lightspeed-operator-ols-metrics-reader" + ) + + +def setup_rbac(namespace: str) -> None: + """Set up pod-reader role and binding. + + Args: + namespace: The Kubernetes namespace for RBAC configuration. + """ + print("Ensuring 'pod-reader' role and rolebinding exist...") + cluster_utils.run_oc( + [ + "create", + "role", + "pod-reader", + "--verb=get,list", + "--resource=pods", + "--namespace", + namespace, + ], + ignore_existing_resource=True, + ) + + cluster_utils.run_oc( + [ + "create", + "rolebinding", + "test-user-pod-reader", + "--role=pod-reader", + f"--serviceaccount={namespace}:test-user", + "--namespace", + namespace, + ], + ignore_existing_resource=True, + ) + print("RBAC setup verified.") + + +def wait_for_deployment() -> None: + """Wait for OLS deployment and pods to be ready. + + Ensures the lightspeed-app-server deployment is available and pods are running. + """ + print("Waiting for OLS deployment to be available...") + retry_until_timeout_or_success( + 30, + 5, + lambda: cluster_utils.run_oc( + [ + "get", + "deployment", + "lightspeed-app-server", + "--ignore-not-found", + "-o", + "name", + ] + ).stdout.strip() + == "deployment.apps/lightspeed-app-server", + "Waiting for lightspeed-app-server deployment to be detected", + ) + + print("Waiting for pods to be ready...") + cluster_utils.wait_for_running_pod() + + +def setup_route() -> str: + """Set up route and return OLS URL. + + Returns: + The HTTPS URL for accessing the OLS service. + """ + try: + cluster_utils.run_oc(["delete", "route", "ols"], ignore_existing_resource=False) + except Exception: + print("No existing route to delete. Continuing...") + + print("Creating route for OLS access") + cluster_utils.run_oc( + ["create", "-f", "tests/config/operator_install/route.yaml"], + ignore_existing_resource=False, + ) + + url = cluster_utils.run_oc( + ["get", "route", "ols", "-o", "jsonpath='{.spec.host}'"] + ).stdout.strip("'") + + return f"https://{url}" + + +def adapt_ols_config() -> tuple[str, str, str]: # noqa: C901 pylint: disable=R0915 + """Adapt OLS configuration for different providers dynamically. + + Ensures RBAC, service accounts, and OLS route exist for test execution. + This function assumes the operator has already been scaled down during initial setup. + + Returns: + tuple: (ols_url, token, metrics_token) + """ + print("Adapting OLS configuration for provider switching") + provider_env = os.getenv("PROVIDER", "openai") + provider_list = provider_env.split() or ["openai"] + ols_image = os.getenv("OLS_IMAGE", "") + namespace = "openshift-lightspeed" + + print("Checking for existing app server deployment...") + try: + cluster_utils.run_oc( + ["scale", "deployment/lightspeed-app-server", "--replicas", "0"] + ) + retry_until_timeout_or_success( + 30, + 3, + lambda: not cluster_utils.get_pod_by_prefix(fail_not_found=False), + "Waiting for old app server pod to terminate", + ) + print("Old app server scaled down") + except Exception as e: + print(f"No existing app server to scale down (this is OK): {e}") + + try: + cluster_utils.run_oc(["delete", "olsconfig", "cluster", "--ignore-not-found"]) + print(" Old OLSConfig CR removed") + except Exception as e: + print(f"Could not delete old OLSConfig: {e}") + + try: + apply_olsconfig(provider_list) + print("New OLSConfig CR applied") + except Exception as e: + raise RuntimeError(f"Failed to apply OLSConfig CR: {e}") from e + + cluster_utils.run_oc( + [ + "scale", + "deployment/lightspeed-operator-controller-manager", + "--replicas", + "1", + ] + ) + # Wait for operator + retry_until_timeout_or_success( + 30, + 5, + lambda: cluster_utils.get_pod_by_prefix( + prefix="lightspeed-operator-controller-manager", fail_not_found=False + ), + "Waiting for operator to start", + ) + + print("Waiting for operator to reconcile OLSConfig CR (30 seconds)...") + time.sleep(30) # Let operator reconcile CR → deployment + configmap + + # Verify reconciliation happened - check deployment exists AND has pods + print("Verifying operator reconciliation completed...") + retry_until_timeout_or_success( + 30, # Give more time for operator to fully reconcile + 3, + lambda: cluster_utils.run_oc( + [ + "get", + "deployment", + "lightspeed-app-server", + "--ignore-not-found", + "-o", + "jsonpath={.status.replicas}", + ] + ).stdout.strip() + != "", + "Waiting for operator to create deployment with replicas", + ) + cluster_utils.run_oc( + [ + "scale", + "deployment/lightspeed-operator-controller-manager", + "--replicas", + "0", + ] + ) + + retry_until_timeout_or_success( + 30, + 3, + lambda: not cluster_utils.get_pod_by_prefix( + prefix="lightspeed-operator-controller-manager", fail_not_found=False + ), + "Waiting for operator to scale down", + ) + print("Operator scaled down") + + # Scale down app server to apply e2e configurations + print("Scaling down app server to apply e2e configurations...") + cluster_utils.run_oc( + ["scale", "deployment/lightspeed-app-server", "--replicas", "0"] + ) + + retry_until_timeout_or_success( + 30, + 3, + lambda: not cluster_utils.get_pod_by_prefix(fail_not_found=False), + "Waiting for app server pod to terminate", + ) + print("App server scaled down") + + # Update configmap with e2e-specific settings - FAIL FAST if this breaks + print("Updating configmap with e2e test settings...") + update_ols_configmap() + print(" Configmap updated successfully") + # Apply test image + if ols_image: + print(f"Applying test image: {ols_image}") + try: + # Patch both containers (api and collector) + patch = ( + f'[{{"op": "replace", "path": "/spec/template/spec/' + f'containers/0/image", "value": "{ols_image}"}}]' + ) + cluster_utils.run_oc( + [ + "patch", + "deployment/lightspeed-app-server", + "--type", + "json", + "-p", + patch, + ] + ) + + # Check if there's a second container and patch it too + try: + patch = ( + f'[{{"op": "replace", "path": "/spec/template/spec/' + f'containers/1/image", "value": "{ols_image}"}}]' + ) + cluster_utils.run_oc( + [ + "patch", + "deployment/lightspeed-app-server", + "--type", + "json", + "-p", + patch, + ] + ) + except Exception as e: + # Second container might not exist + print(f"Note: Could not patch second container: {e}") + + print("Image configuration completed") + except Exception as e: + print(f" Warning: Could not apply test image: {e}") + + # Scale back up + print("Scaling up app server with new configuration...") + cluster_utils.run_oc( + ["scale", "deployment/lightspeed-app-server", "--replicas", "1"] + ) + + # Wait for deployment to be ready + wait_for_deployment() + + # Ensure service accounts exist + try: + setup_service_accounts(namespace) + except Exception as e: + raise RuntimeError( + f"Error ensuring service accounts or access roles: {e}" + ) from e + + # Ensure pod-reader role and binding exist + try: + setup_rbac(namespace) + except Exception as e: + print(f"Warning: Could not ensure pod-reader role/binding: {e}") + + # Disable data collector to avoid interference with tests + # Note: OLS will create the required directories automatically when it starts + try: + pod_name = cluster_utils.get_pod_by_prefix()[0] + print(f"Disabling data collector on pod: {pod_name}") + cluster_utils.create_file(pod_name, OLS_COLLECTOR_DISABLING_FILE, "") + print("Data collector disabled successfully") + except Exception as e: + print(f"Warning: Could not disable collector: {e}") + print("Tests may experience interference from data collector") + + # Fetch tokens for service accounts + print("Fetching tokens for service accounts...") + token = cluster_utils.get_token_for("test-user") + metrics_token = cluster_utils.get_token_for("metrics-test-user") + + # Set up route and get URL + ols_url = setup_route() + + # Wait for OLS to be ready + print(f"Waiting for OLS to be ready at {ols_url}...") + if not wait_for_ols(ols_url, timeout=180): + raise RuntimeError("OLS failed to become ready after configuration") + + print("OLS configuration and access setup completed successfully.") + return ols_url, token, metrics_token + + +if __name__ == "__main__": + adapt_ols_config() diff --git a/tests/e2e/utils/cluster.py b/tests/e2e/utils/cluster.py index 1b5727363..d93e968b7 100644 --- a/tests/e2e/utils/cluster.py +++ b/tests/e2e/utils/cluster.py @@ -23,14 +23,26 @@ def run_oc( ) return res except subprocess.CalledProcessError as e: - if ignore_existing_resource and "AlreadyExists" in e.stderr: - print(f"Resource already exists: {e}\nproceeding...") - else: - print( - f"Error running oc command {args}: {e}, stdout: {e.output}, stderr: {e.stderr}" - ) - raise - return subprocess.CompletedProcess("", 0) + if ignore_existing_resource: + # Check for various "already exists" error patterns in both stderr and stdout + error_text = (e.stderr + " " + e.stdout).lower() + if any( + pattern in error_text + for pattern in [ + "alreadyexists", + "already exists", + "already exist", + "conflict", + "resource exists", + ] + ): + print(f"Resource already exists: {e}\nproceeding...") + return subprocess.CompletedProcess(e.cmd, 0, stdout="", stderr="") + + print( + f"Error running oc command {args}: {e}, stdout: {e.stdout}, stderr: {e.stderr}" + ) + raise def run_oc_and_store_stdout( @@ -325,8 +337,8 @@ def wait_for_running_pod( ): """Wait for the selected pod to be in running state.""" r = retry_until_timeout_or_success( - 5, 3, + 2, lambda: len( run_oc( [ @@ -343,12 +355,9 @@ def wait_for_running_pod( ) # wait for new ols app pod to be created+running - # there should be exactly one, if we see more than one it may be an old pod - # and we need to wait for it to go away before progressing so we don't try to - # interact with it. r = retry_until_timeout_or_success( OC_COMMAND_RETRY_COUNT, - 5, + 3, lambda: len( get_pod_by_prefix(prefix=name, namespace=namespace, fail_not_found=False) ) @@ -358,43 +367,40 @@ def wait_for_running_pod( if not r: raise Exception("Timed out waiting for new OLS pod to be ready") - def pod_has_2_containers_ready(): + def pod_has_containers_ready(): pods = get_pod_by_prefix(prefix=name, namespace=namespace, fail_not_found=False) if not pods: return False - # Creating exception for disconnected since no data collection - disconnected = os.getenv("DISCONNECTED", "") - if not disconnected: - return ( - len( - [ - container - for container in get_container_ready_status(pods[0]) - if container == "true" - ] - ) - == 2 - ) - return ( - len( - [ - container - for container in get_container_ready_status(pods[0]) - if container == "true" - ] - ) - == 1 + + ready_containers = len( + [ + container + for container in get_container_ready_status(pods[0]) + if container == "true" + ] ) - # wait for the two containers in the server pod to become ready + # Check for tool calling or disconnected mode (both need >=2 containers) + disconnected = os.getenv("DISCONNECTED", "") + ols_config_suffix = os.getenv("OLS_CONFIG_SUFFIX", "default") + tool_calling_enabled = "tool_calling" in ols_config_suffix + + if disconnected: + return ready_containers >= 1 + if tool_calling_enabled: + return ready_containers >= 2 + return ready_containers >= 2 + + # wait for the containers in the server pod to become ready + # two containers normally, three in case we're running mcp server r = retry_until_timeout_or_success( OC_COMMAND_RETRY_COUNT, 5, - pod_has_2_containers_ready, - "Waiting for two containers in the server pod to become ready", + pod_has_containers_ready, + "Waiting for containers in the server pod to become ready", ) if not r: - raise Exception("Timed out waiting for new two containers to become ready") + raise Exception("Timed out waiting for containers to become ready") def get_certificate_secret_name( diff --git a/tests/e2e/utils/ols_installer.py b/tests/e2e/utils/ols_installer.py index cffae7f16..6a0ac0c93 100644 --- a/tests/e2e/utils/ols_installer.py +++ b/tests/e2e/utils/ols_installer.py @@ -199,7 +199,7 @@ def create_secrets(provider_name: str, creds: str, provider_size: int) -> None: [ "delete", "secret", - provider_name + "creds", + "llmcreds", ], ) except subprocess.CalledProcessError: diff --git a/tests/scripts/test-e2e-cluster-periodics.sh b/tests/scripts/test-e2e-cluster-periodics.sh index 4ef2f9989..39b994b13 100755 --- a/tests/scripts/test-e2e-cluster-periodics.sh +++ b/tests/scripts/test-e2e-cluster-periodics.sh @@ -82,7 +82,9 @@ function run_suites() { # smoke tests for RHOAI VLLM-compatible provider run_suite "rhoai_vllm" "smoketest" "rhoai_vllm" "$OPENAI_PROVIDER_KEY_PATH" "gpt-3.5-turbo" "$OLS_IMAGE" "default" (( rc = rc || $? )) - + + cleanup_ols_operator + fi set -e diff --git a/tests/scripts/test-e2e-cluster.sh b/tests/scripts/test-e2e-cluster.sh index 7d47ac2dc..3a1958222 100755 --- a/tests/scripts/test-e2e-cluster.sh +++ b/tests/scripts/test-e2e-cluster.sh @@ -73,6 +73,7 @@ function run_suites() { run_suite "quota_limits" "quota_limits" "openai" "$OPENAI_PROVIDER_KEY_PATH" "gpt-4o-mini" "$OLS_IMAGE" "quota" (( rc = rc || $? )) + cleanup_ols_operator set -e diff --git a/tests/scripts/test-evaluation.sh b/tests/scripts/test-evaluation.sh index efe0c1c11..540af59ff 100755 --- a/tests/scripts/test-evaluation.sh +++ b/tests/scripts/test-evaluation.sh @@ -40,6 +40,8 @@ function run_suites() { (( rc = rc || $? )) set -e + cleanup_ols_operator + return $rc } diff --git a/tests/scripts/utils.sh b/tests/scripts/utils.sh index 89730a21e..dee1ad91a 100644 --- a/tests/scripts/utils.sh +++ b/tests/scripts/utils.sh @@ -47,8 +47,6 @@ function cleanup_ols_operator() { # $7 OLS_CONFIG_SUFFIX function run_suite() { echo "Preparing to run suite $1" - - cleanup_ols_operator if [ "$1" = "model_eval" ]; then # Run eval tests