diff --git a/Makefile b/Makefile index 33259cc38..8ab2f2917 100644 --- a/Makefile +++ b/Makefile @@ -3,9 +3,9 @@ .PHONY: help clean clean-env dev dev-http docs install bdist sdist test release check_dists \ clean-images clean-enterprise-gateway clean-demo-base clean-kernel-images clean-enterprise-gateway \ - clean-kernel-py clean-kernel-spark-py clean-kernel-r clean-kernel-spark-r clean-kernel-scala clean-kernel-tf-py \ + clean-kernel-py clean-kernel-spark-py clean-kernel-ray-py clean-kernel-r clean-kernel-spark-r clean-kernel-scala clean-kernel-tf-py \ clean-kernel-tf-gpu-py clean-kernel-image-puller push-images push-enterprise-gateway-demo push-demo-base \ - push-kernel-images push-enterprise-gateway push-kernel-py push-kernel-spark-py push-kernel-r push-kernel-spark-r \ + push-kernel-images push-enterprise-gateway push-kernel-py push-kernel-spark-py push-kernel-ray-py push-kernel-r push-kernel-spark-r \ push-kernel-scala push-kernel-tf-py push-kernel-tf-gpu-py push-kernel-image-puller publish helm-chart SA?=source activate @@ -155,9 +155,9 @@ docker-images: ## Build docker images (includes kernel-based images) kernel-images: ## Build kernel-based docker images # Actual working targets... -docker-images: demo-base enterprise-gateway-demo kernel-images enterprise-gateway kernel-py kernel-spark-py kernel-r kernel-spark-r kernel-scala kernel-tf-py kernel-tf-gpu-py kernel-image-puller +docker-images: demo-base enterprise-gateway-demo kernel-images enterprise-gateway kernel-py kernel-spark-py kernel-ray-py kernel-r kernel-spark-r kernel-scala kernel-tf-py kernel-tf-gpu-py kernel-image-puller -enterprise-gateway-demo kernel-images enterprise-gateway kernel-py kernel-spark-py kernel-r kernel-spark-r kernel-scala kernel-tf-py kernel-tf-gpu-py kernel-image-puller: +enterprise-gateway-demo kernel-images enterprise-gateway kernel-py kernel-spark-py kernel-ray-py kernel-r kernel-spark-r kernel-scala kernel-tf-py kernel-tf-gpu-py kernel-image-puller: make WHEEL_FILE=$(WHEEL_FILE) VERSION=$(VERSION) NO_CACHE=$(NO_CACHE) TAG=$(TAG) SPARK_VERSION=$(SPARK_VERSION) MULTIARCH_BUILD=$(MULTIARCH_BUILD) TARGET_ARCH=$(TARGET_ARCH) -C etc $@ demo-base: @@ -167,14 +167,14 @@ demo-base: clean-images: clean-demo-base ## Remove docker images (includes kernel-based images) clean-kernel-images: ## Remove kernel-based images -clean-images clean-enterprise-gateway-demo clean-kernel-images clean-enterprise-gateway clean-kernel-py clean-kernel-spark-py clean-kernel-r clean-kernel-spark-r clean-kernel-scala clean-kernel-tf-py clean-kernel-tf-gpu-py clean-kernel-image-puller: +clean-images clean-enterprise-gateway-demo clean-kernel-images clean-enterprise-gateway clean-kernel-py clean-kernel-spark-py clean-kernel-ray-py clean-kernel-r clean-kernel-spark-r clean-kernel-scala clean-kernel-tf-py clean-kernel-tf-gpu-py clean-kernel-image-puller: make WHEEL_FILE=$(WHEEL_FILE) VERSION=$(VERSION) TAG=$(TAG) -C etc $@ clean-demo-base: make WHEEL_FILE=$(WHEEL_FILE) VERSION=$(VERSION) TAG=$(SPARK_VERSION) -C etc $@ push-images: push-demo-base -push-images push-enterprise-gateway-demo push-kernel-images push-enterprise-gateway push-kernel-py push-kernel-spark-py push-kernel-r push-kernel-spark-r push-kernel-scala push-kernel-tf-py push-kernel-tf-gpu-py push-kernel-image-puller: +push-images push-enterprise-gateway-demo push-kernel-images push-enterprise-gateway push-kernel-py push-kernel-spark-py push-kernel-ray-py push-kernel-r push-kernel-spark-r push-kernel-scala push-kernel-tf-py push-kernel-tf-gpu-py push-kernel-image-puller: make WHEEL_FILE=$(WHEEL_FILE) VERSION=$(VERSION) TAG=$(TAG) -C etc $@ push-demo-base: diff --git a/enterprise_gateway/services/processproxies/container.py b/enterprise_gateway/services/processproxies/container.py index 6378b633f..24ce5ee2d 100644 --- a/enterprise_gateway/services/processproxies/container.py +++ b/enterprise_gateway/services/processproxies/container.py @@ -147,6 +147,8 @@ def poll(self) -> bool | None: # See https://github.com/jupyter-server/enterprise_gateway/issues/827 if container_status in self.get_initial_states(): result = None + + self.log.debug(f">>> container.poll(): {container_status} --> {result}") return result def send_signal(self, signum: int) -> bool | None: @@ -188,6 +190,7 @@ def shutdown_listener(self): async def confirm_remote_startup(self) -> None: """Confirms the container has started and returned necessary connection information.""" + self.log.debug(">>> container.confirm_remote_startup()") self.log.debug("Trying to confirm kernel container startup status") self.start_time = RemoteProcessProxy.get_current_time() i = 0 @@ -197,6 +200,9 @@ async def confirm_remote_startup(self) -> None: await self.handle_timeout() container_status = self.get_container_status(i) + self.log.debug( + f">>> container.confirm_remote_startup() - container_status: {container_status}" + ) if container_status: if container_status in self.get_error_states(): self.log_and_raise( @@ -204,14 +210,24 @@ async def confirm_remote_startup(self) -> None: reason=f"Error starting kernel container; status: '{container_status}'.", ) else: + self.log.debug( + f">>> container.confirm_remote_startup(): is hosted assigned => {self.assigned_host}" + ) + self.log.debug(">>> should call receive_connection_info()") if self.assigned_host: ready_to_connect = await self.receive_connection_info() + self.log.debug( + f">>> container.confirm_remote_startup(): ready to connect => {ready_to_connect}" + ) self.pid = ( 0 # We won't send process signals for kubernetes lifecycle management ) self.pgid = 0 else: self.detect_launch_failure() + self.log.debug( + f">>> container.confirm_remote_startup(): ready to connect => {ready_to_connect}" + ) def get_process_info(self) -> dict[str, Any]: """Captures the base information necessary for kernel persistence relative to containers.""" diff --git a/enterprise_gateway/services/processproxies/crd.py b/enterprise_gateway/services/processproxies/crd.py index 54f24b5ca..4962c8cc1 100644 --- a/enterprise_gateway/services/processproxies/crd.py +++ b/enterprise_gateway/services/processproxies/crd.py @@ -74,11 +74,15 @@ def get_container_status(self, iteration: int | None) -> str: ) if custom_resource: - application_state = custom_resource['status']['applicationState']['state'].lower() + application_state = custom_resource.get("status", {}).get("state", "").lower() + + self.log.debug(f">>> crd.get_container_status: {application_state}") if application_state in self.get_error_states(): exception_text = self._get_exception_text( - custom_resource['status']['applicationState']['errorMessage'] + custom_resource.get("status", {}) + .get("applicationState", {}) + .get("errorMessage") ) error_message = ( f"CRD submission for kernel {self.kernel_id} failed: {exception_text}" diff --git a/enterprise_gateway/services/processproxies/k8s.py b/enterprise_gateway/services/processproxies/k8s.py index ea4a1822b..14404df2a 100644 --- a/enterprise_gateway/services/processproxies/k8s.py +++ b/enterprise_gateway/services/processproxies/k8s.py @@ -115,6 +115,7 @@ def get_container_status(self, iteration: int | None) -> str: self.container_name = pod_info.metadata.name if pod_info.status: pod_status = pod_info.status.phase.lower() + self.log.debug(f">>> k8s.get_container_status: {pod_status}") if pod_status == "running" and not self.assigned_host: # Pod is running, capture IP self.assigned_ip = pod_info.status.pod_ip @@ -128,6 +129,7 @@ def get_container_status(self, iteration: int | None) -> str: f"Status: '{pod_status}', Pod IP: '{self.assigned_ip}', KernelID: '{self.kernel_id}'" ) + self.log.debug(f">>> k8s.get_container_status: {pod_status}") return pod_status def delete_managed_object(self, termination_stati: list[str]) -> bool: diff --git a/enterprise_gateway/services/processproxies/processproxy.py b/enterprise_gateway/services/processproxies/processproxy.py index 405adfbca..22e01c486 100644 --- a/enterprise_gateway/services/processproxies/processproxy.py +++ b/enterprise_gateway/services/processproxies/processproxy.py @@ -201,6 +201,7 @@ def register_event(self, kernel_id: str) -> None: async def get_connection_info(self, kernel_id: str) -> dict: """Performs a timeout wait on the event, returning the conenction information on completion.""" + self.log.debug(f">>> processproxy.get_connection_info() for kernel_id {kernel_id}") await asyncio.wait_for(self._response_registry[kernel_id].wait(), connection_interval) return self._response_registry.pop(kernel_id).response @@ -1300,9 +1301,13 @@ async def receive_connection_info(self) -> bool: """ # Polls the socket using accept. When data is found, returns ready indicator and encrypted data. ready_to_connect = False - + self.log.debug( + f">>> processproxy.receive_connection_info(): initializing ready to connect as {ready_to_connect}" + ) try: connect_info = await self.response_manager.get_connection_info(self.kernel_id) + self.log.debug(">>> processproxy.receive_connection_info(): connect info received") + self.log.debug(connect_info) self._setup_connection_info(connect_info) ready_to_connect = True except Exception as e: @@ -1320,6 +1325,9 @@ async def receive_connection_info(self) -> bool: self.kill() self.log_and_raise(http_status_code=500, reason=error_message) + self.log.debug( + f">>> processproxy.receive_connection_info(): returning ready to connect {ready_to_connect}" + ) return ready_to_connect def _setup_connection_info(self, connect_info: dict) -> None: diff --git a/enterprise_gateway/services/processproxies/ray_operator.py b/enterprise_gateway/services/processproxies/ray_operator.py new file mode 100644 index 000000000..41cea3c6a --- /dev/null +++ b/enterprise_gateway/services/processproxies/ray_operator.py @@ -0,0 +1,211 @@ +"""A Ray operator process proxy.""" + +# Internal implementation at Apple +from __future__ import annotations + +from typing import Any + +from kubernetes import client + +from ..kernels.remotemanager import RemoteKernelManager +from .k8s import KubernetesProcessProxy + + +class RayOperatorProcessProxy(KubernetesProcessProxy): + """Ray operator process proxy.""" + + object_kind = "RayCluster" + + def __init__(self, kernel_manager: RemoteKernelManager, proxy_config: dict): + """Initialize the proxy.""" + super().__init__(kernel_manager, proxy_config) + self.group = "ray.io" + self.version = "v1alpha1" + self.plural = "rayclusters" + + async def launch_process( + self, kernel_cmd: str, **kwargs: dict[str, Any] | None + ) -> RayOperatorProcessProxy: + """Launch the process for a kernel.""" + self.kernel_resource_name = self._determine_kernel_pod_name(**kwargs) + kwargs["env"]["KERNEL_RESOURCE_NAME"] = self.kernel_resource_name + kwargs["env"]["KERNEL_CRD_GROUP"] = self.group + kwargs["env"]["KERNEL_CRD_VERSION"] = self.version + kwargs["env"]["KERNEL_CRD_PLURAL"] = self.plural + + await super().launch_process(kernel_cmd, **kwargs) + return self + + def get_container_status(self, iteration: int | None) -> str: + """Determines submitted Ray application status and returns unified pod state. + + This method returns the pod status (not CRD status) to maintain compatibility + with the base class lifecycle management. The RayCluster CRD state is checked + first to ensure the cluster is healthy, but we return pod states that the + base class understands: 'pending', 'running', 'failed', etc. + """ + application_state = None + head_pod_status = None + application_state = self._get_application_state() + if application_state: + self.log.debug( + f">>> ray_operator.get_container_status: application_state {application_state}" + ) + + # Check for CRD-level errors first + if application_state in self.get_error_states(): + error_message = ( + f"CRD submission for kernel {self.kernel_id} failed with state: {application_state}" + ) + self.log.error(error_message) + return "failed" # Return pod state, not CRD state + + # If CRD is not ready yet, return "pending" to indicate still launching + if application_state != "ready": + self.log.debug( + f">>> ray_operator.get_container_status: CRD not ready yet, state={application_state}" + ) + return "pending" + + # CRD is ready, now check the actual pod status + kernel_label_selector = "kernel_id=" + self.kernel_id + ",component=kernel" + ret = None + try: + ret = client.CoreV1Api().list_namespaced_pod( + namespace=self.kernel_namespace, label_selector=kernel_label_selector + ) + except client.rest.ApiException as e: + if e.status == 404: + self.log.debug("Resetting cluster connection info as cluster deleted") + self._reset_connection_info() + return None + + if ret and ret.items: + pod_info = ret.items[0] + self.log.debug( + f"Cluster status {application_state}, pod status {pod_info.status.phase.lower()}" + ) + if pod_info.status: + head_pod_status = pod_info.status.phase.lower() + self.log.debug( + f">>> ray_operator.get_container_status: pod_status {head_pod_status}" + ) + if head_pod_status == "running": + self.log.debug( + f"Pod Info name:{pod_info.metadata.name}, pod ip {pod_info.status.pod_ip}, host {self.container_name}" + ) + self.container_name = pod_info.metadata.name + self.assigned_ip = pod_info.status.pod_ip + self.assigned_host = self.container_name + self.assigned_node_ip = pod_info.status.host_ip + + # only log if iteration is not None (otherwise poll() is too noisy) + # check for running state to avoid double logging with superclass + if iteration and head_pod_status != 'running': + self.log.debug( + f"{iteration}: Waiting from CRD status from resource manager {self.object_kind.lower()} in " + f"namespace '{self.kernel_namespace}'. Name: '{self.kernel_resource_name}', " + f"Status: CRD='{application_state}', Pod='{head_pod_status}', KernelID: '{self.kernel_id}'" + ) + + # KEY FIX: Return pod status (not CRD state) so base class poll() works correctly + final_status = head_pod_status if head_pod_status else "pending" + self.log.debug( + f">>> ray_operator.get_container_status: returning pod_status={final_status} " + f"(CRD state was {application_state})" + ) + return final_status + + def delete_managed_object(self, termination_stati: list[str]) -> bool: + """Deletes the object managed by this process-proxy + + A return value of True indicates the object is considered deleted, + otherwise a False or None value is returned. + + Note: the caller is responsible for handling exceptions. + """ + delete_status = client.CustomObjectsApi().delete_namespaced_custom_object( + self.group, + self.version, + self.kernel_namespace, + self.plural, + self.kernel_resource_name, + grace_period_seconds=0, + propagation_policy="Background", + ) + + result = delete_status and delete_status.get("status", None) in termination_stati + if result: + self._reset_connection_info() + return result + + def get_initial_states(self) -> set: + """Return list of states indicating container is starting (includes running). + + Note: We return pod states (not CRD states) to maintain compatibility + with the base class poll() implementation, which checks if the status + returned by get_container_status() is in this set. + """ + return ["pending", "running"] + + def get_error_states(self) -> set: + """Return list of states indicating RayCluster has failed.""" + # Ray doesn't typically use "failed" state, but we'll include common error states + return {"failed", "error", "unhealthy"} + + def _get_ray_cluster_status(self) -> dict: + try: + return client.CustomObjectsApi().get_namespaced_custom_object( + self.group, + self.version, + self.kernel_namespace, + self.plural, + self.kernel_resource_name, + ) + except client.rest.ApiException as e: + if e.status == 404: + self.log.debug("Resetting cluster connection info as cluster deleted") + self._reset_connection_info() + return None + + def _get_application_state(self): + custom_resource = self._get_ray_cluster_status() + + if custom_resource is None: + return None + + if 'status' not in custom_resource or 'state' not in custom_resource['status']: + return None + + return custom_resource['status']['state'].lower() + + def _get_pod_status(self) -> str: + """Get the current status of the kernel pod. + Returns + ------- + str + The pod status in lowercase (e.g., 'pending', 'running', 'failed', 'unknown'). + """ + pod_status = "unknown" + kernel_label_selector = "kernel_id=" + self.kernel_id + ",component=kernel" + ret = client.CoreV1Api().list_namespaced_pod( + namespace=self.kernel_namespace, label_selector=kernel_label_selector + ) + if ret and ret.items: + pod_info = ret.items[0] + self.container_name = pod_info.metadata.name + if pod_info.status: + pod_status = pod_info.status.phase.lower() + self.log.debug(f">>> k8s._get_pod_status: {pod_status}") + + return pod_status + + def _reset_connection_info(self): + """Reset all connection-related attributes to their initial state. + This is typically called when a cluster is deleted or connection is lost. + """ + + self.assigned_host = None + self.container_name = "" + self.assigned_node_ip = None + self.assigned_ip = None diff --git a/enterprise_gateway/services/sessions/kernelsessionmanager.py b/enterprise_gateway/services/sessions/kernelsessionmanager.py index f4e73ca93..f44622bce 100644 --- a/enterprise_gateway/services/sessions/kernelsessionmanager.py +++ b/enterprise_gateway/services/sessions/kernelsessionmanager.py @@ -94,6 +94,7 @@ def create_session(self, kernel_id: str, **kwargs) -> None: Information used for the launch of the kernel """ + self.log.debug(f">>> Creating new session for kernel {kernel_id}") km = self.kernel_manager.get_kernel(kernel_id) # Compose the kernel_session entry @@ -103,11 +104,14 @@ def create_session(self, kernel_id: str, **kwargs) -> None: kernel_session["kernel_name"] = km.kernel_name # Build the inner dictionaries: connection_info, process_proxy and add to kernel_session + self.log.debug(f">>> Getting connection info for kernel {kernel_id}") kernel_session["connection_info"] = km.get_connection_info() kernel_session["launch_args"] = kwargs.copy() + self.log.debug(f">>> Getting process info for kernel {kernel_id}") kernel_session["process_info"] = ( km.process_proxy.get_process_info() if km.process_proxy else {} ) + self.log.debug(f">>> Saving session {kernel_session}") self._save_session(kernel_id, kernel_session) def refresh_session(self, kernel_id: str) -> None: diff --git a/etc/Makefile b/etc/Makefile index 08b54ecb6..9a23c7718 100644 --- a/etc/Makefile +++ b/etc/Makefile @@ -58,7 +58,6 @@ TOREE_LAUNCHER_FILES:=$(shell find kernel-launchers/scala/toree-launcher/src -ty @echo ../build/kernelspecs/{python,R,scala,python_tf,python_tf_gpu}_kubernetes | xargs -t -n 1 cp -r kernel-launchers/kubernetes/* @echo ../build/kernelspecs/spark_{python,R,scala}_kubernetes | xargs -t -n 1 cp -r kernel-launchers/kubernetes/* @echo ../build/kernelspecs/{python,R,scala,python_tf,python_tf_gpu}_docker | xargs -t -n 1 cp -r kernel-launchers/docker/* - @echo ../build/kernelspecs/spark_python_operator | xargs -t -n 1 cp -r kernel-launchers/operators/* # Populate kernel resources. Because tensorflow is also python, it should be last. @echo ../build/kernelspecs/*R* | xargs -t -n 1 cp -r kernel-resources/ir/* @echo ../build/kernelspecs/*scala* | xargs -t -n 1 cp -r kernel-resources/apache_toree/* @@ -66,6 +65,12 @@ TOREE_LAUNCHER_FILES:=$(shell find kernel-launchers/scala/toree-launcher/src -ty @echo ../build/kernelspecs/*tf* | xargs -t -n 1 cp -r kernel-resources/tensorflow/* # Perform the copy again to enable local, per-kernel, overrides cp -r kernelspecs ../build + # Operator kernelspecs get launcher files after the override to preserve scripts + @echo ../build/kernelspecs/spark_python_operator | xargs -t -n 1 cp -r kernel-launchers/operators/* + @rm -f ../build/kernelspecs/spark_python_operator/scripts/ray.io-v1alpha1.yaml.j2 + @echo ../build/kernelspecs/ray_python_operator | xargs -t -n 1 cp -r kernel-launchers/operators/* + @rm -f ../build/kernelspecs/ray_python_operator/scripts/sparkoperator.k8s.io-v1beta2.yaml.j2 + @echo ../build/kernelspecs/ray_python_operator | xargs -t -n 1 cp -r kernel-resources/ray/* @(cd ../build/kernelspecs; find . -name 'kernel.json' -print0 | xargs -0 sed -i.bak "s/VERSION/$(TAG)/g"; find . -name *.bak -print0 | xargs -0 rm -f) @mkdir -p ../dist @@ -105,31 +110,31 @@ kernel_image_files: ../build/kernel_image_files # Docker image build section *********************************************** # -KERNEL_IMAGES := kernel-py kernel-spark-py kernel-r kernel-spark-r kernel-scala kernel-tf-py kernel-tf-gpu-py +KERNEL_IMAGES := kernel-py kernel-spark-py kernel-ray-py kernel-r kernel-spark-r kernel-scala kernel-tf-py kernel-tf-gpu-py DOCKER_IMAGES := demo-base enterprise-gateway-demo enterprise-gateway kernel-image-puller $(KERNEL_IMAGES) PUSHED_IMAGES := demo-base enterprise-gateway-demo enterprise-gateway kernel-image-puller $(KERNEL_IMAGES) docker-images: $(DOCKER_IMAGES) kernel-images: $(KERNEL_IMAGES) -push-images: push-enterprise-gateway-demo push-enterprise-gateway push-kernel-py push-kernel-spark-py push-kernel-tf-py push-kernel-r push-kernel-spark-r push-kernel-scala push-kernel-image-puller +push-images: push-enterprise-gateway-demo push-enterprise-gateway push-kernel-py push-kernel-spark-py push-kernel-ray-py push-kernel-tf-py push-kernel-r push-kernel-spark-r push-kernel-scala push-kernel-image-puller clean-images: clean-enterprise-gateway-demo clean-demo-base clean-enterprise-gateway clean-kernel-image-puller clean-kernel-images -clean-kernel-images: clean-kernel-py clean-kernel-spark-py clean-kernel-tf-py clean-kernel-tf-gpu-py clean-kernel-r clean-kernel-spark-r clean-kernel-scala +clean-kernel-images: clean-kernel-py clean-kernel-spark-py clean-kernel-ray-py clean-kernel-tf-py clean-kernel-tf-gpu-py clean-kernel-r clean-kernel-spark-r clean-kernel-scala # Extra dependencies for each docker image... DEPENDS_demo-base: DEPENDS_enterprise-gateway-demo: $(FILE_kernelspecs_all) DEPENDS_enterprise-gateway: $(FILE_kernelspecs_all) DEPENDS_kernel-image-puller: -DEPENDS_kernel-py DEPENDS_kernel-spark-py DEPENDS_kernel-r DEPENDS_kernel-spark-r DEPENDS_kernel-scala DEPENDS_kernel-tf-py DEPENDS_kernel-tf-gpu-py: $(FILE_kernelspecs_kubernetes) $(FILE_kernelspecs_docker) +DEPENDS_kernel-py DEPENDS_kernel-spark-py DEPENDS_kernel-ray-py DEPENDS_kernel-r DEPENDS_kernel-spark-r DEPENDS_kernel-scala DEPENDS_kernel-tf-py DEPENDS_kernel-tf-gpu-py: $(FILE_kernelspecs_kubernetes) $(FILE_kernelspecs_docker) # Extra targets for each docker image... TARGETS_demo-base: TARGETS_kernel-image-puller: TARGETS_enterprise-gateway TARGETS_enterprise-gateway-demo: kernelspecs @make -C .. bdist -TARGETS_kernel-py TARGETS_kernel-spark-py TARGETS_kernel-r TARGETS_kernel-spark-r TARGETS_kernel-scala TARGETS_kernel-tf-py TARGETS_kernel-tf-gpu-py: kernelspecs +TARGETS_kernel-py TARGETS_kernel-spark-py TARGETS_kernel-ray-py TARGETS_kernel-r TARGETS_kernel-spark-r TARGETS_kernel-scala TARGETS_kernel-tf-py TARGETS_kernel-tf-gpu-py: kernelspecs # Extra files for each docker image... FILES_demo-base := @@ -138,6 +143,7 @@ FILES_enterprise-gateway-demo := ../dist/jupyter_enterprise_gateway_kernelspecs- FILES_enterprise-gateway := ../dist/jupyter_enterprise_gateway_kernel_image_files* ../dist/jupyter_enterprise_gateway_kernelspecs-* ../dist/jupyter_enterprise_gateway*.whl FILES_kernel-py := ../dist/jupyter_enterprise_gateway_kernel_image_files* FILES_kernel-spark-py := ../dist/jupyter_enterprise_gateway_kernel_image_files* +FILES_kernel-ray-py := ../dist/jupyter_enterprise_gateway_kernel_image_files* FILES_kernel-tf-py := ../dist/jupyter_enterprise_gateway_kernel_image_files* FILES_kernel-tf-gpu-py := ../dist/jupyter_enterprise_gateway_kernel_image_files* FILES_kernel-r := ../dist/jupyter_enterprise_gateway_kernel_image_files* diff --git a/etc/docker/demo-base/Dockerfile b/etc/docker/demo-base/Dockerfile index 9b484c507..47c3aa8a4 100644 --- a/etc/docker/demo-base/Dockerfile +++ b/etc/docker/demo-base/Dockerfile @@ -27,8 +27,8 @@ ENV SHELL=/bin/bash \ ENV HOME=/home/$NB_USER \ PATH=$JAVA_HOME/bin:$ANACONDA_HOME/bin:$HADOOP_HOME/bin:$SPARK_HOME/bin:$PATH -ENV SPARK_VER $SPARK_VERSION -ENV HADOOP_VER 3.3.1 +ENV SPARK_VER=$SPARK_VERSION +ENV HADOOP_VER=3.3.1 # INSTALL / DOWNLOAD ALL NEEDED PACKAGES RUN dpkg --purge --force-depends ca-certificates-java \ diff --git a/etc/docker/enterprise-gateway/Dockerfile b/etc/docker/enterprise-gateway/Dockerfile index 08b640771..5341fe714 100644 --- a/etc/docker/enterprise-gateway/Dockerfile +++ b/etc/docker/enterprise-gateway/Dockerfile @@ -4,8 +4,8 @@ FROM $BASE_CONTAINER ARG SPARK_VERSION -ENV SPARK_VER $SPARK_VERSION -ENV SPARK_HOME /opt/spark +ENV SPARK_VER=$SPARK_VERSION +ENV SPARK_HOME=/opt/spark RUN mamba install --quiet --yes \ @@ -20,9 +20,9 @@ RUN mamba install --quiet --yes \ USER root -RUN apt update && apt install -yq curl openjdk-8-jdk +RUN apt update && apt install -yq curl openjdk-8-jdk iputils-ping telnet netcat-openbsd net-tools iproute2 dnsutils curl -ENV JAVA_HOME /usr/lib/jvm/java +ENV JAVA_HOME=/usr/lib/jvm/java RUN ln -s $(readlink -f /usr/bin/javac | sed "s:/bin/javac::") ${JAVA_HOME} # Download and install Spark @@ -53,6 +53,6 @@ USER jovyan CMD ["/usr/local/bin/start-enterprise-gateway.sh"] -EXPOSE 8888 +EXPOSE 8888 8877 WORKDIR /usr/local/bin diff --git a/etc/docker/kernel-image-puller/Dockerfile b/etc/docker/kernel-image-puller/Dockerfile index 271e60df5..1683aae70 100644 --- a/etc/docker/kernel-image-puller/Dockerfile +++ b/etc/docker/kernel-image-puller/Dockerfile @@ -17,11 +17,11 @@ RUN apt-get update && apt-get install cri-tools RUN echo $PATH # The following environment variables are supported - defaults provided. Override as needed. -ENV KIP_GATEWAY_HOST http://localhost:8888 -ENV KIP_INTERVAL 300 -ENV KIP_LOG_LEVEL INFO -ENV KIP_NUM_PULLERS 2 -ENV KIP_NUM_RETRIES 3 -ENV KIP_PULL_POLICY 'IfNotPresent' +ENV KIP_GATEWAY_HOST=http://localhost:8888 +ENV KIP_INTERVAL=300 +ENV KIP_LOG_LEVEL=INFO +ENV KIP_NUM_PULLERS=2 +ENV KIP_NUM_RETRIES=3 +ENV KIP_PULL_POLICY='IfNotPresent' CMD [ "python", "./kernel_image_puller.py" ] diff --git a/etc/docker/kernel-py/Dockerfile b/etc/docker/kernel-py/Dockerfile index e967509bb..3f8a6fed1 100644 --- a/etc/docker/kernel-py/Dockerfile +++ b/etc/docker/kernel-py/Dockerfile @@ -5,7 +5,7 @@ FROM $BASE_CONTAINER ENV PATH=$PATH:$CONDA_DIR/bin # Add debugger support -RUN pip install --upgrade ipykernel +RUN pip install --upgrade --no-cache-dir ipykernel RUN conda install --quiet --yes \ cffi \ @@ -29,7 +29,7 @@ RUN chown jovyan:users /usr/local/bin/bootstrap-kernel.sh && \ USER jovyan -ENV KERNEL_LANGUAGE python +ENV KERNEL_LANGUAGE=python # Disble healthcheck inherited from notebook image HEALTHCHECK NONE diff --git a/etc/docker/kernel-r/Dockerfile b/etc/docker/kernel-r/Dockerfile index c615674d5..be8b376df 100644 --- a/etc/docker/kernel-r/Dockerfile +++ b/etc/docker/kernel-r/Dockerfile @@ -25,7 +25,7 @@ RUN chown jovyan:users /usr/local/bin/bootstrap-kernel.sh && \ USER jovyan -ENV KERNEL_LANGUAGE R +ENV KERNEL_LANGUAGE=R # Disble healthcheck inherited from notebook image HEALTHCHECK NONE diff --git a/etc/docker/kernel-ray-py/Dockerfile b/etc/docker/kernel-ray-py/Dockerfile new file mode 100644 index 000000000..cb3dad0bf --- /dev/null +++ b/etc/docker/kernel-ray-py/Dockerfile @@ -0,0 +1,51 @@ +# Ray 2.50.0 with Python 3.11 +# rayproject/ray:2.50.0.714bc0-extra-py311-cpu +ARG BASE_CONTAINER=rayproject/ray:2.50.0.714bc0-extra-py311-cpu +FROM $BASE_CONTAINER + +# Add debugger support +RUN pip install --upgrade --no-cache-dir ipykernel + +RUN pip install --upgrade --no-cache-dir --upgrade \ + "jupyter_client>=6.1,<7" \ + "jupyter_server>=1.7,<2" \ + "pyzmq>=20.0.0,<25" \ + "ray[data]==2.50.0" \ + ipykernel \ + cffi \ + future \ + pycryptodomex + +ADD jupyter_enterprise_gateway_kernel_image_files*.tar.gz /usr/local/bin/ + +USER root + +RUN apt-get update && apt-get install -yq --no-install-recommends \ + libkrb5-dev \ + iputils-ping \ + telnet \ + netcat-openbsd \ + net-tools \ + iproute2 \ + dnsutils \ + curl \ + less \ + && rm -rf /var/lib/apt/lists/* + +# Set up permissions for ray user (Ray base image uses 'ray' user) +RUN chown ray:users /usr/local/bin/bootstrap-kernel.sh && \ + chmod 0755 /usr/local/bin/bootstrap-kernel.sh && \ + chown -R ray:users /usr/local/bin/kernel-launchers + +USER ray + +ENV KERNEL_LANGUAGE=python +ENV RAY_HOME=/home/ray + +WORKDIR /home/ray + +# Disble healthcheck inherited from notebook image +HEALTHCHECK NONE + + +CMD /usr/local/bin/bootstrap-kernel.sh diff --git a/etc/docker/kernel-ray-py/README.md b/etc/docker/kernel-ray-py/README.md new file mode 100644 index 000000000..da9a403ac --- /dev/null +++ b/etc/docker/kernel-ray-py/README.md @@ -0,0 +1,16 @@ +This image enables the use of an IPython kernel launched from [Jupyter Enterprise Gateway](https://jupyter-enterprise-gateway.readthedocs.io/en/latest/) within a Kubernetes cluster. It is built on the base image [rayproject/ray:2.50.0.714bc0-extra-py311-cpu](https://hub.docker.com/r/rayproject/ray/), and provides [Ray 2.50.0](https://docs.ray.io/) for distributed Python computing. + +# What it Gives You + +- IPython kernel support (with debugger) +- Ray 2.50.0 for distributed computing +- Python 3.11 environment +- Ray on Kubernetes support from within a Jupyter Notebook + +# Basic Use + +Deploy [enterprise-gateway](https://hub.docker.com/r/elyra/enterprise-gateway/) per its instructions and configured to the appropriate environment. + +Launch a gateway-enabled Jupyter Notebook application against the Enterprise Gateway instance and pick the Ray kernel to use in your notebook. + +For more information, check our [repo](https://github.com/jupyter-server/enterprise_gateway) and [docs](https://jupyter-enterprise-gateway.readthedocs.io/en/latest/). diff --git a/etc/docker/kernel-scala/Dockerfile b/etc/docker/kernel-scala/Dockerfile index d3146da48..e26a91d46 100644 --- a/etc/docker/kernel-scala/Dockerfile +++ b/etc/docker/kernel-scala/Dockerfile @@ -17,5 +17,5 @@ RUN adduser --system -uid 1000 jovyan --ingroup users && \ chown -R jovyan:users /usr/local/bin/kernel-launchers USER jovyan -ENV KERNEL_LANGUAGE scala +ENV KERNEL_LANGUAGE=scala CMD /usr/local/bin/bootstrap-kernel.sh diff --git a/etc/docker/kernel-spark-py/Dockerfile b/etc/docker/kernel-spark-py/Dockerfile index ed6f1a3d0..86ac97193 100644 --- a/etc/docker/kernel-spark-py/Dockerfile +++ b/etc/docker/kernel-spark-py/Dockerfile @@ -7,11 +7,11 @@ FROM $BASE_CONTAINER ARG SPARK_VERSION -ENV SPARK_VER $SPARK_VERSION -ENV SPARK_HOME /opt/spark -ENV KERNEL_LANGUAGE python -ENV R_LIBS_USER $R_LIBS_USER:${SPARK_HOME}/R/lib -ENV PATH $PATH:$SPARK_HOME/bin +ENV SPARK_VER=$SPARK_VERSION +ENV SPARK_HOME=/opt/spark +ENV KERNEL_LANGUAGE=python +ENV R_LIBS_USER=$R_LIBS_USER:${SPARK_HOME}/R/lib +ENV PATH=$PATH:$SPARK_HOME/bin USER root @@ -26,7 +26,7 @@ RUN dpkg --purge --force-depends ca-certificates-java \ libssl-dev \ && rm -rf /var/lib/apt/lists/* -ENV JAVA_HOME /usr/lib/jvm/java +ENV JAVA_HOME=/usr/lib/jvm/java RUN ln -s $(readlink -f /usr/bin/javac | sed "s:/bin/javac::") ${JAVA_HOME} # Download and install Spark diff --git a/etc/docker/kernel-spark-r/Dockerfile b/etc/docker/kernel-spark-r/Dockerfile index 5e92caeaa..df1f6a0ed 100644 --- a/etc/docker/kernel-spark-r/Dockerfile +++ b/etc/docker/kernel-spark-r/Dockerfile @@ -8,11 +8,11 @@ ARG SPARK_VERSION USER root -ENV SPARK_VER $SPARK_VERSION -ENV SPARK_HOME /opt/spark +ENV SPARK_VER=$SPARK_VERSION +ENV SPARK_HOME=/opt/spark ENV KERNEL_LANGUAGE=R -ENV R_LIBS_USER $R_LIBS_USER:${R_HOME}/library:${SPARK_HOME}/R/lib -ENV PATH $PATH:$SPARK_HOME/bin +ENV R_LIBS_USER=$R_LIBS_USER:${R_HOME}/library:${SPARK_HOME}/R/lib +ENV PATH=$PATH:$SPARK_HOME/bin RUN dpkg --purge --force-depends ca-certificates-java \ && apt-get update \ @@ -23,7 +23,7 @@ RUN dpkg --purge --force-depends ca-certificates-java \ libssl-dev \ && rm -rf /var/lib/apt/lists/* -ENV JAVA_HOME /usr/lib/jvm/java +ENV JAVA_HOME=/usr/lib/jvm/java RUN ln -s $(readlink -f /usr/bin/javac | sed "s:/bin/javac::") ${JAVA_HOME} # Download and install Spark diff --git a/etc/docker/kernel-tf-gpu-py/Dockerfile b/etc/docker/kernel-tf-gpu-py/Dockerfile index d6b6c5d27..354465299 100644 --- a/etc/docker/kernel-tf-gpu-py/Dockerfile +++ b/etc/docker/kernel-tf-gpu-py/Dockerfile @@ -27,5 +27,5 @@ RUN adduser --system --uid 1000 --gid 100 jovyan && \ USER jovyan -ENV KERNEL_LANGUAGE python +ENV KERNEL_LANGUAGE=python CMD /usr/local/bin/bootstrap-kernel.sh diff --git a/etc/docker/kernel-tf-py/Dockerfile b/etc/docker/kernel-tf-py/Dockerfile index b6b7e225e..783faa594 100644 --- a/etc/docker/kernel-tf-py/Dockerfile +++ b/etc/docker/kernel-tf-py/Dockerfile @@ -4,7 +4,7 @@ ARG BASE_CONTAINER=jupyter/tensorflow-notebook:2023-10-20 FROM $BASE_CONTAINER -ENV KERNEL_LANGUAGE python +ENV KERNEL_LANGUAGE=python ADD jupyter_enterprise_gateway_kernel_image_files*.tar.gz /usr/local/bin/ diff --git a/etc/kernel-launchers/operators/scripts/launch_custom_resource.py b/etc/kernel-launchers/operators/scripts/launch_custom_resource.py index 371d18b2d..9a6e0379a 100644 --- a/etc/kernel-launchers/operators/scripts/launch_custom_resource.py +++ b/etc/kernel-launchers/operators/scripts/launch_custom_resource.py @@ -76,6 +76,7 @@ def launch_custom_resource_kernel( kernel_crd_template = keywords["kernel_crd_group"] + "-" + keywords["kernel_crd_version"] custom_resource_yaml = generate_kernel_custom_resource_yaml(kernel_crd_template, keywords) + print(f">>> Generated YAML \n{custom_resource_yaml}") kernel_namespace = keywords["kernel_namespace"] group = keywords["kernel_crd_group"] diff --git a/etc/kernel-launchers/operators/scripts/ray.io-v1alpha1-all-in-head.yaml.j2 b/etc/kernel-launchers/operators/scripts/ray.io-v1alpha1-all-in-head.yaml.j2 new file mode 100644 index 000000000..ee51806ea --- /dev/null +++ b/etc/kernel-launchers/operators/scripts/ray.io-v1alpha1-all-in-head.yaml.j2 @@ -0,0 +1,318 @@ +apiVersion: ray.io/v1alpha1 +kind: RayCluster +metadata: + labels: + controller-tools.k8s.io: "1.0" + ray.io/cluster-name: "{{ kernel_resource_name }}" + annotations: + ray.io/ft-enabled: "false" # Disable GCS FT for faster startup + name: {{ kernel_resource_name }} +spec: + enableInTreeAutoscaling: true + autoscalerOptions: + upscalingMode: Aggressive + idleTimeoutSeconds: 3600 + imagePullPolicy: Always + resources: + limits: + cpu: 1 + memory: "1Gi" + requests: + cpu: 1 + memory: "1Gi" +########################################## +## HEAD Node group spec +########################################## + headGroupSpec: + serviceType: ClusterIP # optional + # the following params are used to complete the ray start: ray start --head --block --port=6379 ... + rayStartParams: + disable-usage-stats: 'true' + dashboard-host: '0.0.0.0' + block: 'true' + template: + metadata: + labels: + kernel_id: "{{ kernel_id }}" + app: enterprise-gateway + component: kernel + annotations: + cluster-autoscaler.kubernetes.io/safe-to-evict: "false" + spec: + restartPolicy: OnFailure + serviceAccountName: "{{ kernel_service_account_name }}" +# nodeSelector: +# node.kubernetes.io/instance-type: m5d.8xlarge + containers: + # Combined Ray head + Jupyter kernel container + - name: ray-head + image: {{ kernel_image }} + imagePullPolicy: Always + securityContext: + allowPrivilegeEscalation: false + runAsUser: 0 + resources: + limits: + cpu: {{ kernel_head_num_cpu_limit | default(3)}} + memory: {{ kernel_head_memory_limit | default("6Gi")}} + requests: + cpu: {{ kernel_head_num_cpu_request | default(2)}} + memory: {{ kernel_head_memory_request | default("4Gi")}} + env: + - name: RAY_ADDRESS + value: "127.0.0.1:6379" + - name: RAY_PORT + value: "6379" + - name: RAY_TMPDIR + value: "/tmp/ray" + - name: SERVE_URI + value: "{{ kernel_serve_url }}" + - name: BUILD_URI + value: "{{ kernel_build_url }}" + - name: EG_LOG_LEVEL + value: "0" + - name: KERNEL_ID + value: "{{ kernel_id }}" + - name: EG_RESPONSE_ADDRESS + value: "{{ eg_response_address }}" + - name: EG_PORT_RANGE + value: "{{ eg_port_range }}" + - name: EG_PUBLIC_KEY + value: "{{ eg_public_key }}" + volumeMounts: + - name: ray-logs + mountPath: /tmp/ray + ports: + - containerPort: 6379 + name: gcs + protocol: TCP + - containerPort: 8000 + name: serve + protocol: TCP + - containerPort: 8265 + name: dashboard + protocol: TCP + - containerPort: 10001 + name: client + protocol: TCP + - containerPort: 44217 + name: as-metrics + protocol: TCP + - containerPort: 44227 + name: dash-metrics + protocol: TCP + command: + - "/bin/bash" + - "-c" + - | + set -e + echo "Starting Ray head node..." + + # Ensure /tmp/ray directory exists and has correct permissions + mkdir -p /tmp/ray + chmod 777 /tmp/ray + + # Set RAY_TMPDIR to ensure consistent session directory + export RAY_TMPDIR=/tmp/ray + + # Start Ray head in the background + ray start --head \ + --port=6379 \ + --dashboard-host=0.0.0.0 \ + --dashboard-port=8265 \ + --ray-client-server-port=10001 \ + --disable-usage-stats \ + --block & + + RAY_PID=$! + echo "Ray head started with PID $RAY_PID" + + # Wait for Ray GCS to be ready + echo "Waiting for Ray GCS to be ready on port 6379..." + timeout=60 + elapsed=0 + while ! nc -z 127.0.0.1 6379; do + if [ $elapsed -ge $timeout ]; then + echo "ERROR: Ray GCS failed to start within ${timeout}s" + exit 1 + fi + echo "Waiting for Ray GCS... (${elapsed}s/${timeout}s)" + sleep 2 + elapsed=$((elapsed + 2)) + done + echo "Ray GCS is ready!" + + # Check Ray cluster status (non-fatal) + ray status || echo "Warning: ray status command failed, but GCS is available" + + # Wait for Ray dashboard to be ready + echo "Waiting for Ray dashboard to be ready on port 8265..." + timeout=60 + elapsed=0 + while ! nc -z 127.0.0.1 8265; do + if [ $elapsed -ge $timeout ]; then + echo "WARNING: Ray dashboard not ready within ${timeout}s, continuing anyway..." + break + fi + sleep 2 + elapsed=$((elapsed + 2)) + done + echo "Ray dashboard is ready!" + + # Wait for Raylet to be fully initialized (check dashboard agent health) + echo "Waiting for Raylet to be ready..." + timeout=60 + elapsed=0 + while ! wget --tries 1 -T 2 -q -O- http://127.0.0.1:52365/api/local_raylet_healthz 2>/dev/null | grep -q success; do + if [ $elapsed -ge $timeout ]; then + echo "WARNING: Raylet not ready within ${timeout}s, continuing anyway..." + break + fi + echo "Waiting for Raylet... (${elapsed}s/${timeout}s)" + sleep 2 + elapsed=$((elapsed + 2)) + done + echo "Raylet is ready!" + + # Launch Jupyter kernel in the foreground + echo "Launching Jupyter kernel..." + exec python /usr/local/bin/kernel-launchers/python/scripts/launch_ipykernel.py \ + --kernel-id "$KERNEL_ID" \ + --response-address "$EG_RESPONSE_ADDRESS" \ + --port-range "$EG_PORT_RANGE" \ + --public-key "$EG_PUBLIC_KEY" \ + --cluster-type "ray" + startupProbe: + httpGet: + path: / + port: 8265 + initialDelaySeconds: 15 + periodSeconds: 5 + timeoutSeconds: 5 + failureThreshold: 24 + successThreshold: 1 + readinessProbe: + httpGet: + path: / + port: 8265 + initialDelaySeconds: 0 + periodSeconds: 10 + timeoutSeconds: 3 + failureThreshold: 3 + successThreshold: 1 + livenessProbe: + httpGet: + path: / + port: 8265 + initialDelaySeconds: 30 + periodSeconds: 20 + timeoutSeconds: 5 + failureThreshold: 3 + successThreshold: 1 + volumes: + - name: ray-logs + emptyDir: {} +########################################## +## CPU Workers group specs +########################################## + workerGroupSpecs: + - replicas: 1 + minReplicas: 1 + maxReplicas: {{ kernel_num_cpu_worker or 1 }} + groupName: cpu-group + rayStartParams: + block: 'true' + template: + metadata: + annotations: + ray.io/compute-image: {{ kernel_image }} + cluster-autoscaler.kubernetes.io/safe-to-evict: "false" + spec: + serviceAccountName: "{{ kernel_service_account_name }}" +# nodeSelector: +# node.kubernetes.io/instance-type: {{ kernel_cpu_instance_type | default("m5d.8xlarge")}} + initContainers: + - name: init + image: busybox:1.28 + command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for K8s Service $RAY_IP; sleep 2; done"] + containers: + - name: ray-cpu-worker + image: {{ kernel_image }} + imagePullPolicy: Always + resources: + limits: + cpu: {{ kernel_cpu_worker_num_cpu_limit | default(2)}} + memory: {{ kernel_cpu_worker_num_memory_limit | default("4Gi")}} + requests: + cpu: {{ kernel_cpu_worker_num_cpu_request | default("1")}} + memory: {{ kernel_cpu_worker_num_memory_request | default("2Gi")}} + volumeMounts: + - name: ray-logs + mountPath: /tmp/ray + securityContext: + allowPrivilegeEscalation: false + runAsUser: 0 + # Liveness probe with longer initial delay for dashboard agent startup + livenessProbe: + exec: + command: + - bash + - -c + - wget --tries 1 -T 2 -q -O- http://localhost:52365/api/local_raylet_healthz | grep success + initialDelaySeconds: 45 + periodSeconds: 5 + timeoutSeconds: 2 + failureThreshold: 20 + successThreshold: 1 + # Readiness probe with appropriate timing + readinessProbe: + exec: + command: + - bash + - -c + - wget --tries 1 -T 2 -q -O- http://localhost:52365/api/local_raylet_healthz | grep success + initialDelaySeconds: 30 + periodSeconds: 5 + timeoutSeconds: 2 + failureThreshold: 10 + successThreshold: 1 + volumes: + - name: ray-logs + emptyDir: {} +########################################## +## GPU Workers node groups +########################################## +# - replicas: 0 +# minReplicas: 0 +# maxReplicas: {{ kernel_num_gpu_worker or 0 }} +# groupName: gpu-group +# rayStartParams: +# block: 'true' +# template: +# spec: +# serviceAccountName: "{{ kernel_service_account_name }}" +## nodeSelector: +## node.kubernetes.io/instance-type: {{ kernel_gpu_instance_type | default("g5.4xlarge")}} +# initContainers: +# - name: init +# image: busybox:1.28 +# command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for K8s Service $RAY_IP; sleep 2; done"] +# containers: +# - name: ray-gpu-worker +# image: {{ kernel_image }} +# imagePullPolicy: Always +# env: +# - name: PIP_INDEX_URL +# value: https://pypi.org +# resources: +# limits: +# cpu: {{ kernel_gpu_worker_num_cpu_limit | default(1)}} +# memory: {{ kernel_gpu_worker_num_memory_limit | default("1Gi")}} +# nvidia.com/gpu: {{ kernel_gpu_worker_num_gpu | default(0)}} +# requests: +# cpu: {{kernel_gpu_worker_num_cpu_request | default(0)}} +# memory: {{ kernel_gpu_worker_num_memory_request | default("512Mi")}} +# nvidia.com/gpu: {{ kernel_gpu_worker_num_gpu | default(0)}} +# securityContext: +# allowPrivilegeEscalation: false +# runAsUser: 0 diff --git a/etc/kernel-launchers/operators/scripts/ray.io-v1alpha1.yaml.j2 b/etc/kernel-launchers/operators/scripts/ray.io-v1alpha1.yaml.j2 new file mode 100644 index 000000000..fda5d4c18 --- /dev/null +++ b/etc/kernel-launchers/operators/scripts/ray.io-v1alpha1.yaml.j2 @@ -0,0 +1,223 @@ +apiVersion: ray.io/v1alpha1 +kind: RayCluster +metadata: + labels: + controller-tools.k8s.io: "1.0" + ray.io/cluster-name: "{{ kernel_resource_name }}" + annotations: + ray.io/ft-enabled: "false" # Disable GCS FT for faster startup + name: {{ kernel_resource_name }} +spec: + enableInTreeAutoscaling: true + autoscalerOptions: + upscalingMode: Aggressive + idleTimeoutSeconds: 3600 + imagePullPolicy: Always + resources: + limits: + cpu: 1 + memory: "1Gi" + requests: + cpu: 1 + memory: "1Gi" +########################################## +## HEAD Node group spec +########################################## + headGroupSpec: + serviceType: ClusterIP # optional + # the following params are used to complete the ray start: ray start --head --block --port=6379 ... + rayStartParams: + disable-usage-stats: 'true' + dashboard-host: '0.0.0.0' + block: 'true' + template: + metadata: + labels: + kernel_id: "{{ kernel_id }}" + app: enterprise-gateway + component: kernel + annotations: + cluster-autoscaler.kubernetes.io/safe-to-evict: "false" + spec: + restartPolicy: OnFailure + serviceAccountName: "{{ kernel_service_account_name }}" +# nodeSelector: +# node.kubernetes.io/instance-type: m5d.8xlarge + containers: + # The Ray head container + - name: ray-head + image: {{ kernel_image }} + imagePullPolicy: Always + securityContext: + allowPrivilegeEscalation: false + runAsUser: 0 + resources: + limits: + cpu: {{ kernel_head_num_cpu_limit | default(2)}} + memory: {{ kernel_head_memory_limit | default("4Gi")}} + requests: + cpu: {{ kernel_head_num_cpu_request | default(2)}} + memory: {{ kernel_head_memory_request | default("4Gi")}} + ports: + - containerPort: 6379 + name: gcs + - containerPort: 8000 + name: serve + - containerPort: 8265 + name: dashboard + - containerPort: 10001 + name: client + - containerPort: 44217 + name: as-metrics # autoscaler + - containerPort: 44227 + name: dash-metrics # dashboard + startupProbe: + exec: + command: + - /bin/sh + - -c + - | + # Check GCS port 6379 + nc -z localhost 6379 || exit 1 + # Check Ray API cluster status (verifies GCS is ready) + wget -q -O- --timeout=5 http://localhost:8265/api/cluster_status 2>/dev/null | grep -q "ALIVE" || exit 1 + # Check dashboard port 8265 + wget -q --spider --timeout=30 http://localhost:8265/ || exit 1 + initialDelaySeconds: 15 + periodSeconds: 5 + timeoutSeconds: 5 + failureThreshold: 24 + successThreshold: 1 + readinessProbe: + exec: + command: + - /bin/sh + - -c + - | + # Check GCS port 6379 + nc -z localhost 6379 || exit 1 + # Check Ray API cluster status (verifies GCS is ready) + wget -q -O- --timeout=5 http://localhost:8265/api/cluster_status 2>/dev/null | grep -q "ALIVE" || exit 1 + # Check dashboard port 8265 + wget -q --spider --timeout=30 http://localhost:8265/ || exit 1 + initialDelaySeconds: 0 + periodSeconds: 10 + timeoutSeconds: 3 + failureThreshold: 3 + successThreshold: 1 + livenessProbe: + httpGet: + path: / + port: 8265 + initialDelaySeconds: 30 + periodSeconds: 20 + timeoutSeconds: 5 + failureThreshold: 3 + successThreshold: 1 + - name: ray-kernel + image: {{ kernel_image }} + imagePullPolicy: Always + securityContext: + allowPrivilegeEscalation: false + runAsUser: 0 + env: + - name: RAY_ADDRESS + value: "127.0.0.1:6379" + - name: RAY_PORT + value: "6379" + - name: SERVE_URI + value: "{{ kernel_serve_url }}" + - name: BUILD_URI + value: "{{ kernel_build_url }}" + - name: EG_LOG_LEVEL + value: "0" +# - name: PIP_INDEX_URL +# value: https://pypi.org + command: + - "/bin/sh" + - "-c" + - "python /usr/local/bin/kernel-launchers/python/scripts/launch_ipykernel.py --kernel-id {{ kernel_id }} --response-address {{ eg_response_address }} --port-range {{ eg_port_range }} --public-key {{ eg_public_key }}" +########################################## +## CPU Workers group specs +########################################## + workerGroupSpecs: + - replicas: 1 + minReplicas: 1 + maxReplicas: {{ kernel_num_cpu_worker or 1 }} + groupName: cpu-group + rayStartParams: + block: 'true' + template: + metadata: + annotations: + ray.io/compute-image: {{ kernel_image }} + cluster-autoscaler.kubernetes.io/safe-to-evict: "false" + spec: + serviceAccountName: "{{ kernel_service_account_name }}" +# nodeSelector: +# node.kubernetes.io/instance-type: {{ kernel_cpu_instance_type | default("m5d.8xlarge")}} + initContainers: + - name: init + image: busybox:1.28 + command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for K8s Service $RAY_IP; sleep 2; done"] + containers: + - name: ray-cpu-worker + image: {{ kernel_image }} + imagePullPolicy: Always +# env: +# - name: PIP_INDEX_URL +# value: https://pypi.org + resources: + limits: + cpu: {{ kernel_cpu_worker_num_cpu_limit | default(1)}} + memory: {{ kernel_cpu_worker_num_memory_limit | default("1Gi")}} + requests: + cpu: {{ kernel_cpu_worker_num_cpu_request | default("500m")}} + memory: {{ kernel_cpu_worker_num_memory_request | default("1Gi")}} +# volumeMounts: +# - name: ray-logs +# mountPath: /tmp/ray + securityContext: + allowPrivilegeEscalation: false + runAsUser: 0 + # volumes: + # - name: ray-logs + # hostPath: + # path: "/mnt/data" +########################################## +## GPU Workers node groups +########################################## +# - replicas: 0 +# minReplicas: 0 +# maxReplicas: {{ kernel_num_gpu_worker or 0 }} +# groupName: gpu-group +# rayStartParams: +# block: 'true' +# template: +# spec: +# serviceAccountName: "{{ kernel_service_account_name }}" +## nodeSelector: +## node.kubernetes.io/instance-type: {{ kernel_gpu_instance_type | default("g5.4xlarge")}} +# initContainers: +# - name: init +# image: busybox:1.28 +# command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for K8s Service $RAY_IP; sleep 2; done"] +# containers: +# - name: ray-gpu-worker +# image: {{ kernel_image }} +# imagePullPolicy: Always +# env: +# - name: PIP_INDEX_URL +# value: https://pypi.org +# resources: +# limits: +# cpu: {{ kernel_gpu_worker_num_cpu_limit | default(1)}} +# memory: {{ kernel_gpu_worker_num_memory_limit | default("1Gi")}} +# nvidia.com/gpu: {{ kernel_gpu_worker_num_gpu | default(0)}} +# requests: +# cpu: {{kernel_gpu_worker_num_cpu_request | default(0)}} +# memory: {{ kernel_gpu_worker_num_memory_request | default("512Mi")}} +# nvidia.com/gpu: {{ kernel_gpu_worker_num_gpu | default(0)}} +# securityContext: +# allowPrivilegeEscalation: false +# runAsUser: 0 diff --git a/etc/kernel-launchers/python/scripts/launch_ipykernel.py b/etc/kernel-launchers/python/scripts/launch_ipykernel.py index 4ed8b3b1b..b1344f788 100644 --- a/etc/kernel-launchers/python/scripts/launch_ipykernel.py +++ b/etc/kernel-launchers/python/scripts/launch_ipykernel.py @@ -63,7 +63,7 @@ def initialize_namespace(namespace, cluster_type="spark"): Parameters ---------- - cluster_type : {'spark', 'dask', 'none'} + cluster_type : {'spark', 'ray', 'dask', 'none'} The cluster type to initialize. ``'none'`` results in no variables in the initial namespace. """ @@ -116,6 +116,20 @@ def sql(query): init_thread.start() + elif cluster_type == "ray": + try: + pass + # import ray + + # ray.init(address="127.0.0.1:6379") + # print(ray.cluster_resources()) + except ImportError: + logger.info( + "A Ray init was desired but the ray distribution is not present. " + "Ray.init will not occur." + ) + return + elif cluster_type == "dask": import dask_yarn @@ -517,7 +531,7 @@ def start_ipython( "--cluster-type", dest="cluster_type", nargs="?", - help="the kind of cluster to initialize: spark, dask, or none", + help="the kind of cluster to initialize: spark, ray, dask, or none", ) parser.add_argument( "--kernel-class-name", @@ -570,7 +584,7 @@ def start_ipython( dest="rpp_cluster_type", nargs="?", default="spark", - help="the kind of cluster to initialize: spark, dask, or none (deprecated)", + help="the kind of cluster to initialize: spark, ray, dask, or none (deprecated)", ) arguments = vars(parser.parse_args()) diff --git a/etc/kernel-resources/ray/logo-64x64.png b/etc/kernel-resources/ray/logo-64x64.png new file mode 100644 index 000000000..9917a4c0d Binary files /dev/null and b/etc/kernel-resources/ray/logo-64x64.png differ diff --git a/etc/kernelspecs/ray_python_operator/kernel.json b/etc/kernelspecs/ray_python_operator/kernel.json new file mode 100644 index 000000000..129c5ec7f --- /dev/null +++ b/etc/kernelspecs/ray_python_operator/kernel.json @@ -0,0 +1,25 @@ +{ + "language": "python", + "display_name": "Ray Operator (Python)", + "metadata": { + "process_proxy": { + "class_name": "enterprise_gateway.services.processproxies.ray_operator.RayOperatorProcessProxy", + "config": { + "image_name": "lresende/kernel-ray-py:VERSION", + "executor_image_name": "lresende/kernel-ray-py:VERSION" + } + } + }, + "argv": [ + "python", + "/usr/local/share/jupyter/kernels/ray_python_operator/scripts/launch_custom_resource.py", + "--RemoteProcessProxy.kernel-id", + "{kernel_id}", + "--RemoteProcessProxy.port-range", + "{port_range}", + "--RemoteProcessProxy.response-address", + "{response_address}", + "--RemoteProcessProxy.public-key", + "{public_key}" + ] +} diff --git a/etc/kubernetes/helm/enterprise-gateway/templates/deployment.yaml b/etc/kubernetes/helm/enterprise-gateway/templates/deployment.yaml index 5edef9bf4..03e676965 100644 --- a/etc/kubernetes/helm/enterprise-gateway/templates/deployment.yaml +++ b/etc/kubernetes/helm/enterprise-gateway/templates/deployment.yaml @@ -73,6 +73,8 @@ spec: value: !!str {{ .Values.kernel.launchTimeout }} - name: EG_KERNEL_INFO_TIMEOUT value: !!str {{ .Values.kernel.infoTimeout }} + - name: EG_REQUEST_TIMEOUT + value: !!str {{ .Values.kernel.requestTimeout }} - name: EG_ALLOWED_KERNELS value: {{ toJson .Values.kernel.allowedKernels | squote }} - name: EG_DEFAULT_KERNEL_NAME diff --git a/etc/kubernetes/helm/enterprise-gateway/templates/eg-clusterrole.yaml b/etc/kubernetes/helm/enterprise-gateway/templates/eg-clusterrole.yaml index 11a0abac5..be06575fb 100644 --- a/etc/kubernetes/helm/enterprise-gateway/templates/eg-clusterrole.yaml +++ b/etc/kubernetes/helm/enterprise-gateway/templates/eg-clusterrole.yaml @@ -23,6 +23,9 @@ rules: - apiGroups: ["sparkoperator.k8s.io"] resources: ["sparkapplications", "sparkapplications/status", "scheduledsparkapplications", "scheduledsparkapplications/status"] verbs: ["get", "watch", "list", "create", "delete"] + - apiGroups: ["ray.io"] + resources: ["rayclusters", "rayclusters/status", "rayjobs", "rayjobs/status", "rayservices", "rayservices/status"] + verbs: ["get", "watch", "list", "create", "delete"] --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole @@ -43,6 +46,6 @@ rules: resources: ["configmaps"] verbs: ["list", "create"] - apiGroups: [""] - resources: ["services", "persistentvolumeclaims"] + resources: ["services", "persistentvolumes", "persistentvolumeclaims"] verbs: ["list"] {{- end }} diff --git a/etc/kubernetes/helm/enterprise-gateway/values.yaml b/etc/kubernetes/helm/enterprise-gateway/values.yaml index 493bb3ebf..b65c95be1 100644 --- a/etc/kubernetes/helm/enterprise-gateway/values.yaml +++ b/etc/kubernetes/helm/enterprise-gateway/values.yaml @@ -89,6 +89,8 @@ kernel: shareGatewayNamespace: false # Timeout for kernel launching in seconds. launchTimeout: 60 + infoTimeout: 60 + requestTimeout: 60 # Timeout for an idle kernel before its culled in seconds. Default is 1 hour. cullIdleTimeout: 3600 # Whether to cull idle kernels with connecting clients diff --git a/etc/minikube/ray/README.md b/etc/minikube/ray/README.md new file mode 100644 index 000000000..a53232457 --- /dev/null +++ b/etc/minikube/ray/README.md @@ -0,0 +1,368 @@ +# Ray + Jupyter Enterprise Gateway + JupyterHub on Minikube + +This directory contains scripts and configuration files to deploy a complete Jupyter development environment on Minikube with Ray cluster support for distributed computing. + +## Overview + +This setup provides: + +- **Minikube Kubernetes Cluster**: Local Kubernetes environment for testing and development +- **Ray Operator**: Manages Ray clusters for distributed Python workloads +- **Jupyter Enterprise Gateway**: Enables remote kernel execution on Ray and Kubernetes +- **JupyterHub**: Multi-user notebook server with custom spawner integration + +## Architecture + +``` +┌─────────────────────────────────────────────────────────┐ +│ Minikube Cluster │ +│ │ +│ ┌──────────────┐ ┌─────────────────────────┐ │ +│ │ JupyterHub │────────▶│ Enterprise Gateway │ │ +│ │ (hub ns) │ │ (enterprise-gateway ns)│ │ +│ └──────────────┘ └─────────────────────────┘ │ +│ │ │ │ +│ │ │ │ +│ ▼ ▼ │ +│ ┌──────────────┐ ┌─────────────────────────┐ │ +│ │ User Pods │ │ Ray Kernels │ │ +│ │ (Notebooks) │ │ (ray_python_operator) │ │ +│ └──────────────┘ └─────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌─────────────────────────┐ │ +│ │ KubeRay Operator │ │ +│ │ (Ray Clusters) │ │ +│ └─────────────────────────┘ │ +└─────────────────────────────────────────────────────────┘ +``` + +## Prerequisites + +Before running the installation, ensure you have: + +- **Docker Desktop**: Running and accessible +- **Minikube**: Installed (`brew install minikube` on macOS) +- **kubectl**: Kubernetes command-line tool +- **Helm 3**: Package manager for Kubernetes +- **EG_HOME**: Environment variable pointing to Enterprise Gateway repository root + +```bash +# Example setup +export EG_HOME=/Users/lresende/opensource/jupyter/enterprise-gateway +``` + +## Installation + +### Initial Cluster Setup + +Run `install-minikube-ray.sh` to create a complete new cluster from scratch: + +```bash +./install-minikube-ray.sh +``` + +**What this script does:** + +1. **Launches Docker Desktop** (if not running) +1. **Stops existing Minikube cluster** named `ray` (if it exists) +1. **Starts Minikube** with: + - Profile: `ray` + - Driver: `docker` + - Kubernetes version: `v1.31` + - Memory: `12GB` +1. **Installs KubeRay Operator** (v1.5.0) via Helm + - Manages Ray cluster lifecycle + - Handles Ray pod scheduling and scaling +1. **Deploys Enterprise Gateway** via Helm chart + - Uses local development build from `$EG_HOME/dist/` + - Configured with `enterprise-gateway-minikube-helm.yaml` + - Default kernel: `ray_python_operator` + - Service exposed on NodePort 30088 +1. **Applies Network Policy** (`enterprise-gateway-network.yaml`) + - Allows all ingress/egress for Enterprise Gateway namespace +1. **Installs JupyterHub** (v4.3.1) via Helm + - Custom KubeSpawner configuration + - Integrates with Enterprise Gateway + - Admin users: root, jovyan, lresende + - Exposed via NodePort service +1. **Displays service URL** for JupyterHub proxy + +**Expected Output:** + +At the end of installation, you'll see the JupyterHub URL: + +``` +http://127.0.0.1:XXXXX +``` + +Open this URL in your browser to access JupyterHub. + +### Notes on Installation Script + +The script includes two options for cluster management (line 3-4): + +```bash +minikube -p ray stop # Updates existing cluster +# minikube -p ray stop && minikube -p ray delete # Creates fresh cluster +``` + +- **Default behavior**: Stops and restarts the existing cluster (preserves state) +- **Alternative**: Uncomment line 4 to completely delete and recreate the cluster + +## Development Workflow + +### Building and Updating Images + +Use `update-minikube-ray.sh` when you've made changes to Enterprise Gateway or kernel images: + +```bash +./update-minikube-ray.sh +``` + +**What this script does:** + +1. **Navigates to EG_HOME** and builds distributions: + + ```bash + make clean dist + ``` + + - Creates Helm chart tarball for Enterprise Gateway + +1. **Builds and pushes Docker images**: + + ```bash + make clean-enterprise-gateway enterprise-gateway push-enterprise-gateway \ + clean-kernel-ray-py kernel-ray-py push-kernel-ray-py \ + HUB_ORG=lresende TAG=dev + ``` + + - Builds `lresende/enterprise-gateway:dev` + - Builds `lresende/kernel-ray-py:dev` + - Pushes to DockerHub (requires authentication) + +1. **Loads images into Minikube**: + + ```bash + minikube image load lresende/enterprise-gateway:dev + minikube image load lresende/kernel-ray-py:dev + ``` + + - Makes images available to Kubernetes without pulling from registry + +1. **Restarts Enterprise Gateway deployment**: + + ```bash + kubectl rollout restart deployment/enterprise-gateway -n enterprise-gateway + ``` + + - Picks up new image versions + - Zero-downtime rolling update + +1. **Displays Enterprise Gateway service URL** for verification + +**When to use this script:** + +- After modifying Enterprise Gateway source code +- After updating kernel image definitions +- When testing new features or bug fixes +- Before creating pull requests + +**Note**: The script assumes you have push access to the `lresende` DockerHub organization. Modify `HUB_ORG` in the script if using a different registry. + +## Configuration Files + +### enterprise-gateway-minikube-helm.yaml + +Helm values for Enterprise Gateway deployment: + +- **Image**: `lresende/enterprise-gateway:dev` +- **Kernel Configuration**: + - Allowed kernels: `ray_python_operator`, `python_kubernetes` + - Default kernel: `ray_python_operator` + - Launch/timeout settings: 500 seconds (helpful for debugging) + - Idle timeout: 3600 seconds (1 hour) +- **Service**: NodePort on 30088 (HTTP) and 30077 (responses) +- **RBAC**: Enabled with `enterprise-gateway-sa` service account +- **KIP**: Enabled for pre-pulling kernel images from Docker Hub + +### jupyterhub-config.yaml + +JupyterHub configuration with custom spawner: + +- **Database**: In-memory SQLite (not for production!) +- **Authenticator**: Admin users configured (root, jovyan, lresende) +- **Custom Spawner**: `CustomKubeSpawner` extends KubeSpawner + - Sets `JUPYTER_GATEWAY_URL` to Enterprise Gateway service + - Configures kernel namespace: `enterprise-gateway` + - Sets service account: `enterprise-gateway-sa` + - Passes username to kernels via environment variables +- **Single-user Image**: `quay.io/jupyterhub/k8s-singleuser-sample:4.3.1` +- **Storage**: Ephemeral (type: none) - notebooks are not persisted +- **Default UI**: JupyterLab (`/lab`) + +### enterprise-gateway-network.yaml + +Kubernetes NetworkPolicy that allows all traffic to/from Enterprise Gateway namespace: + +- Required for Enterprise Gateway to communicate with kernels +- Allows kernels in the same namespace to connect back to gateway +- In production, consider more restrictive policies + +## Usage + +### Access JupyterHub + +1. After installation, open the URL displayed by the script +1. Log in with username: `root`, `jovyan`, or `lresende` (any password works in dev mode) +1. Wait for your user pod to start (first launch may take 1-2 minutes) + +### Launch a Ray Kernel + +1. In JupyterLab, create a new notebook +1. Select kernel: **Ray Python (ray_python_operator)** +1. Run Python code that executes on Ray: + +```python +import ray +ray.init(address='auto') + +@ray.remote +def compute_pi(n): + import random + count = sum(1 for _ in range(n) + if random.random()**2 + random.random()**2 <= 1) + return 4.0 * count / n + +# Distributed computation across Ray cluster +futures = [compute_pi.remote(1000000) for _ in range(10)] +results = ray.get(futures) +print(f"Pi estimate: {sum(results) / len(results)}") +``` + +### Verify Cluster Status + +```bash +# Check all pods across namespaces +kubectl get pods --all-namespaces + +# Check Enterprise Gateway logs +kubectl logs -n enterprise-gateway deployment/enterprise-gateway -f + +# Check JupyterHub logs +kubectl logs -n hub deployment/hub -f + +# List running Ray clusters +kubectl get rayclusters --all-namespaces + +# Access Enterprise Gateway directly +minikube -p ray service enterprise-gateway -n enterprise-gateway --url +``` + +## Troubleshooting + +### Minikube won't start + +```bash +# Clean up and retry +minikube -p ray delete +./install-minikube-ray.sh +``` + +### Pods stuck in ImagePullBackOff + +```bash +# Verify images are loaded +minikube -p ray image ls | grep lresende + +# Re-run update script +./update-minikube-ray.sh +``` + +### Kernels fail to start + +Check timeout settings and logs: + +```bash +# View Enterprise Gateway logs +kubectl logs -n enterprise-gateway deployment/enterprise-gateway --tail=100 + +# Check kernel pods +kubectl get pods -n enterprise-gateway -l kernel_id + +# Describe a failing pod +kubectl describe pod -n enterprise-gateway +``` + +### JupyterHub can't connect to Enterprise Gateway + +Verify the service URL configuration: + +```bash +# Get Enterprise Gateway service URL +kubectl get svc -n enterprise-gateway enterprise-gateway + +# Should show: http://enterprise-gateway.enterprise-gateway:8888 +# This matches JUPYTER_GATEWAY_URL in jupyterhub-config.yaml +``` + +### Ray cluster issues + +```bash +# Check KubeRay operator +kubectl get pods -l app.kubernetes.io/name=kuberay-operator + +# View operator logs +kubectl logs -l app.kubernetes.io/name=kuberay-operator -f +``` + +## Cleanup + +### Stop the cluster (preserves state) + +```bash +minikube -p ray stop +``` + +### Delete everything + +```bash +minikube -p ray delete +``` + +This removes all data, configurations, and the Minikube VM. + +## Development Tips + +1. **Faster iteration**: Use `imagePullPolicy: Never` in Helm configs during development to force local image usage + +1. **Debug mode**: Both Enterprise Gateway and JupyterHub are configured with debug logging enabled + +1. **Resource monitoring**: + + ```bash + # Watch resource usage + kubectl top nodes + kubectl top pods --all-namespaces + ``` + +1. **Port forwarding** (alternative to NodePort): + + ```bash + kubectl port-forward -n hub svc/proxy-public 8000:80 + kubectl port-forward -n enterprise-gateway svc/enterprise-gateway 8888:8888 + ``` + +1. **Multi-arch builds**: Uncomment the `MULTIARCH_BUILD=true` line in `update-minikube-ray.sh` if building for ARM and x86 architectures + +## References + +- [Jupyter Enterprise Gateway Documentation](https://jupyter-enterprise-gateway.readthedocs.io/) +- [KubeRay Documentation](https://docs.ray.io/en/latest/cluster/kubernetes/index.html) +- [JupyterHub on Kubernetes](https://z2jh.jupyter.org/) +- [Ray Documentation](https://docs.ray.io/) + +## License + +This configuration is part of the Jupyter Enterprise Gateway project. See the parent repository for license information. diff --git a/etc/minikube/ray/enterprise-gateway-minikube-helm.yaml b/etc/minikube/ray/enterprise-gateway-minikube-helm.yaml new file mode 100644 index 000000000..a85ad07d1 --- /dev/null +++ b/etc/minikube/ray/enterprise-gateway-minikube-helm.yaml @@ -0,0 +1,52 @@ +#image: elyra/enterprise-gateway:dev +image: lresende/enterprise-gateway:dev +imagePullPolicy: Always +logLevel: DEBUG +rbac: true +hostNetwork: true +kernel: + # All kernels reside in the EG namespace if True, otherwise KERNEL_NAMESPACE + # must be provided or one will be created for each kernel. + shareGatewayNamespace: true + launchTimeout: 500 + infoTimeout: 500 + requestTimeout: 500 + # Current idle timeout is 1 hour. + cullIdleTimeout: 3600 + allowedKernels: +# - spark_python_operator + - ray_python_operator + - python_kubernetes + defaultKernelName: ray_python_operator +# for Spark +# defaultServiceAccountName: "spark-operator-spark" +# for Ray + defaultServiceAccountName: "enterprise-gateway-sa" +# kernelspecsPvc: +# enabled: true +# name: pvc-kernelspecs +#deployment: +# extraEnv: +# EG_DEFAULT_KERNEL_SERVICE_ACCOUNT_NAME: "kuberay-operator" +service: + type: "NodePort" + externalIPs: + # Master public IP on which to expose EG. + k8sMasterPublicIP: '' + ports: + # The primary port on which Enterprise Gateway is servicing requests. + - name: "http" + port: 8888 + targetPort: 8888 + nodePort: 30088 # optional nodePort + # The port on which Enterprise Gateway will receive kernel connection info responses. + - name: "http-response" + port: 8877 + targetPort: 8877 + nodePort: 30077 # optional nodePort +kip: + enabled: true + image: elyra/kernel-image-puller:3.2.3 + imagePullPolicy: Always + pullPolicy: Always + defaultContainerRegistry: "docker.io" diff --git a/etc/minikube/ray/enterprise-gateway-network.yaml b/etc/minikube/ray/enterprise-gateway-network.yaml new file mode 100644 index 000000000..eda43d813 --- /dev/null +++ b/etc/minikube/ray/enterprise-gateway-network.yaml @@ -0,0 +1,14 @@ +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: enterprise-gateway-allow-all + namespace: enterprise-gateway +spec: + podSelector: {} + ingress: + - {} + egress: + - {} + policyTypes: + - Ingress + - Egress diff --git a/etc/minikube/ray/install-minikube-ray.sh b/etc/minikube/ray/install-minikube-ray.sh new file mode 100755 index 000000000..c5e3cbdc9 --- /dev/null +++ b/etc/minikube/ray/install-minikube-ray.sh @@ -0,0 +1,25 @@ +open -a Docker + +minikube -p ray stop # this keep updating the existing cluster +# minikube -p ray stop && minikube -p ray delete # this always create a new cluster + +# Launch Minikube +minikube start -p ray --driver=docker --kubernetes-version=v1.31 --memory=12000 +minikube profile ray + +# Ray operator helm +helm repo add kuberay https://ray-project.github.io/kuberay-helm/ +helm repo update + +helm install kuberay-operator kuberay/kuberay-operator --version 1.5.1 --create-namespace --wait + +helm upgrade --install enterprise-gateway ../../../dist/jupyter_enterprise_gateway_helm-*.tar.gz --namespace enterprise-gateway --values enterprise-gateway-minikube-helm.yaml --create-namespace --wait +kubectl apply -f enterprise-gateway-network.yaml +# minikube service --url enterprise-gateway -n enterprise-gateway + +# Install JupyterHub +helm repo add jupyterhub https://jupyterhub.github.io/helm-chart/ +helm repo update + +helm upgrade --install hub jupyterhub/jupyterhub --namespace hub --version 4.3.1 --values jupyterhub-config.yaml --create-namespace --timeout 10m --wait +minikube service --url proxy-public -n hub diff --git a/etc/minikube/ray/jupyterhub-config.yaml b/etc/minikube/ray/jupyterhub-config.yaml new file mode 100644 index 000000000..a58d92c82 --- /dev/null +++ b/etc/minikube/ray/jupyterhub-config.yaml @@ -0,0 +1,65 @@ +hub: + db: + type: sqlite-memory + config: + Authenticator: + admin_users: + - root + - jovyan + - lresende + extraConfig: + customSpawner: | + from kubespawner import KubeSpawner + from tornado import gen + import yaml + + class CustomKubeSpawner(KubeSpawner): + def get_env(self): + env = super().get_env() + env['EG_HTTP_USER'] = self.user.name + env['KERNEL_USERNAME'] = self.user.name + env['KERNEL_NAMESPACE'] = "default" + env['KERNEL_NAMESPACE'] = "enterprise-gateway" + env['KERNEL_SERVICE_ACCOUNT_NAME'] = "spark-operator-spark" + env['KERNEL_SERVICE_ACCOUNT_NAME'] = "enterprise-gateway-sa" + return env + c.JupyterHub.spawner_class = CustomKubeSpawner + c.Spawner.start_timeout = 500 + +proxy: + secretToken: "992a30653af09eebd07ff22588fbc295e35aea2ce4396230d9a9a0634b9b000e" + service: + type: NodePort + +ingress: + enabled: false + + # annotations: + # kubernetes.io/ingress.class: "nginx" + # hosts: + # - lresende-alien + +singleuser: + defaultUrl: "/lab" + image: + # name: quay.io/elyra/elyra + # tag: 3.15.0 + name: quay.io/jupyterhub/k8s-singleuser-sample + tag: 4.3.1 + # disable this in a production environment + pullPolicy: "Always" + storage: + type: none +# storage: +# dynamic: +# storageClass: nfs-dynamic + extraEnv: + JUPYTERHUB_SINGLEUSER_APP: "jupyter_server.serverapp.ServerApp" + JUPYTER_GATEWAY_URL: http://enterprise-gateway.enterprise-gateway:8888 + JUPYTER_GATEWAY_REQUEST_TIMEOUT: "500" + KERNEL_LAUNCH_TIMEOUT: "500" +rbac: + create: true + +debug: + enabled: true diff --git a/etc/minikube/ray/update-minikube-ray.sh b/etc/minikube/ray/update-minikube-ray.sh new file mode 100755 index 000000000..74199e2d5 --- /dev/null +++ b/etc/minikube/ray/update-minikube-ray.sh @@ -0,0 +1,12 @@ +#!/bin/bash +set -e +pushd . +cd $EG_HOME +make clean dist; +#make clean-enterprise-gateway enterprise-gateway clean-kernel-ray-py kernel-ray-py HUB_ORG=lresende TAG=dev MULTIARCH_BUILD=true; +make clean-enterprise-gateway enterprise-gateway push-enterprise-gateway clean-kernel-ray-py kernel-ray-py push-kernel-ray-py HUB_ORG=lresende TAG=dev; +popd +minikube image load lresende/enterprise-gateway:dev; +minikube image load lresende/kernel-ray-py:dev; +kubectl --context=ray -n enterprise-gateway rollout restart deployment/enterprise-gateway; +minikube service --url proxy-public -n hub diff --git a/pyproject.toml b/pyproject.toml index 829c6be41..424a25e1e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,7 +33,7 @@ dependencies = [ "pycryptodomex>=3.9.7", "pyzmq>=20.0,<25.0", # Pyzmq 25 removes deprecated code that jupyter_client 6 uses, remove if v6 gets updated "requests>=2.14.2", - "tornado>=6.1", + "tornado>=6.5.2", "traitlets>=5.3.0", "watchdog>=2.1.3", "yarn-api-client>=1.0"