Skip to content

Commit 6c49b85

Browse files
committed
cocalc-api: tweak test config
1 parent 0d70752 commit 6c49b85

File tree

2 files changed

+196
-68
lines changed

2 files changed

+196
-68
lines changed

src/python/cocalc-api/tests/conftest.py

Lines changed: 164 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,50 @@
1010
from cocalc_api import Hub, Project
1111

1212
from psycopg2 import pool as pg_pool
13+
from typing import Callable, TypeVar, Any
1314

1415
# Database configuration examples (DRY principle)
1516
PGHOST_SOCKET_EXAMPLE = "/path/to/cocalc-data/socket"
1617
PGHOST_NETWORK_EXAMPLE = "localhost"
1718

19+
T = TypeVar('T')
20+
21+
22+
def retry_with_backoff(
23+
func: Callable[[], T],
24+
max_retries: int = 3,
25+
retry_delay: int = 5,
26+
error_condition: Callable[[RuntimeError], bool] = lambda e: "timeout" in str(e).lower(),
27+
) -> T:
28+
"""
29+
Retry a function call with exponential backoff for timeout errors.
30+
31+
This helper is useful for operations that may timeout on first attempt due to
32+
cold starts (e.g., kernel launches).
33+
34+
Args:
35+
func: Callable that performs the operation
36+
max_retries: Maximum number of attempts (default: 3)
37+
retry_delay: Delay in seconds between retries (default: 5)
38+
error_condition: Function to determine if an error should trigger retry.
39+
Defaults to checking for "timeout" in error message.
40+
41+
Returns:
42+
The result of the function call
43+
44+
Raises:
45+
RuntimeError: If all retries fail or error condition doesn't match
46+
"""
47+
for attempt in range(max_retries):
48+
try:
49+
return func()
50+
except RuntimeError as e:
51+
if error_condition(e) and attempt < max_retries - 1:
52+
print(f"Attempt {attempt + 1} timed out, retrying in {retry_delay}s...")
53+
time.sleep(retry_delay)
54+
else:
55+
raise
56+
1857

1958
def assert_valid_uuid(value, description="value"):
2059
"""
@@ -125,39 +164,110 @@ def project_client(temporary_project, api_key, cocalc_host):
125164
return Project(project_id=temporary_project['project_id'], api_key=api_key, host=cocalc_host)
126165

127166

167+
@pytest.fixture(autouse=True)
168+
def cleanup_kernels_after_test(request, project_client):
169+
"""
170+
Clean up excess Jupyter kernels after test classes that use them.
171+
172+
Kernel accumulation happens because the kernel pool reuses kernels, but under
173+
heavy test load, old kernels aren't always properly cleaned up by the pool.
174+
This fixture cleans up accumulated kernels BETWEEN test classes (not between
175+
individual tests) to avoid interfering with the pool's reuse strategy.
176+
177+
The fixture only runs for tests in classes that deal with Jupyter kernels
178+
(TestJupyterExecuteViaHub, TestJupyterExecuteViaProject, TestJupyterKernelManagement)
179+
to avoid interfering with other tests.
180+
"""
181+
yield # Allow test to run
182+
183+
# Only cleanup for Jupyter-related tests
184+
test_class = request.cls
185+
if test_class is None:
186+
return
187+
188+
jupyter_test_classes = {
189+
'TestJupyterExecuteViaHub',
190+
'TestJupyterExecuteViaProject',
191+
'TestJupyterKernelManagement',
192+
}
193+
194+
if test_class.__name__ not in jupyter_test_classes:
195+
return
196+
197+
# Clean up accumulated kernels carefully
198+
# Only cleanup if we have more kernels than the pool can manage (> 3)
199+
# This gives some buffer to the pool's reuse mechanism
200+
try:
201+
import time
202+
kernels = project_client.system.list_jupyter_kernels()
203+
204+
# Only cleanup if significantly over pool size (pool size is 2)
205+
# We use threshold of 3 to trigger cleanup
206+
if len(kernels) > 3:
207+
# Keep the 2 most recent kernels (higher PIDs), stop older ones
208+
kernels_sorted = sorted(kernels, key=lambda k: k.get("pid", 0))
209+
kernels_to_stop = kernels_sorted[:-2] # All but the 2 newest
210+
211+
for kernel in kernels_to_stop:
212+
try:
213+
project_client.system.stop_jupyter_kernel(pid=kernel["pid"])
214+
time.sleep(0.1) # Small delay between kills
215+
except Exception:
216+
# Silently ignore individual kernel failures
217+
pass
218+
except Exception:
219+
# If listing kernels fails, just continue
220+
pass
221+
222+
128223
def ensure_python3_kernel(project_client: Project):
129224
"""
130225
Ensure the default python3 Jupyter kernel is installed in the project.
131226
132227
If not available, install ipykernel and register the kernelspec.
133228
"""
134229

135-
def has_python_kernel() -> bool:
230+
def try_exec(command: list[str], timeout: int = 60, capture_stdout: bool = False):
136231
try:
137232
result = project_client.system.exec(
138-
command="python3",
139-
args=["-m", "jupyter", "kernelspec", "list", "--json"],
140-
timeout=60,
233+
command=command[0],
234+
args=command[1:],
235+
timeout=timeout,
141236
)
142-
data = json.loads(result["stdout"])
143-
kernelspecs = data.get("kernelspecs", {})
144-
return "python3" in kernelspecs
237+
return (True, result["stdout"] if capture_stdout else None)
238+
except Exception as err:
239+
print(f"Warning: command {command} failed: {err}")
240+
return (False, None)
241+
242+
def has_python_kernel() -> bool:
243+
ok, stdout = try_exec(
244+
["python3", "-m", "jupyter", "kernelspec", "list", "--json"],
245+
capture_stdout=True,
246+
)
247+
if not ok or stdout is None:
248+
return False
249+
try:
250+
data = json.loads(stdout)
251+
return "python3" in data.get("kernelspecs", {})
145252
except Exception as err:
146-
print(f"Warning: Failed to list kernelspecs: {err}")
253+
print(f"Warning: Failed to parse kernelspec list: {err}")
147254
return False
148255

149256
if has_python_kernel():
150257
return
151258

152259
print("Installing python3 kernelspec in project...")
153-
project_client.system.exec(
154-
command="python3",
155-
args=["-m", "pip", "install", "--user", "ipykernel"],
156-
timeout=300,
157-
)
158-
project_client.system.exec(
159-
command="python3",
160-
args=[
260+
# Install pip if needed
261+
try_exec(["python3", "-m", "ensurepip", "--user"], timeout=120)
262+
# Upgrade pip but ignore errors (not fatal)
263+
try_exec(["python3", "-m", "pip", "install", "--user", "--upgrade", "pip"], timeout=120)
264+
265+
if not try_exec(["python3", "-m", "pip", "install", "--user", "ipykernel"], timeout=300):
266+
raise RuntimeError("Failed to install ipykernel via pip")
267+
268+
if not try_exec(
269+
[
270+
"python3",
161271
"-m",
162272
"ipykernel",
163273
"install",
@@ -166,7 +276,8 @@ def has_python_kernel() -> bool:
166276
"--display-name=Python 3",
167277
],
168278
timeout=120,
169-
)
279+
):
280+
raise RuntimeError("Failed to install python3 kernelspec")
170281

171282
if not has_python_kernel():
172283
raise RuntimeError("Failed to ensure python3 kernelspec is installed in project")
@@ -500,3 +611,39 @@ def cleanup():
500611
request.addfinalizer(cleanup)
501612

502613
yield
614+
615+
616+
@pytest.fixture(scope="session", autouse=True)
617+
def cleanup_jupyter_kernels_session(project_client):
618+
"""
619+
Clean up all Jupyter kernels created during the test session.
620+
621+
This session-scoped fixture ensures that all kernels spawned during testing
622+
are properly terminated at the end of the test session. This prevents
623+
orphaned processes from accumulating in the system.
624+
625+
The fixture runs AFTER all tests complete (via yield), ensuring no
626+
interference with test execution while still guaranteeing cleanup.
627+
"""
628+
yield # Allow all tests to run first
629+
630+
# After all tests complete, clean up all remaining kernels
631+
try:
632+
kernels = project_client.system.list_jupyter_kernels()
633+
if kernels:
634+
print(f"\n{'='*70}")
635+
print(f"CLEANING UP {len(kernels)} JUPYTER KERNELS FROM TEST SESSION")
636+
print(f"{'='*70}")
637+
for kernel in kernels:
638+
try:
639+
pid = kernel.get("pid")
640+
result = project_client.system.stop_jupyter_kernel(pid=pid)
641+
if result.get("success"):
642+
print(f"✓ Stopped kernel PID {pid}")
643+
else:
644+
print(f"✗ Failed to stop kernel PID {pid}")
645+
except Exception as e:
646+
print(f"✗ Error stopping kernel: {e}")
647+
print(f"{'='*70}\n")
648+
except Exception as e:
649+
print(f"Warning: Failed to clean up jupyter kernels: {e}")

src/python/cocalc-api/tests/test_jupyter.py

Lines changed: 32 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,10 @@
22
Tests for Jupyter kernel functionality.
33
"""
44

5-
import time
6-
from typing import Optional
5+
import pytest
6+
7+
# Import helper from conftest
8+
from tests.conftest import retry_with_backoff
79

810

911
class TestJupyterKernelSetup:
@@ -68,10 +70,15 @@ class TestJupyterExecuteViaHub:
6870
"""Tests for executing code via hub.jupyter.execute()."""
6971

7072
def test_execute_simple_sum(self, hub, temporary_project):
71-
"""Test executing a simple sum using the python3 kernel."""
73+
"""Test executing a simple sum using the python3 kernel.
74+
75+
Note: First execution may take longer as kernel needs to start up (30+ seconds).
76+
"""
7277
project_id = temporary_project["project_id"]
7378

74-
result = hub.jupyter.execute(input="sum(range(100))", kernel="python3", project_id=project_id)
79+
result = retry_with_backoff(
80+
lambda: hub.jupyter.execute(input="sum(range(100))", kernel="python3", project_id=project_id)
81+
)
7582

7683
# Check the result structure
7784
assert isinstance(result, dict)
@@ -107,10 +114,15 @@ def test_execute_with_history(self, hub, temporary_project):
107114
assert first_output["data"]["text/plain"] == "5050"
108115

109116
def test_execute_print_statement(self, hub, temporary_project):
110-
"""Test executing code that prints output."""
117+
"""Test executing code that prints output.
118+
119+
Note: First execution may take longer as kernel needs to start up (30+ seconds).
120+
"""
111121
project_id = temporary_project["project_id"]
112122

113-
result = hub.jupyter.execute(input='print("Hello from Jupyter")', kernel="python3", project_id=project_id)
123+
result = retry_with_backoff(
124+
lambda: hub.jupyter.execute(input='print("Hello from Jupyter")', kernel="python3", project_id=project_id)
125+
)
114126

115127
# Check that we got output
116128
assert isinstance(result, dict)
@@ -138,21 +150,9 @@ def test_jupyter_execute_simple_sum(self, project_client):
138150
139151
Note: First execution may take longer as kernel needs to start up (30+ seconds).
140152
"""
141-
# Retry logic for first kernel startup
142-
max_retries = 3
143-
retry_delay = 15
144-
result: Optional[list] = None
145-
146-
for attempt in range(max_retries):
147-
try:
148-
result = project_client.system.jupyter_execute(input="sum(range(100))", kernel="python3")
149-
break
150-
except RuntimeError as e:
151-
if "timeout" in str(e).lower() and attempt < max_retries - 1:
152-
print(f"Attempt {attempt + 1} timed out, retrying in {retry_delay}s...")
153-
time.sleep(retry_delay)
154-
else:
155-
raise
153+
result = retry_with_backoff(
154+
lambda: project_client.system.jupyter_execute(input="sum(range(100))", kernel="python3")
155+
)
156156

157157
# Result is a list, not a dict with 'output' key
158158
assert isinstance(result, list)
@@ -169,8 +169,12 @@ def test_jupyter_execute_with_history(self, project_client):
169169
Test executing code with history via project API.
170170
171171
The result is a list of output items directly.
172+
173+
Note: First execution may take longer as kernel needs to start up (30+ seconds).
172174
"""
173-
result = project_client.system.jupyter_execute(history=["b = 50"], input="b * 2", kernel="python3")
175+
result = retry_with_backoff(
176+
lambda: project_client.system.jupyter_execute(history=["b = 50"], input="b * 2", kernel="python3")
177+
)
174178

175179
# Result is a list
176180
assert isinstance(result, list)
@@ -188,21 +192,9 @@ def test_jupyter_execute_list_operation(self, project_client):
188192
189193
The result is a list of output items directly.
190194
"""
191-
# Retry logic for kernel startup
192-
max_retries = 3
193-
retry_delay = 15
194-
result: Optional[list] = None
195-
196-
for attempt in range(max_retries):
197-
try:
198-
result = project_client.system.jupyter_execute(input="[x**2 for x in range(5)]", kernel="python3")
199-
break
200-
except RuntimeError as e:
201-
if "timeout" in str(e).lower() and attempt < max_retries - 1:
202-
print(f"Attempt {attempt + 1} timed out, retrying in {retry_delay}s...")
203-
time.sleep(retry_delay)
204-
else:
205-
raise
195+
result = retry_with_backoff(
196+
lambda: project_client.system.jupyter_execute(input="[x**2 for x in range(5)]", kernel="python3")
197+
)
206198

207199
# Result is a list
208200
assert isinstance(result, list)
@@ -221,20 +213,9 @@ class TestJupyterKernelManagement:
221213
def test_list_jupyter_kernels(self, project_client):
222214
"""Test listing running Jupyter kernels."""
223215
# First execute some code to ensure a kernel is running
224-
# Retry logic for first kernel startup (may take longer in CI)
225-
max_retries = 3
226-
retry_delay = 15
227-
228-
for attempt in range(max_retries):
229-
try:
230-
project_client.system.jupyter_execute(input="1+1", kernel="python3")
231-
break
232-
except RuntimeError as e:
233-
if "timeout" in str(e).lower() and attempt < max_retries - 1:
234-
print(f"Attempt {attempt + 1} timed out, retrying in {retry_delay}s...")
235-
time.sleep(retry_delay)
236-
else:
237-
raise
216+
retry_with_backoff(
217+
lambda: project_client.system.jupyter_execute(input="1+1", kernel="python3")
218+
)
238219

239220
# List kernels
240221
kernels = project_client.system.list_jupyter_kernels()

0 commit comments

Comments
 (0)