Skip to content

Commit 48fed26

Browse files
committed
Fix JSON serialization for checkpoint URIs and Inspect AI imports
1 parent 782d4d0 commit 48fed26

File tree

5 files changed

+42
-19
lines changed

5 files changed

+42
-19
lines changed

inspect_eval.py

Lines changed: 24 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,21 @@
1010
from inspect_ai.model import GenerateConfig, Model
1111
from inspect_ai.scorer import match, includes
1212
from inspect_ai.solver import generate
13+
14+
QA_SAMPLES = [
15+
Sample(input="What is 2 + 2?", target="4"),
16+
Sample(input="What is the capital of France?", target="Paris"),
17+
Sample(input="What color is grass?", target="green"),
18+
Sample(input="How many days in a week?", target="7"),
19+
Sample(input="What is 10 x 5?", target="50"),
20+
]
21+
1322
INSPECT_AVAILABLE = True
1423
except ImportError:
1524
INSPECT_AVAILABLE = False
25+
Task = None
26+
task = None
27+
QA_SAMPLES = []
1628

1729
try:
1830
from tinker_cookbook.eval.inspect_utils import InspectAPIFromTinkerSampling
@@ -21,31 +33,25 @@
2133
TINKER_INSPECT_AVAILABLE = False
2234

2335

24-
QA_SAMPLES = [
25-
Sample(input="What is 2 + 2?", target="4"),
26-
Sample(input="What is the capital of France?", target="Paris"),
27-
Sample(input="What color is grass?", target="green"),
28-
Sample(input="How many days in a week?", target="7"),
29-
Sample(input="What is 10 x 5?", target="50"),
30-
]
31-
32-
33-
@task
34-
def simple_qa_task() -> Task:
36+
def simple_qa_task():
3537
"""
3638
Simple QA evaluation task for demo purposes.
3739
3840
Tests basic factual knowledge with exact match scoring.
3941
"""
40-
if not INSPECT_AVAILABLE:
42+
if not INSPECT_AVAILABLE or not Task:
4143
raise ImportError("inspect_ai required for this task")
4244

43-
return Task(
44-
name="simple_qa",
45-
dataset=MemoryDataset(name="simple_qa", samples=QA_SAMPLES),
46-
solver=generate(),
47-
scorer=includes(),
48-
)
45+
@task
46+
def _simple_qa() -> Task:
47+
return Task(
48+
name="simple_qa",
49+
dataset=MemoryDataset(name="simple_qa", samples=QA_SAMPLES),
50+
solver=generate(),
51+
scorer=includes(),
52+
)
53+
54+
return _simple_qa()
4955

5056

5157
async def run_inspect_evaluation(

logger.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ def log_checkpoint(self, round_num: int, checkpoint_uri: str):
8585
"""Log a checkpoint save."""
8686
self.log_event("checkpoint", {
8787
"round": round_num,
88-
"checkpoint_uri": checkpoint_uri,
88+
"checkpoint_uri": str(checkpoint_uri),
8989
})
9090

9191
def log_config(self, config: Dict[str, Any]):

runs/20251001_185228/metrics.jsonl

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
{"timestamp": "2025-10-01T18:52:28.055140", "run_id": "20251001_185228", "event": "config", "base_model": "test-model", "train_file": "/private/var/folders/1s/sm2bn6kd64gdnv757t9qshcm0000gn/T/pytest-of-jonathanhaas/pytest-39/test_early_stopping_on_thresho0/train.jsonl", "eval_tasks": [], "renderer_name": "default", "learning_rate": 0.0001, "eval_threshold": 0.8, "max_rounds": 5, "lr_decay": 0.8, "evalops_enabled": false, "evalops_test_suite_id": null, "evalops_api_url": null, "steps_per_round": 1, "batch_size": 8, "max_seq_length": 2048, "lora_rank": 16, "warmup_steps": 100, "max_steps": 1000, "min_lr": 1e-06, "use_recommended_lr": false}
2+
{"timestamp": "2025-10-01T18:52:28.079142", "run_id": "20251001_185228", "event": "config", "base_model": "test-model", "train_file": "/private/var/folders/1s/sm2bn6kd64gdnv757t9qshcm0000gn/T/pytest-of-jonathanhaas/pytest-39/test_full_rounds_below_thresho0/train.jsonl", "eval_tasks": [], "renderer_name": "default", "learning_rate": 0.0001, "eval_threshold": 0.9, "max_rounds": 3, "lr_decay": 0.5, "evalops_enabled": false, "evalops_test_suite_id": null, "evalops_api_url": null, "steps_per_round": 1, "batch_size": 8, "max_seq_length": 2048, "lora_rank": 16, "warmup_steps": 100, "max_steps": 1000, "min_lr": 1e-06, "use_recommended_lr": false}
3+
{"timestamp": "2025-10-01T18:52:28.090092", "run_id": "20251001_185228", "event": "config", "base_model": "test-model", "train_file": "/private/var/folders/1s/sm2bn6kd64gdnv757t9qshcm0000gn/T/pytest-of-jonathanhaas/pytest-39/test_evalops_integration_calle0/train.jsonl", "eval_tasks": [], "renderer_name": "default", "learning_rate": 0.0001, "eval_threshold": 0.8, "max_rounds": 1, "lr_decay": 0.8, "evalops_enabled": true, "evalops_test_suite_id": "suite-123", "evalops_api_url": null, "steps_per_round": 1, "batch_size": 8, "max_seq_length": 2048, "lora_rank": 16, "warmup_steps": 100, "max_steps": 1000, "min_lr": 1e-06, "use_recommended_lr": false}
4+
{"timestamp": "2025-10-01T18:52:28.101579", "run_id": "20251001_185228", "event": "config", "base_model": "test-model", "train_file": "/private/var/folders/1s/sm2bn6kd64gdnv757t9qshcm0000gn/T/pytest-of-jonathanhaas/pytest-39/test_lr_decay_across_rounds0/train.jsonl", "eval_tasks": [], "renderer_name": "default", "learning_rate": 1.0, "eval_threshold": 0.99, "max_rounds": 3, "lr_decay": 0.5, "evalops_enabled": false, "evalops_test_suite_id": null, "evalops_api_url": null, "steps_per_round": 1, "batch_size": 8, "max_seq_length": 2048, "lora_rank": 16, "warmup_steps": 0, "max_steps": 1000, "min_lr": 1e-06, "use_recommended_lr": false}

runs/20251001_185238/metrics.jsonl

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
{"timestamp": "2025-10-01T18:52:38.285251", "run_id": "20251001_185238", "event": "config", "base_model": "test-model", "train_file": "/private/var/folders/1s/sm2bn6kd64gdnv757t9qshcm0000gn/T/pytest-of-jonathanhaas/pytest-40/test_early_stopping_on_thresho0/train.jsonl", "eval_tasks": [], "renderer_name": "default", "learning_rate": 0.0001, "eval_threshold": 0.8, "max_rounds": 5, "lr_decay": 0.8, "evalops_enabled": false, "evalops_test_suite_id": null, "evalops_api_url": null, "steps_per_round": 1, "batch_size": 8, "max_seq_length": 2048, "lora_rank": 16, "warmup_steps": 100, "max_steps": 1000, "min_lr": 1e-06, "use_recommended_lr": false}
2+
{"timestamp": "2025-10-01T18:52:38.285705", "run_id": "20251001_185238", "event": "checkpoint", "round": 1, "checkpoint_uri": "<MagicMock name='mock.create_lora_training_client().save_weights_for_sampler().result().path' id='4462745920'>"}
3+
{"timestamp": "2025-10-01T18:52:38.307962", "run_id": "20251001_185238", "event": "config", "base_model": "test-model", "train_file": "/private/var/folders/1s/sm2bn6kd64gdnv757t9qshcm0000gn/T/pytest-of-jonathanhaas/pytest-40/test_full_rounds_below_thresho0/train.jsonl", "eval_tasks": [], "renderer_name": "default", "learning_rate": 0.0001, "eval_threshold": 0.9, "max_rounds": 3, "lr_decay": 0.5, "evalops_enabled": false, "evalops_test_suite_id": null, "evalops_api_url": null, "steps_per_round": 1, "batch_size": 8, "max_seq_length": 2048, "lora_rank": 16, "warmup_steps": 100, "max_steps": 1000, "min_lr": 1e-06, "use_recommended_lr": false}
4+
{"timestamp": "2025-10-01T18:52:38.308338", "run_id": "20251001_185238", "event": "checkpoint", "round": 1, "checkpoint_uri": "<MagicMock name='mock.create_lora_training_client().save_weights_for_sampler().result().path' id='4462753312'>"}
5+
{"timestamp": "2025-10-01T18:52:38.321016", "run_id": "20251001_185238", "event": "config", "base_model": "test-model", "train_file": "/private/var/folders/1s/sm2bn6kd64gdnv757t9qshcm0000gn/T/pytest-of-jonathanhaas/pytest-40/test_evalops_integration_calle0/train.jsonl", "eval_tasks": [], "renderer_name": "default", "learning_rate": 0.0001, "eval_threshold": 0.8, "max_rounds": 1, "lr_decay": 0.8, "evalops_enabled": true, "evalops_test_suite_id": "suite-123", "evalops_api_url": null, "steps_per_round": 1, "batch_size": 8, "max_seq_length": 2048, "lora_rank": 16, "warmup_steps": 100, "max_steps": 1000, "min_lr": 1e-06, "use_recommended_lr": false}
6+
{"timestamp": "2025-10-01T18:52:38.321513", "run_id": "20251001_185238", "event": "checkpoint", "round": 1, "checkpoint_uri": "<MagicMock name='mock.create_lora_training_client().save_weights_for_sampler().result().path' id='4462744912'>"}
7+
{"timestamp": "2025-10-01T18:52:38.332722", "run_id": "20251001_185238", "event": "config", "base_model": "test-model", "train_file": "/private/var/folders/1s/sm2bn6kd64gdnv757t9qshcm0000gn/T/pytest-of-jonathanhaas/pytest-40/test_lr_decay_across_rounds0/train.jsonl", "eval_tasks": [], "renderer_name": "default", "learning_rate": 1.0, "eval_threshold": 0.99, "max_rounds": 3, "lr_decay": 0.5, "evalops_enabled": false, "evalops_test_suite_id": null, "evalops_api_url": null, "steps_per_round": 1, "batch_size": 8, "max_seq_length": 2048, "lora_rank": 16, "warmup_steps": 0, "max_steps": 1000, "min_lr": 1e-06, "use_recommended_lr": false}
8+
{"timestamp": "2025-10-01T18:52:38.333083", "run_id": "20251001_185238", "event": "checkpoint", "round": 1, "checkpoint_uri": "<MagicMock name='mock.create_lora_training_client().save_weights_for_sampler().result().path' id='4468904960'>"}
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
{
2+
"round_idx": 1,
3+
"global_step": 1,
4+
"learning_rate": 1.0,
5+
"checkpoint_uri":

0 commit comments

Comments
 (0)