Skip to content

Commit 029217e

Browse files
authored
Merge pull request #45 from VectorInstitute/pipeline_schema
Pipeline schema design.
2 parents 3d98e1f + 0daa522 commit 029217e

12 files changed

+1600
-0
lines changed

src/schemas/PIPELINE_SCHEMAS.md

Lines changed: 681 additions & 0 deletions
Large diffs are not rendered by default.

src/schemas/README.md

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
# ACE Pipeline Schemas
2+
3+
This directory contains standardized schemas for all ACE pipeline stages, ensuring consistent data formats across different implementations.
4+
5+
## Structure
6+
7+
- **[`PIPELINE_SCHEMAS.md`](PIPELINE_SCHEMAS.md)** - Complete documentation of input/output formats for each stage
8+
- **Python Dataclasses** - Type-safe data structures for each stage:
9+
- [`experiment_schemas.py`](experiment_schemas.py) - Experiment (Stage 0)
10+
- [`domain_schemas.py`](domain_schemas.py) - Domain (Stage 0)
11+
- [`metadata_schemas.py`](metadata_schemas.py) - Common metadata (PipelineMetadata)
12+
- [`area_schemas.py`](area_schemas.py) - Area generation (Stage 1)
13+
- [`capability_schemas.py`](capability_schemas.py) - Capability generation (Stage 2)
14+
- [`task_schemas.py`](task_schemas.py) - Task generation (Stage 3)
15+
- [`solution_schemas.py`](solution_schemas.py) - Solution generation (Stage 4)
16+
- [`validation_schemas.py`](validation_schemas.py) - Validation (Stage 5)
17+
- **I/O Utilities** - Save and load functions:
18+
- [`io_utils.py`](io_utils.py) - Functions to save/load all stage outputs (save/load functions for all 7 stage outputs)
19+
20+
## Usage
21+
22+
### Using Python Dataclasses
23+
24+
```python
25+
from src.schemas import (
26+
Domain,
27+
Experiment,
28+
PipelineMetadata,
29+
Area,
30+
Capability,
31+
Task,
32+
TaskSolution,
33+
ValidationResult,
34+
)
35+
36+
# Create area
37+
domain = Domain(name="Personal Finance", domain_id="domain_000")
38+
area = Area(
39+
name="Cash Flow & Budget Management",
40+
area_id="area_000",
41+
description="Design and monitor budgets...",
42+
domain=domain,
43+
# generation_metadata is optional
44+
)
45+
46+
# Convert to dict for JSON serialization
47+
data = area.to_dict()
48+
49+
# Load from dict
50+
area = Area.from_dict(data)
51+
```
52+
53+
### Using Save/Load Functions
54+
55+
```python
56+
from pathlib import Path
57+
from src.schemas import (
58+
save_areas,
59+
load_areas,
60+
PipelineMetadata,
61+
Area,
62+
)
63+
64+
# Save areas
65+
areas = [Area(...), Area(...)]
66+
metadata = PipelineMetadata(
67+
experiment_id="r0_10x10",
68+
output_base_dir="agentic_outputs",
69+
timestamp="2025-11-06T12:00:00Z",
70+
output_stage_tag="_20251009_122040"
71+
)
72+
save_areas(areas, metadata, Path("output/areas.json"))
73+
74+
# Load areas
75+
areas, metadata = load_areas(Path("output/areas.json"))
76+
```
77+
78+
## Pipeline Stages
79+
80+
0. **Experiment Setup**`Experiment`, `Domain`
81+
1. **Area Generation**`Area`
82+
2. **Capability Generation**`Capability`
83+
3. **Task Generation**`Task`
84+
4. **Solution Generation**`TaskSolution`
85+
5. **Validation**`ValidationResult`
86+
87+
See [`PIPELINE_SCHEMAS.md`](PIPELINE_SCHEMAS.md) for detailed specifications.

src/schemas/__init__.py

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
"""Standardized schemas for ACE pipeline stages.
2+
3+
This module provides standardized data structures for all pipeline stages,
4+
ensuring consistent input/output formats regardless of internal implementation.
5+
"""
6+
7+
from src.schemas.area_schemas import Area
8+
from src.schemas.capability_schemas import Capability
9+
from src.schemas.domain_schemas import Domain
10+
from src.schemas.experiment_schemas import Experiment
11+
from src.schemas.io_utils import (
12+
load_areas,
13+
load_capabilities,
14+
load_domain,
15+
load_experiment,
16+
load_solution,
17+
load_tasks,
18+
load_validation,
19+
save_areas,
20+
save_capabilities,
21+
save_domain,
22+
save_experiment,
23+
save_solution,
24+
save_tasks,
25+
save_validation,
26+
)
27+
from src.schemas.metadata_schemas import PipelineMetadata
28+
from src.schemas.solution_schemas import TaskSolution
29+
from src.schemas.task_schemas import Task
30+
from src.schemas.validation_schemas import ValidationResult
31+
32+
33+
__all__ = [
34+
# Metadata
35+
"PipelineMetadata",
36+
# Experiment schemas (Stage 0)
37+
"Experiment",
38+
"Domain",
39+
# Area schemas
40+
"Area",
41+
# Capability schemas
42+
"Capability",
43+
# Task schemas
44+
"Task",
45+
# Solution schemas
46+
"TaskSolution",
47+
# Validation schemas
48+
"ValidationResult",
49+
# I/O functions - Save
50+
"save_experiment",
51+
"save_domain",
52+
"save_areas",
53+
"save_capabilities",
54+
"save_tasks",
55+
"save_solution",
56+
"save_validation",
57+
# I/O functions - Load
58+
"load_experiment",
59+
"load_domain",
60+
"load_areas",
61+
"load_capabilities",
62+
"load_tasks",
63+
"load_solution",
64+
"load_validation",
65+
]

src/schemas/area_schemas.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
"""Schemas for area generation stage (Stage 1).
2+
3+
Defines Area dataclass for domain area. Areas are high-level categories
4+
within a domain.
5+
"""
6+
7+
from dataclasses import dataclass, field
8+
from typing import Dict, Optional
9+
10+
from src.schemas.domain_schemas import Domain
11+
12+
13+
@dataclass
14+
class Area:
15+
"""Dataclass for domain area."""
16+
17+
name: str
18+
area_id: str
19+
domain: Domain
20+
description: str
21+
generation_metadata: Optional[Dict] = field(default_factory=dict)
22+
23+
def to_dict(self):
24+
"""Convert to dictionary."""
25+
result = {
26+
"name": self.name,
27+
"area_id": self.area_id,
28+
"domain": self.domain.name,
29+
"domain_id": self.domain.domain_id,
30+
"description": self.description,
31+
}
32+
if self.generation_metadata:
33+
result["generation_metadata"] = self.generation_metadata
34+
return result
35+
36+
@classmethod
37+
def from_dict(cls, data: dict):
38+
"""Create from dictionary."""
39+
domain = Domain(
40+
name=data["domain"],
41+
domain_id=data["domain_id"],
42+
description=data.get("domain_description"),
43+
)
44+
return cls(
45+
name=data["name"],
46+
area_id=data["area_id"],
47+
domain=domain,
48+
description=data["description"],
49+
generation_metadata=data.get("generation_metadata", {}),
50+
)

src/schemas/capability_schemas.py

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
"""Schemas for capability generation stage (Stage 2).
2+
3+
Defines Capability dataclass for capability within an area. Capabilities
4+
are specific skills or abilities.
5+
"""
6+
7+
from dataclasses import dataclass, field
8+
from typing import Dict, Optional
9+
10+
from src.schemas.area_schemas import Area
11+
from src.schemas.domain_schemas import Domain
12+
13+
14+
@dataclass
15+
class Capability:
16+
"""Dataclass for capability."""
17+
18+
name: str
19+
capability_id: str
20+
area: Area
21+
description: str
22+
generation_metadata: Optional[Dict] = field(default_factory=dict)
23+
24+
def to_dict(self):
25+
"""Convert to dictionary."""
26+
result = {
27+
"name": self.name,
28+
"capability_id": self.capability_id,
29+
"area": self.area.name,
30+
"area_id": self.area.area_id,
31+
"area_description": self.area.description,
32+
"domain": self.area.domain.name,
33+
"domain_id": self.area.domain.domain_id,
34+
"description": self.description,
35+
}
36+
if self.generation_metadata:
37+
result["generation_metadata"] = self.generation_metadata
38+
return result
39+
40+
@classmethod
41+
def from_dict(cls, data: dict):
42+
"""Create from dictionary."""
43+
domain = Domain(
44+
name=data["domain"],
45+
domain_id=data["domain_id"],
46+
description=data.get("domain_description"),
47+
)
48+
area = Area(
49+
name=data["area"],
50+
area_id=data["area_id"],
51+
domain=domain,
52+
description=data["area_description"],
53+
)
54+
return cls(
55+
name=data["name"],
56+
capability_id=data["capability_id"],
57+
area=area,
58+
description=data["description"],
59+
generation_metadata=data.get("generation_metadata", {}),
60+
)

src/schemas/domain_schemas.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
"""Schemas for domain (Stage 0).
2+
3+
Defines Domain dataclass for domain.
4+
"""
5+
6+
from dataclasses import dataclass
7+
from typing import Optional
8+
9+
10+
@dataclass
11+
class Domain:
12+
"""Dataclass for domain."""
13+
14+
name: str
15+
domain_id: str
16+
description: Optional[str] = None
17+
18+
def to_dict(self):
19+
"""Convert to dictionary."""
20+
result = {
21+
"name": self.name,
22+
"domain_id": self.domain_id,
23+
}
24+
if self.description is not None:
25+
result["description"] = self.description
26+
return result
27+
28+
@classmethod
29+
def from_dict(cls, data: dict):
30+
"""Create from dictionary."""
31+
return cls(
32+
name=data["name"],
33+
domain_id=data["domain_id"],
34+
description=data.get("description"),
35+
)

src/schemas/experiment_schemas.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
"""Schemas for experiment setup stage (Stage 0).
2+
3+
Defines Experiment dataclass containing experiment configuration and metadata.
4+
"""
5+
6+
from dataclasses import dataclass, field
7+
from typing import Any, Dict, Optional
8+
9+
10+
@dataclass
11+
class Experiment:
12+
"""Dataclass for experiment metadata and configuration."""
13+
14+
experiment_id: str
15+
domain: str
16+
domain_id: str
17+
pipeline_type: Optional[str] = None
18+
configuration: Dict[str, Any] = field(default_factory=dict)
19+
20+
def to_dict(self):
21+
"""Convert to dictionary."""
22+
result = {
23+
"experiment_id": self.experiment_id,
24+
"domain": self.domain,
25+
"domain_id": self.domain_id,
26+
"configuration": self.configuration,
27+
}
28+
if self.pipeline_type is not None:
29+
result["pipeline_type"] = self.pipeline_type
30+
return result
31+
32+
@classmethod
33+
def from_dict(cls, data: dict):
34+
"""Create from dictionary."""
35+
return cls(
36+
experiment_id=data["experiment_id"],
37+
domain=data["domain"],
38+
domain_id=data["domain_id"],
39+
pipeline_type=data.get("pipeline_type"),
40+
configuration=data.get("configuration", {}),
41+
)

0 commit comments

Comments
 (0)