Introduce StructuredRunOpt for more complex runopts (#1154)

AbishekS · facebook-github-bot · commit f8f251b337b9 · 2025-10-28T18:43:02.000-07:00
Summary: StructuredRunOpt can be used to create new complex runopts This can be used as for example: ``` dataclass class UlimitTest(StructuredRunOpt): name: str hard: int soft: int def template(self) -> str: return "{name},{soft:d},{hard:d}" ``` This comes with 1. template() that helps the from_repr() use that template to map to the fields with types 2. __eq__ to check equality between two Add this new type to CfgVal's acceptable types. Also modify a piece of code that could use CfgVal instead of typing out entire list of types in. https://www.internalfb.com/code/fbsource/[37f968940832a633afa761e829e81184858cf6b8]/fbcode/msl/experimental/training_execution_environment/monarch_backend/api/launch_cluster.py?lines=196-199 Reviewed By: kiukchung Differential Revision: D85159071
diff --git a/docs/source/specs.rst b/docs/source/specs.rst
@@ -52,6 +52,9 @@ Run Configs
 .. autoclass:: runopts
    :members:
 
+.. autoclass:: StructuredRunOpt
+   :members:
+
 Run Status
 --------------
 .. autoclass:: AppStatus
diff --git a/requirements.txt b/requirements.txt
@@ -4,3 +4,4 @@ docker
 filelock
 fsspec>=2023.10.0
 tabulate
+parse
diff --git a/torchx/specs/__init__.py b/torchx/specs/__init__.py
@@ -43,6 +43,7 @@
     RoleStatus,
     runopt,
     runopts,
+    StructuredRunOpt,
     TORCHX_HOME,
     UnknownAppException,
     UnknownSchedulerException,
@@ -226,6 +227,7 @@ def gpu_x_1() -> Dict[str, Resource]:
     "RoleStatus",
     "runopt",
     "runopts",
+    "StructuredRunOpt",
     "UnknownAppException",
     "UnknownSchedulerException",
     "InvalidRunConfigException",
diff --git a/torchx/specs/api.py b/torchx/specs/api.py
@@ -6,6 +6,7 @@
 
 # pyre-strict
 
+import abc
 import asyncio
 import copy
 import inspect
@@ -17,6 +18,7 @@
 import shutil
 import typing
 import warnings
+from abc import abstractmethod
 from dataclasses import asdict, dataclass, field
 from datetime import datetime
 from enum import Enum, IntEnum
@@ -36,10 +38,12 @@
     Tuple,
     Type,
     TypeVar,
-    Union,
 )
 
+import parse
+
 from torchx.util.types import to_dict
+from typing_extensions import Self
 
 _APP_STATUS_FORMAT_TEMPLATE = """AppStatus:
     State: ${state}
@@ -877,11 +881,72 @@ def __init__(self, status: AppStatus, *args: object) -> None:
         self.status = status
 
 
-# valid run cfg values; only support primitives (str, int, float, bool, List[str], Dict[str, str])
+U = TypeVar("U", bound="StructuredRunOpt")
+
+
+class StructuredRunOpt(abc.ABC):
+    """
+    StructuredRunOpt is a class that represents a structured run option.
+    This is to allow for more complex types than currently supported.
+
+    Usage
+
+    .. doctest::
+        @dataclass
+        class Ulimit(StructuredRunOpt):
+            name: str
+            hard: int
+            soft: int
+
+            def template(self) -> str:
+                # The template string should contain the field names of the Ulimit object.
+                # Template strings also may need types as below where `:d` is for integer type.
+                return "{name},{soft:d},{hard:d}"
+
+        opts = runopts()
+        opts.add("ulimit", type_=self.Ulimit, help="ulimits for the container")
+
+        # .from_repr() is used to create a Ulimit object from a string representation that is the template.
+        cfg = opts.resolve(
+            {
+                "ulimit": self.Ulimit.from_repr(
+                    "test,50,100",
+                )
+            }
+        )
+
+    """
+
+    @abstractmethod
+    def template(self) -> str:
+        """
+        Returns the template string for the StructuredRunOpt.
+        These are mapped to the field names of the StructuredRunOpt object.
+        """
+        ...
+
+    def __repr__(self) -> str:
+        return self.template().format(**asdict(self))
+
+    def __eq__(self, other: object) -> bool:
+        return isinstance(other, type(self)) and asdict(self) == asdict(other)
+
+    @classmethod
+    def from_repr(cls, repr: str) -> Self:
+        """
+        Parses the repr string and returns a StructuredRunOpt object
+        """
+        tmpl = cls.__new__(cls).template()
+        result = parse.parse(tmpl, repr)
+        return cls(**result.named)
+
+
+# valid run cfg values; support primitives (str, int, float, bool, List[str], Dict[str, str])
+# And StructuredRunOpt Type for more complex types.
 # TODO(wilsonhong): python 3.9+ supports list[T] in typing, which can be used directly
 # in isinstance(). Should replace with that.
 # see: https://docs.python.org/3/library/stdtypes.html#generic-alias-type
-CfgVal = Union[str, int, float, bool, List[str], Dict[str, str], None]
+CfgVal = str | int | float | bool | List[str] | Dict[str, str] | StructuredRunOpt | None
 
 
 T = TypeVar("T")
diff --git a/torchx/specs/test/api_test.py b/torchx/specs/test/api_test.py
@@ -14,7 +14,7 @@
 import time
 import unittest
 import warnings
-from dataclasses import asdict
+from dataclasses import asdict, dataclass
 from pathlib import Path
 from typing import Dict, List, Mapping, Tuple, Union
 from unittest import mock
@@ -43,6 +43,7 @@
     RoleStatus,
     runopt,
     runopts,
+    StructuredRunOpt,
     TORCHX_HOME,
     Workspace,
 )
@@ -550,6 +551,56 @@ def test_getset_metadata(self) -> None:
         self.assertEqual(None, app.metadata.get("non_existent"))
 
 
+class StructuredRunOptTest(unittest.TestCase):
+
+    @dataclass
+    class UlimitTest(StructuredRunOpt):
+        name: str
+        hard: int
+        soft: int
+
+        def template(self) -> str:
+            return "{name},{soft:d},{hard:d}"
+
+    def test_structured_runopt(self) -> None:
+        opt = self.UlimitTest(name="test", hard=100, soft=50)
+
+        # Test class from_repr
+        self.assertEqual(
+            opt,
+            self.UlimitTest.from_repr(
+                "test,50,100",
+            ),
+        )
+
+        # Test repr
+        self.assertEqual(
+            "StructuredRunOptTest.UlimitTest(name='test', hard=100, soft=50)", repr(opt)
+        )
+
+        # Test equality
+        opt_other = self.UlimitTest(name="test", hard=100, soft=50)
+        self.assertEqual(opt, opt_other)
+        opt_other = self.UlimitTest(name="test", hard=100, soft=70)
+        self.assertNotEqual(opt, opt_other)
+
+        # Test with runopts
+
+        opts = runopts()
+        opts.add("ulimit", type_=self.UlimitTest, help="test ulimit")
+        cfg = opts.resolve(
+            {
+                "ulimit": self.UlimitTest.from_repr(
+                    "test,50,100",
+                )
+            }
+        )
+        self.assertEqual(
+            cfg.get("ulimit"),
+            self.UlimitTest(name="test", hard=100, soft=50),
+        )
+
+
 class RunConfigTest(unittest.TestCase):
     def get_cfg(self) -> Mapping[str, CfgVal]:
         return {