Support saving and loading sharded optimizer variables into checkpoints across arbitrary sharding configurations.

jamesmullenbach · tensorflower-gardener · commit 39ac369ddd6f · 2023-12-12T13:44:21.000-08:00
Because optimizer variables are initialized from the model's individual shard variables using `colocate_with`, they are not aggregated into a `ShardedVariable` upon creation. This change accumulates and creates `ShardedVariable` objects for optimizer variables automatically as they are created from corresponding model variables, when those model variables are shards.

Optimizer attribute variables (e.g. Adam's `self._momentums`) are kept as regular Variables to maintain existing functionality. In order to save and restore optimizer attribute variables correctly across varying shard configurations, they are replaced with their `ShardedVariable` containers in the checkpointing object graph.

PiperOrigin-RevId: 590322532
diff --git a/tf_keras/optimizers/BUILD b/tf_keras/optimizers/BUILD
@@ -104,7 +104,7 @@ distribute_py_test(
     name = "optimizer_pss_test",
     size = "medium",
     srcs = ["optimizer_pss_test.py"],
-    shard_count = 32,
+    shard_count = 50,
     tags = [
         "multi_gpu",
         "no_oss",
diff --git a/tf_keras/optimizers/optimizer.py b/tf_keras/optimizers/optimizer.py
@@ -102,6 +102,10 @@ def __init__(
             )
 
         self._variables = []
+        # A dict mapping a model ShardedVariable id to an object that builds a
+        # ShardedVariable from the corresponding optimizer variables. See
+        # `add_variable_from_reference`.
+        self._sharded_variable_builders = self._no_dependency({})
         self._create_iteration_variable()
         self._process_kwargs(kwargs)
 
@@ -516,9 +520,76 @@ def add_variable_from_reference(
             dtype=model_variable.dtype,
             trainable=False,
         )
-        self._variables.append(variable)
+        # If model_variable is a shard of a ShardedVariable, we should add a
+        # ShardedVariable for all related optimizer variables so that
+        # checkpointing is robust to different partitionings. Use unique_id to
+        # dedup ShardedVariables.
+        if hasattr(model_variable, "_sharded_container"):
+            sharded_variable = model_variable._sharded_container()
+            # Get or create builder object
+            sv_builder = self._sharded_variable_builders.setdefault(
+                (sharded_variable._unique_id, variable_name),
+                _ShardedVariableBuilder(len(sharded_variable.variables)),
+            )
+            sv_builder.add_shard(variable)
+            if sv_builder.has_all_shards():
+                self._variables.append(sv_builder.build())
+        else:
+            self._variables.append(variable)
         return variable
 
+    def _trackable_children(self, save_type="checkpoint", **kwargs):
+        """Override in order to coalesce and track `ShardedVariable`s.
+
+        If an optimizer variable's corresponding model variable is a shard of a
+        larger `ShardedVariable`, then we track the optimizer variable in
+        `self._variables` as a `ShardedVariable` via the logic in
+        `add_variable_from_reference`. However, most optimizer implementations
+        additionally keep their variables as attributes, which will be tracked
+        via `AutoTrackable` functionality and not accumulated into
+        `ShardedVariable`s.
+
+        So, to enable restoration of these attributes in possibly different
+        sharding configurations, we should save them as `ShardedVariable`s.
+        Here, any optimizer attributes that are variable shards of a larger
+        `ShardedVariable` are here replaced by the `ShardedVariable` itself,
+        which was created in `add_variable_from_reference`.
+
+        All non-sharded variables are kept as-is. If none of the model variables
+        are sharded, this reduces to `AutoTrackable._trackable_children()`.
+        """
+        # Due to object-identity based matching logic in checkpointing, new
+        # python objects should not be created on each call to
+        # `_trackable_children`.  So instead, only coalesce if not done before.
+        if not hasattr(self, "_coalesced_children"):
+            # This new attribute should not be tracked to avoid infinite
+            # recursion, so wrap in NoDependency
+            self._coalesced_children = self._no_dependency({})
+            children = super()._trackable_children(save_type, **kwargs)
+            for key, val in children.items():
+                if key not in [
+                    "_variables",
+                    "_index_dict",
+                    "_learning_rate",
+                    "_iterations",
+                ]:
+                    new_val = val
+                    if isinstance(val, list):
+                        # TODO(jmullenbach): handle arbitrary nesting
+                        sv_vals = []
+                        for var in val:
+                            if hasattr(var, "_sharded_container"):
+                                sv = var._sharded_container()
+                                if sv not in sv_vals:
+                                    sv_vals.append(sv)
+                            else:
+                                sv_vals.append(var)
+                        new_val = tf.__internal__.tracking.wrap(sv_vals)
+                    self._coalesced_children[key] = new_val
+                else:
+                    self._coalesced_children[key] = val
+        return self._coalesced_children
+
     def minimize(self, loss, var_list, tape=None):
         """Minimize `loss` by updating `var_list`.
 
@@ -1384,6 +1455,30 @@ def __call__(self):
         return self
 
 
+class _ShardedVariableBuilder:
+    """Accumulate variable shards into a `ShardedVariable`."""
+
+    def __init__(self, num_shards):
+        self.shards = [None] * num_shards
+
+    def add_shard(self, shard):
+        # Get shard index from name
+        shard_idx = int(shard.name.split("part_")[-1].split(":")[0])
+        if self.shards[shard_idx] is None:
+            self.shards[shard_idx] = shard
+        else:
+            raise ValueError(
+                "Cannot add duplicate optimizer variable from "
+                f"shard variable {shard.name}"
+            )
+
+    def has_all_shards(self):
+        return all([shard is not None for shard in self.shards])
+
+    def build(self):
+        return tf.__internal__.distribute.ShardedVariable(self.shards)
+
+
 # Register the optimizer for loading from saved_model purpose.
 # When `keras_2` is installed in same env, it raises assertion for duplicate
 # registration with same name. Rename the symbol in this case.
diff --git a/tf_keras/optimizers/optimizer_pss_test.py b/tf_keras/optimizers/optimizer_pss_test.py
@@ -1,5 +1,7 @@
 """Tests for calling optimizer on ParameterServerStrategy."""
 
+import os
+
 import tensorflow.compat.v2 as tf
 from absl.testing import parameterized
 
@@ -96,7 +98,11 @@ def _verify_accumulators_updated(self, optimizer):
             if "iteration" not in var.name and "learning_rate" not in var.name:
                 # Find a variable not iteration or learning_rate, and verify its
                 # value is updated (not 0).
-                self.assertNotAllEqual(var, 0)
+                if isinstance(var, tf.__internal__.distribute.ShardedVariable):
+                    for shard in var.variables:
+                        self.assertNotAllEqual(shard, 0)
+                else:
+                    self.assertNotAllEqual(var, 0)
 
     @ds_combinations.generate(
         tf.__internal__.test.combinations.combine(
@@ -160,6 +166,112 @@ def replica_fn(data):
             self.assertEqual(self.evaluate(optimizer.iterations), 3)
             self._verify_accumulators_updated(optimizer)
 
+    @ds_combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            strategy=STRATEGIES,
+            shard_config=[
+                [2, 2],
+                [2, 3],
+                [3, 2],
+                [2, 1],
+                [1, 1],
+                [1, 2],
+                [1, 3],
+            ],
+        )
+    )
+    def testCheckpointShardedVariable(self, strategy, shard_config):
+        # Data are embedding indices near shard boundaries for 2 or 3 shards
+        test_indices = [33, 34, 49, 50, 66, 67]
+
+        def dataset_fn(_):
+            x, y = [[index] for index in test_indices], [1, 1, 1, 0, 0, 0]
+            ds = tf.data.Dataset.from_tensor_slices((x, y))
+            ds = ds.repeat().batch(6)
+            return ds
+
+        vocab_size = 100
+        embed_dim = 32
+
+        def get_model():
+            return keras.Sequential(
+                [
+                    keras.layers.Embedding(vocab_size, embed_dim),
+                    keras.layers.Dense(1, activation="sigmoid"),
+                ]
+            )
+
+        # Override partitioning
+        if shard_config[0] == 1:
+            strategy._extended._variable_partitioner = None
+        else:
+            strategy._extended._variable_partitioner = (
+                tf.distribute.experimental.partitioners.FixedShardsPartitioner(
+                    shard_config[0]
+                )
+            )
+
+        # Create model and optimizer
+        with strategy.scope():
+            model = get_model()
+            optimizer = adam.Adam(0.002)
+
+            model.compile(loss="mse", optimizer=optimizer)
+
+            model.build(input_shape=(None, 1))
+            model.optimizer.build(model.trainable_variables)
+
+        ds = dataset_creator.DatasetCreator(dataset_fn)
+        # Train a bit to update optimizer variables
+        model.fit(ds, epochs=1, steps_per_epoch=5)
+
+        self._verify_accumulators_updated(optimizer)
+
+        # Extract optimizer variables to later check they restore properly
+        pre_ckpt_optimizer_values = []
+        for var in model.optimizer.variables:
+            # Just check the embedding variables
+            if var.shape == [vocab_size, embed_dim]:
+                for index in test_indices:
+                    pre_ckpt_optimizer_values.append(var[index])
+        # Adam has 2 slot variables, momentum and velocity
+        self.assertLen(pre_ckpt_optimizer_values, 2 * len(test_indices))
+
+        checkpoint_path = os.path.join(self.get_temp_dir(), "model_weights")
+        model.save_weights(checkpoint_path)
+
+        # Create new model under different sharding and load checkpoint
+        if shard_config[1] == 1:
+            strategy._extended._variable_partitioner = None
+        else:
+            strategy._extended._variable_partitioner = (
+                tf.distribute.experimental.partitioners.FixedShardsPartitioner(
+                    shard_config[1]
+                )
+            )
+        with strategy.scope():
+            model_2 = get_model()
+            optimizer_2 = adam.Adam(0.002)
+            model_2.compile(loss="mse", optimizer=optimizer_2)
+            model_2.build(input_shape=(None, 1))
+            model_2.optimizer.build(model_2.trainable_variables)
+            model_2.load_weights(checkpoint_path)
+
+        post_ckpt_optimizer_values = []
+        for var in model_2.optimizer.variables:
+            if var.shape == [vocab_size, embed_dim]:
+                for index in test_indices:
+                    post_ckpt_optimizer_values.append(var[index])
+        self.assertLen(post_ckpt_optimizer_values, 2 * len(test_indices))
+        for pre_val, post_val in zip(
+            pre_ckpt_optimizer_values, post_ckpt_optimizer_values
+        ):
+            self.assertAllEqual(pre_val, post_val)
+
+        # Confirm training still functional
+        ds = dataset_creator.DatasetCreator(dataset_fn)
+        model_2.fit(ds, epochs=1, steps_per_epoch=5)
+
 
 if __name__ == "__main__":
     tf.__internal__.distribute.multi_process_runner.test_main()