fix corner case where train_pipeline PostProc can't trace through a fx wrap module (meta-pytorch#3377)

TroyGarden · facebook-github-bot · commit 9d6ce014a1d6 · 2025-09-15T21:44:43.000-07:00
Summary: Pull Request resolved: meta-pytorch#3377 # context * when working on UDD LSR MC5, we observed 5% QPS regression due to disabled SDD pipeline ([mast log](https://fburl.com/mlhub/rvuibmcp)) * the code change is mainly due to the `torch.fx.wrap` of a [postproc function](https://fburl.com/code/spvnz047) * although the `pipeline_postproc` flag is `True`, the issue is that it's actually a fx-wrapped function. * currently postproc trace only works for a torch.nn.Module, not a function. * added test case to verify this, and future work is needed to support better tracing for the fx-wrapped function. Reviewed By: spmex Differential Revision: D82501416 fbshipit-source-id: 9cadbafaaee6536c95fe3050b60cb0a2ad7a91b1
diff --git a/torchrec/distributed/train_pipeline/tests/test_train_pipelines_utils.py b/torchrec/distributed/train_pipeline/tests/test_train_pipelines_utils.py
@@ -10,25 +10,23 @@
 import copy
 import enum
 import unittest
+from typing import Tuple, Union
 from unittest.mock import MagicMock
 
 import torch
 
 from torchrec.distributed.embedding_types import EmbeddingComputeKernel
-from torchrec.distributed.test_utils.test_model import ModelInput, TestNegSamplingModule
+from torchrec.distributed.test_utils.test_model import (
+    ModelInput,
+    TestNegSamplingModule,
+    TestSparseNN,
+)
 from torchrec.distributed.train_pipeline.pipeline_context import TrainPipelineContext
 from torchrec.distributed.train_pipeline.runtime_forwards import PipelinedForward
-
 from torchrec.distributed.train_pipeline.tests.test_train_pipelines_base import (
     TrainPipelineSparseDistTestBase,
 )
-from torchrec.distributed.train_pipeline.tracing import (
-    ArgInfo,
-    ArgInfoStepFactory,
-    CallArgs,
-    NodeArgsHelper,
-    PipelinedPostproc,
-)
+from torchrec.distributed.train_pipeline.tracing import CallArgs, PipelinedPostproc
 from torchrec.distributed.train_pipeline.utils import _rewrite_model
 from torchrec.distributed.types import ShardingType
 from torchrec.sparse.jagged_tensor import KeyedJaggedTensor
@@ -40,6 +38,15 @@ class ModelType(enum.Enum):
     PIPELINED = "pipelined"
 
 
+@torch.fx.wrap
+def enrich_hstu_features(
+    kjt: KeyedJaggedTensor, hstu_factor: float
+) -> KeyedJaggedTensor:
+    if kjt._weights is not None:
+        kjt._weights *= hstu_factor
+    return kjt
+
+
 class TrainPipelineUtilsTest(TrainPipelineSparseDistTestBase):
     # pyre-fixme[56]: Pyre was not able to infer the type of argument
     @unittest.skipIf(
@@ -257,3 +264,115 @@ def test_restore_from_snapshot(self) -> None:
         ]
         for source_model_type, recipient_model_type in variants:
             self._test_restore_from_snapshot(source_model_type, recipient_model_type)
+
+    # pyre-fixme[56]: Pyre was not able to infer the type of argument
+    @unittest.skipIf(
+        not torch.cuda.is_available(),
+        "Not enough GPUs, this test requires at least one GPU",
+    )
+    def test_rewrite_model_with_fx_wrap(self) -> None:
+        sharding_type = ShardingType.TABLE_WISE.value
+        kernel_type = EmbeddingComputeKernel.FUSED.value
+        fused_params = {}
+
+        class TestPostProcModule(torch.nn.Module):
+            def __init__(self, f: float):
+                super().__init__()
+                self.f = f
+
+            def forward(self, x: KeyedJaggedTensor) -> KeyedJaggedTensor:
+                return enrich_hstu_features(x, self.f)
+
+        postproc_module = TestPostProcModule(0.3)
+
+        class TestModel(TestSparseNN):
+            use_postproc_module: bool = False
+
+            def forward(
+                self,
+                input: ModelInput,
+            ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+                if (type(self)).use_postproc_module:
+                    input = self.postproc_module(input)
+                else:
+                    input = enrich_hstu_features(input, 0.3)
+                return self.dense_forward(input, self.sparse_forward(input))
+
+        model = TestModel(
+            tables=self.tables,
+            weighted_tables=self.weighted_tables,
+            dense_device=self.device,
+            sparse_device=torch.device("meta"),
+            postproc_module=postproc_module,
+        )
+
+        sharded_model, optim = self._generate_sharded_model_and_optimizer(
+            model, sharding_type, kernel_type, fused_params
+        )
+
+        # Try to rewrite model using a function for postproc
+        # EBC forwards not overwritten to PipelinedForward due to KJT modification
+        self.assertFalse(model.use_postproc_module)
+        _rewrite_model(
+            model=sharded_model,
+            batch=None,
+            context=TrainPipelineContext(),
+            dist_stream=None,
+            pipeline_postproc=True,
+        )
+        self.assertNotIsInstance(
+            # pyre-fixme[16]: Item `Tensor` of `Tensor | Module` has no attribute
+            #  `sparse`.
+            sharded_model.module.sparse.ebc.forward,
+            PipelinedForward,
+        )
+        self.assertNotIsInstance(
+            # pyre-fixme[16]: Item `Tensor` of `Tensor | Module` has no attribute
+            #  `sparse`.
+            sharded_model.module.sparse.weighted_ebc.forward,
+            PipelinedForward,
+        )
+
+        # Now use postproc module
+        TestModel.use_postproc_module = True
+        self.assertTrue(model.use_postproc_module)
+        _rewrite_model(
+            model=sharded_model,
+            batch=None,
+            context=TrainPipelineContext(),
+            dist_stream=None,
+            pipeline_postproc=True,
+        )
+
+        # pyre-fixme[16]: Item `Tensor` of `Tensor | Module` has no attribute `sparse`.
+        self.assertIsInstance(sharded_model.module.sparse.ebc.forward, PipelinedForward)
+        self.assertIsInstance(
+            # pyre-fixme[16]: Item `Tensor` of `Tensor | Module` has no attribute
+            #  `sparse`.
+            sharded_model.module.sparse.weighted_ebc.forward,
+            PipelinedForward,
+        )
+        self.assertEqual(
+            # pyre-fixme[16]: Item `Tensor` of `Tensor | Module` has no attribute
+            #  `sparse`.
+            sharded_model.module.sparse.ebc.forward._args.args[0]
+            .steps[0]
+            .postproc_module,
+            # pyre-fixme[16]: Item `Tensor` of `Tensor | Module` has no attribute
+            #  `postproc_module`.
+            sharded_model.module.postproc_module,
+        )
+        self.assertEqual(
+            # pyre-fixme[16]: Item `Tensor` of `Tensor | Module` has no attribute
+            #  `sparse`.
+            sharded_model.module.sparse.weighted_ebc.forward._args.args[0]
+            .steps[0]
+            .postproc_module,
+            # pyre-fixme[16]: Item `Tensor` of `Tensor | Module` has no attribute
+            #  `postproc_module`.
+            sharded_model.module.postproc_module,
+        )
+        state_dict = sharded_model.state_dict()
+        missing_keys, unexpected_keys = sharded_model.load_state_dict(state_dict)
+        self.assertEqual(missing_keys, [])
+        self.assertEqual(unexpected_keys, [])
diff --git a/torchrec/distributed/train_pipeline/utils.py b/torchrec/distributed/train_pipeline/utils.py
@@ -361,6 +361,9 @@ def _rewrite_model(  # noqa C901
         model, context, pipeline_postproc, default_stream, dist_stream
     )
 
+    logger.info(
+        f"pipeline_postproc is {'enabled' if pipeline_postproc else 'disabled'}"
+    )
     for node in graph.nodes:
         # only work on the call_module node which is also a sharded module
         if node.op != "call_module" or node.target not in sharded_modules:

Original file line number	Diff line number	Diff line change
`@@ -361,6 +361,9 @@ def _rewrite_model( # noqa C901`
`361`	`361`	`model, context, pipeline_postproc, default_stream, dist_stream`
`362`	`362`	`)`
`363`	`363`
	`364`	`+ logger.info(`
	`365`	`+ f"pipeline_postproc is {'enabled' if pipeline_postproc else 'disabled'}"`
	`366`	`+ )`
`364`	`367`	`for node in graph.nodes:`
`365`	`368`	`# only work on the call_module node which is also a sharded module`
`366`	`369`	`if node.op != "call_module" or node.target not in sharded_modules:`