Avoid for-loop in EmbeddingEncoder (#366)

weihua916 · web-flow · commit f4e5130e6ef2 · 2024-02-28T11:48:44.000-08:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Added
 
+- Avoided for-loop in `EmbeddingEncoder` ([#366](https://github.com/pyg-team/pytorch-frame/pull/366))
 - Added `image_embedded` and one tabular image dataset ([#344](https://github.com/pyg-team/pytorch-frame/pull/344))
 - Added benchmarking suite for encoders ([#360](https://github.com/pyg-team/pytorch-frame/pull/360))
 - Added dataframe text benchmark script ([#354](https://github.com/pyg-team/pytorch-frame/pull/354))
diff --git a/torch_frame/nn/encoder/stype_encoder.py b/torch_frame/nn/encoder/stype_encoder.py
@@ -284,39 +284,45 @@ def __init__(
 
     def init_modules(self):
         super().init_modules()
-        self.embs = ModuleList([])
+        num_categories_list = [0]
         for stats in self.stats_list:
             num_categories = len(stats[StatType.COUNT][0])
-            # 0-th category is for NaN.
-            self.embs.append(
-                Embedding(
-                    num_categories + 1,
-                    self.out_channels,
-                    padding_idx=0,
-                ))
+            num_categories_list.append(num_categories)
+        # Single embedding module that stores embeddings of all categories
+        # across all categorical columns.
+        # 0-th category is for NaN.
+        self.emb = Embedding(
+            sum(num_categories_list) + 1,
+            self.out_channels,
+            padding_idx=0,
+        )
+        # [num_cols, ]
+        self.register_buffer(
+            "offset",
+            torch.cumsum(
+                torch.tensor(num_categories_list[:-1], dtype=torch.long),
+                dim=0))
         self.reset_parameters()
 
     def reset_parameters(self):
         super().reset_parameters()
-        for emb in self.embs:
-            emb.reset_parameters()
+        self.emb.reset_parameters()
 
     def encode_forward(
         self,
         feat: Tensor,
         col_names: list[str] | None = None,
     ) -> Tensor:
-        # TODO: Make this more efficient.
-        # Increment the index by one so that NaN index (-1) becomes 0
-        # (padding_idx)
         # feat: [batch_size, num_cols]
-        feat = feat + 1
-        xs = []
-        for i, emb in enumerate(self.embs):
-            xs.append(emb(feat[:, i]))
-        # [batch_size, num_cols, hidden_channels]
-        x = torch.stack(xs, dim=1)
-        return x
+        # Get NaN mask
+        na_mask = feat < 0
+        # Increment the index by one not to conflict with the padding idx
+        # Also add offset for each column to avoid embedding conflict
+        feat = feat + self.offset + 1
+        # Use 0th index for NaN
+        feat[na_mask] = 0
+        # [batch_size, num_cols, channels]
+        return self.emb(feat)
 
 
 class MultiCategoricalEmbeddingEncoder(StypeEncoder):