[TorchToLinalg] simplify non-broadcast unit dim indexing maps in elementise generics (#4107)

zjgarvey · web-flow · commit 493bb33c3694 · 2025-03-27T12:36:06.000-05:00
This change is made to reduce the pattern-matching load for fusing
elementwise generic ops with non-broadcasting unit dims.

For example, adding tensors with shapes `[6,1]` and `[1]`, the output
shape will be `[6,1]`. Before this change, the indexing maps were
inconsistent between the inputs and outputs for the unit-dim (constant 0
for inputs, and a dim expression for the output).

---------

Signed-off-by: zjgarvey &lt;zjgarvey@gmail.com&gt;
diff --git a/lib/Conversion/TorchToLinalg/Utils.cpp b/lib/Conversion/TorchToLinalg/Utils.cpp
@@ -255,26 +255,47 @@ Value torch_to_linalg::createElementwiseLinalgGeneric(
   // all sizes along that result dimension are statically 1.
   auto c1 = b.create<arith::ConstantIndexOp>(loc, /*value=*/1);
   SmallVector<Value> resultShape(resultRank, c1);
+
+  // Record whether or not all corresponding input dims are statically 1.
+  // We don't want to use a constant 0 expression for the input indexing maps in
+  // this case, since there is no broadcasting. Using the constant 0 expressions
+  // for the inputs, when they actually do correspond to an output dim, makes
+  // subsequent optimizations (e.g. fusions) more difficult.
+  DenseSet<int64_t> nonStaticOneResultDims;
+  for (int64_t i = 0; i < resultRank; i++) {
+    for (Value tensorOperand : tensorOperands) {
+      auto type = cast<RankedTensorType>(tensorOperand.getType());
+      auto index = i - (resultRank - type.getRank());
+      if (index < 0)
+        continue;
+      int64_t dimSize = makeShapeTorchCompatible(type.getShape())[index];
+      if (dimSize != 1) {
+        nonStaticOneResultDims.insert(i);
+        break;
+      }
+    }
+  }
+
   SmallVector<AffineMap> indexingMaps;
   bool elideDynamicBroadcastCheck = isAssumingStrictSymbolicShapes(b);
+
   for (Value tensorOperand : tensorOperands) {
     SmallVector<AffineExpr> exprs;
     auto type = cast<RankedTensorType>(tensorOperand.getType());
     for (auto size :
          llvm::enumerate(makeShapeTorchCompatible(type.getShape()))) {
-      // If the size is statically known to be 1, we don't want any
-      // error guards to be spuriously emitted, since we are specifically
-      // allowing size-1 broadcasts in this case, as they correspond to a
-      // constant-0 indexing map.
-      if (size.value() == 1) {
-        exprs.push_back(b.getAffineConstantExpr(0));
-        continue;
-      }
 
       // The rank of this operand might be smaller than the overall rank of
       // the broadcast. Add an offset to correlate it to the correct
       // dimension of the result.
-      auto resultDim = size.index() + (resultRank - type.getRank());
+      int64_t resultDim = size.index() + (resultRank - type.getRank());
+
+      // If the size is statically 1 and we don't know that the result dim is
+      // statically 1, use an affine constant expression to broadcast.
+      if (size.value() == 1 && nonStaticOneResultDims.contains(resultDim)) {
+        exprs.push_back(b.getAffineConstantExpr(0));
+        continue;
+      }
 
       // The generated linalg op will now be iterating along the full size
       // of this dimension. Record that fact.
diff --git a/test/Conversion/TorchToLinalg/elementwise.mlir b/test/Conversion/TorchToLinalg/elementwise.mlir
@@ -118,3 +118,16 @@ func.func @elementwise_todtype_bf162f16(%arg0: !torch.vtensor<[1,?,32,128],bf16>
   %0 = torch.aten.to.dtype %arg0, %int5, %false, %false, %none : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[1,?,32,128],f16>
   return %0 : !torch.vtensor<[1,?,32,128],f16>
 }
+
+// -----
+
+// CHECK-LABEL:   func.func @elementwise_add_non_broadcast_unit_dims(
+// CHECK:           linalg.generic {indexing_maps = [
+// CHECK-SAME:        affine_map<(d0, d1) -> (d0, d1)>,
+// CHECK-SAME:        affine_map<(d0, d1) -> (d1)>,
+// CHECK-SAME:        affine_map<(d0, d1) -> (d0, d1)>]
+func.func @elementwise_add_non_broadcast_unit_dims(%arg0: !torch.vtensor<[6,1],bf16>, %arg1 : !torch.vtensor<[1],bf16>) -> !torch.vtensor<[6,1],bf16> {
+  %int1_13 = torch.constant.int 1
+  %11 = torch.aten.add.Tensor %arg0, %arg1, %int1_13 : !torch.vtensor<[6,1],bf16>, !torch.vtensor<[1],bf16>, !torch.int -> !torch.vtensor<[6,1],bf16>
+  return %11 : !torch.vtensor<[6,1],bf16>
+}