[Tosa] : Use output type for bias for creating tosa.conv (#4252)

sahas3 · web-flow · commit 386bba4a9289 · 2025-07-14T15:57:45.000Z
For ConvolutionLayer initialized without bias, a zero tensor for bias is
created when converting to `tosa.conv2d` as the op always expects a bias
tensor. This zero tensor was always initialized to be `fp32`
irrespective of what the input/weights type were. This leads to a
validation error since `bias` type (fp32) didn't match with output of
conv (fp16) when the input/weight are of `fp16` type.
diff --git a/include/torch-mlir/Conversion/TorchToTosa/TosaLegalizeUtils.h b/include/torch-mlir/Conversion/TorchToTosa/TosaLegalizeUtils.h
@@ -101,6 +101,11 @@ LogicalResult getConvOpsAccType(PatternRewriter &rewriter,
                                 RankedTensorType weightTy,
                                 RankedTensorType outputTy, TypeAttr &accType);
 
+FailureOr<Value> getConvBiasForNoneType(Operation *op,
+                                        PatternRewriter &rewriter,
+                                        Type inputElemTy, Type outputElemTy,
+                                        ArrayRef<int64_t> weightShape);
+
 } // namespace tosa
 } // namespace mlir
 
diff --git a/lib/Conversion/TorchToTosa/TorchToTosa.cpp b/lib/Conversion/TorchToTosa/TorchToTosa.cpp
@@ -29,6 +29,8 @@
 #include <optional>
 #include <random>
 
+#include "mlir/Dialect/Tosa/Utils/QuantUtils.h"
+
 using namespace mlir;
 using namespace mlir::torch;
 using namespace mlir::torch::Torch;
@@ -2295,7 +2297,6 @@ LogicalResult ConvertAtenOp<AtenConvolutionOp>::matchAndRewrite(
   auto weightTy = cast<RankedTensorType>(weight.getType());
   auto outputTy =
       cast<RankedTensorType>(getTypeConverter()->convertType(op.getType()));
-
   if (!inputTy || !weightTy || !outputTy)
     return rewriter.notifyMatchFailure(
         op, "Input, weight and output to Convolution must be ranked tensors");
@@ -2304,6 +2305,7 @@ LogicalResult ConvertAtenOp<AtenConvolutionOp>::matchAndRewrite(
   auto weightElemTy = weightTy.getElementType();
   auto inputShape = makeShapeTorchCompatible(inputTy.getShape());
   auto weightShape = makeShapeTorchCompatible(weightTy.getShape());
+  auto outputElemTy = outputTy.getElementType();
 
   if (inputTy.getRank() != 4)
     return rewriter.notifyMatchFailure(
@@ -2316,28 +2318,21 @@ LogicalResult ConvertAtenOp<AtenConvolutionOp>::matchAndRewrite(
   // Bias is optional. TOSA mandates a zero tensor here, so construct one if
   // required.
   auto bias = adaptor.getBias();
-  if (isa<Torch::NoneType>(adaptor.getBias().getType())) {
-    // TBD: This is only valid for quantized 8-bit. For 16-bit, the bias (and
-    // accumulator) are 48-bit and not 32-bit, and requires the use of APInt to
-    // define a 48-bit int.
-    if (isa<quant::QuantizedType>(inputElemTy)) {
-      SmallVector<int32_t> zeroVec(weightShape[0], 0);
-      bias = tosa::getConstTensor<int32_t>(
-                 rewriter, op, zeroVec, {static_cast<int32_t>(weightShape[0])})
-                 .value();
-    } else {
-      SmallVector<float> zeroVec(weightShape[0], 0);
-      bias = tosa::getConstTensor<float>(rewriter, op, zeroVec,
-                                         {static_cast<int32_t>(weightShape[0])})
-                 .value();
-    }
+
+  if (isa<Torch::NoneType>(bias.getType())) {
+    auto bias_result = tosa::getConvBiasForNoneType(op, rewriter, inputElemTy,
+                                                    outputElemTy, weightShape);
+    if (failed(bias_result))
+      return rewriter.notifyMatchFailure(
+          op, "Failed to create bias tensor for none type.");
+    bias = bias_result.value();
   } else {
-    if (!cast<RankedTensorType>(bias.getType()))
+    if (!isa<RankedTensorType>(bias.getType()))
       return rewriter.notifyMatchFailure(
           op, "Bias provided but not a ranked tensor");
   }
-  auto biasElemTy =
-      isa<mlir::FloatType>(inputElemTy) ? inputElemTy : rewriter.getI32Type();
+
+  Type biasElemTy = cast<RankedTensorType>(bias.getType()).getElementType();
 
   int64_t groups;
   if (!matchPattern(op.getGroups(), m_TorchConstantInt(&groups))) {
@@ -2528,14 +2523,29 @@ LogicalResult ConvertAtenOp<AtenConvolutionOp>::matchAndRewrite(
   auto convOpTy =
       RankedTensorType::get(makeShapeLLVMCompatible(outputShape), biasElemTy);
 
+  // create zero-point tensors for input and weight
+  auto zps = tosa::createZPsAsConst(rewriter, input, weight);
+  // for i8 input/weight, zero-points are returned as un-initialized
+  Value inputZp =
+      zps.first
+          ? zps.first
+          : tosa::createZeroPointTensor(rewriter, op->getLoc(), inputElemTy, 0)
+                .value();
+
+  Value weightZp =
+      zps.second
+          ? zps.second
+          : tosa::createZeroPointTensor(rewriter, op->getLoc(), weightElemTy, 0)
+                .value();
+
   Value convOpResult;
   if (groups == 1) {
     // full convolution
     convOpResult =
         rewriter
             .create<tosa::Conv2DOp>(
                 op->getLoc(), getTypeConverter()->convertType(convOpTy),
-                transposedInput, transformedWeight, bias,
+                transposedInput, transformedWeight, bias, inputZp, weightZp,
                 rewriter.getDenseI64ArrayAttr(padding),
                 rewriter.getDenseI64ArrayAttr(stride),
                 rewriter.getDenseI64ArrayAttr(dilation), accType)
@@ -2546,7 +2556,7 @@ LogicalResult ConvertAtenOp<AtenConvolutionOp>::matchAndRewrite(
         rewriter
             .create<tosa::DepthwiseConv2DOp>(
                 op->getLoc(), getTypeConverter()->convertType(convOpTy),
-                transposedInput, transformedWeight, bias,
+                transposedInput, transformedWeight, bias, inputZp, weightZp,
                 rewriter.getDenseI64ArrayAttr(padding),
                 rewriter.getDenseI64ArrayAttr(stride),
                 rewriter.getDenseI64ArrayAttr(dilation), accType)
@@ -2574,8 +2584,12 @@ LogicalResult ConvertAtenOp<AtenConvolutionOp>::matchAndRewrite(
         rewriter, op, transposedOutput, inputTy, weightTy, outputTy);
   }
 
-  rewriter.replaceOpWithNewOp<tensor::CastOp>(
-      op, getTypeConverter()->convertType(op.getType()), rescaledResult);
+  // cast to outputTy is required if convOpTy is not same as outputTy
+  // the difference is not in the shape information, rather the element-type
+  // itself
+  rewriter.replaceOp(
+      op,
+      {tosa::tosaCastTensorToType(rewriter, rescaledResult, outputTy).value()});
 
   return success();
 }
diff --git a/lib/Conversion/TorchToTosa/TosaLegalizeUtils.cpp b/lib/Conversion/TorchToTosa/TosaLegalizeUtils.cpp
@@ -11,6 +11,7 @@
 #include "mlir/Dialect/Tosa/IR/TosaOps.h" // from @llvm-project
 #include "mlir/Dialect/Tosa/Utils/ConversionUtils.h"
 #include "mlir/Dialect/Tosa/Utils/QuantUtils.h" // from @llvm-project
+#include "llvm/ADT/ArrayRef.h"
 
 namespace mlir {
 namespace tosa {
@@ -551,5 +552,48 @@ LogicalResult getConvOpsAccType(PatternRewriter &rewriter,
   return success();
 }
 
+FailureOr<Value> getConvBiasForNoneType(Operation *op,
+                                        PatternRewriter &rewriter,
+                                        Type inputElemTy, Type outputElemTy,
+                                        ArrayRef<int64_t> weightShape) {
+
+  Type biasElemTy;
+
+  if (isa<quant::QuantizedType>(outputElemTy)) {
+    auto input_qtype = dyn_cast<mlir::quant::QuantizedType>(inputElemTy);
+    if (!input_qtype) {
+      return rewriter.notifyMatchFailure(op,
+                                         "output is qtype but input is not");
+    }
+    int input_bits = input_qtype.getStorageTypeIntegralWidth();
+    if (input_bits != 8) {
+      // TBD: This is only valid for quantized 8-bit. For 16-bit, the bias (and
+      // accumulator) are 48-bit and not 32-bit, and requires the use of APInt
+      // to define a 48-bit int.
+      return rewriter.notifyMatchFailure(
+          op, "Only int8 input tensor to conv2d is supported.");
+    }
+    // For signed int8 input tensor, int32 bias and output
+    // tensor are generated.
+    int bias_bits = 32;
+    biasElemTy = rewriter.getIntegerType(bias_bits);
+  } else {
+    biasElemTy = outputElemTy;
+  }
+
+  if (biasElemTy.isInteger()) {
+    SmallVector<int32_t> zeroVec(weightShape[0], 0);
+    return tosa::getConstTensor<int32_t>(rewriter, op, zeroVec,
+                                         {static_cast<int32_t>(weightShape[0])})
+        .value();
+  } else {
+    SmallVector<float> zeroVec(weightShape[0], 0);
+    return tosa::getConstTensor<float>(rewriter, op, zeroVec,
+                                       {static_cast<int32_t>(weightShape[0])},
+                                       biasElemTy)
+        .value();
+  }
+}
+
 } // namespace tosa
 } // namespace mlir
diff --git a/projects/pt1/e2e_testing/xfail_sets.py b/projects/pt1/e2e_testing/xfail_sets.py
@@ -681,6 +681,7 @@
     "ConstantBoolParameterModule_basic",
     "ContainsIntList_False",
     "ContainsIntList_True",
+    "Conv2dFP16NoBiasModule_basic",
     "Conv2dQInt8Module_basic",
     "Conv2dQInt8Module_depthwise",
     "Conv2dQInt8Module_grouped",
@@ -2874,6 +2875,7 @@
     "Conv2dBiasNoPaddingModule_basic",
     "Conv2dModule_basic",
     "Conv2dNoPaddingModule_basic",
+    "Conv2dFP16NoBiasModule_basic",
     "Conv2dQInt8Module_basic",
     "Conv2dQInt8Module_depthwise",
     "Conv2dQInt8Module_grouped",
diff --git a/projects/pt1/python/torch_mlir_e2e_test/test_suite/conv.py b/projects/pt1/python/torch_mlir_e2e_test/test_suite/conv.py
@@ -1259,6 +1259,36 @@ def Conv2dModule_basic(module, tu: TestUtils):
     module.forward(inputVec, weight, bias)
 
 
+class Conv2dFP16NoBiasModule(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    @export
+    @annotate_args(
+        [
+            None,
+            ([-1, -1, -1, -1], torch.float16, True),
+            ([-1, -1, -1, -1], torch.float16, True),
+        ]
+    )
+    def forward(self, inputVec, weight):
+        return torch.ops.aten.conv2d(
+            inputVec,
+            weight,
+            stride=[1, 1],
+            padding=[0, 0],
+            dilation=[1, 1],
+            groups=1,
+        )
+
+
+@register_test_case(module_factory=lambda: Conv2dFP16NoBiasModule())
+def Conv2dFP16NoBiasModule_basic(module, tu: TestUtils):
+    inputVec = tu.rand(2, 2, 6, 6).to(torch.float16)
+    weight = torch.randn(8, 2, 3, 3).to(torch.float16)
+    module.forward(inputVec, weight)
+
+
 class Conv3dModule(torch.nn.Module):
     def __init__(self):
         super().__init__()
diff --git a/test/Conversion/TorchToTosa/basic.mlir b/test/Conversion/TorchToTosa/basic.mlir