thecppzoo · jamierpond · Jan 13, 2025 · Jan 13, 2025 · Jan 13, 2025 · Jan 13, 2025
diff --git a/inc/zoo/swar/SWAR.h b/inc/zoo/swar/SWAR.h
@@ -70,6 +70,8 @@ constexpr __uint128_t lsbIndex(__uint128_t v) noexcept {
 }
 #endif
 
+
+
 /// Core abstraction around SIMD Within A Register (SWAR).  Specifies 'lanes'
 /// of NBits width against a type T, and provides an abstraction for performing
 /// SIMD operations against that primitive type T treated as a SIMD register.
@@ -108,6 +110,17 @@ struct SWAR {
         return result;
     }
 
+    constexpr static auto evenLaneMask() {
+        using S = SWAR<NBits, T>;
+        static_assert(0 == S::Lanes % 2, "Only even number of elements supported");
+        using D = SWAR<NBits * 2, T>;
+        return S{(D::LeastSignificantBit << S::NBits) - D::LeastSignificantBit};
+    }
+
+    constexpr static auto oddLaneMask() {
+        return SWAR<NBits, T>{static_cast<T>(~evenLaneMask().value())};
+    }
+
     template <typename Range>
     constexpr static auto from(const Range &values) noexcept {
         using std::begin; using std::end;
@@ -595,4 +608,7 @@ static_assert(
     0x0706050403020100ull
 );
 
+static_assert(SWAR<4, u16>::evenLaneMask().value() == 0b0000'1111'0000'1111);
+static_assert(SWAR<4, u16>::oddLaneMask().value()  == 0b1111'0000'1111'0000);
+
 }}
diff --git a/inc/zoo/swar/associative_iteration.h b/inc/zoo/swar/associative_iteration.h
@@ -475,14 +475,6 @@ struct SWAR_Pair{
     SWAR<NB, T> even, odd;
 };
 
-template<int NB, typename T>
-constexpr SWAR<NB, T> doublingMask() {
-    using S = SWAR<NB, T>;
-    static_assert(0 == S::Lanes % 2, "Only even number of elements supported");
-    using D = SWAR<NB * 2, T>;
-    return S{(D::LeastSignificantBit << NB) - D::LeastSignificantBit};
-}
-
 template<int NB, typename T>
 constexpr auto doublePrecision(SWAR<NB, T> input) {
     using S = SWAR<NB, T>;
@@ -491,7 +483,7 @@ constexpr auto doublePrecision(SWAR<NB, T> input) {
         "Precision can only be doubled for SWARs of even element count"
     );
     using RV = SWAR<NB * 2, T>;
-    constexpr auto DM = doublingMask<NB, T>();
+    constexpr auto DM = SWAR<NB, T>::evenLaneMask();
     return SWAR_Pair<NB * 2, T>{
         RV{(input & DM).value()},
         RV{(input.value() >> NB) & DM.value()}
@@ -503,13 +495,66 @@ constexpr auto halvePrecision(SWAR<NB, T> even, SWAR<NB, T> odd) {
     using S = SWAR<NB, T>;
     static_assert(0 == NB % 2, "Only even lane-bitcounts supported");
     using RV = SWAR<NB/2, T>;
-    constexpr auto HalvingMask = doublingMask<NB/2, T>();
+    constexpr auto HalvingMask = SWAR<NB/2, T>::evenLaneMask();
     auto
         evenHalf = RV{even.value()} & HalvingMask,
         oddHalf = RV{(RV{odd.value()} & HalvingMask).value() << NB/2};
     return evenHalf | oddHalf;
 }
 
+
+template <int NB, typename T> struct MultiplicationResult {
+   SWAR<NB, T> result;
+   SWAR<NB, T> overflow;
+};
+
+template <int NB, typename T>
+constexpr MultiplicationResult<NB, T>
+fullMultiplication(SWAR<NB, T> multiplicand, SWAR<NB, T> multiplier) {
+   using S = SWAR<NB, T>; using D = SWAR<NB * 2, T>;
+
+   auto [l_even, l_odd] = doublePrecision(multiplicand);
+   auto [r_even, r_odd] = doublePrecision(multiplier);
+   auto res_even = multiplication_OverflowUnsafe(l_even, r_even);
+   auto res_odd = multiplication_OverflowUnsafe(l_odd, r_odd);
+
+   // Into the double precision world
+   constexpr auto HalfLane = S::NBits;
+   constexpr auto UpperHalfOfLanes = SWAR<S::NBits, T>::oddLaneMask().value();
+   auto res = halvePrecision(res_even, res_odd);
+
+   auto over_even = D{(res_even.value() & UpperHalfOfLanes) >> HalfLane};
+   auto over_odd = D{(res_odd.value() & UpperHalfOfLanes) >> HalfLane};
+   auto overflow_values = halvePrecision(over_even, over_odd);
+
+   return {res, overflow_values};
+}
+
+using S = SWAR<4, u32>;
+
+static_assert(S::oddLaneMask().value() == 0xF0F0'F0F0);
+static_assert(S::evenLaneMask().value() == 0x0F0F'0F0F);
+
+static_assert(fullMultiplication(S{0x0009'0000}, S{0x0009'0000})
+                  .result.value() == 0x0001'0000);
+static_assert(fullMultiplication(S{0x0003'0000}, S{0x0007'0000})
+                  .result.value() == 0x0005'0000);
+
+// static_assert(fullMultiplication(S{0x0002'0000}, S{0x0008'0000})
+//                   .overflowed.value() == 0x0008'0000);
+//
+// static_assert(fullMultiplication(S{0x0008'0000}, S{0x0008'0000})
+//                   .overflowed.value() == 0x0008'0000);
+//
+// static_assert(fullMultiplication(S{0x0001'0000}, S{0x0008'0000})
+//                   .overflowed.value() == 0x0000'0000);
+
+static_assert(fullMultiplication(S{0x0008'0012}, S{0x0007'0032})
+                  .result.value() == 0x0008'0034);
+
+static_assert(fullMultiplication(S{0x0008'0012}, S{0x0007'0032})
+                  .result.value() == 0x0008'0034);
+
 }
 
 #endif
diff --git a/test/swar/BasicOperations.cpp b/test/swar/BasicOperations.cpp
@@ -211,7 +211,7 @@ static_assert(BooleanSWAR{Literals<4, u16>,
 namespace Multiplication {
 
 static_assert(~int64_t(0) == negate(S4_64{S4_64::LeastSignificantBit}).value());
-static_assert(0x0F0F0F0F == doublingMask<4, uint32_t>().value());
+static_assert(0x0F0F0F0F == SWAR<4, uint32_t>::evenLaneMask().value());
 
 constexpr auto PrecisionFixtureTest = 0x89ABCDEF;
 constexpr auto Doubled =