diff --git a/inc/zoo/swar/associative_iteration.h b/inc/zoo/swar/associative_iteration.h index 2a0f7e9..e67bd09 100644 --- a/inc/zoo/swar/associative_iteration.h +++ b/inc/zoo/swar/associative_iteration.h @@ -41,51 +41,26 @@ std::ostream &operator<<(std::ostream &out, zoo::swar::SWAR s) { namespace zoo::swar { -/// \note This code should be substituted by an application of "progressive" algebraic iteration -/// \note There is also parallelPrefix (to be implemented) -template -constexpr SWAR parallelSuffix(SWAR input) { - using S = SWAR; - auto - shiftClearingMask = S{static_cast(~S::MostSignificantBit)}, - doubling = input, - result = S{0}; +constexpr auto log2_of_power_of_two = [](auto power_of_two) { + return __builtin_ctz(power_of_two); +}; + +template +constexpr auto parallelSuffix(S input) { auto - bitsToXOR = NB, + log2Count = log2_of_power_of_two(S::NBits), power = 1; + auto + result = input, + shiftMask = S{~S::MostSignificantBit}; - #define ZTE(...) - // ZOO_TRACEABLE_EXPRESSION(__VA_ARGS__) - for(;;) { - ZTE(doubling); - // From the perspective of "associative iteration", this is when we ask whether to "add" - if(1 & bitsToXOR) { - ZTE(result ^ doubling); - result = result ^ doubling; - ZTE(doubling.shiftIntraLaneLeft(power, shiftClearingMask)); - doubling = doubling.shiftIntraLaneLeft(power, shiftClearingMask); - } - ZTE(bitsToXOR >> 1); - bitsToXOR >>= 1; - if(!bitsToXOR) { break; } - auto shifted = doubling.shiftIntraLaneLeft(power, shiftClearingMask); - ZTE(shifted); - ZTE(doubling ^ shifted); - // This is part of the "doubling" step in A. I. - // Doubling has several parts, though, the shifting, masking and XOR - doubling = doubling ^ shifted; - // 01...1 - // 001...1 - // 00001...1 - // 000000001...1 - shiftClearingMask = - shiftClearingMask & - S{static_cast(shiftClearingMask.value() >> power)}; - ZTE(power << 1); + for (;;) { + result = result ^ result.shiftIntraLaneLeft(power, shiftMask); + if (!--log2Count) { break; } + shiftMask = shiftMask & S{shiftMask.value() >> power}; power <<= 1; } - ZTE(input); - #undef ZTE + return S{result}; } diff --git a/test/swar/BasicOperations.cpp b/test/swar/BasicOperations.cpp index 602384a..b4edd05 100644 --- a/test/swar/BasicOperations.cpp +++ b/test/swar/BasicOperations.cpp @@ -266,6 +266,42 @@ TEST_CASE("Old multiply version", "[deprecated][swar]") { CHECK(Expected == result.value()); } +static_assert( + parallelSuffix(SWAR<32, u64>{ + 0b00000000000000110000000000000011'00000000000000000000000000000111}).value() + == 0b00000000000000010000000000000001'11111111111111111111111111111101 +); + +static_assert( + parallelSuffix(SWAR<16, u32>{ + 0b0000000000000011'0000000000000011}).value() + == 0b0000000000000001'0000000000000001 +); + +static_assert( + parallelSuffix(SWAR<8, u32>{ + 0b00000000'00000000'00000000'00000000}).value() + == 0b00000000'00000000'00000000'00000000 +); + +static_assert( + parallelSuffix(SWAR<8, u32>{ + 0b00000011'00000011'00000111'00000011}).value() + == 0b00000001'00000001'11111101'00000001 +); + +static_assert( + parallelSuffix(SWAR<8, u32>{ + 0b00011000'00000011'00111000'00000011}).value() + == 0b00001000'00000001'11101000'00000001 +); + +static_assert( + parallelSuffix(SWAR<4, u32> { + 0b0011'0110'0011'0000'0110'0011'0011'0011}).value() + == 0b0001'0010'0001'0000'0010'0001'0001'0001 +); + TEST_CASE("Parity", "[swar]") { // For each nibble, E indicates (E)ven and O (O)dd parities // EEOEEOOO