From f5f149d0afb348ba7110f738259b70f6d2681f8b Mon Sep 17 00:00:00 2001 From: Jamie Pond <73431532+jamierpond@users.noreply.github.com> Date: Thu, 12 Sep 2024 00:14:32 -0700 Subject: [PATCH 01/15] wip --- inc/zoo/swar/associative_iteration.h | 111 +++++++++++++++++++++++---- 1 file changed, 96 insertions(+), 15 deletions(-) diff --git a/inc/zoo/swar/associative_iteration.h b/inc/zoo/swar/associative_iteration.h index 2a0f7e9..d7d9ab9 100644 --- a/inc/zoo/swar/associative_iteration.h +++ b/inc/zoo/swar/associative_iteration.h @@ -2,6 +2,7 @@ #define ZOO_SWAR_ASSOCIATIVE_ITERATION_H #include "zoo/swar/SWAR.h" +#include //#define ZOO_DEVELOPMENT_DEBUGGING #ifdef ZOO_DEVELOPMENT_DEBUGGING @@ -46,31 +47,27 @@ namespace zoo::swar { template constexpr SWAR parallelSuffix(SWAR input) { using S = SWAR; + auto shiftClearingMask = S{~S::MostSignificantBit}; + auto - shiftClearingMask = S{static_cast(~S::MostSignificantBit)}, doubling = input, result = S{0}; + auto bitsToXOR = NB, power = 1; - #define ZTE(...) - // ZOO_TRACEABLE_EXPRESSION(__VA_ARGS__) for(;;) { - ZTE(doubling); // From the perspective of "associative iteration", this is when we ask whether to "add" - if(1 & bitsToXOR) { - ZTE(result ^ doubling); + if (1 & bitsToXOR) { result = result ^ doubling; - ZTE(doubling.shiftIntraLaneLeft(power, shiftClearingMask)); doubling = doubling.shiftIntraLaneLeft(power, shiftClearingMask); } - ZTE(bitsToXOR >> 1); bitsToXOR >>= 1; - if(!bitsToXOR) { break; } + + if (!bitsToXOR) { break; } auto shifted = doubling.shiftIntraLaneLeft(power, shiftClearingMask); - ZTE(shifted); - ZTE(doubling ^ shifted); + // This is part of the "doubling" step in A. I. // Doubling has several parts, though, the shifting, masking and XOR doubling = doubling ^ shifted; @@ -80,15 +77,31 @@ constexpr SWAR parallelSuffix(SWAR input) { // 000000001...1 shiftClearingMask = shiftClearingMask & - S{static_cast(shiftClearingMask.value() >> power)}; - ZTE(power << 1); + S{shiftClearingMask.value() >> power}; power <<= 1; } - ZTE(input); - #undef ZTE return S{result}; } +static_assert( + parallelSuffix(SWAR<8, u32>{ + 0b00000000'00000000'00000000'00000000}).value() + == 0b00000000'00000000'00000000'00000000 +); + +static_assert( + parallelSuffix(SWAR<8, u32>{ + 0b00000011'00000011'00000111'00000011}).value() + == 0b00000001'00000001'11111101'00000001 +); + +static_assert( + parallelSuffix(SWAR<8, u32>{ + 0b00000000'00000011'00000000'00000011}).value() + == 0b00000000'00000001'00000000'00000001 +); + + /* Binary compress: A fascinating algorithm. @@ -409,6 +422,74 @@ constexpr auto associativeOperatorIterated_regressive( return result; } +constexpr auto log2_of_power_of_two = [](auto power_of_two) { + if (power_of_two == 0) { + return 0; + } + if (power_of_two == 1) { + return 1; + } + return __builtin_ctz(power_of_two); +}; + +static_assert(log2_of_power_of_two(1) == 1); +static_assert(log2_of_power_of_two(32) == 5); +static_assert(log2_of_power_of_two(64) == 6); + +template +constexpr T ps(T x) { + constexpr auto sizeOfType = sizeof(T) * 8; + constexpr auto log2Count = log2_of_power_of_two(sizeOfType); + + auto operation = [](auto input, auto num_shifts) { + auto shifted = input << num_shifts; + return input ^ shifted; + }; + + for (auto i = 0; i < log2Count; ++i) { + x = operation(x, 1 << i); + } + + return x; +} + +static_assert(ps(0b00000000'00000001) == 0b11111111'11111111'11111111'11111111); +static_assert(ps(0b00000000'00000011) == 0b00000000'00000000'00000000'00000001); +static_assert(ps(0b00000000'00000111) == 0b11111111'11111111'11111111'11111101); +static_assert(ps(0b00000000'00001111) == 0b00000000'00000000'00000000'00000101); +static_assert(ps(0b00000000'00011111) == 0b11111111'11111111'11111111'11110101); +static_assert(ps(0b00000000'00111111) == 0b00000000'00000000'00000000'00010101); +static_assert(ps(0b00000000'01111111) == 0b11111111'11111111'11111111'11010101); +static_assert(ps(0b00000000'11111111) == 0b00000000'00000000'00000000'01010101); + +template +constexpr auto parallel_suffix(S input) { + constexpr auto log2Count = S::Lanes; + constexpr auto neutral = S{0}; + + auto operation = [](auto left, auto right, auto counts) { + return left + (right & counts); + }; + + auto halver = [](auto counts) { + return counts >> 1; + }; + + auto count = S{S::MostSignificantBit}; + auto forSquaring = S{S::LeastSignificantBit}; + + + return associativeOperatorIterated_regressive( + input, + neutral, + count, + forSquaring, + operation, + log2Count, halver + ); + +} + template constexpr auto multiplication_OverflowUnsafe_SpecificBitCount( SWAR multiplicand, SWAR multiplier From e52122b84959158dcb1b46b8d9bd8e61adc78314 Mon Sep 17 00:00:00 2001 From: Jamie Pond <73431532+jamierpond@users.noreply.github.com> Date: Thu, 12 Sep 2024 01:26:28 -0700 Subject: [PATCH 02/15] think this is the best --- inc/zoo/swar/associative_iteration.h | 61 ++++++++++++---------------- 1 file changed, 27 insertions(+), 34 deletions(-) diff --git a/inc/zoo/swar/associative_iteration.h b/inc/zoo/swar/associative_iteration.h index d7d9ab9..9c708cb 100644 --- a/inc/zoo/swar/associative_iteration.h +++ b/inc/zoo/swar/associative_iteration.h @@ -42,42 +42,36 @@ std::ostream &operator<<(std::ostream &out, zoo::swar::SWAR s) { namespace zoo::swar { +constexpr auto log2_of_power_of_two = [](auto power_of_two) { + if (power_of_two == 0) { + return 0; + } + if (power_of_two == 1) { + return 1; + } + return __builtin_ctz(power_of_two); +}; + /// \note This code should be substituted by an application of "progressive" algebraic iteration /// \note There is also parallelPrefix (to be implemented) -template -constexpr SWAR parallelSuffix(SWAR input) { - using S = SWAR; +template +constexpr auto parallelSuffix(S input) { + constexpr auto log2Count = log2_of_power_of_two(S::NBits); + constexpr auto operation = [] (auto doubling, auto power, auto mask) { + auto shifted = doubling.shiftIntraLaneLeft(power, mask); + doubling = doubling ^ shifted; + return doubling; + }; + auto result = input; auto shiftClearingMask = S{~S::MostSignificantBit}; - - auto - doubling = input, - result = S{0}; - - auto - bitsToXOR = NB, - power = 1; - + auto power = 1; for(;;) { - // From the perspective of "associative iteration", this is when we ask whether to "add" - if (1 & bitsToXOR) { - result = result ^ doubling; - doubling = doubling.shiftIntraLaneLeft(power, shiftClearingMask); + result = operation(result, power, shiftClearingMask); + if (power >= log2Count) { // this is log2Count only + break; } - bitsToXOR >>= 1; - - if (!bitsToXOR) { break; } - auto shifted = doubling.shiftIntraLaneLeft(power, shiftClearingMask); - - // This is part of the "doubling" step in A. I. - // Doubling has several parts, though, the shifting, masking and XOR - doubling = doubling ^ shifted; - // 01...1 - // 001...1 - // 00001...1 - // 000000001...1 - shiftClearingMask = - shiftClearingMask & - S{shiftClearingMask.value() >> power}; + // I'm pretty sure we need to keep track of both of these... + shiftClearingMask = shiftClearingMask & S{shiftClearingMask.value() >> power}; power <<= 1; } return S{result}; @@ -97,8 +91,8 @@ static_assert( static_assert( parallelSuffix(SWAR<8, u32>{ - 0b00000000'00000011'00000000'00000011}).value() - == 0b00000000'00000001'00000000'00000001 + 0b00011000'00000011'00111000'00000011}).value() + == 0b00001000'00000001'11101000'00000001 ); @@ -478,7 +472,6 @@ constexpr auto parallel_suffix(S input) { auto count = S{S::MostSignificantBit}; auto forSquaring = S{S::LeastSignificantBit}; - return associativeOperatorIterated_regressive( input, neutral, From 79adb65e8e77e18962d93ff6946df0adc7439ebe Mon Sep 17 00:00:00 2001 From: Jamie Pond <73431532+jamierpond@users.noreply.github.com> Date: Thu, 12 Sep 2024 01:27:41 -0700 Subject: [PATCH 03/15] clean --- inc/zoo/swar/associative_iteration.h | 68 ---------------------------- 1 file changed, 68 deletions(-) diff --git a/inc/zoo/swar/associative_iteration.h b/inc/zoo/swar/associative_iteration.h index 9c708cb..fe0aed7 100644 --- a/inc/zoo/swar/associative_iteration.h +++ b/inc/zoo/swar/associative_iteration.h @@ -2,7 +2,6 @@ #define ZOO_SWAR_ASSOCIATIVE_ITERATION_H #include "zoo/swar/SWAR.h" -#include //#define ZOO_DEVELOPMENT_DEBUGGING #ifdef ZOO_DEVELOPMENT_DEBUGGING @@ -416,73 +415,6 @@ constexpr auto associativeOperatorIterated_regressive( return result; } -constexpr auto log2_of_power_of_two = [](auto power_of_two) { - if (power_of_two == 0) { - return 0; - } - if (power_of_two == 1) { - return 1; - } - return __builtin_ctz(power_of_two); -}; - -static_assert(log2_of_power_of_two(1) == 1); -static_assert(log2_of_power_of_two(32) == 5); -static_assert(log2_of_power_of_two(64) == 6); - -template -constexpr T ps(T x) { - constexpr auto sizeOfType = sizeof(T) * 8; - constexpr auto log2Count = log2_of_power_of_two(sizeOfType); - - auto operation = [](auto input, auto num_shifts) { - auto shifted = input << num_shifts; - return input ^ shifted; - }; - - for (auto i = 0; i < log2Count; ++i) { - x = operation(x, 1 << i); - } - - return x; -} - -static_assert(ps(0b00000000'00000001) == 0b11111111'11111111'11111111'11111111); -static_assert(ps(0b00000000'00000011) == 0b00000000'00000000'00000000'00000001); -static_assert(ps(0b00000000'00000111) == 0b11111111'11111111'11111111'11111101); -static_assert(ps(0b00000000'00001111) == 0b00000000'00000000'00000000'00000101); -static_assert(ps(0b00000000'00011111) == 0b11111111'11111111'11111111'11110101); -static_assert(ps(0b00000000'00111111) == 0b00000000'00000000'00000000'00010101); -static_assert(ps(0b00000000'01111111) == 0b11111111'11111111'11111111'11010101); -static_assert(ps(0b00000000'11111111) == 0b00000000'00000000'00000000'01010101); - -template -constexpr auto parallel_suffix(S input) { - constexpr auto log2Count = S::Lanes; - constexpr auto neutral = S{0}; - - auto operation = [](auto left, auto right, auto counts) { - return left + (right & counts); - }; - - auto halver = [](auto counts) { - return counts >> 1; - }; - - auto count = S{S::MostSignificantBit}; - auto forSquaring = S{S::LeastSignificantBit}; - - return associativeOperatorIterated_regressive( - input, - neutral, - count, - forSquaring, - operation, - log2Count, halver - ); - -} - template constexpr auto multiplication_OverflowUnsafe_SpecificBitCount( SWAR multiplicand, SWAR multiplier From e17f30443fae1265979303d3f51f0f673264e7ae Mon Sep 17 00:00:00 2001 From: Jamie Pond <73431532+jamierpond@users.noreply.github.com> Date: Thu, 12 Sep 2024 01:42:30 -0700 Subject: [PATCH 04/15] fine. --- inc/zoo/swar/associative_iteration.h | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/inc/zoo/swar/associative_iteration.h b/inc/zoo/swar/associative_iteration.h index fe0aed7..3cef3be 100644 --- a/inc/zoo/swar/associative_iteration.h +++ b/inc/zoo/swar/associative_iteration.h @@ -55,33 +55,46 @@ constexpr auto log2_of_power_of_two = [](auto power_of_two) { /// \note There is also parallelPrefix (to be implemented) template constexpr auto parallelSuffix(S input) { - constexpr auto log2Count = log2_of_power_of_two(S::NBits); constexpr auto operation = [] (auto doubling, auto power, auto mask) { auto shifted = doubling.shiftIntraLaneLeft(power, mask); doubling = doubling ^ shifted; return doubling; }; + auto result = input; auto shiftClearingMask = S{~S::MostSignificantBit}; auto power = 1; + auto log2Count = log2_of_power_of_two(S::NBits) + 1; + for(;;) { result = operation(result, power, shiftClearingMask); - if (power >= log2Count) { // this is log2Count only + if (!--log2Count) { break; } - // I'm pretty sure we need to keep track of both of these... shiftClearingMask = shiftClearingMask & S{shiftClearingMask.value() >> power}; power <<= 1; } return S{result}; } +static_assert( + parallelSuffix(SWAR<16, u32>{ + 0b0000000000000011'0000000000000011}).value() + == 0b0000000000000001'0000000000000001 +); + static_assert( parallelSuffix(SWAR<8, u32>{ 0b00000000'00000000'00000000'00000000}).value() == 0b00000000'00000000'00000000'00000000 ); +static_assert( + parallelSuffix(SWAR<4, u32> { + 0b0011'0110'0011'0000'0110'0011'0011'0011}).value() + == 0b0001'0010'0001'0000'0010'0001'0001'0001 +); + static_assert( parallelSuffix(SWAR<8, u32>{ 0b00000011'00000011'00000111'00000011}).value() @@ -95,6 +108,7 @@ static_assert( ); + /* Binary compress: A fascinating algorithm. From a64f2b52343d1953bdf8ef1a2252f898662e9be9 Mon Sep 17 00:00:00 2001 From: Jamie Pond <73431532+jamierpond@users.noreply.github.com> Date: Thu, 12 Sep 2024 01:43:22 -0700 Subject: [PATCH 05/15] oops --- inc/zoo/swar/associative_iteration.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inc/zoo/swar/associative_iteration.h b/inc/zoo/swar/associative_iteration.h index 3cef3be..4a1a91d 100644 --- a/inc/zoo/swar/associative_iteration.h +++ b/inc/zoo/swar/associative_iteration.h @@ -64,7 +64,7 @@ constexpr auto parallelSuffix(S input) { auto result = input; auto shiftClearingMask = S{~S::MostSignificantBit}; auto power = 1; - auto log2Count = log2_of_power_of_two(S::NBits) + 1; + auto log2Count = log2_of_power_of_two(S::NBits); for(;;) { result = operation(result, power, shiftClearingMask); From 0b5065983b88215b95ebd29e53eed692a89e77dc Mon Sep 17 00:00:00 2001 From: Jamie Pond <73431532+jamierpond@users.noreply.github.com> Date: Thu, 12 Sep 2024 01:49:00 -0700 Subject: [PATCH 06/15] tidy up a little? --- inc/zoo/swar/associative_iteration.h | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/inc/zoo/swar/associative_iteration.h b/inc/zoo/swar/associative_iteration.h index 4a1a91d..0e8038d 100644 --- a/inc/zoo/swar/associative_iteration.h +++ b/inc/zoo/swar/associative_iteration.h @@ -51,8 +51,6 @@ constexpr auto log2_of_power_of_two = [](auto power_of_two) { return __builtin_ctz(power_of_two); }; -/// \note This code should be substituted by an application of "progressive" algebraic iteration -/// \note There is also parallelPrefix (to be implemented) template constexpr auto parallelSuffix(S input) { constexpr auto operation = [] (auto doubling, auto power, auto mask) { @@ -60,18 +58,16 @@ constexpr auto parallelSuffix(S input) { doubling = doubling ^ shifted; return doubling; }; - - auto result = input; - auto shiftClearingMask = S{~S::MostSignificantBit}; - auto power = 1; - auto log2Count = log2_of_power_of_two(S::NBits); - + auto log2Count = log2_of_power_of_two(S::NBits), + power = 1; + auto result = input, + mask = S{~S::MostSignificantBit}; for(;;) { - result = operation(result, power, shiftClearingMask); + result = operation(result, power, mask); if (!--log2Count) { break; } - shiftClearingMask = shiftClearingMask & S{shiftClearingMask.value() >> power}; + mask = mask & S{mask.value() >> power}; power <<= 1; } return S{result}; From d3f989d97d4607838f86e56e2fb46fe6fc128e7e Mon Sep 17 00:00:00 2001 From: Jamie Pond <73431532+jamierpond@users.noreply.github.com> Date: Thu, 12 Sep 2024 01:58:45 -0700 Subject: [PATCH 07/15] generalise attempt --- inc/zoo/swar/associative_iteration.h | 47 ++++++++++++++++++++++------ 1 file changed, 38 insertions(+), 9 deletions(-) diff --git a/inc/zoo/swar/associative_iteration.h b/inc/zoo/swar/associative_iteration.h index 0e8038d..d4751c4 100644 --- a/inc/zoo/swar/associative_iteration.h +++ b/inc/zoo/swar/associative_iteration.h @@ -51,10 +51,20 @@ constexpr auto log2_of_power_of_two = [](auto power_of_two) { return __builtin_ctz(power_of_two); }; -template -constexpr auto parallelSuffix(S input) { +enum class ParallelXixOperation { + Suffix, + Prefix +}; + +template +constexpr auto parallelXix(S input) { constexpr auto operation = [] (auto doubling, auto power, auto mask) { - auto shifted = doubling.shiftIntraLaneLeft(power, mask); + auto shifted = [&] { + if constexpr(XixType == ParallelXixOperation::Suffix) { + return doubling.shiftIntraLaneLeft(power, mask); + } + return doubling.shiftIntraLaneRight(power, mask); + }(); doubling = doubling ^ shifted; return doubling; }; @@ -73,6 +83,25 @@ constexpr auto parallelSuffix(S input) { return S{result}; } +// not 100% sure this one works yet, need to test +// not sure that the power needs to shift the mask which way, and if the +// mask needs to start with the least significant bit etc. +template +constexpr auto parallelPrefix(S input) { + return parallelXix(input); +} + +template +constexpr auto parallelSuffix(S input) { + return parallelXix(input); +} + +static_assert( + parallelSuffix(SWAR<32, u64>{ + 0b00000000000000110000000000000011'00000000000000000000000000000111}).value() + == 0b00000000000000010000000000000001'11111111111111111111111111111101 +); + static_assert( parallelSuffix(SWAR<16, u32>{ 0b0000000000000011'0000000000000011}).value() @@ -85,12 +114,6 @@ static_assert( == 0b00000000'00000000'00000000'00000000 ); -static_assert( - parallelSuffix(SWAR<4, u32> { - 0b0011'0110'0011'0000'0110'0011'0011'0011}).value() - == 0b0001'0010'0001'0000'0010'0001'0001'0001 -); - static_assert( parallelSuffix(SWAR<8, u32>{ 0b00000011'00000011'00000111'00000011}).value() @@ -103,6 +126,12 @@ static_assert( == 0b00001000'00000001'11101000'00000001 ); +static_assert( + parallelSuffix(SWAR<4, u32> { + 0b0011'0110'0011'0000'0110'0011'0011'0011}).value() + == 0b0001'0010'0001'0000'0010'0001'0001'0001 +); + /* From cc9b7b00980258bf7c5c3ac477a8ee0e82c56051 Mon Sep 17 00:00:00 2001 From: Jamie Pond <73431532+jamierpond@users.noreply.github.com> Date: Thu, 12 Sep 2024 01:59:41 -0700 Subject: [PATCH 08/15] comment --- inc/zoo/swar/associative_iteration.h | 1 + 1 file changed, 1 insertion(+) diff --git a/inc/zoo/swar/associative_iteration.h b/inc/zoo/swar/associative_iteration.h index d4751c4..071fa99 100644 --- a/inc/zoo/swar/associative_iteration.h +++ b/inc/zoo/swar/associative_iteration.h @@ -41,6 +41,7 @@ std::ostream &operator<<(std::ostream &out, zoo::swar::SWAR s) { namespace zoo::swar { +// i'm sure we must have one of these elsewhere lol constexpr auto log2_of_power_of_two = [](auto power_of_two) { if (power_of_two == 0) { return 0; From 7b6eb0fb02944a629853d8a47c848f7827166a88 Mon Sep 17 00:00:00 2001 From: Jamie Pond <73431532+jamierpond@users.noreply.github.com> Date: Thu, 12 Sep 2024 02:00:51 -0700 Subject: [PATCH 09/15] nah, that's undefined --- inc/zoo/swar/associative_iteration.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/inc/zoo/swar/associative_iteration.h b/inc/zoo/swar/associative_iteration.h index 071fa99..de6b0e8 100644 --- a/inc/zoo/swar/associative_iteration.h +++ b/inc/zoo/swar/associative_iteration.h @@ -43,9 +43,6 @@ namespace zoo::swar { // i'm sure we must have one of these elsewhere lol constexpr auto log2_of_power_of_two = [](auto power_of_two) { - if (power_of_two == 0) { - return 0; - } if (power_of_two == 1) { return 1; } From 13a51aeadb2dca316a945cfe803086ba3ee0236d Mon Sep 17 00:00:00 2001 From: Jamie Pond <73431532+jamierpond@users.noreply.github.com> Date: Thu, 12 Sep 2024 02:22:58 -0700 Subject: [PATCH 10/15] ok simplify again --- inc/zoo/swar/associative_iteration.h | 51 ++++++++-------------------- 1 file changed, 14 insertions(+), 37 deletions(-) diff --git a/inc/zoo/swar/associative_iteration.h b/inc/zoo/swar/associative_iteration.h index de6b0e8..b144fa8 100644 --- a/inc/zoo/swar/associative_iteration.h +++ b/inc/zoo/swar/associative_iteration.h @@ -41,57 +41,33 @@ std::ostream &operator<<(std::ostream &out, zoo::swar::SWAR s) { namespace zoo::swar { -// i'm sure we must have one of these elsewhere lol constexpr auto log2_of_power_of_two = [](auto power_of_two) { + if (power_of_two == 0) { + return 0; + } if (power_of_two == 1) { return 1; } return __builtin_ctz(power_of_two); }; -enum class ParallelXixOperation { - Suffix, - Prefix -}; -template -constexpr auto parallelXix(S input) { - constexpr auto operation = [] (auto doubling, auto power, auto mask) { - auto shifted = [&] { - if constexpr(XixType == ParallelXixOperation::Suffix) { - return doubling.shiftIntraLaneLeft(power, mask); - } - return doubling.shiftIntraLaneRight(power, mask); - }(); - doubling = doubling ^ shifted; - return doubling; - }; +template +constexpr auto parallelSuffix(S input) { auto log2Count = log2_of_power_of_two(S::NBits), power = 1; + auto result = input, - mask = S{~S::MostSignificantBit}; - for(;;) { - result = operation(result, power, mask); - if (!--log2Count) { - break; - } - mask = mask & S{mask.value() >> power}; + shiftMask = S{~S::MostSignificantBit}; + + for (;;) { + result = result ^ result.shiftIntraLaneLeft(power, shiftMask); + if (!--log2Count) { break; } + shiftMask = shiftMask & S{shiftMask.value() >> power}; power <<= 1; } - return S{result}; -} - -// not 100% sure this one works yet, need to test -// not sure that the power needs to shift the mask which way, and if the -// mask needs to start with the least significant bit etc. -template -constexpr auto parallelPrefix(S input) { - return parallelXix(input); -} -template -constexpr auto parallelSuffix(S input) { - return parallelXix(input); + return S{result}; } static_assert( @@ -100,6 +76,7 @@ static_assert( == 0b00000000000000010000000000000001'11111111111111111111111111111101 ); + static_assert( parallelSuffix(SWAR<16, u32>{ 0b0000000000000011'0000000000000011}).value() From 6bc2bea149d6947ecb6bc447af8a36c877f627a9 Mon Sep 17 00:00:00 2001 From: Jamie Pond <73431532+jamierpond@users.noreply.github.com> Date: Thu, 12 Sep 2024 02:25:00 -0700 Subject: [PATCH 11/15] lean --- inc/zoo/swar/associative_iteration.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/inc/zoo/swar/associative_iteration.h b/inc/zoo/swar/associative_iteration.h index b144fa8..cf9b533 100644 --- a/inc/zoo/swar/associative_iteration.h +++ b/inc/zoo/swar/associative_iteration.h @@ -54,11 +54,12 @@ constexpr auto log2_of_power_of_two = [](auto power_of_two) { template constexpr auto parallelSuffix(S input) { - auto log2Count = log2_of_power_of_two(S::NBits), - power = 1; - - auto result = input, - shiftMask = S{~S::MostSignificantBit}; + auto + log2Count = log2_of_power_of_two(S::NBits), + power = 1; + auto + result = input, + shiftMask = S{~S::MostSignificantBit}; for (;;) { result = result ^ result.shiftIntraLaneLeft(power, shiftMask); From fe5b320d6ab782acb7e0d4c5c7f5499e7ad15d79 Mon Sep 17 00:00:00 2001 From: Jamie Pond <73431532+jamierpond@users.noreply.github.com> Date: Thu, 12 Sep 2024 02:30:21 -0700 Subject: [PATCH 12/15] still undefined --- inc/zoo/swar/associative_iteration.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/inc/zoo/swar/associative_iteration.h b/inc/zoo/swar/associative_iteration.h index cf9b533..d1d16b6 100644 --- a/inc/zoo/swar/associative_iteration.h +++ b/inc/zoo/swar/associative_iteration.h @@ -42,9 +42,6 @@ std::ostream &operator<<(std::ostream &out, zoo::swar::SWAR s) { namespace zoo::swar { constexpr auto log2_of_power_of_two = [](auto power_of_two) { - if (power_of_two == 0) { - return 0; - } if (power_of_two == 1) { return 1; } From 899bb08baa470b3c6f0e8c3778bc9070e69bed6c Mon Sep 17 00:00:00 2001 From: Jamie Pond <73431532+jamierpond@users.noreply.github.com> Date: Thu, 12 Sep 2024 02:31:47 -0700 Subject: [PATCH 13/15] oops --- inc/zoo/swar/associative_iteration.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/inc/zoo/swar/associative_iteration.h b/inc/zoo/swar/associative_iteration.h index d1d16b6..866c494 100644 --- a/inc/zoo/swar/associative_iteration.h +++ b/inc/zoo/swar/associative_iteration.h @@ -42,13 +42,9 @@ std::ostream &operator<<(std::ostream &out, zoo::swar::SWAR s) { namespace zoo::swar { constexpr auto log2_of_power_of_two = [](auto power_of_two) { - if (power_of_two == 1) { - return 1; - } return __builtin_ctz(power_of_two); }; - template constexpr auto parallelSuffix(S input) { auto From 1dbdd25c55924c3f151c7bf6cde4c1d2527e0e7e Mon Sep 17 00:00:00 2001 From: Jamie Pond <73431532+jamierpond@users.noreply.github.com> Date: Thu, 12 Sep 2024 02:34:44 -0700 Subject: [PATCH 14/15] mv asserts --- test/swar/BasicOperations.cpp | 36 +++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/test/swar/BasicOperations.cpp b/test/swar/BasicOperations.cpp index 602384a..b4edd05 100644 --- a/test/swar/BasicOperations.cpp +++ b/test/swar/BasicOperations.cpp @@ -266,6 +266,42 @@ TEST_CASE("Old multiply version", "[deprecated][swar]") { CHECK(Expected == result.value()); } +static_assert( + parallelSuffix(SWAR<32, u64>{ + 0b00000000000000110000000000000011'00000000000000000000000000000111}).value() + == 0b00000000000000010000000000000001'11111111111111111111111111111101 +); + +static_assert( + parallelSuffix(SWAR<16, u32>{ + 0b0000000000000011'0000000000000011}).value() + == 0b0000000000000001'0000000000000001 +); + +static_assert( + parallelSuffix(SWAR<8, u32>{ + 0b00000000'00000000'00000000'00000000}).value() + == 0b00000000'00000000'00000000'00000000 +); + +static_assert( + parallelSuffix(SWAR<8, u32>{ + 0b00000011'00000011'00000111'00000011}).value() + == 0b00000001'00000001'11111101'00000001 +); + +static_assert( + parallelSuffix(SWAR<8, u32>{ + 0b00011000'00000011'00111000'00000011}).value() + == 0b00001000'00000001'11101000'00000001 +); + +static_assert( + parallelSuffix(SWAR<4, u32> { + 0b0011'0110'0011'0000'0110'0011'0011'0011}).value() + == 0b0001'0010'0001'0000'0010'0001'0001'0001 +); + TEST_CASE("Parity", "[swar]") { // For each nibble, E indicates (E)ven and O (O)dd parities // EEOEEOOO From 505f9dc9e9d123e633e10658f1b90406aa40b605 Mon Sep 17 00:00:00 2001 From: Jamie Pond <73431532+jamierpond@users.noreply.github.com> Date: Thu, 12 Sep 2024 02:35:03 -0700 Subject: [PATCH 15/15] rm asserts --- inc/zoo/swar/associative_iteration.h | 39 ---------------------------- 1 file changed, 39 deletions(-) diff --git a/inc/zoo/swar/associative_iteration.h b/inc/zoo/swar/associative_iteration.h index 866c494..e67bd09 100644 --- a/inc/zoo/swar/associative_iteration.h +++ b/inc/zoo/swar/associative_iteration.h @@ -64,45 +64,6 @@ constexpr auto parallelSuffix(S input) { return S{result}; } -static_assert( - parallelSuffix(SWAR<32, u64>{ - 0b00000000000000110000000000000011'00000000000000000000000000000111}).value() - == 0b00000000000000010000000000000001'11111111111111111111111111111101 -); - - -static_assert( - parallelSuffix(SWAR<16, u32>{ - 0b0000000000000011'0000000000000011}).value() - == 0b0000000000000001'0000000000000001 -); - -static_assert( - parallelSuffix(SWAR<8, u32>{ - 0b00000000'00000000'00000000'00000000}).value() - == 0b00000000'00000000'00000000'00000000 -); - -static_assert( - parallelSuffix(SWAR<8, u32>{ - 0b00000011'00000011'00000111'00000011}).value() - == 0b00000001'00000001'11111101'00000001 -); - -static_assert( - parallelSuffix(SWAR<8, u32>{ - 0b00011000'00000011'00111000'00000011}).value() - == 0b00001000'00000001'11101000'00000001 -); - -static_assert( - parallelSuffix(SWAR<4, u32> { - 0b0011'0110'0011'0000'0110'0011'0011'0011}).value() - == 0b0001'0010'0001'0000'0010'0001'0001'0001 -); - - - /* Binary compress: A fascinating algorithm.