From 484e443b73854abeb52038c7fca882f7ce0fdea4 Mon Sep 17 00:00:00 2001 From: evoskuil Date: Wed, 4 Dec 2024 20:23:55 -0500 Subject: [PATCH 1/9] Style. --- include/bitcoin/system/impl/hash/sha/algorithm_iterate.ipp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_iterate.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_iterate.ipp index ee9abbe756..a174a1064b 100644 --- a/include/bitcoin/system/impl/hash/sha/algorithm_iterate.ipp +++ b/include/bitcoin/system/impl/hash/sha/algorithm_iterate.ipp @@ -228,7 +228,7 @@ iterate_vector(state_t& state, const ablocks_t& blocks) NOEXCEPT { if (blocks.size() >= min_lanes) { - auto iblocks = iblocks_t{ array_cast(blocks) }; + iblocks_t iblocks{ array_cast(blocks) }; iterate_vector(state, iblocks); } else From 80451c7c23ebabce23d9e9829422d4bedce82551 Mon Sep 17 00:00:00 2001 From: evoskuil Date: Wed, 4 Dec 2024 20:33:25 -0500 Subject: [PATCH 2/9] Add default element count (1) to iterable template methods. --- include/bitcoin/system/data/iterable.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/bitcoin/system/data/iterable.hpp b/include/bitcoin/system/data/iterable.hpp index 3a2daeaace..08c1b2d924 100644 --- a/include/bitcoin/system/data/iterable.hpp +++ b/include/bitcoin/system/data/iterable.hpp @@ -175,7 +175,7 @@ class iterable return begin_; } - template + template inline iterable& advance() NOEXCEPT { // This is safe for overflow, will advance to end. @@ -185,7 +185,7 @@ class iterable return *this; } - template + template inline const std_array& to_array() const NOEXCEPT { return unsafe_array_cast(begin_); From 7fb2b154c203b964b225d8350ad643ad00af85c6 Mon Sep 17 00:00:00 2001 From: evoskuil Date: Thu, 5 Dec 2024 09:55:56 -0500 Subject: [PATCH 3/9] Replace buffered shani with rotating. --- include/bitcoin/system/hash/sha/algorithm.hpp | 34 +++- .../impl/hash/sha/algorithm_compress.ipp | 10 +- .../impl/hash/sha/algorithm_iterate.ipp | 37 ++++- .../system/impl/hash/sha/algorithm_native.ipp | 157 +++++++++++++++++- .../impl/hash/sha/algorithm_schedule.ipp | 10 +- .../system/impl/hash/sha/algorithm_sigma.ipp | 36 ++-- 6 files changed, 242 insertions(+), 42 deletions(-) diff --git a/include/bitcoin/system/hash/sha/algorithm.hpp b/include/bitcoin/system/hash/sha/algorithm.hpp index 6201230bcf..b8683277d0 100644 --- a/include/bitcoin/system/hash/sha/algorithm.hpp +++ b/include/bitcoin/system/hash/sha/algorithm.hpp @@ -281,6 +281,11 @@ class algorithm INLINE static void iterate_vector(state_t& state, iblocks_t& blocks) NOEXCEPT; + template + INLINE static void iterate_native(state_t& state, + const ablocks_t& blocks) NOEXCEPT; + INLINE static void iterate_native(state_t& state, iblocks_t& blocks) NOEXCEPT; + template INLINE static constexpr void iterate_(state_t& state, const ablocks_t& blocks) NOEXCEPT; @@ -317,7 +322,8 @@ class algorithm const xstate_t& xstate) NOEXCEPT; template = true> - INLINE static void merkle_hash_vector(idigests_t& digests, iblocks_t& blocks) NOEXCEPT; + INLINE static void merkle_hash_vector(idigests_t& digests, + iblocks_t& blocks) NOEXCEPT; INLINE static void merkle_hash_vector(digests_t& digests) NOEXCEPT; VCONSTEXPR static void merkle_hash_(digests_t& digests, size_t offset=zero) NOEXCEPT; @@ -330,10 +336,10 @@ class algorithm auto x6, auto x7, auto x8) NOEXCEPT; template - INLINE static void prepare1(buffer_t& buffer, const auto& xsigma0) NOEXCEPT; + INLINE static void prepare_1(buffer_t& buffer, const auto& xsigma0) NOEXCEPT; template - INLINE static void prepare8(buffer_t& buffer) NOEXCEPT; + INLINE static void prepare_8(buffer_t& buffer) NOEXCEPT; template INLINE static void schedule_sigma(xbuffer_t& xbuffer) NOEXCEPT; @@ -357,6 +363,8 @@ class algorithm /// Native SHA optimizations (single blocks). /// ----------------------------------------------------------------------- +#if defined(DISABLED) + template INLINE static void prepare_native(wbuffer_t& wbuffer) NOEXCEPT; static void schedule_native(wbuffer_t& wbuffer) NOEXCEPT; @@ -390,12 +398,28 @@ class algorithm INLINE static void compress_native(state_t& state, const buffer_t& buffer) NOEXCEPT; +#else // DISABLED + + INLINE static void shuffle(xint128_t& state0, xint128_t& state1) NOEXCEPT; + INLINE static void unshuffle(xint128_t& state0, xint128_t& state1) NOEXCEPT; + INLINE static void prepare(xint128_t& message0, xint128_t message1) NOEXCEPT; + INLINE static void prepare(xint128_t& message0, xint128_t message1, + xint128_t& message2) NOEXCEPT; + + template + INLINE static void round_4(xint128_t& state0, xint128_t& state1, + xint128_t message) NOEXCEPT; + + static void native_rounds(state_t& state, iblocks_t& blocks) NOEXCEPT; + +#endif // DISABLED + public: /// Summary public values. /// ----------------------------------------------------------------------- static constexpr auto caching = Cached; - static constexpr auto native = (use_shani || use_neon) && - !is_same_size; + static constexpr auto native = (use_shani || use_neon) + && (SHA::strength == 256 || SHA::strength == 160); static constexpr auto vector = (use_x128 || use_x256 || use_x512) && !(build_x32 && is_same_size); }; diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_compress.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_compress.ipp index f9da36fd0f..32093848b3 100644 --- a/include/bitcoin/system/impl/hash/sha/algorithm_compress.ipp +++ b/include/bitcoin/system/impl/hash/sha/algorithm_compress.ipp @@ -276,11 +276,11 @@ compress(state_t& state, const buffer_t& buffer) NOEXCEPT { compress_(state, buffer); } - else if constexpr (native) - { - // Single block shani compression optimization. - compress_native(state, buffer); - } + ////else if constexpr (native) + ////{ + //// // Single block shani compression optimization. + //// compress_native(state, buffer); + ////} ////else if constexpr (vector) ////{ //// // Compression is not vectorized within a block, however this is diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_iterate.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_iterate.ipp index a174a1064b..bb51e8e9ce 100644 --- a/include/bitcoin/system/impl/hash/sha/algorithm_iterate.ipp +++ b/include/bitcoin/system/impl/hash/sha/algorithm_iterate.ipp @@ -237,6 +237,31 @@ iterate_vector(state_t& state, const ablocks_t& blocks) NOEXCEPT } } +// Native SHA +// ============================================================================ +// www.intel.com/content/dam/develop/external/us/en/documents/ +// intel-sha-extensions-white-paper-402097.pdf + +TEMPLATE +INLINE void CLASS:: +iterate_native(state_t& state, iblocks_t& blocks) NOEXCEPT +{ + native_rounds(state, blocks); +} + +TEMPLATE +template +INLINE void CLASS:: +iterate_native(state_t& state, const ablocks_t& blocks) NOEXCEPT +{ + iblocks_t iblocks{ array_cast(blocks) }; + native_rounds(state, iblocks); +} + +// Dispatch and normal forms. +// ============================================================================ +// protected + TEMPLATE template INLINE constexpr void CLASS:: @@ -273,11 +298,9 @@ iterate(state_t& state, const ablocks_t& blocks) NOEXCEPT { iterate_(state, blocks); } - else if constexpr (native) + else if constexpr (native && SHA::strength == 256) { - // TODO: evaluate 4/8/16 lane message scheduling vs. shani scheduling. - // Multiple block shani message schduling and compression optimization. - iterate_(state, blocks); + iterate_native(state, blocks); } else if constexpr (vector) { @@ -294,11 +317,9 @@ TEMPLATE INLINE void CLASS:: iterate(state_t& state, iblocks_t& blocks) NOEXCEPT { - if constexpr (native) + if constexpr (native && SHA::strength == 256) { - // TODO: evaluate 4/8/16 lane message scheduling vs. shani scheduling. - // Multiple block shani message schduling and compression optimization. - iterate_(state, blocks); + iterate_native(state, blocks); } else if constexpr (vector) { diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_native.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_native.ipp index b09941ea35..0d5232c166 100644 --- a/include/bitcoin/system/impl/hash/sha/algorithm_native.ipp +++ b/include/bitcoin/system/impl/hash/sha/algorithm_native.ipp @@ -35,7 +35,9 @@ namespace libbitcoin { namespace system { namespace sha { - + +#if defined(DISABLED) + // schedule // ---------------------------------------------------------------------------- // protected @@ -302,6 +304,159 @@ compress_native(state_t& state, const buffer_t& buffer) NOEXCEPT } } +#else // DISABLED + +// rotating variables (no buffer) +// ---------------------------------------------------------------------------- +// protected + +// The iterative method is used for sha native as it is an order of magnitude +// more efficient and cannot benefit from vectorization. + +TEMPLATE +INLINE void CLASS:: +shuffle(xint128_t& state0, xint128_t& state1) NOEXCEPT +{ + const auto shuffle0 = mm_shuffle_epi32(state0, 0xb1); + const auto shuffle1 = mm_shuffle_epi32(state1, 0x1b); + state0 = mm_alignr_epi8(shuffle0, shuffle1, 0x08); + state1 = mm_blend_epi16(shuffle1, shuffle0, 0xf0); +} + +TEMPLATE +INLINE void CLASS:: +unshuffle(xint128_t& state0, xint128_t& state1) NOEXCEPT +{ + const auto shuffle0 = mm_shuffle_epi32(state0, 0x1b); + const auto shuffle1 = mm_shuffle_epi32(state1, 0xb1); + state0 = mm_blend_epi16(shuffle0, shuffle1, 0xf0); + state1 = mm_alignr_epi8(shuffle1, shuffle0, 0x08); +} + +TEMPLATE +INLINE void CLASS:: +prepare(xint128_t& message0, xint128_t message1) NOEXCEPT +{ + message0 = mm_sha256msg1_epu32(message0, message1); +} + +TEMPLATE +INLINE void CLASS:: +prepare(xint128_t& message0, xint128_t message1, xint128_t& message2) NOEXCEPT +{ + message2 = mm_sha256msg2_epu32(f::add(message2, + mm_alignr_epi8(message1, message0, 4)), message1); +} + +TEMPLATE +template +INLINE void CLASS:: +round_4(xint128_t& state0, xint128_t& state1, xint128_t message) NOEXCEPT +{ + // TODO: evaluate static setter (local to Round). + ////static const auto k = set( + //// K::get[r + 0], K::get[r + 1], K::get[r + 2], K::get[r + 3]); + + constexpr auto r = Round * 4; + const auto wk = f::add(message, set( + K::get[r + 0], K::get[r + 1], K::get[r + 2], K::get[r + 3])); + + state1 = mm_sha256rnds2_epu32(state1, state0, wk); + state0 = mm_sha256rnds2_epu32(state0, state1, mm_shuffle_epi32(wk, 0x0e)); +} + +TEMPLATE +void CLASS:: +native_rounds(state_t& state, iblocks_t& blocks) NOEXCEPT +{ + // Individual state vars are used vs. array to ensure register persistence. + auto& wstate = array_cast(state); + auto lo = load(wstate[0]); + auto hi = load(wstate[1]); + + // shufle organizes state as expected by sha256rnds2. + shuffle(lo, hi); + + while (!blocks.empty()) + { + const auto start_lo = lo; + const auto start_hi = hi; + auto& block = array_cast(blocks.to_array()); + + auto message0 = byteswap(block[0]); + round_4<0>(lo, hi, message0); + + auto message1 = byteswap(block[1]); + round_4<1>(lo, hi, message1); + + prepare(message0, message1); + auto message2 = byteswap(block[2]); + round_4<2>(lo, hi, message2); + + prepare(message1, message2); + auto message3 = byteswap(block[3]); + round_4<3>(lo, hi, message3); + + prepare(message2, message3, message0); + prepare(message2, message3); + round_4<4>(lo, hi, message0); + + prepare(message3, message0, message1); + prepare(message3, message0); + round_4<5>(lo, hi, message1); + + prepare(message0, message1, message2); + prepare(message0, message1); + round_4<6>(lo, hi, message2); + + prepare(message1, message2, message3); + prepare(message1, message2); + round_4<7>(lo, hi, message3); + + prepare(message2, message3, message0); + prepare(message2, message3); + round_4<8>(lo, hi, message0); + + prepare(message3, message0, message1); + prepare(message3, message0); + round_4<9>(lo, hi, message1); + + prepare(message0, message1, message2); + prepare(message0, message1); + round_4<10>(lo, hi, message2); + + prepare(message1, message2, message3); + prepare(message1, message2); + round_4<11>(lo, hi, message3); + + prepare(message2, message3, message0); + prepare(message2, message3); + round_4<12>(lo, hi, message0); + + prepare(message3, message0, message1); + prepare(message3, message0); + round_4<13>(lo, hi, message1); + + prepare(message0, message1, message2); + round_4<14>(lo, hi, message2); + + prepare(message1, message2, message3); + round_4<15>(lo, hi, message3); + + lo = f::add(lo, start_lo); + hi = f::add(hi, start_hi); + blocks.advance(); + } + + // unshuffle restores state to normal form. + unshuffle(lo, hi); + + store(wstate[0], lo); + store(wstate[1], hi); +} + +#endif // DISABLED + } // namespace sha } // namespace system } // namespace libbitcoin diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_schedule.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_schedule.ipp index f2796d4b5c..b65704e27b 100644 --- a/include/bitcoin/system/impl/hash/sha/algorithm_schedule.ipp +++ b/include/bitcoin/system/impl/hash/sha/algorithm_schedule.ipp @@ -138,11 +138,11 @@ schedule(buffer_t& buffer) NOEXCEPT { schedule_(buffer); } - else if constexpr (native) - { - // Single block (with shani) message scheduling optimization. - schedule_native(buffer); - } + ////else if constexpr (native) + ////{ + //// // Single block (with shani) message scheduling optimization. + //// schedule_native(buffer); + ////} else if constexpr (vector) { // Single block (without shani) message scheduling optimization. diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_sigma.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_sigma.ipp index 866fac88b8..9e95f50323 100644 --- a/include/bitcoin/system/impl/hash/sha/algorithm_sigma.ipp +++ b/include/bitcoin/system/impl/hash/sha/algorithm_sigma.ipp @@ -41,7 +41,7 @@ sigma0_8(auto x1, auto x2, auto x3, auto x4, auto x5, auto x6, auto x7, TEMPLATE template INLINE void CLASS:: -prepare1(buffer_t& buffer, const auto& xsigma0) NOEXCEPT +prepare_1(buffer_t& buffer, const auto& xsigma0) NOEXCEPT { static_assert(Round >= 16); constexpr auto r02 = Round - 2; @@ -58,7 +58,7 @@ prepare1(buffer_t& buffer, const auto& xsigma0) NOEXCEPT TEMPLATE template INLINE void CLASS:: -prepare8(buffer_t& buffer) NOEXCEPT +prepare_8(buffer_t& buffer) NOEXCEPT { // Requires avx512 for sha512 and avx2 for sha256. // The simplicity of sha160 message prepare precludes this optimization. @@ -73,14 +73,14 @@ prepare8(buffer_t& buffer) NOEXCEPT buffer[r15 + 0], buffer[r15 + 1], buffer[r15 + 2], buffer[r15 + 3], buffer[r15 + 4], buffer[r15 + 5], buffer[r15 + 6], buffer[r15 + 7]); - prepare1(buffer, xsigma0); - prepare1(buffer, xsigma0); - prepare1(buffer, xsigma0); - prepare1(buffer, xsigma0); - prepare1(buffer, xsigma0); - prepare1(buffer, xsigma0); - prepare1(buffer, xsigma0); - prepare1(buffer, xsigma0); + prepare_1(buffer, xsigma0); + prepare_1(buffer, xsigma0); + prepare_1(buffer, xsigma0); + prepare_1(buffer, xsigma0); + prepare_1(buffer, xsigma0); + prepare_1(buffer, xsigma0); + prepare_1(buffer, xsigma0); + prepare_1(buffer, xsigma0); } TEMPLATE @@ -98,17 +98,17 @@ schedule_sigma(buffer_t& buffer) NOEXCEPT { if constexpr (SHA::strength != 160 && have_lanes) { - prepare8<16>(buffer); - prepare8<24>(buffer); - prepare8<32>(buffer); - prepare8<40>(buffer); - prepare8<48>(buffer); - prepare8<56>(buffer); + prepare_8<16>(buffer); + prepare_8<24>(buffer); + prepare_8<32>(buffer); + prepare_8<40>(buffer); + prepare_8<48>(buffer); + prepare_8<56>(buffer); if constexpr (SHA::rounds == 80) { - prepare8<64>(buffer); - prepare8<72>(buffer); + prepare_8<64>(buffer); + prepare_8<72>(buffer); } konstant(buffer); From 0030b7264045e98ad3b95c663fd572e5092cc9fd Mon Sep 17 00:00:00 2001 From: evoskuil Date: Thu, 5 Dec 2024 09:56:03 -0500 Subject: [PATCH 4/9] Comments. --- include/bitcoin/system/impl/hash/sha/algorithm_merkle.ipp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_merkle.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_merkle.ipp index d0ab65e166..72f3853beb 100644 --- a/include/bitcoin/system/impl/hash/sha/algorithm_merkle.ipp +++ b/include/bitcoin/system/impl/hash/sha/algorithm_merkle.ipp @@ -421,6 +421,10 @@ merkle_hash_vector(digests_t& digests) NOEXCEPT // ---------------------------------------------------------------------------- // public +// TODO: consider eliminating endianness conversions internal to the root +// computation, instead converting on way in and way out ony, and using non +// converting input/output (nop) functions. + TEMPLATE VCONSTEXPR typename CLASS::digest_t CLASS:: merkle_root(digests_t&& digests) NOEXCEPT From 2dc7bf22c4e37799af063bd75f22167016d26788 Mon Sep 17 00:00:00 2001 From: evoskuil Date: Thu, 5 Dec 2024 11:53:04 -0500 Subject: [PATCH 5/9] Style, delint, use add() for vector only adds. --- include/bitcoin/system/hash/sha/algorithm.hpp | 39 --- .../impl/hash/sha/algorithm_konstant.ipp | 7 +- .../system/impl/hash/sha/algorithm_native.ipp | 313 +----------------- .../system/impl/hash/sha/algorithm_sigma.ipp | 7 +- .../system/intrinsics/xcpu/defines.hpp | 53 +-- .../system/intrinsics/xcpu/functional_128.hpp | 90 +++-- .../system/intrinsics/xcpu/functional_256.hpp | 63 ++-- .../system/intrinsics/xcpu/functional_512.hpp | 55 +-- 8 files changed, 189 insertions(+), 438 deletions(-) diff --git a/include/bitcoin/system/hash/sha/algorithm.hpp b/include/bitcoin/system/hash/sha/algorithm.hpp index b8683277d0..5b25bfac1c 100644 --- a/include/bitcoin/system/hash/sha/algorithm.hpp +++ b/include/bitcoin/system/hash/sha/algorithm.hpp @@ -363,43 +363,6 @@ class algorithm /// Native SHA optimizations (single blocks). /// ----------------------------------------------------------------------- -#if defined(DISABLED) - - template - INLINE static void prepare_native(wbuffer_t& wbuffer) NOEXCEPT; - static void schedule_native(wbuffer_t& wbuffer) NOEXCEPT; - - template - INLINE static void schedule_native(xbuffer_t& xbuffer) NOEXCEPT; - INLINE static void schedule_native(buffer_t& buffer) NOEXCEPT; - - template - INLINE static void round_native(wstate_t& state, - const wbuffer_t& wk) NOEXCEPT; - - INLINE static void shuffle(wstate_t& wstate) NOEXCEPT; - INLINE static void unshuffle(wstate_t& wstate) NOEXCEPT; - INLINE static void summarize_native(wstate_t& out, - const wstate_t& in) NOEXCEPT; - - template - static void compress_native(wstate_t& state, - const wbuffer_t& wbuffer) NOEXCEPT; - - template - INLINE static void compress_native(xstate_t& xstate, - const xbuffer_t& xbuffer) NOEXCEPT; - - template - INLINE static void compress_native(state_t& state, - const xbuffer_t& xbuffer) NOEXCEPT; - - template - INLINE static void compress_native(state_t& state, - const buffer_t& buffer) NOEXCEPT; - -#else // DISABLED - INLINE static void shuffle(xint128_t& state0, xint128_t& state1) NOEXCEPT; INLINE static void unshuffle(xint128_t& state0, xint128_t& state1) NOEXCEPT; INLINE static void prepare(xint128_t& message0, xint128_t message1) NOEXCEPT; @@ -412,8 +375,6 @@ class algorithm static void native_rounds(state_t& state, iblocks_t& blocks) NOEXCEPT; -#endif // DISABLED - public: /// Summary public values. /// ----------------------------------------------------------------------- diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_konstant.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_konstant.ipp index e62127a10c..26cf225d14 100644 --- a/include/bitcoin/system/impl/hash/sha/algorithm_konstant.ipp +++ b/include/bitcoin/system/impl/hash/sha/algorithm_konstant.ipp @@ -50,13 +50,12 @@ template INLINE void CLASS:: vector_konstant(wbuffer_t& wbuffer) NOEXCEPT { - constexpr auto s = SHA::word_bits; constexpr auto lanes = capacity; constexpr auto r = Round * lanes; if constexpr (lanes == 16) { - wbuffer[Round] = f::add(wbuffer[Round], set( + wbuffer[Round] = add(wbuffer[Round], set( K::get[r + 0], K::get[r + 1], K::get[r + 2], K::get[r + 3], K::get[r + 4], K::get[r + 5], K::get[r + 6], K::get[r + 7], K::get[r + 8], K::get[r + 9], K::get[r + 10], K::get[r + 11], @@ -64,13 +63,13 @@ vector_konstant(wbuffer_t& wbuffer) NOEXCEPT } else if constexpr (lanes == 8) { - wbuffer[Round] = f::add(wbuffer[Round], set( + wbuffer[Round] = add(wbuffer[Round], set( K::get[r + 0], K::get[r + 1], K::get[r + 2], K::get[r + 3], K::get[r + 4], K::get[r + 5], K::get[r + 6], K::get[r + 7])); } else if constexpr (lanes == 4) { - wbuffer[Round] = f::add(wbuffer[Round], set( + wbuffer[Round] = add(wbuffer[Round], set( K::get[r + 0], K::get[r + 1], K::get[r + 2], K::get[r + 3])); } } diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_native.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_native.ipp index 0d5232c166..1c0f424001 100644 --- a/include/bitcoin/system/impl/hash/sha/algorithm_native.ipp +++ b/include/bitcoin/system/impl/hash/sha/algorithm_native.ipp @@ -21,298 +21,19 @@ // Native (SHA-NI or NEON) // ============================================================================ -// Native does not change the buffer size (not expanded), just its "shape". -// Four words are buffered into one xint128_t, resulting in 1/4 the buffer -// array size and number of rounds. Four state words are packed into each of -// two state variables. This applies to sha160 and sha256, but sha512 native -// is not supported. - -// The base buffer is already populated with proper endianness. -// Input could be optimized using intrinsics (see comments in parse). -// The unextended state vector is already output with proper endianness. -// Output could also be optimized using intrinsics (see comments in parse). +// The iterative method is used for sha native as it is an order of magnitude +// more efficient and cannot benefit from vectorization. namespace libbitcoin { namespace system { namespace sha { -#if defined(DISABLED) +// TODO: intel sha160, arm sha160, arm sha256 -// schedule +// intel sha256 // ---------------------------------------------------------------------------- // protected -TEMPLATE -template -INLINE void CLASS:: -prepare_native(wbuffer_t& wbuffer) NOEXCEPT -{ - if constexpr (SHA::strength == 160) - { - if constexpr (use_neon) - { - } - else if constexpr (use_shani) - { - } - } - else if constexpr (SHA::strength == 256) - { - if constexpr (use_neon) - { - } - else if constexpr (use_shani) - { - wbuffer[Round] = mm_sha256msg2_epu32 - ( - mm_add_epi32 - ( - mm_alignr_epi8 - ( - wbuffer[Round - 1], wbuffer[Round - 2], SHA::word_bytes - ), - mm_sha256msg1_epu32 - ( - wbuffer[Round - 4], wbuffer[Round - 3] - ) - ), - wbuffer[Round - 1] - ); - } - } -} - -TEMPLATE -void CLASS:: -schedule_native(wbuffer_t& wbuffer) NOEXCEPT -{ - prepare_native<4>(wbuffer); - prepare_native<5>(wbuffer); - prepare_native<6>(wbuffer); - prepare_native<7>(wbuffer); - prepare_native<8>(wbuffer); - prepare_native<9>(wbuffer); - prepare_native<10>(wbuffer); - prepare_native<11>(wbuffer); - prepare_native<12>(wbuffer); - prepare_native<13>(wbuffer); - prepare_native<14>(wbuffer); - prepare_native<15>(wbuffer); - - if constexpr (SHA::rounds == 80) - { - prepare_native<16>(wbuffer); - prepare_native<17>(wbuffer); - prepare_native<18>(wbuffer); - prepare_native<19>(wbuffer); - } - - konstant(array_cast(wbuffer)); -} - -TEMPLATE -INLINE void CLASS:: -schedule_native(buffer_t& buffer) NOEXCEPT -{ - // neon and sha160 not yet implemented, sha512 is not native. - if constexpr (SHA::strength == 256 && !use_neon) - { - schedule_native(array_cast(buffer)); - } - else - { - schedule_(buffer); - } -} - -TEMPLATE -template -INLINE void CLASS:: -schedule_native(xbuffer_t& xbuffer) NOEXCEPT -{ - // Merkle extended buffer is not native dispatched. - schedule_(xbuffer); -} - -// compression -// ---------------------------------------------------------------------------- -// protected - -TEMPLATE -template -INLINE void CLASS:: -round_native(wstate_t& state, - const wbuffer_t& wk) NOEXCEPT -{ - if constexpr (SHA::strength == 160) - { - if constexpr (use_neon) - { - } - else if constexpr (use_shani) - { - } - } - else if constexpr (SHA::strength == 256) - { - if constexpr (use_neon) - { - } - else if constexpr (use_shani) - { - // Process wk[Round][0..1], [HGDC][FEBA] (initial state) - state[1] = mm_sha256rnds2_epu32(state[1], state[0], wk[Round]); - - // Process wk[Round][2..3] (shifted down) - state[0] = mm_sha256rnds2_epu32(state[0], state[1], - mm_shuffle_epi32(wk[Round], 0x0e)); - } - } -} - -TEMPLATE -INLINE void CLASS:: -summarize_native(wstate_t& out, - const wstate_t& in) NOEXCEPT -{ - if constexpr (SHA::strength == 160) - { - if constexpr (use_neon) - { - } - else if constexpr (use_shani) - { - } - } - else if constexpr (SHA::strength == 256) - { - if constexpr (use_neon) - { - } - else if constexpr (use_shani) - { - out[0] = mm_add_epi32(out[0], in[0]); - out[1] = mm_add_epi32(out[1], in[1]); - } - } -} - -TEMPLATE -INLINE void CLASS:: -shuffle(wstate_t& wstate) NOEXCEPT -{ - // Change wstate to mm_sha256rnds2_epu32 expected form: - // [ABCD][EFGH] -> [FEBA][HGDC] (ordered low to high). - const auto t1 = mm_shuffle_epi32(wstate[0], 0xb1); - const auto t2 = mm_shuffle_epi32(wstate[1], 0x1b); - wstate[0] = mm_alignr_epi8(t1, t2, 0x08); - wstate[1] = mm_blend_epi16(t2, t1, 0xf0); -} - -TEMPLATE -INLINE void CLASS:: -unshuffle(wstate_t& wstate) NOEXCEPT -{ - // Restore wstate to normal form: - // [FEBA][HGDC] -> [ABCD][EFGH] (ordered low to high). - const auto t1 = mm_shuffle_epi32(wstate[0], 0x1b); - const auto t2 = mm_shuffle_epi32(wstate[1], 0xb1); - wstate[0] = mm_blend_epi16(t1, t2, 0xf0); - wstate[1] = mm_alignr_epi8(t2, t1, 0x08); -} - -TEMPLATE -template -void CLASS:: -compress_native(wstate_t& wstate, - const wbuffer_t& wbuffer) NOEXCEPT -{ - // Shuffle and unshuffle can be done outside of all blocks, but this would - // leave state in a non-normal form, so presently absorbing that cost. - shuffle(wstate); - - // This is a copy. - const auto start = wstate; - - round_native< 0, Lane>(wstate, wbuffer); - round_native< 1, Lane>(wstate, wbuffer); - round_native< 2, Lane>(wstate, wbuffer); - round_native< 3, Lane>(wstate, wbuffer); - round_native< 4, Lane>(wstate, wbuffer); - round_native< 5, Lane>(wstate, wbuffer); - round_native< 6, Lane>(wstate, wbuffer); - round_native< 7, Lane>(wstate, wbuffer); - round_native< 8, Lane>(wstate, wbuffer); - round_native< 9, Lane>(wstate, wbuffer); - round_native<10, Lane>(wstate, wbuffer); - round_native<11, Lane>(wstate, wbuffer); - round_native<12, Lane>(wstate, wbuffer); - round_native<13, Lane>(wstate, wbuffer); - round_native<14, Lane>(wstate, wbuffer); - round_native<15, Lane>(wstate, wbuffer); - - if constexpr (SHA::rounds == 80) - { - round_native<16, Lane>(wstate, wbuffer); - round_native<17, Lane>(wstate, wbuffer); - round_native<18, Lane>(wstate, wbuffer); - round_native<19, Lane>(wstate, wbuffer); - } - - // This is just a vectorized version of summarize(). - summarize_native(wstate, start); - - // See above comments on shuffle(). - unshuffle(wstate); -} - -TEMPLATE -template -INLINE void CLASS:: -compress_native(xstate_t& xstate, - const xbuffer_t& xbuffer) NOEXCEPT -{ - // Merkle extended state/buffer is not native dispatched. - compress_(xstate, xbuffer); -} - -TEMPLATE -template -INLINE void CLASS:: -compress_native(state_t& state, const xbuffer_t& xbuffer) NOEXCEPT -{ - // Iterate extended buffer is not native dispatched. - compress_(state, xbuffer); -} - -TEMPLATE -template -INLINE void CLASS:: -compress_native(state_t& state, const buffer_t& buffer) NOEXCEPT -{ - // TODO: debug. - // TODO: sha160 state is too small to array cast into two xwords. - // neon and sha160 not yet implemented, sha512 is not native. - if constexpr (SHA::strength == 256 && !use_neon) - { - compress_native(array_cast(state), - array_cast(buffer)); - } - else - { - compress_(state, buffer); - } -} - -#else // DISABLED - -// rotating variables (no buffer) -// ---------------------------------------------------------------------------- -// protected - -// The iterative method is used for sha native as it is an order of magnitude -// more efficient and cannot benefit from vectorization. - TEMPLATE INLINE void CLASS:: shuffle(xint128_t& state0, xint128_t& state1) NOEXCEPT @@ -344,7 +65,7 @@ TEMPLATE INLINE void CLASS:: prepare(xint128_t& message0, xint128_t message1, xint128_t& message2) NOEXCEPT { - message2 = mm_sha256msg2_epu32(f::add(message2, + message2 = mm_sha256msg2_epu32(add(message2, mm_alignr_epi8(message1, message0, 4)), message1); } @@ -353,12 +74,8 @@ template INLINE void CLASS:: round_4(xint128_t& state0, xint128_t& state1, xint128_t message) NOEXCEPT { - // TODO: evaluate static setter (local to Round). - ////static const auto k = set( - //// K::get[r + 0], K::get[r + 1], K::get[r + 2], K::get[r + 3]); - constexpr auto r = Round * 4; - const auto wk = f::add(message, set( + const auto wk = add(message, set( K::get[r + 0], K::get[r + 1], K::get[r + 2], K::get[r + 3])); state1 = mm_sha256rnds2_epu32(state1, state0, wk); @@ -374,27 +91,27 @@ native_rounds(state_t& state, iblocks_t& blocks) NOEXCEPT auto lo = load(wstate[0]); auto hi = load(wstate[1]); - // shufle organizes state as expected by sha256rnds2. + // shuffle organizes state as expected by sha256rnds2. shuffle(lo, hi); while (!blocks.empty()) { const auto start_lo = lo; const auto start_hi = hi; - auto& block = array_cast(blocks.to_array()); + const auto& wblock = array_cast(blocks.to_array()); - auto message0 = byteswap(block[0]); + auto message0 = byteswap(load(wblock[0])); round_4<0>(lo, hi, message0); - auto message1 = byteswap(block[1]); + auto message1 = byteswap(load(wblock[1])); round_4<1>(lo, hi, message1); prepare(message0, message1); - auto message2 = byteswap(block[2]); + auto message2 = byteswap(load(wblock[2])); round_4<2>(lo, hi, message2); prepare(message1, message2); - auto message3 = byteswap(block[3]); + auto message3 = byteswap(load(wblock[3])); round_4<3>(lo, hi, message3); prepare(message2, message3, message0); @@ -443,8 +160,8 @@ native_rounds(state_t& state, iblocks_t& blocks) NOEXCEPT prepare(message1, message2, message3); round_4<15>(lo, hi, message3); - lo = f::add(lo, start_lo); - hi = f::add(hi, start_hi); + lo = add(lo, start_lo); + hi = add(hi, start_hi); blocks.advance(); } @@ -455,8 +172,6 @@ native_rounds(state_t& state, iblocks_t& blocks) NOEXCEPT store(wstate[1], hi); } -#endif // DISABLED - } // namespace sha } // namespace system } // namespace libbitcoin diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_sigma.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_sigma.ipp index 9e95f50323..4425eea8eb 100644 --- a/include/bitcoin/system/impl/hash/sha/algorithm_sigma.ipp +++ b/include/bitcoin/system/impl/hash/sha/algorithm_sigma.ipp @@ -47,12 +47,11 @@ prepare_1(buffer_t& buffer, const auto& xsigma0) NOEXCEPT constexpr auto r02 = Round - 2; constexpr auto r07 = Round - 7; constexpr auto r16 = Round - 16; - constexpr auto s = SHA::word_bits; // buffer[r07 + 7] is buffer[Round + 0], so sigma0 is limited to 8 lanes. - buffer[Round + Offset] = f::add( - f::add(buffer[r16 + Offset], get(xsigma0)), - f::add(buffer[r07 + Offset], sigma1(buffer[r02 + Offset]))); + buffer[Round + Offset] = add( + add(buffer[r16 + Offset], get(xsigma0)), + add(buffer[r07 + Offset], sigma1(buffer[r02 + Offset]))); } TEMPLATE diff --git a/include/bitcoin/system/intrinsics/xcpu/defines.hpp b/include/bitcoin/system/intrinsics/xcpu/defines.hpp index 000f655459..5a1f03b908 100644 --- a/include/bitcoin/system/intrinsics/xcpu/defines.hpp +++ b/include/bitcoin/system/intrinsics/xcpu/defines.hpp @@ -104,7 +104,6 @@ BC_POP_WARNING() #endif #if !defined(HAVE_SSE41) - #define mm_alignr_epi8(a, b, c) {} #define mm_and_si128(a, b) (a) #define mm_or_si128(a, b) (a) #define mm_xor_si128(a, b) (a) @@ -126,9 +125,8 @@ BC_POP_WARNING() #define mm_extract_epi64(a, Lane) {} #define mm_shuffle_epi8(a, mask) (a) #define mm_shuffle_epi32(a, mask) (a) - #define mm_blend_epi16(a, b, mask) (a) - #define mm_load_si128(a) {} - #define mm_loadu_si128(a) {} + #define mm_load_si128(a) (a) + #define mm_loadu_si128(a) (a) #define mm_store_si128(memory, a) #define mm_storeu_si128(memory, a) #define mm_set1_epi8(K) @@ -140,7 +138,6 @@ BC_POP_WARNING() #define mm_set_epi16(x08, x07, x06, x05, x04, x03, x02, x01) #define mm_set_epi8(x16, x15, x14, x13, x12, x11, x10, x09, x08, x07, x06, x05, x04, x03, x02, x01) #else - #define mm_alignr_epi8(a, b, c) _mm_alignr_epi8(a, b, c) // for native sha (128 only) #define mm_and_si128(a, b) _mm_and_si128(a, b) #define mm_or_si128(a, b) _mm_or_si128(a, b) #define mm_xor_si128(a, b) _mm_xor_si128(a, b) @@ -162,7 +159,6 @@ BC_POP_WARNING() #define mm_extract_epi64(a, Lane) _mm_extract_epi64(a, Lane) // undefined for X32 #define mm_shuffle_epi8(a, mask) _mm_shuffle_epi8(a, mask) #define mm_shuffle_epi32(a, mask) _mm_shuffle_epi32(a, mask) - #define mm_blend_epi16(a, b, mask) _mm_blend_epi16(a, b, mask) #define mm_load_si128(a) _mm_load_si128(a) #define mm_loadu_si128(a) _mm_loadu_si128(a) #define mm_store_si128(memory, a) _mm_store_si128(memory, a) @@ -201,8 +197,8 @@ BC_POP_WARNING() #define mm256_extract_epi32(a, Lane) {} #define mm256_extract_epi64(a, Lane) {} #define mm256_shuffle_epi8(a, mask) (a) - #define mm256_load_si256(a) {} - #define mm256_loadu_si256(a) {} + #define mm256_load_si256(a) (a) + #define mm256_loadu_si256(a) (a) #define mm256_store_si256(memory, a) {} #define mm256_storeu_si256(memory, a) {} #define mm256_set1_epi8(K) @@ -273,8 +269,8 @@ BC_POP_WARNING() #define mm512_extract_epi32(a, Lane) {} #define mm512_extract_epi64(a, Lane) {} #define mm512_shuffle_epi8(a, mask) (a) - #define mm512_load_si512(a) {} - #define mm512_loadu_si512(a) {} + #define mm512_load_si512(a) (a) + #define mm512_loadu_si512(a) (a) #define mm512_store_si512(memory, a) #define mm512_storeu_si512(memory, a) #define mm512_set1_epi8(K) @@ -325,21 +321,30 @@ BC_POP_WARNING() #endif #if !defined(HAVE_SHANI) - #define mm_sha1msg1_epu32(a, b) {} - #define mm_sha1msg2_epu32(a, b) {} - #define mm_sha1rnds4_epu32(a, b, functor) {} - #define mm_sha1nexte_epu32(a, b) {} - #define mm_sha256msg1_epu32(a, b) {} - #define mm_sha256msg2_epu32(a, b) {} - #define mm_sha256rnds2_epu32(a, b, k) (k) + #define mm_sha1msg1_epu32(a, b) {} + #define mm_sha1msg2_epu32(a, b) {} + #define mm_sha1rnds4_epu32(a, b, f) {} + #define mm_sha1nexte_epu32(a, b) {} + #define mm_sha256msg1_epu32(a, b) {} + #define mm_sha256msg2_epu32(a, b) {} + #define mm_sha256rnds2_epu32(a, b, k) (k) + + // supporting + #define mm_alignr_epi8(a, b, c) (a) + #define mm_blend_epi16(a, b, mask) (a) #else - #define mm_sha1msg1_epu32(a, b) _mm_sha1msg1_epu32(a, b) - #define mm_sha1msg2_epu32(a, b) _mm_sha1msg2_epu32(a, b) - #define mm_sha1rnds4_epu32(a, b, functor) _mm_sha1rnds4_epu32(a, b, functor) - #define mm_sha1nexte_epu32(a, b) _mm_sha1nexte_epu32(a, b) - #define mm_sha256msg1_epu32(a, b) _mm_sha256msg1_epu32(a, b) - #define mm_sha256msg2_epu32(a, b) _mm_sha256msg2_epu32(a, b) - #define mm_sha256rnds2_epu32(a, b, k) _mm_sha256rnds2_epu32(a, b, k) + #define mm_sha1msg1_epu32(a, b) _mm_sha1msg1_epu32(a, b) + #define mm_sha1msg2_epu32(a, b) _mm_sha1msg2_epu32(a, b) + #define mm_sha1rnds4_epu32(a, b, f) _mm_sha1rnds4_epu32(a, b, f) + #define mm_sha1nexte_epu32(a, b) _mm_sha1nexte_epu32(a, b) + #define mm_sha256msg1_epu32(a, b) _mm_sha256msg1_epu32(a, b) + #define mm_sha256msg2_epu32(a, b) _mm_sha256msg2_epu32(a, b) + #define mm_sha256rnds2_epu32(a, b, k) _mm_sha256rnds2_epu32(a, b, k) + + // supporting + #define mm_alignr_epi8(a, b, c) _mm_alignr_epi8(a, b, c) + #define mm_blend_epi16(a, b, mask) _mm_blend_epi16(a, b, mask) + #endif #endif diff --git a/include/bitcoin/system/intrinsics/xcpu/functional_128.hpp b/include/bitcoin/system/intrinsics/xcpu/functional_128.hpp index 68a227ceb2..43dd6d4900 100644 --- a/include/bitcoin/system/intrinsics/xcpu/functional_128.hpp +++ b/include/bitcoin/system/intrinsics/xcpu/functional_128.hpp @@ -76,9 +76,9 @@ INLINE xint128_t shr(xint128_t a) NOEXCEPT if constexpr (S == bits) return mm_srli_epi16(a, B); - else if constexpr (S == bits) + if constexpr (S == bits) return mm_srli_epi32(a, B); - else if constexpr (S == bits) + if constexpr (S == bits) return mm_srli_epi64(a, B); } @@ -93,9 +93,9 @@ INLINE xint128_t shl(xint128_t a) NOEXCEPT if constexpr (S == bits) return mm_slli_epi16(a, B); - else if constexpr (S == bits) + if constexpr (S == bits) return mm_slli_epi32(a, B); - else if constexpr (S == bits) + if constexpr (S == bits) return mm_slli_epi64(a, B); } @@ -117,11 +117,11 @@ INLINE xint128_t add(xint128_t a, xint128_t b) NOEXCEPT { if constexpr (S == bits) return mm_add_epi8(a, b); - else if constexpr (S == bits) + if constexpr (S == bits) return mm_add_epi16(a, b); - else if constexpr (S == bits) + if constexpr (S == bits) return mm_add_epi32(a, b); - else if constexpr (S == bits) + if constexpr (S == bits) return mm_add_epi64(a, b); } @@ -131,22 +131,36 @@ INLINE xint128_t addc(xint128_t a) NOEXCEPT { if constexpr (S == bits) return add(a, mm_set1_epi8(K)); - else if constexpr (S == bits) + if constexpr (S == bits) return add(a, mm_set1_epi16(K)); - else if constexpr (S == bits) + if constexpr (S == bits) return add(a, mm_set1_epi32(K)); - else if constexpr (S == bits) + if constexpr (S == bits) return add(a, mm_set1_epi64x(K)); } } // namespace f -/// broadcast/get/set +/// add/broadcast/gadd/get/set /// --------------------------------------------------------------------------- // SSE2 -template = true, if_integral_integer = true> +template = true> +INLINE xint128_t add(xint128_t a, xint128_t b) NOEXCEPT +{ + if constexpr (is_same_type) + return mm_add_epi8(a, b); + if constexpr (is_same_type) + return mm_add_epi16(a, b); + if constexpr (is_same_type) + return mm_add_epi32(a, b); + if constexpr (is_same_type) + return mm_add_epi64(a, b); +} + +// SSE2 +template = true, + if_same = true> INLINE xint128_t broadcast(Word a) NOEXCEPT { // set1 broadcasts integer to all elements. @@ -172,13 +186,13 @@ INLINE Word get(xint128_t a) NOEXCEPT return mm_extract_epi8(a, Lane); // SSE2 - else if constexpr (is_same_type) + if constexpr (is_same_type) return mm_extract_epi16(a, Lane); // SSE4.1 - else if constexpr (is_same_type) + if constexpr (is_same_type) return mm_extract_epi32(a, Lane); - else if constexpr (is_same_type) + if constexpr (is_same_type) return mm_extract_epi64(a, Lane); } @@ -257,25 +271,26 @@ INLINE xint128_t byteswap(xint128_t a) NOEXCEPT /// load/store (from casted to loaded/stored) /// --------------------------------------------------------------------------- +/// These have defined overrides for !HAVE_SSE41 -INLINE xint128_t load_aligned(const xint128_t& bytes) NOEXCEPT +INLINE xint128_t load(const xint128_t& bytes) NOEXCEPT { - return mm_load_si128(&bytes); + return mm_loadu_si128(&bytes); } -INLINE xint128_t load(const xint128_t& bytes) NOEXCEPT +INLINE void store(xint128_t& bytes, xint128_t a) NOEXCEPT { - return mm_loadu_si128(&bytes); + mm_storeu_si128(&bytes, a); } -INLINE void store_aligned(xint128_t& bytes, xint128_t a) NOEXCEPT +INLINE xint128_t load_aligned(const xint128_t& bytes) NOEXCEPT { - mm_store_si128(&bytes, a); + return mm_load_si128(&bytes); } -INLINE void store(xint128_t& bytes, xint128_t a) NOEXCEPT +INLINE void store_aligned(xint128_t& bytes, xint128_t a) NOEXCEPT { - mm_storeu_si128(&bytes, a); + mm_store_si128(&bytes, a); } #else @@ -283,6 +298,33 @@ INLINE void store(xint128_t& bytes, xint128_t a) NOEXCEPT // Symbol is defined but not usable as an integer. using xint128_t = std_array>; +template = true> +INLINE xint128_t add(xint128_t, xint128_t) NOEXCEPT +{ + return {}; +} + +template = true> +INLINE xint128_t set(uint32_t, uint32_t, uint32_t, uint32_t) NOEXCEPT +{ + return {}; +} + +template = true> +INLINE xint128_t byteswap(xint128_t) NOEXCEPT +{ + return {}; +} + +INLINE xint128_t load(const xint128_t&) NOEXCEPT +{ + return {}; +} + +INLINE void store(xint128_t&, xint128_t) NOEXCEPT +{ +} + #endif // HAVE_SSE41 } // namespace system diff --git a/include/bitcoin/system/intrinsics/xcpu/functional_256.hpp b/include/bitcoin/system/intrinsics/xcpu/functional_256.hpp index e2c3711cb4..f385863ac7 100644 --- a/include/bitcoin/system/intrinsics/xcpu/functional_256.hpp +++ b/include/bitcoin/system/intrinsics/xcpu/functional_256.hpp @@ -75,9 +75,9 @@ INLINE xint256_t shr(xint256_t a) NOEXCEPT // AVX2 if constexpr (S == bits) return mm256_srli_epi16(a, B); - else if constexpr (S == bits) + if constexpr (S == bits) return mm256_srli_epi32(a, B); - else if constexpr (S == bits) + if constexpr (S == bits) return mm256_srli_epi64(a, B); } @@ -92,9 +92,9 @@ INLINE xint256_t shl(xint256_t a) NOEXCEPT // AVX2 if constexpr (S == bits) return mm256_slli_epi16(a, B); - else if constexpr (S == bits) + if constexpr (S == bits) return mm256_slli_epi32(a, B); - else if constexpr (S == bits) + if constexpr (S == bits) return mm256_slli_epi64(a, B); } @@ -116,11 +116,11 @@ INLINE xint256_t add(xint256_t a, xint256_t b) NOEXCEPT { if constexpr (S == bits) return mm256_add_epi8(a, b); - else if constexpr (S == bits) + if constexpr (S == bits) return mm256_add_epi16(a, b); - else if constexpr (S == bits) + if constexpr (S == bits) return mm256_add_epi32(a, b); - else if constexpr (S == bits) + if constexpr (S == bits) return mm256_add_epi64(a, b); } @@ -130,22 +130,36 @@ INLINE xint256_t addc(xint256_t a) NOEXCEPT { if constexpr (S == bits) return add(a, mm256_set1_epi8(K)); - else if constexpr (S == bits) + if constexpr (S == bits) return add(a, mm256_set1_epi16(K)); - else if constexpr (S == bits) + if constexpr (S == bits) return add(a, mm256_set1_epi32(K)); - else if constexpr (S == bits) + if constexpr (S == bits) return add(a, mm256_set1_epi64x(K)); } } // namespace f -/// broadcast/get/set +/// add/broadcast/get/set /// --------------------------------------------------------------------------- // AVX -template = true, if_integral_integer = true> +template = true> +INLINE xint256_t add(xint256_t a, xint256_t b) NOEXCEPT +{ + if constexpr (is_same_type) + return mm256_add_epi8(a, b); + if constexpr (is_same_type) + return mm256_add_epi16(a, b); + if constexpr (is_same_type) + return mm256_add_epi32(a, b); + if constexpr (is_same_type) + return mm256_add_epi64(a, b); +} + +// AVX +template = true, + if_same = true> INLINE xint256_t broadcast(Word a) NOEXCEPT { // set1 broadcasts integer to all elements. @@ -169,13 +183,13 @@ INLINE Word get(xint256_t a) NOEXCEPT // AVX2 if constexpr (is_same_type) return mm256_extract_epi8(a, Lane); - else if constexpr (is_same_type) + if constexpr (is_same_type) return mm256_extract_epi16(a, Lane); // AVX - else if constexpr (is_same_type) + if constexpr (is_same_type) return mm256_extract_epi32(a, Lane); - else if constexpr (is_same_type) + if constexpr (is_same_type) return mm256_extract_epi64(a, Lane); } @@ -270,25 +284,26 @@ INLINE xint256_t byteswap(xint256_t a) NOEXCEPT /// load/store (from casted to loaded/stored) /// --------------------------------------------------------------------------- +/// These have defined overrides for !HAVE_AVX2 -INLINE xint256_t load_aligned(const xint256_t& bytes) NOEXCEPT +INLINE xint256_t load(const xint256_t& bytes) NOEXCEPT { - return mm256_load_si256(&bytes); + return mm256_loadu_si256(&bytes); } -INLINE xint256_t load(const xint256_t& bytes) NOEXCEPT +INLINE void store(xint256_t& bytes, xint256_t a) NOEXCEPT { - return mm256_loadu_si256(&bytes); + mm256_storeu_si256(&bytes, a); } -INLINE void store_aligned(xint256_t& bytes, xint256_t a) NOEXCEPT +INLINE xint256_t load_aligned(const xint256_t& bytes) NOEXCEPT { - mm256_store_si256(&bytes, a); + return mm256_load_si256(&bytes); } -INLINE void store(xint256_t& bytes, xint256_t a) NOEXCEPT +INLINE void store_aligned(xint256_t& bytes, xint256_t a) NOEXCEPT { - mm256_storeu_si256(&bytes, a); + mm256_store_si256(&bytes, a); } #else diff --git a/include/bitcoin/system/intrinsics/xcpu/functional_512.hpp b/include/bitcoin/system/intrinsics/xcpu/functional_512.hpp index 3308bd0362..53724cdbbd 100644 --- a/include/bitcoin/system/intrinsics/xcpu/functional_512.hpp +++ b/include/bitcoin/system/intrinsics/xcpu/functional_512.hpp @@ -83,9 +83,9 @@ INLINE xint512_t shr(xint512_t a) NOEXCEPT return mm512_srli_epi16(a, B); // AVX512F - else if constexpr (S == bits) + if constexpr (S == bits) return mm512_srli_epi32(a, B); - else if constexpr (S == bits) + if constexpr (S == bits) return mm512_srli_epi64(a, B); } @@ -102,9 +102,9 @@ INLINE xint512_t shl(xint512_t a) NOEXCEPT return mm512_slli_epi16(a, B); // AVX512F - else if constexpr (S == bits) + if constexpr (S == bits) return mm512_slli_epi32(a, B); - else if constexpr (S == bits) + if constexpr (S == bits) return mm512_slli_epi64(a, B); } @@ -126,13 +126,13 @@ INLINE xint512_t add(xint512_t a, xint512_t b) NOEXCEPT // AVX512BW if constexpr (S == bits) return mm512_add_epi8(a, b); - else if constexpr (S == bits) + if constexpr (S == bits) return mm512_add_epi16(a, b); // AVX512F - else if constexpr (S == bits) + if constexpr (S == bits) return mm512_add_epi32(a, b); - else if constexpr (S == bits) + if constexpr (S == bits) return mm512_add_epi64(a, b); } @@ -142,11 +142,11 @@ INLINE xint512_t addc(xint512_t a) NOEXCEPT { if constexpr (S == bits) return add(a, mm512_set1_epi8(K)); - else if constexpr (S == bits) + if constexpr (S == bits) return add(a, mm512_set1_epi16(K)); - else if constexpr (S == bits) + if constexpr (S == bits) return add(a, mm512_set1_epi32(K)); - else if constexpr (S == bits) + if constexpr (S == bits) return add(a, mm512_set1_epi64(K)); } @@ -156,8 +156,22 @@ INLINE xint512_t addc(xint512_t a) NOEXCEPT /// --------------------------------------------------------------------------- // AVX512F -template = true, if_integral_integer = true> +template = true> +INLINE xint512_t add(xint512_t a, xint512_t b) NOEXCEPT +{ + if constexpr (is_same_type) + return mm256_add_epi8(a, b); + if constexpr (is_same_type) + return mm256_add_epi16(a, b); + if constexpr (is_same_type) + return mm256_add_epi32(a, b); + if constexpr (is_same_type) + return mm256_add_epi64(a, b); +} + +// AVX512F +template = true, + if_same = true> INLINE xint512_t broadcast(Word a) NOEXCEPT { // set1 broadcasts integer to all elements. @@ -310,25 +324,26 @@ INLINE xint512_t byteswap(xint512_t a) NOEXCEPT /// load/store (from casted to loaded/stored) /// --------------------------------------------------------------------------- +/// These have defined overrides for !HAVE_AVX2 -INLINE xint512_t load_aligned(const xint512_t& bytes) NOEXCEPT +INLINE xint512_t load(const xint512_t& bytes) NOEXCEPT { - return mm512_load_si512(&bytes); + return mm512_loadu_si512(&bytes); } -INLINE xint512_t load(const xint512_t& bytes) NOEXCEPT +INLINE void store(xint512_t& bytes, xint512_t a) NOEXCEPT { - return mm512_loadu_si512(&bytes); + mm512_storeu_si512(&bytes, a); } -INLINE void store_aligned(xint512_t& bytes, xint512_t a) NOEXCEPT +INLINE xint512_t load_aligned(const xint512_t& bytes) NOEXCEPT { - mm512_store_si512(&bytes, a); + return mm512_load_si512(&bytes); } -INLINE void store(xint512_t& bytes, xint512_t a) NOEXCEPT +INLINE void store_aligned(xint512_t& bytes, xint512_t a) NOEXCEPT { - mm512_storeu_si512(&bytes, a); + mm512_store_si512(&bytes, a); } #else From bb1adcdfb9ccf8789d07bfb4aea9dcdea0f543b5 Mon Sep 17 00:00:00 2001 From: evoskuil Date: Thu, 5 Dec 2024 12:12:21 -0500 Subject: [PATCH 6/9] Delint. --- include/bitcoin/system/intrinsics/xcpu/defines.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/bitcoin/system/intrinsics/xcpu/defines.hpp b/include/bitcoin/system/intrinsics/xcpu/defines.hpp index 5a1f03b908..be56f019f6 100644 --- a/include/bitcoin/system/intrinsics/xcpu/defines.hpp +++ b/include/bitcoin/system/intrinsics/xcpu/defines.hpp @@ -325,13 +325,13 @@ BC_POP_WARNING() #define mm_sha1msg2_epu32(a, b) {} #define mm_sha1rnds4_epu32(a, b, f) {} #define mm_sha1nexte_epu32(a, b) {} - #define mm_sha256msg1_epu32(a, b) {} - #define mm_sha256msg2_epu32(a, b) {} + #define mm_sha256msg1_epu32(a, b) (b) + #define mm_sha256msg2_epu32(a, b) (b) #define mm_sha256rnds2_epu32(a, b, k) (k) // supporting - #define mm_alignr_epi8(a, b, c) (a) - #define mm_blend_epi16(a, b, mask) (a) + #define mm_alignr_epi8(a, b, c) (b) + #define mm_blend_epi16(a, b, mask) (b) #else #define mm_sha1msg1_epu32(a, b) _mm_sha1msg1_epu32(a, b) #define mm_sha1msg2_epu32(a, b) _mm_sha1msg2_epu32(a, b) From 2b4691637c5df20d3aed0931e83b41419293faac Mon Sep 17 00:00:00 2001 From: evoskuil Date: Thu, 5 Dec 2024 12:31:58 -0500 Subject: [PATCH 7/9] Disable neon (activating but not impl). --- include/bitcoin/system/have.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/bitcoin/system/have.hpp b/include/bitcoin/system/have.hpp index b9897f0fb9..79da053996 100644 --- a/include/bitcoin/system/have.hpp +++ b/include/bitcoin/system/have.hpp @@ -110,12 +110,13 @@ #define HAVE_XASSEMBLY #endif +/// DISABLED /// ARM Neon intrinsics. #if defined(HAVE_ARM) // -march=armv8-a+crc+crypto [all] // -arch arm64 [apple] (also -isysroot to phone sdk) #if defined(HAVE_GNUC) || defined(__ARM_NEON) || defined(HAVE_MSC) - #define HAVE_NEON + ////#define HAVE_NEON #endif #endif From 7fa9ec0a392db09885bf513c02d2ae6697c479da Mon Sep 17 00:00:00 2001 From: evoskuil Date: Thu, 5 Dec 2024 12:32:09 -0500 Subject: [PATCH 8/9] Comments. --- .../impl/hash/sha/algorithm_compress.ipp | 29 ------------------- 1 file changed, 29 deletions(-) diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_compress.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_compress.ipp index 32093848b3..6b9db05926 100644 --- a/include/bitcoin/system/impl/hash/sha/algorithm_compress.ipp +++ b/include/bitcoin/system/impl/hash/sha/algorithm_compress.ipp @@ -71,17 +71,6 @@ round(auto a, auto& b, auto c, auto d, auto& e, auto wk) NOEXCEPT e = /*a =*/ f::add(f::add(f::add(f::rol<5, s>(a), fn(b, c, d)), e), wk); b = /*c =*/ f::rol<30, s>(b); - - // SHA-NI - // Four rounds (total rounds 80/4). - // First round is add(e, w), then sha1nexte(e, w). - // fk is round-based enumeration implying f selection and k value. - // e1 = sha1nexte(e0, w); - // abcd = sha1rnds4(abcd, e0, fk); - // NEON - // f is implied by k in wk. - // e1 = vsha1h(vgetq_lane(abcd, 0); - // vsha1cq(abcd, e0, vaddq(w, k)); } TEMPLATE @@ -97,16 +86,6 @@ round(auto a, auto b, auto c, auto& d, auto e, auto f, auto g, auto& h, const auto t = f::add(f::add(f::add(Sigma1(e), choice(e, f, g)), h), wk); d = /*e =*/ f::add(d, t); h = /*a =*/ f::add(f::add(Sigma0(a), majority(a, b, c)), t); - - // Each call is 2 rounds, s, w and k are 128 (4 words each, s1/s2 is 8 word state). - // SHA-NI - // const auto value = add(w, k); - // abcd = sha256rnds2(abcd, efgh, value); - // efgh = sha256rnds2(efgh, abcd, shuffle(value)); - // NEON - // const auto value = vaddq(w, k); - // abcd = vsha256hq(abcd, efgh, value); - // efgh = vsha256h2q(efgh, abcd, value); } TEMPLATE @@ -125,10 +104,6 @@ round(auto& state, const auto& wk) NOEXCEPT state[(SHA::rounds + 3 - Round) % SHA::state_words], state[(SHA::rounds + 4 - Round) % SHA::state_words], // a->e extract(wk[Round])); - - // SHA-NI/NEON - // State packs in 128 (one state variable), reduces above to 1 out[]. - // Input value is 128 (w). Constants (k) statically initialized as 128. } else { @@ -142,10 +117,6 @@ round(auto& state, const auto& wk) NOEXCEPT state[(SHA::rounds + 6 - Round) % SHA::state_words], state[(SHA::rounds + 7 - Round) % SHA::state_words], // a->h extract(wk[Round])); - - // SHA-NI/NEON - // Each element is 128 (vs. 32), reduces above to 2 out[] (s0/s1). - // Input value is 128 (w). Constants (k) statically initialized as 128. } } From 99167b1f7a60d26837385376771b47ef78605a7d Mon Sep 17 00:00:00 2001 From: evoskuil Date: Thu, 5 Dec 2024 13:03:17 -0500 Subject: [PATCH 9/9] Delint. --- .../system/impl/hash/sha/algorithm_native.ipp | 5 +++-- include/bitcoin/system/intrinsics/xcpu/defines.hpp | 11 ++++++----- .../system/intrinsics/xcpu/functional_128.hpp | 12 ++++++------ 3 files changed, 15 insertions(+), 13 deletions(-) diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_native.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_native.ipp index 1c0f424001..79f7a34898 100644 --- a/include/bitcoin/system/impl/hash/sha/algorithm_native.ipp +++ b/include/bitcoin/system/impl/hash/sha/algorithm_native.ipp @@ -63,9 +63,10 @@ prepare(xint128_t& message0, xint128_t message1) NOEXCEPT TEMPLATE INLINE void CLASS:: -prepare(xint128_t& message0, xint128_t message1, xint128_t& message2) NOEXCEPT +prepare(xint128_t& SHANI_ONLY(message0), xint128_t message1, + xint128_t& message2) NOEXCEPT { - message2 = mm_sha256msg2_epu32(add(message2, + message2 = mm_sha256msg2_epu32(mm_add_epi32(message2, mm_alignr_epi8(message1, message0, 4)), message1); } diff --git a/include/bitcoin/system/intrinsics/xcpu/defines.hpp b/include/bitcoin/system/intrinsics/xcpu/defines.hpp index be56f019f6..0d997d0960 100644 --- a/include/bitcoin/system/intrinsics/xcpu/defines.hpp +++ b/include/bitcoin/system/intrinsics/xcpu/defines.hpp @@ -328,10 +328,9 @@ BC_POP_WARNING() #define mm_sha256msg1_epu32(a, b) (b) #define mm_sha256msg2_epu32(a, b) (b) #define mm_sha256rnds2_epu32(a, b, k) (k) - - // supporting - #define mm_alignr_epi8(a, b, c) (b) - #define mm_blend_epi16(a, b, mask) (b) + #define mm_alignr_epi8(a, b, c) (a) + #define mm_blend_epi16(a, b, mask) (a) + #define SHANI_ONLY(a) #else #define mm_sha1msg1_epu32(a, b) _mm_sha1msg1_epu32(a, b) #define mm_sha1msg2_epu32(a, b) _mm_sha1msg2_epu32(a, b) @@ -341,10 +340,12 @@ BC_POP_WARNING() #define mm_sha256msg2_epu32(a, b) _mm_sha256msg2_epu32(a, b) #define mm_sha256rnds2_epu32(a, b, k) _mm_sha256rnds2_epu32(a, b, k) - // supporting + // supporting #define mm_alignr_epi8(a, b, c) _mm_alignr_epi8(a, b, c) #define mm_blend_epi16(a, b, mask) _mm_blend_epi16(a, b, mask) + // unused argument suppression + #define SHANI_ONLY(a) a #endif #endif diff --git a/include/bitcoin/system/intrinsics/xcpu/functional_128.hpp b/include/bitcoin/system/intrinsics/xcpu/functional_128.hpp index 43dd6d4900..0f841da915 100644 --- a/include/bitcoin/system/intrinsics/xcpu/functional_128.hpp +++ b/include/bitcoin/system/intrinsics/xcpu/functional_128.hpp @@ -299,9 +299,9 @@ INLINE void store_aligned(xint128_t& bytes, xint128_t a) NOEXCEPT using xint128_t = std_array>; template = true> -INLINE xint128_t add(xint128_t, xint128_t) NOEXCEPT +INLINE xint128_t add(xint128_t, xint128_t b) NOEXCEPT { - return {}; + return b; } template = true> @@ -311,14 +311,14 @@ INLINE xint128_t set(uint32_t, uint32_t, uint32_t, uint32_t) NOEXCEPT } template = true> -INLINE xint128_t byteswap(xint128_t) NOEXCEPT +INLINE xint128_t byteswap(xint128_t a) NOEXCEPT { - return {}; + return a; } -INLINE xint128_t load(const xint128_t&) NOEXCEPT +INLINE xint128_t load(const xint128_t& a) NOEXCEPT { - return {}; + return a; } INLINE void store(xint128_t&, xint128_t) NOEXCEPT