From 484e443b73854abeb52038c7fca882f7ce0fdea4 Mon Sep 17 00:00:00 2001
From: evoskuil <eric@voskuil.org>
Date: Wed, 4 Dec 2024 20:23:55 -0500
Subject: [PATCH 1/9] Style.

---
 include/bitcoin/system/impl/hash/sha/algorithm_iterate.ipp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_iterate.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_iterate.ipp
index ee9abbe756..a174a1064b 100644
--- a/include/bitcoin/system/impl/hash/sha/algorithm_iterate.ipp
+++ b/include/bitcoin/system/impl/hash/sha/algorithm_iterate.ipp
@@ -228,7 +228,7 @@ iterate_vector(state_t& state, const ablocks_t<Size>& blocks) NOEXCEPT
 {
     if (blocks.size() >= min_lanes)
     {
-        auto iblocks = iblocks_t{ array_cast<byte_t>(blocks) };
+        iblocks_t iblocks{ array_cast<byte_t>(blocks) };
         iterate_vector(state, iblocks);
     }
     else

From 80451c7c23ebabce23d9e9829422d4bedce82551 Mon Sep 17 00:00:00 2001
From: evoskuil <eric@voskuil.org>
Date: Wed, 4 Dec 2024 20:33:25 -0500
Subject: [PATCH 2/9] Add default element count (1) to iterable template
 methods.

---
 include/bitcoin/system/data/iterable.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/bitcoin/system/data/iterable.hpp b/include/bitcoin/system/data/iterable.hpp
index 3a2daeaace..08c1b2d924 100644
--- a/include/bitcoin/system/data/iterable.hpp
+++ b/include/bitcoin/system/data/iterable.hpp
@@ -175,7 +175,7 @@ class iterable
         return begin_;
     }
 
-    template <size_t Elements>
+    template <size_t Elements = one>
     inline iterable& advance() NOEXCEPT
     {
         // This is safe for overflow, will advance to end.
@@ -185,7 +185,7 @@ class iterable
         return *this;
     }
 
-    template <size_t Elements>
+    template <size_t Elements = one>
     inline const std_array<value_t, Elements>& to_array() const NOEXCEPT
     {
         return unsafe_array_cast<value_t, Elements>(begin_);

From 7fb2b154c203b964b225d8350ad643ad00af85c6 Mon Sep 17 00:00:00 2001
From: evoskuil <eric@voskuil.org>
Date: Thu, 5 Dec 2024 09:55:56 -0500
Subject: [PATCH 3/9] Replace buffered shani with rotating.

---
 include/bitcoin/system/hash/sha/algorithm.hpp |  34 +++-
 .../impl/hash/sha/algorithm_compress.ipp      |  10 +-
 .../impl/hash/sha/algorithm_iterate.ipp       |  37 ++++-
 .../system/impl/hash/sha/algorithm_native.ipp | 157 +++++++++++++++++-
 .../impl/hash/sha/algorithm_schedule.ipp      |  10 +-
 .../system/impl/hash/sha/algorithm_sigma.ipp  |  36 ++--
 6 files changed, 242 insertions(+), 42 deletions(-)

diff --git a/include/bitcoin/system/hash/sha/algorithm.hpp b/include/bitcoin/system/hash/sha/algorithm.hpp
index 6201230bcf..b8683277d0 100644
--- a/include/bitcoin/system/hash/sha/algorithm.hpp
+++ b/include/bitcoin/system/hash/sha/algorithm.hpp
@@ -281,6 +281,11 @@ class algorithm
     INLINE static void iterate_vector(state_t& state,
         iblocks_t& blocks) NOEXCEPT;
 
+    template <size_t Size>
+    INLINE static void iterate_native(state_t& state,
+        const ablocks_t<Size>& blocks) NOEXCEPT;
+    INLINE static void iterate_native(state_t& state, iblocks_t& blocks) NOEXCEPT;
+
     template <size_t Size>
     INLINE static constexpr void iterate_(state_t& state,
         const ablocks_t<Size>& blocks) NOEXCEPT;
@@ -317,7 +322,8 @@ class algorithm
         const xstate_t<xWord>& xstate) NOEXCEPT;
 
     template <typename xWord, if_extended<xWord> = true>
-    INLINE static void merkle_hash_vector(idigests_t& digests, iblocks_t& blocks) NOEXCEPT;
+    INLINE static void merkle_hash_vector(idigests_t& digests,
+        iblocks_t& blocks) NOEXCEPT;
     INLINE static void merkle_hash_vector(digests_t& digests) NOEXCEPT;
     VCONSTEXPR static void merkle_hash_(digests_t& digests,
         size_t offset=zero) NOEXCEPT;
@@ -330,10 +336,10 @@ class algorithm
         auto x6, auto x7, auto x8) NOEXCEPT;
 
     template<size_t Round, size_t Offset>
-    INLINE static void prepare1(buffer_t& buffer, const auto& xsigma0) NOEXCEPT;
+    INLINE static void prepare_1(buffer_t& buffer, const auto& xsigma0) NOEXCEPT;
 
     template<size_t Round>
-    INLINE static void prepare8(buffer_t& buffer) NOEXCEPT;
+    INLINE static void prepare_8(buffer_t& buffer) NOEXCEPT;
 
     template <typename xWord>
     INLINE static void schedule_sigma(xbuffer_t<xWord>& xbuffer) NOEXCEPT;
@@ -357,6 +363,8 @@ class algorithm
     /// Native SHA optimizations (single blocks).
     /// -----------------------------------------------------------------------
 
+#if defined(DISABLED)
+
     template<size_t Round>
     INLINE static void prepare_native(wbuffer_t<xint128_t>& wbuffer) NOEXCEPT;
     static void schedule_native(wbuffer_t<xint128_t>& wbuffer) NOEXCEPT;
@@ -390,12 +398,28 @@ class algorithm
     INLINE static void compress_native(state_t& state,
         const buffer_t& buffer) NOEXCEPT;
 
+#else // DISABLED
+
+    INLINE static void shuffle(xint128_t& state0, xint128_t& state1) NOEXCEPT;
+    INLINE static void unshuffle(xint128_t& state0, xint128_t& state1) NOEXCEPT;
+    INLINE static void prepare(xint128_t& message0, xint128_t message1) NOEXCEPT;
+    INLINE static void prepare(xint128_t& message0, xint128_t message1,
+        xint128_t& message2) NOEXCEPT;
+
+    template <size_t Round>
+    INLINE static void round_4(xint128_t& state0, xint128_t& state1,
+        xint128_t message) NOEXCEPT;
+
+    static void native_rounds(state_t& state, iblocks_t& blocks) NOEXCEPT;
+
+#endif // DISABLED
+
 public:
     /// Summary public values.
     /// -----------------------------------------------------------------------
     static constexpr auto caching = Cached;
-    static constexpr auto native = (use_shani || use_neon) &&
-        !is_same_size<word_t, uint64_t>;
+    static constexpr auto native = (use_shani || use_neon)
+        && (SHA::strength == 256 || SHA::strength == 160);
     static constexpr auto vector = (use_x128 || use_x256 || use_x512)
         && !(build_x32 && is_same_size<word_t, uint64_t>);
 };
diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_compress.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_compress.ipp
index f9da36fd0f..32093848b3 100644
--- a/include/bitcoin/system/impl/hash/sha/algorithm_compress.ipp
+++ b/include/bitcoin/system/impl/hash/sha/algorithm_compress.ipp
@@ -276,11 +276,11 @@ compress(state_t& state, const buffer_t& buffer) NOEXCEPT
     {
         compress_<Lane>(state, buffer);
     }
-    else if constexpr (native)
-    {
-        // Single block shani compression optimization.
-        compress_native<Lane>(state, buffer);
-    }
+    ////else if constexpr (native)
+    ////{
+    ////    // Single block shani compression optimization.
+    ////    compress_native<Lane>(state, buffer);
+    ////}
     ////else if constexpr (vector)
     ////{
     ////    // Compression is not vectorized within a block, however this is
diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_iterate.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_iterate.ipp
index a174a1064b..bb51e8e9ce 100644
--- a/include/bitcoin/system/impl/hash/sha/algorithm_iterate.ipp
+++ b/include/bitcoin/system/impl/hash/sha/algorithm_iterate.ipp
@@ -237,6 +237,31 @@ iterate_vector(state_t& state, const ablocks_t<Size>& blocks) NOEXCEPT
     }
 }
 
+// Native SHA
+// ============================================================================
+// www.intel.com/content/dam/develop/external/us/en/documents/
+// intel-sha-extensions-white-paper-402097.pdf
+
+TEMPLATE
+INLINE void CLASS::
+iterate_native(state_t& state, iblocks_t& blocks) NOEXCEPT
+{
+    native_rounds(state, blocks);
+}
+
+TEMPLATE
+template <size_t Size>
+INLINE void CLASS::
+iterate_native(state_t& state, const ablocks_t<Size>& blocks) NOEXCEPT
+{
+    iblocks_t iblocks{ array_cast<byte_t>(blocks) };
+    native_rounds(state, iblocks);
+}
+
+// Dispatch and normal forms.
+// ============================================================================
+// protected
+
 TEMPLATE
 template <size_t Size>
 INLINE constexpr void CLASS::
@@ -273,11 +298,9 @@ iterate(state_t& state, const ablocks_t<Size>& blocks) NOEXCEPT
     {
         iterate_(state, blocks);
     }
-    else if constexpr (native)
+    else if constexpr (native && SHA::strength == 256)
     {
-        // TODO: evaluate 4/8/16 lane message scheduling vs. shani scheduling.
-        // Multiple block shani message schduling and compression optimization.
-        iterate_(state, blocks);
+        iterate_native(state, blocks);
     }
     else if constexpr (vector)
     {
@@ -294,11 +317,9 @@ TEMPLATE
 INLINE void CLASS::
 iterate(state_t& state, iblocks_t& blocks) NOEXCEPT
 {
-    if constexpr (native)
+    if constexpr (native && SHA::strength == 256)
     {
-        // TODO: evaluate 4/8/16 lane message scheduling vs. shani scheduling.
-        // Multiple block shani message schduling and compression optimization.
-        iterate_(state, blocks);
+        iterate_native(state, blocks);
     }
     else if constexpr (vector)
     {
diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_native.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_native.ipp
index b09941ea35..0d5232c166 100644
--- a/include/bitcoin/system/impl/hash/sha/algorithm_native.ipp
+++ b/include/bitcoin/system/impl/hash/sha/algorithm_native.ipp
@@ -35,7 +35,9 @@
 namespace libbitcoin {
 namespace system {
 namespace sha {
-    
+
+#if defined(DISABLED)
+
 // schedule
 // ----------------------------------------------------------------------------
 // protected
@@ -302,6 +304,159 @@ compress_native(state_t& state, const buffer_t& buffer) NOEXCEPT
     }
 }
 
+#else // DISABLED
+
+// rotating variables (no buffer)
+// ----------------------------------------------------------------------------
+// protected
+
+// The iterative method is used for sha native as it is an order of magnitude
+// more efficient and cannot benefit from vectorization.
+
+TEMPLATE
+INLINE void CLASS::
+shuffle(xint128_t& state0, xint128_t& state1) NOEXCEPT
+{
+    const auto shuffle0 = mm_shuffle_epi32(state0, 0xb1);
+    const auto shuffle1 = mm_shuffle_epi32(state1, 0x1b);
+    state0 = mm_alignr_epi8(shuffle0, shuffle1, 0x08);
+    state1 = mm_blend_epi16(shuffle1, shuffle0, 0xf0);
+}
+
+TEMPLATE
+INLINE void CLASS::
+unshuffle(xint128_t& state0, xint128_t& state1) NOEXCEPT
+{
+    const auto shuffle0 = mm_shuffle_epi32(state0, 0x1b);
+    const auto shuffle1 = mm_shuffle_epi32(state1, 0xb1);
+    state0 = mm_blend_epi16(shuffle0, shuffle1, 0xf0);
+    state1 = mm_alignr_epi8(shuffle1, shuffle0, 0x08);
+}
+
+TEMPLATE
+INLINE void CLASS::
+prepare(xint128_t& message0, xint128_t message1) NOEXCEPT
+{
+    message0 = mm_sha256msg1_epu32(message0, message1);
+}
+
+TEMPLATE
+INLINE void CLASS::
+prepare(xint128_t& message0, xint128_t message1, xint128_t& message2) NOEXCEPT
+{
+    message2 = mm_sha256msg2_epu32(f::add<SHA::word_bits>(message2,
+        mm_alignr_epi8(message1, message0, 4)), message1);
+}
+
+TEMPLATE
+template <size_t Round>
+INLINE void CLASS::
+round_4(xint128_t& state0, xint128_t& state1, xint128_t message) NOEXCEPT
+{
+    // TODO: evaluate static setter (local to Round).
+    ////static const auto k = set<xint128_t>(
+    ////    K::get[r + 0], K::get[r + 1], K::get[r + 2], K::get[r + 3]);
+
+    constexpr auto r = Round * 4;
+    const auto wk = f::add<SHA::word_bits>(message, set<xint128_t>(
+        K::get[r + 0], K::get[r + 1], K::get[r + 2], K::get[r + 3]));
+
+    state1 = mm_sha256rnds2_epu32(state1, state0, wk);
+    state0 = mm_sha256rnds2_epu32(state0, state1, mm_shuffle_epi32(wk, 0x0e));
+}
+
+TEMPLATE
+void CLASS::
+native_rounds(state_t& state, iblocks_t& blocks) NOEXCEPT
+{
+    // Individual state vars are used vs. array to ensure register persistence.
+    auto& wstate = array_cast<xint128_t>(state);
+    auto lo = load(wstate[0]);
+    auto hi = load(wstate[1]);
+
+    // shufle organizes state as expected by sha256rnds2.
+    shuffle(lo, hi);
+
+    while (!blocks.empty())
+    {
+        const auto start_lo = lo;
+        const auto start_hi = hi;
+        auto& block = array_cast<xint128_t>(blocks.to_array());
+
+        auto message0 = byteswap<uint32_t>(block[0]);
+        round_4<0>(lo, hi, message0);
+
+        auto message1 = byteswap<uint32_t>(block[1]);
+        round_4<1>(lo, hi, message1);
+
+        prepare(message0, message1);
+        auto message2 = byteswap<uint32_t>(block[2]);
+        round_4<2>(lo, hi, message2);
+
+        prepare(message1, message2);
+        auto message3 = byteswap<uint32_t>(block[3]);
+        round_4<3>(lo, hi, message3);
+
+        prepare(message2, message3, message0);
+        prepare(message2, message3);
+        round_4<4>(lo, hi, message0);
+
+        prepare(message3, message0, message1);
+        prepare(message3, message0);
+        round_4<5>(lo, hi, message1);
+
+        prepare(message0, message1, message2);
+        prepare(message0, message1);
+        round_4<6>(lo, hi, message2);
+
+        prepare(message1, message2, message3);
+        prepare(message1, message2);
+        round_4<7>(lo, hi, message3);
+
+        prepare(message2, message3, message0);
+        prepare(message2, message3);
+        round_4<8>(lo, hi, message0);
+
+        prepare(message3, message0, message1);
+        prepare(message3, message0);
+        round_4<9>(lo, hi, message1);
+
+        prepare(message0, message1, message2);
+        prepare(message0, message1);
+        round_4<10>(lo, hi, message2);
+
+        prepare(message1, message2, message3);
+        prepare(message1, message2);
+        round_4<11>(lo, hi, message3);
+
+        prepare(message2, message3, message0);
+        prepare(message2, message3);
+        round_4<12>(lo, hi, message0);
+
+        prepare(message3, message0, message1);
+        prepare(message3, message0);
+        round_4<13>(lo, hi, message1);
+
+        prepare(message0, message1, message2);
+        round_4<14>(lo, hi, message2);
+
+        prepare(message1, message2, message3);
+        round_4<15>(lo, hi, message3);
+
+        lo = f::add<SHA::word_bits>(lo, start_lo);
+        hi = f::add<SHA::word_bits>(hi, start_hi);
+        blocks.advance();
+    }
+
+    // unshuffle restores state to normal form.
+    unshuffle(lo, hi);
+
+    store(wstate[0], lo);
+    store(wstate[1], hi);
+}
+
+#endif // DISABLED
+
 } // namespace sha
 } // namespace system
 } // namespace libbitcoin
diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_schedule.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_schedule.ipp
index f2796d4b5c..b65704e27b 100644
--- a/include/bitcoin/system/impl/hash/sha/algorithm_schedule.ipp
+++ b/include/bitcoin/system/impl/hash/sha/algorithm_schedule.ipp
@@ -138,11 +138,11 @@ schedule(buffer_t& buffer) NOEXCEPT
     {
         schedule_(buffer);
     }
-    else if constexpr (native)
-    {
-        // Single block (with shani) message scheduling optimization.
-        schedule_native(buffer);
-    }
+    ////else if constexpr (native)
+    ////{
+    ////    // Single block (with shani) message scheduling optimization.
+    ////    schedule_native(buffer);
+    ////}
     else if constexpr (vector)
     {
         // Single block (without shani) message scheduling optimization.
diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_sigma.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_sigma.ipp
index 866fac88b8..9e95f50323 100644
--- a/include/bitcoin/system/impl/hash/sha/algorithm_sigma.ipp
+++ b/include/bitcoin/system/impl/hash/sha/algorithm_sigma.ipp
@@ -41,7 +41,7 @@ sigma0_8(auto x1, auto x2, auto x3, auto x4, auto x5, auto x6, auto x7,
 TEMPLATE
 template<size_t Round, size_t Offset>
 INLINE void CLASS::
-prepare1(buffer_t& buffer, const auto& xsigma0) NOEXCEPT
+prepare_1(buffer_t& buffer, const auto& xsigma0) NOEXCEPT
 {
     static_assert(Round >= 16);
     constexpr auto r02 = Round - 2;
@@ -58,7 +58,7 @@ prepare1(buffer_t& buffer, const auto& xsigma0) NOEXCEPT
 TEMPLATE
 template<size_t Round>
 INLINE void CLASS::
-prepare8(buffer_t& buffer) NOEXCEPT
+prepare_8(buffer_t& buffer) NOEXCEPT
 {
     // Requires avx512 for sha512 and avx2 for sha256.
     // The simplicity of sha160 message prepare precludes this optimization.
@@ -73,14 +73,14 @@ prepare8(buffer_t& buffer) NOEXCEPT
         buffer[r15 + 0], buffer[r15 + 1], buffer[r15 + 2], buffer[r15 + 3],
         buffer[r15 + 4], buffer[r15 + 5], buffer[r15 + 6], buffer[r15 + 7]);
 
-    prepare1<Round, 0>(buffer, xsigma0);
-    prepare1<Round, 1>(buffer, xsigma0);
-    prepare1<Round, 2>(buffer, xsigma0);
-    prepare1<Round, 3>(buffer, xsigma0);
-    prepare1<Round, 4>(buffer, xsigma0);
-    prepare1<Round, 5>(buffer, xsigma0);
-    prepare1<Round, 6>(buffer, xsigma0);
-    prepare1<Round, 7>(buffer, xsigma0);
+    prepare_1<Round, 0>(buffer, xsigma0);
+    prepare_1<Round, 1>(buffer, xsigma0);
+    prepare_1<Round, 2>(buffer, xsigma0);
+    prepare_1<Round, 3>(buffer, xsigma0);
+    prepare_1<Round, 4>(buffer, xsigma0);
+    prepare_1<Round, 5>(buffer, xsigma0);
+    prepare_1<Round, 6>(buffer, xsigma0);
+    prepare_1<Round, 7>(buffer, xsigma0);
 }
 
 TEMPLATE
@@ -98,17 +98,17 @@ schedule_sigma(buffer_t& buffer) NOEXCEPT
 {
     if constexpr (SHA::strength != 160 && have_lanes<word_t, 8>)
     {
-        prepare8<16>(buffer);
-        prepare8<24>(buffer);
-        prepare8<32>(buffer);
-        prepare8<40>(buffer);
-        prepare8<48>(buffer);
-        prepare8<56>(buffer);
+        prepare_8<16>(buffer);
+        prepare_8<24>(buffer);
+        prepare_8<32>(buffer);
+        prepare_8<40>(buffer);
+        prepare_8<48>(buffer);
+        prepare_8<56>(buffer);
 
         if constexpr (SHA::rounds == 80)
         {
-            prepare8<64>(buffer);
-            prepare8<72>(buffer);
+            prepare_8<64>(buffer);
+            prepare_8<72>(buffer);
         }
 
         konstant(buffer);

From 0030b7264045e98ad3b95c663fd572e5092cc9fd Mon Sep 17 00:00:00 2001
From: evoskuil <eric@voskuil.org>
Date: Thu, 5 Dec 2024 09:56:03 -0500
Subject: [PATCH 4/9] Comments.

---
 include/bitcoin/system/impl/hash/sha/algorithm_merkle.ipp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_merkle.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_merkle.ipp
index d0ab65e166..72f3853beb 100644
--- a/include/bitcoin/system/impl/hash/sha/algorithm_merkle.ipp
+++ b/include/bitcoin/system/impl/hash/sha/algorithm_merkle.ipp
@@ -421,6 +421,10 @@ merkle_hash_vector(digests_t& digests) NOEXCEPT
 // ----------------------------------------------------------------------------
 // public
 
+// TODO: consider eliminating endianness conversions internal to the root
+// computation, instead converting on way in and way out ony, and using non
+// converting input/output (nop) functions.
+
 TEMPLATE
 VCONSTEXPR typename CLASS::digest_t CLASS::
 merkle_root(digests_t&& digests) NOEXCEPT

From 2dc7bf22c4e37799af063bd75f22167016d26788 Mon Sep 17 00:00:00 2001
From: evoskuil <eric@voskuil.org>
Date: Thu, 5 Dec 2024 11:53:04 -0500
Subject: [PATCH 5/9] Style, delint, use add<word>() for vector only adds.

---
 include/bitcoin/system/hash/sha/algorithm.hpp |  39 ---
 .../impl/hash/sha/algorithm_konstant.ipp      |   7 +-
 .../system/impl/hash/sha/algorithm_native.ipp | 313 +-----------------
 .../system/impl/hash/sha/algorithm_sigma.ipp  |   7 +-
 .../system/intrinsics/xcpu/defines.hpp        |  53 +--
 .../system/intrinsics/xcpu/functional_128.hpp |  90 +++--
 .../system/intrinsics/xcpu/functional_256.hpp |  63 ++--
 .../system/intrinsics/xcpu/functional_512.hpp |  55 +--
 8 files changed, 189 insertions(+), 438 deletions(-)

diff --git a/include/bitcoin/system/hash/sha/algorithm.hpp b/include/bitcoin/system/hash/sha/algorithm.hpp
index b8683277d0..5b25bfac1c 100644
--- a/include/bitcoin/system/hash/sha/algorithm.hpp
+++ b/include/bitcoin/system/hash/sha/algorithm.hpp
@@ -363,43 +363,6 @@ class algorithm
     /// Native SHA optimizations (single blocks).
     /// -----------------------------------------------------------------------
 
-#if defined(DISABLED)
-
-    template<size_t Round>
-    INLINE static void prepare_native(wbuffer_t<xint128_t>& wbuffer) NOEXCEPT;
-    static void schedule_native(wbuffer_t<xint128_t>& wbuffer) NOEXCEPT;
-
-    template <typename xWord>
-    INLINE static void schedule_native(xbuffer_t<xWord>& xbuffer) NOEXCEPT;
-    INLINE static void schedule_native(buffer_t& buffer) NOEXCEPT;
-
-    template<size_t Round, size_t Lane>
-    INLINE static void round_native(wstate_t<xint128_t>& state,
-        const wbuffer_t<xint128_t>& wk) NOEXCEPT;
-
-    INLINE static void shuffle(wstate_t<xint128_t>& wstate) NOEXCEPT;
-    INLINE static void unshuffle(wstate_t<xint128_t>& wstate) NOEXCEPT;
-    INLINE static void summarize_native(wstate_t<xint128_t>& out,
-        const wstate_t<xint128_t>& in) NOEXCEPT;
-
-    template <size_t Lane>
-    static void compress_native(wstate_t<xint128_t>& state,
-        const wbuffer_t<xint128_t>& wbuffer) NOEXCEPT;
-
-    template <typename xWord, size_t Lane>
-    INLINE static void compress_native(xstate_t<xWord>& xstate,
-        const xbuffer_t<xWord>& xbuffer) NOEXCEPT;
-
-    template <typename xWord, size_t Lane>
-    INLINE static void compress_native(state_t& state,
-        const xbuffer_t<xWord>& xbuffer) NOEXCEPT;
-
-    template <size_t Lane>
-    INLINE static void compress_native(state_t& state,
-        const buffer_t& buffer) NOEXCEPT;
-
-#else // DISABLED
-
     INLINE static void shuffle(xint128_t& state0, xint128_t& state1) NOEXCEPT;
     INLINE static void unshuffle(xint128_t& state0, xint128_t& state1) NOEXCEPT;
     INLINE static void prepare(xint128_t& message0, xint128_t message1) NOEXCEPT;
@@ -412,8 +375,6 @@ class algorithm
 
     static void native_rounds(state_t& state, iblocks_t& blocks) NOEXCEPT;
 
-#endif // DISABLED
-
 public:
     /// Summary public values.
     /// -----------------------------------------------------------------------
diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_konstant.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_konstant.ipp
index e62127a10c..26cf225d14 100644
--- a/include/bitcoin/system/impl/hash/sha/algorithm_konstant.ipp
+++ b/include/bitcoin/system/impl/hash/sha/algorithm_konstant.ipp
@@ -50,13 +50,12 @@ template<size_t Round, typename xWord>
 INLINE void CLASS::
 vector_konstant(wbuffer_t<xWord>& wbuffer) NOEXCEPT
 {
-    constexpr auto s = SHA::word_bits;
     constexpr auto lanes = capacity<xWord, word_t>;
     constexpr auto r = Round * lanes;
 
     if constexpr (lanes == 16)
     {
-        wbuffer[Round] = f::add<s>(wbuffer[Round], set<xWord>(
+        wbuffer[Round] = add<word_t>(wbuffer[Round], set<xWord>(
             K::get[r + 0], K::get[r + 1], K::get[r + 2], K::get[r + 3],
             K::get[r + 4], K::get[r + 5], K::get[r + 6], K::get[r + 7],
             K::get[r + 8], K::get[r + 9], K::get[r + 10], K::get[r + 11],
@@ -64,13 +63,13 @@ vector_konstant(wbuffer_t<xWord>& wbuffer) NOEXCEPT
     }
     else if constexpr (lanes == 8)
     {
-        wbuffer[Round] = f::add<s>(wbuffer[Round], set<xWord>(
+        wbuffer[Round] = add<word_t>(wbuffer[Round], set<xWord>(
             K::get[r + 0], K::get[r + 1], K::get[r + 2], K::get[r + 3],
             K::get[r + 4], K::get[r + 5], K::get[r + 6], K::get[r + 7]));
     }
     else if constexpr (lanes == 4)
     {
-        wbuffer[Round] = f::add<s>(wbuffer[Round], set<xWord>(
+        wbuffer[Round] = add<word_t>(wbuffer[Round], set<xWord>(
             K::get[r + 0], K::get[r + 1], K::get[r + 2], K::get[r + 3]));
     }
 }
diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_native.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_native.ipp
index 0d5232c166..1c0f424001 100644
--- a/include/bitcoin/system/impl/hash/sha/algorithm_native.ipp
+++ b/include/bitcoin/system/impl/hash/sha/algorithm_native.ipp
@@ -21,298 +21,19 @@
 
 // Native (SHA-NI or NEON)
 // ============================================================================
-// Native does not change the buffer size (not expanded), just its "shape".
-// Four words are buffered into one xint128_t, resulting in 1/4 the buffer
-// array size and number of rounds. Four state words are  packed into each of
-// two state variables. This applies to sha160 and sha256, but sha512 native
-// is not supported.
-
-// The base buffer is already populated with proper endianness.
-// Input could be optimized using intrinsics (see comments in parse).
-// The unextended state vector is already output with proper endianness.
-// Output could also be optimized using intrinsics (see comments in parse).
+// The iterative method is used for sha native as it is an order of magnitude
+// more efficient and cannot benefit from vectorization.
 
 namespace libbitcoin {
 namespace system {
 namespace sha {
 
-#if defined(DISABLED)
+// TODO: intel sha160, arm sha160, arm sha256
 
-// schedule
+// intel sha256
 // ----------------------------------------------------------------------------
 // protected
 
-TEMPLATE
-template<size_t Round>
-INLINE void CLASS::
-prepare_native(wbuffer_t<xint128_t>& wbuffer) NOEXCEPT
-{
-    if constexpr (SHA::strength == 160)
-    {
-        if constexpr (use_neon)
-        {
-        }
-        else if constexpr (use_shani)
-        {
-        }
-    }
-    else if constexpr (SHA::strength == 256)
-    {
-        if constexpr (use_neon)
-        {
-        }
-        else if constexpr (use_shani)
-        {
-            wbuffer[Round] = mm_sha256msg2_epu32
-            (
-                mm_add_epi32
-                (
-                    mm_alignr_epi8
-                    (
-                        wbuffer[Round - 1], wbuffer[Round - 2], SHA::word_bytes
-                    ),
-                    mm_sha256msg1_epu32
-                    (
-                        wbuffer[Round - 4], wbuffer[Round - 3]
-                    )
-                ),
-                wbuffer[Round - 1]
-            );
-        }
-    }
-}
-
-TEMPLATE
-void CLASS::
-schedule_native(wbuffer_t<xint128_t>& wbuffer) NOEXCEPT
-{
-    prepare_native<4>(wbuffer);
-    prepare_native<5>(wbuffer);
-    prepare_native<6>(wbuffer);
-    prepare_native<7>(wbuffer);
-    prepare_native<8>(wbuffer);
-    prepare_native<9>(wbuffer);
-    prepare_native<10>(wbuffer);
-    prepare_native<11>(wbuffer);
-    prepare_native<12>(wbuffer);
-    prepare_native<13>(wbuffer);
-    prepare_native<14>(wbuffer);
-    prepare_native<15>(wbuffer);
-
-    if constexpr (SHA::rounds == 80)
-    {
-        prepare_native<16>(wbuffer);
-        prepare_native<17>(wbuffer);
-        prepare_native<18>(wbuffer);
-        prepare_native<19>(wbuffer);
-    }
-
-    konstant(array_cast<word_t>(wbuffer));
-}
-
-TEMPLATE
-INLINE void CLASS::
-schedule_native(buffer_t& buffer) NOEXCEPT
-{
-    // neon and sha160 not yet implemented, sha512 is not native.
-    if constexpr (SHA::strength == 256 && !use_neon)
-    {
-        schedule_native(array_cast<xint128_t>(buffer));
-    }
-    else
-    {
-        schedule_(buffer);
-    }
-}
-
-TEMPLATE
-template <typename xWord>
-INLINE void CLASS::
-schedule_native(xbuffer_t<xWord>& xbuffer) NOEXCEPT
-{
-    // Merkle extended buffer is not native dispatched.
-    schedule_(xbuffer);
-}
-
-// compression
-// ----------------------------------------------------------------------------
-// protected
-
-TEMPLATE
-template<size_t Round, size_t Lane>
-INLINE void CLASS::
-round_native(wstate_t<xint128_t>& state,
-    const wbuffer_t<xint128_t>& wk) NOEXCEPT
-{
-    if constexpr (SHA::strength == 160)
-    {
-        if constexpr (use_neon)
-        {
-        }
-        else if constexpr (use_shani)
-        {
-        }
-    }
-    else if constexpr (SHA::strength == 256)
-    {
-        if constexpr (use_neon)
-        {
-        }
-        else if constexpr (use_shani)
-        {
-            // Process wk[Round][0..1], [HGDC][FEBA] (initial state)
-            state[1] = mm_sha256rnds2_epu32(state[1], state[0], wk[Round]);
-
-            // Process wk[Round][2..3] (shifted down)
-            state[0] = mm_sha256rnds2_epu32(state[0], state[1],
-                mm_shuffle_epi32(wk[Round], 0x0e));
-        }
-    }
-}
-
-TEMPLATE
-INLINE void CLASS::
-summarize_native(wstate_t<xint128_t>& out,
-    const wstate_t<xint128_t>& in) NOEXCEPT
-{
-    if constexpr (SHA::strength == 160)
-    {
-        if constexpr (use_neon)
-        {
-        }
-        else if constexpr (use_shani)
-        {
-        }
-    }
-    else if constexpr (SHA::strength == 256)
-    {
-        if constexpr (use_neon)
-        {
-        }
-        else if constexpr (use_shani)
-        {
-            out[0] = mm_add_epi32(out[0], in[0]);
-            out[1] = mm_add_epi32(out[1], in[1]);
-        }
-    }
-}
-
-TEMPLATE
-INLINE void CLASS::
-shuffle(wstate_t<xint128_t>& wstate) NOEXCEPT
-{
-    // Change wstate to mm_sha256rnds2_epu32 expected form:
-    // [ABCD][EFGH] -> [FEBA][HGDC] (ordered low to high).
-    const auto t1 = mm_shuffle_epi32(wstate[0], 0xb1);
-    const auto t2 = mm_shuffle_epi32(wstate[1], 0x1b);
-    wstate[0] = mm_alignr_epi8(t1, t2, 0x08);
-    wstate[1] = mm_blend_epi16(t2, t1, 0xf0);
-}
-
-TEMPLATE
-INLINE void CLASS::
-unshuffle(wstate_t<xint128_t>& wstate) NOEXCEPT
-{
-    // Restore wstate to normal form:
-    // [FEBA][HGDC] -> [ABCD][EFGH] (ordered low to high).
-    const auto t1 = mm_shuffle_epi32(wstate[0], 0x1b);
-    const auto t2 = mm_shuffle_epi32(wstate[1], 0xb1);
-    wstate[0] = mm_blend_epi16(t1, t2, 0xf0);
-    wstate[1] = mm_alignr_epi8(t2, t1, 0x08);
-}
-
-TEMPLATE
-template <size_t Lane>
-void CLASS::
-compress_native(wstate_t<xint128_t>& wstate,
-    const wbuffer_t<xint128_t>& wbuffer) NOEXCEPT
-{ 
-    // Shuffle and unshuffle can be done outside of all blocks, but this would
-    // leave state in a non-normal form, so presently absorbing that cost.
-    shuffle(wstate);
-
-    // This is a copy.
-    const auto start = wstate;
-
-    round_native< 0, Lane>(wstate, wbuffer);
-    round_native< 1, Lane>(wstate, wbuffer);
-    round_native< 2, Lane>(wstate, wbuffer);
-    round_native< 3, Lane>(wstate, wbuffer);
-    round_native< 4, Lane>(wstate, wbuffer);
-    round_native< 5, Lane>(wstate, wbuffer);
-    round_native< 6, Lane>(wstate, wbuffer);
-    round_native< 7, Lane>(wstate, wbuffer);
-    round_native< 8, Lane>(wstate, wbuffer);
-    round_native< 9, Lane>(wstate, wbuffer);
-    round_native<10, Lane>(wstate, wbuffer);
-    round_native<11, Lane>(wstate, wbuffer);
-    round_native<12, Lane>(wstate, wbuffer);
-    round_native<13, Lane>(wstate, wbuffer);
-    round_native<14, Lane>(wstate, wbuffer);
-    round_native<15, Lane>(wstate, wbuffer);
-
-    if constexpr (SHA::rounds == 80)
-    {
-        round_native<16, Lane>(wstate, wbuffer);
-        round_native<17, Lane>(wstate, wbuffer);
-        round_native<18, Lane>(wstate, wbuffer);
-        round_native<19, Lane>(wstate, wbuffer);
-    }
-
-    // This is just a vectorized version of summarize().
-    summarize_native(wstate, start);
-
-    // See above comments on shuffle().
-    unshuffle(wstate);
-}
-
-TEMPLATE
-template <typename xWord, size_t Lane>
-INLINE void CLASS::
-compress_native(xstate_t<xWord>& xstate,
-    const xbuffer_t<xWord>& xbuffer) NOEXCEPT
-{
-    // Merkle extended state/buffer is not native dispatched.
-    compress_<Lane>(xstate, xbuffer);
-}
-
-TEMPLATE
-template <typename xWord, size_t Lane>
-INLINE void CLASS::
-compress_native(state_t& state, const xbuffer_t<xWord>& xbuffer) NOEXCEPT
-{
-    // Iterate extended buffer is not native dispatched.
-    compress_<Lane>(state, xbuffer);
-}
-
-TEMPLATE
-template <size_t Lane>
-INLINE void CLASS::
-compress_native(state_t& state, const buffer_t& buffer) NOEXCEPT
-{
-    // TODO: debug.
-    // TODO: sha160 state is too small to array cast into two xwords.
-    // neon and sha160 not yet implemented, sha512 is not native.
-    if constexpr (SHA::strength == 256 && !use_neon)
-    {
-        compress_native<Lane>(array_cast<xint128_t>(state),
-            array_cast<xint128_t>(buffer));
-    }
-    else
-    {
-        compress_<Lane>(state, buffer);
-    }
-}
-
-#else // DISABLED
-
-// rotating variables (no buffer)
-// ----------------------------------------------------------------------------
-// protected
-
-// The iterative method is used for sha native as it is an order of magnitude
-// more efficient and cannot benefit from vectorization.
-
 TEMPLATE
 INLINE void CLASS::
 shuffle(xint128_t& state0, xint128_t& state1) NOEXCEPT
@@ -344,7 +65,7 @@ TEMPLATE
 INLINE void CLASS::
 prepare(xint128_t& message0, xint128_t message1, xint128_t& message2) NOEXCEPT
 {
-    message2 = mm_sha256msg2_epu32(f::add<SHA::word_bits>(message2,
+    message2 = mm_sha256msg2_epu32(add<word_t>(message2,
         mm_alignr_epi8(message1, message0, 4)), message1);
 }
 
@@ -353,12 +74,8 @@ template <size_t Round>
 INLINE void CLASS::
 round_4(xint128_t& state0, xint128_t& state1, xint128_t message) NOEXCEPT
 {
-    // TODO: evaluate static setter (local to Round).
-    ////static const auto k = set<xint128_t>(
-    ////    K::get[r + 0], K::get[r + 1], K::get[r + 2], K::get[r + 3]);
-
     constexpr auto r = Round * 4;
-    const auto wk = f::add<SHA::word_bits>(message, set<xint128_t>(
+    const auto wk = add<word_t>(message, set<xint128_t>(
         K::get[r + 0], K::get[r + 1], K::get[r + 2], K::get[r + 3]));
 
     state1 = mm_sha256rnds2_epu32(state1, state0, wk);
@@ -374,27 +91,27 @@ native_rounds(state_t& state, iblocks_t& blocks) NOEXCEPT
     auto lo = load(wstate[0]);
     auto hi = load(wstate[1]);
 
-    // shufle organizes state as expected by sha256rnds2.
+    // shuffle organizes state as expected by sha256rnds2.
     shuffle(lo, hi);
 
     while (!blocks.empty())
     {
         const auto start_lo = lo;
         const auto start_hi = hi;
-        auto& block = array_cast<xint128_t>(blocks.to_array());
+        const auto& wblock = array_cast<xint128_t>(blocks.to_array());
 
-        auto message0 = byteswap<uint32_t>(block[0]);
+        auto message0 = byteswap<uint32_t>(load(wblock[0]));
         round_4<0>(lo, hi, message0);
 
-        auto message1 = byteswap<uint32_t>(block[1]);
+        auto message1 = byteswap<uint32_t>(load(wblock[1]));
         round_4<1>(lo, hi, message1);
 
         prepare(message0, message1);
-        auto message2 = byteswap<uint32_t>(block[2]);
+        auto message2 = byteswap<uint32_t>(load(wblock[2]));
         round_4<2>(lo, hi, message2);
 
         prepare(message1, message2);
-        auto message3 = byteswap<uint32_t>(block[3]);
+        auto message3 = byteswap<uint32_t>(load(wblock[3]));
         round_4<3>(lo, hi, message3);
 
         prepare(message2, message3, message0);
@@ -443,8 +160,8 @@ native_rounds(state_t& state, iblocks_t& blocks) NOEXCEPT
         prepare(message1, message2, message3);
         round_4<15>(lo, hi, message3);
 
-        lo = f::add<SHA::word_bits>(lo, start_lo);
-        hi = f::add<SHA::word_bits>(hi, start_hi);
+        lo = add<word_t>(lo, start_lo);
+        hi = add<word_t>(hi, start_hi);
         blocks.advance();
     }
 
@@ -455,8 +172,6 @@ native_rounds(state_t& state, iblocks_t& blocks) NOEXCEPT
     store(wstate[1], hi);
 }
 
-#endif // DISABLED
-
 } // namespace sha
 } // namespace system
 } // namespace libbitcoin
diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_sigma.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_sigma.ipp
index 9e95f50323..4425eea8eb 100644
--- a/include/bitcoin/system/impl/hash/sha/algorithm_sigma.ipp
+++ b/include/bitcoin/system/impl/hash/sha/algorithm_sigma.ipp
@@ -47,12 +47,11 @@ prepare_1(buffer_t& buffer, const auto& xsigma0) NOEXCEPT
     constexpr auto r02 = Round - 2;
     constexpr auto r07 = Round - 7;
     constexpr auto r16 = Round - 16;
-    constexpr auto s = SHA::word_bits;
 
     // buffer[r07 + 7] is buffer[Round + 0], so sigma0 is limited to 8 lanes.
-    buffer[Round + Offset] = f::add<s>(
-        f::add<s>(buffer[r16 + Offset], get<word_t, Offset>(xsigma0)),
-        f::add<s>(buffer[r07 + Offset], sigma1(buffer[r02 + Offset])));
+    buffer[Round + Offset] = add<word_t>(
+        add<word_t>(buffer[r16 + Offset], get<word_t, Offset>(xsigma0)),
+        add<word_t>(buffer[r07 + Offset], sigma1(buffer[r02 + Offset])));
 }
 
 TEMPLATE
diff --git a/include/bitcoin/system/intrinsics/xcpu/defines.hpp b/include/bitcoin/system/intrinsics/xcpu/defines.hpp
index 000f655459..5a1f03b908 100644
--- a/include/bitcoin/system/intrinsics/xcpu/defines.hpp
+++ b/include/bitcoin/system/intrinsics/xcpu/defines.hpp
@@ -104,7 +104,6 @@ BC_POP_WARNING()
 #endif
 
 #if !defined(HAVE_SSE41)
-    #define mm_alignr_epi8(a, b, c) {}
     #define mm_and_si128(a, b)  (a)
     #define mm_or_si128(a, b)   (a)
     #define mm_xor_si128(a, b)  (a)
@@ -126,9 +125,8 @@ BC_POP_WARNING()
     #define mm_extract_epi64(a, Lane)   {}
     #define mm_shuffle_epi8(a, mask)    (a)
     #define mm_shuffle_epi32(a, mask)   (a)
-    #define mm_blend_epi16(a, b, mask)  (a)
-    #define mm_load_si128(a)            {}
-    #define mm_loadu_si128(a)           {}
+    #define mm_load_si128(a)            (a)
+    #define mm_loadu_si128(a)           (a)
     #define mm_store_si128(memory, a)
     #define mm_storeu_si128(memory, a)
     #define mm_set1_epi8(K)
@@ -140,7 +138,6 @@ BC_POP_WARNING()
     #define mm_set_epi16(x08, x07, x06, x05, x04, x03, x02, x01)
     #define mm_set_epi8(x16, x15, x14, x13, x12, x11, x10, x09, x08, x07, x06, x05, x04, x03, x02, x01)
 #else
-    #define mm_alignr_epi8(a, b, c)     _mm_alignr_epi8(a, b, c) // for native sha (128 only)
     #define mm_and_si128(a, b)          _mm_and_si128(a, b)
     #define mm_or_si128(a, b)           _mm_or_si128(a, b)
     #define mm_xor_si128(a, b)          _mm_xor_si128(a, b)
@@ -162,7 +159,6 @@ BC_POP_WARNING()
     #define mm_extract_epi64(a, Lane)   _mm_extract_epi64(a, Lane) // undefined for X32
     #define mm_shuffle_epi8(a, mask)    _mm_shuffle_epi8(a, mask)
     #define mm_shuffle_epi32(a, mask)   _mm_shuffle_epi32(a, mask)
-    #define mm_blend_epi16(a, b, mask)  _mm_blend_epi16(a, b, mask)
     #define mm_load_si128(a)            _mm_load_si128(a)
     #define mm_loadu_si128(a)           _mm_loadu_si128(a)
     #define mm_store_si128(memory, a)   _mm_store_si128(memory, a)
@@ -201,8 +197,8 @@ BC_POP_WARNING()
     #define mm256_extract_epi32(a, Lane)    {}
     #define mm256_extract_epi64(a, Lane)    {}
     #define mm256_shuffle_epi8(a, mask)     (a)
-    #define mm256_load_si256(a)             {}
-    #define mm256_loadu_si256(a)            {}
+    #define mm256_load_si256(a)             (a)
+    #define mm256_loadu_si256(a)            (a)
     #define mm256_store_si256(memory, a)    {}
     #define mm256_storeu_si256(memory, a)   {}
     #define mm256_set1_epi8(K)
@@ -273,8 +269,8 @@ BC_POP_WARNING()
     #define mm512_extract_epi32(a, Lane)    {}
     #define mm512_extract_epi64(a, Lane)    {}
     #define mm512_shuffle_epi8(a, mask)     (a)
-    #define mm512_load_si512(a)             {}
-    #define mm512_loadu_si512(a)            {}
+    #define mm512_load_si512(a)             (a)
+    #define mm512_loadu_si512(a)            (a)
     #define mm512_store_si512(memory, a)
     #define mm512_storeu_si512(memory, a)
     #define mm512_set1_epi8(K)
@@ -325,21 +321,30 @@ BC_POP_WARNING()
 #endif
 
 #if !defined(HAVE_SHANI)
-    #define mm_sha1msg1_epu32(a, b)             {}
-    #define mm_sha1msg2_epu32(a, b)             {}
-    #define mm_sha1rnds4_epu32(a, b, functor)   {}
-    #define mm_sha1nexte_epu32(a, b)            {}
-    #define mm_sha256msg1_epu32(a, b)           {}
-    #define mm_sha256msg2_epu32(a, b)           {}
-    #define mm_sha256rnds2_epu32(a, b, k)       (k)
+    #define mm_sha1msg1_epu32(a, b)         {}
+    #define mm_sha1msg2_epu32(a, b)         {}
+    #define mm_sha1rnds4_epu32(a, b, f)     {}
+    #define mm_sha1nexte_epu32(a, b)        {}
+    #define mm_sha256msg1_epu32(a, b)       {}
+    #define mm_sha256msg2_epu32(a, b)       {}
+    #define mm_sha256rnds2_epu32(a, b, k)   (k)
+
+    // supporting
+    #define mm_alignr_epi8(a, b, c)         (a)
+    #define mm_blend_epi16(a, b, mask)      (a)
 #else
-    #define mm_sha1msg1_epu32(a, b)             _mm_sha1msg1_epu32(a, b)
-    #define mm_sha1msg2_epu32(a, b)             _mm_sha1msg2_epu32(a, b)
-    #define mm_sha1rnds4_epu32(a, b, functor)   _mm_sha1rnds4_epu32(a, b, functor)
-    #define mm_sha1nexte_epu32(a, b)            _mm_sha1nexte_epu32(a, b)
-    #define mm_sha256msg1_epu32(a, b)           _mm_sha256msg1_epu32(a, b)
-    #define mm_sha256msg2_epu32(a, b)           _mm_sha256msg2_epu32(a, b)
-    #define mm_sha256rnds2_epu32(a, b, k)       _mm_sha256rnds2_epu32(a, b, k)
+    #define mm_sha1msg1_epu32(a, b)         _mm_sha1msg1_epu32(a, b)
+    #define mm_sha1msg2_epu32(a, b)         _mm_sha1msg2_epu32(a, b)
+    #define mm_sha1rnds4_epu32(a, b, f)     _mm_sha1rnds4_epu32(a, b, f)
+    #define mm_sha1nexte_epu32(a, b)        _mm_sha1nexte_epu32(a, b)
+    #define mm_sha256msg1_epu32(a, b)       _mm_sha256msg1_epu32(a, b)
+    #define mm_sha256msg2_epu32(a, b)       _mm_sha256msg2_epu32(a, b)
+    #define mm_sha256rnds2_epu32(a, b, k)   _mm_sha256rnds2_epu32(a, b, k)
+
+    // supporting   
+    #define mm_alignr_epi8(a, b, c)         _mm_alignr_epi8(a, b, c)
+    #define mm_blend_epi16(a, b, mask)      _mm_blend_epi16(a, b, mask)
+
 #endif
 
 #endif
diff --git a/include/bitcoin/system/intrinsics/xcpu/functional_128.hpp b/include/bitcoin/system/intrinsics/xcpu/functional_128.hpp
index 68a227ceb2..43dd6d4900 100644
--- a/include/bitcoin/system/intrinsics/xcpu/functional_128.hpp
+++ b/include/bitcoin/system/intrinsics/xcpu/functional_128.hpp
@@ -76,9 +76,9 @@ INLINE xint128_t shr(xint128_t a) NOEXCEPT
 
     if constexpr (S == bits<uint16_t>)
         return mm_srli_epi16(a, B);
-    else if constexpr (S == bits<uint32_t>)
+    if constexpr (S == bits<uint32_t>)
         return mm_srli_epi32(a, B);
-    else if constexpr (S == bits<uint64_t>)
+    if constexpr (S == bits<uint64_t>)
         return mm_srli_epi64(a, B);
 }
 
@@ -93,9 +93,9 @@ INLINE xint128_t shl(xint128_t a) NOEXCEPT
 
     if constexpr (S == bits<uint16_t>)
         return mm_slli_epi16(a, B);
-    else if constexpr (S == bits<uint32_t>)
+    if constexpr (S == bits<uint32_t>)
         return mm_slli_epi32(a, B);
-    else if constexpr (S == bits<uint64_t>)
+    if constexpr (S == bits<uint64_t>)
         return mm_slli_epi64(a, B);
 }
 
@@ -117,11 +117,11 @@ INLINE xint128_t add(xint128_t a, xint128_t b) NOEXCEPT
 {
     if constexpr (S == bits<uint8_t>)
         return mm_add_epi8(a, b);
-    else if constexpr (S == bits<uint16_t>)
+    if constexpr (S == bits<uint16_t>)
         return mm_add_epi16(a, b);
-    else if constexpr (S == bits<uint32_t>)
+    if constexpr (S == bits<uint32_t>)
         return mm_add_epi32(a, b);
-    else if constexpr (S == bits<uint64_t>)
+    if constexpr (S == bits<uint64_t>)
         return mm_add_epi64(a, b);
 }
 
@@ -131,22 +131,36 @@ INLINE xint128_t addc(xint128_t a) NOEXCEPT
 {
     if constexpr (S == bits<uint8_t>)
         return add<S>(a, mm_set1_epi8(K));
-    else if constexpr (S == bits<uint16_t>)
+    if constexpr (S == bits<uint16_t>)
         return add<S>(a, mm_set1_epi16(K));
-    else if constexpr (S == bits<uint32_t>)
+    if constexpr (S == bits<uint32_t>)
         return add<S>(a, mm_set1_epi32(K));
-    else if constexpr (S == bits<uint64_t>)
+    if constexpr (S == bits<uint64_t>)
         return add<S>(a, mm_set1_epi64x(K));
 }
 
 } // namespace f
 
-/// broadcast/get/set
+/// add/broadcast/gadd/get/set
 /// ---------------------------------------------------------------------------
 
 // SSE2
-template <typename xWord, typename Word,
-    if_same<xWord, xint128_t> = true, if_integral_integer<Word> = true>
+template <typename Word, if_integral_integer<Word> = true>
+INLINE xint128_t add(xint128_t a, xint128_t b) NOEXCEPT
+{
+    if constexpr (is_same_type<Word, uint8_t>)
+        return mm_add_epi8(a, b);
+    if constexpr (is_same_type<Word, uint16_t>)
+        return mm_add_epi16(a, b);
+    if constexpr (is_same_type<Word, uint32_t>)
+        return mm_add_epi32(a, b);
+    if constexpr (is_same_type<Word, uint64_t>)
+        return mm_add_epi64(a, b);
+}
+
+// SSE2
+template <typename xWord, typename Word, if_integral_integer<Word> = true,
+    if_same<xWord, xint128_t> = true>
 INLINE xint128_t broadcast(Word a) NOEXCEPT
 {
     // set1 broadcasts integer to all elements.
@@ -172,13 +186,13 @@ INLINE Word get(xint128_t a) NOEXCEPT
         return mm_extract_epi8(a, Lane);
 
     // SSE2
-    else if constexpr (is_same_type<Word, uint16_t>)
+    if constexpr (is_same_type<Word, uint16_t>)
         return mm_extract_epi16(a, Lane);
 
     // SSE4.1
-    else if constexpr (is_same_type<Word, uint32_t>)
+    if constexpr (is_same_type<Word, uint32_t>)
         return mm_extract_epi32(a, Lane);
-    else if constexpr (is_same_type<Word, uint64_t>)
+    if constexpr (is_same_type<Word, uint64_t>)
         return mm_extract_epi64(a, Lane);
 }
 
@@ -257,25 +271,26 @@ INLINE xint128_t byteswap(xint128_t a) NOEXCEPT
 
 /// load/store (from casted to loaded/stored)
 /// ---------------------------------------------------------------------------
+/// These have defined overrides for !HAVE_SSE41
 
-INLINE xint128_t load_aligned(const xint128_t& bytes) NOEXCEPT
+INLINE xint128_t load(const xint128_t& bytes) NOEXCEPT
 {
-    return mm_load_si128(&bytes);
+    return mm_loadu_si128(&bytes);
 }
 
-INLINE xint128_t load(const xint128_t& bytes) NOEXCEPT
+INLINE void store(xint128_t& bytes, xint128_t a) NOEXCEPT
 {
-    return mm_loadu_si128(&bytes);
+    mm_storeu_si128(&bytes, a);
 }
 
-INLINE void store_aligned(xint128_t& bytes, xint128_t a) NOEXCEPT
+INLINE xint128_t load_aligned(const xint128_t& bytes) NOEXCEPT
 {
-    mm_store_si128(&bytes, a);
+    return mm_load_si128(&bytes);
 }
 
-INLINE void store(xint128_t& bytes, xint128_t a) NOEXCEPT
+INLINE void store_aligned(xint128_t& bytes, xint128_t a) NOEXCEPT
 {
-    mm_storeu_si128(&bytes, a);
+    mm_store_si128(&bytes, a);
 }
 
 #else
@@ -283,6 +298,33 @@ INLINE void store(xint128_t& bytes, xint128_t a) NOEXCEPT
 // Symbol is defined but not usable as an integer.
 using xint128_t = std_array<uint8_t, bytes<128>>;
 
+template <typename Word, if_integral_integer<Word> = true>
+INLINE xint128_t add(xint128_t, xint128_t) NOEXCEPT
+{
+    return {};
+}
+
+template <typename xWord, if_same<xWord, xint128_t> = true>
+INLINE xint128_t set(uint32_t, uint32_t, uint32_t, uint32_t) NOEXCEPT
+{
+    return {};
+}
+
+template <typename Word, if_integral_integer<Word> = true>
+INLINE xint128_t byteswap(xint128_t) NOEXCEPT
+{
+    return {};
+}
+
+INLINE xint128_t load(const xint128_t&) NOEXCEPT
+{
+    return {};
+}
+
+INLINE void store(xint128_t&, xint128_t) NOEXCEPT
+{
+}
+
 #endif // HAVE_SSE41
 
 } // namespace system
diff --git a/include/bitcoin/system/intrinsics/xcpu/functional_256.hpp b/include/bitcoin/system/intrinsics/xcpu/functional_256.hpp
index e2c3711cb4..f385863ac7 100644
--- a/include/bitcoin/system/intrinsics/xcpu/functional_256.hpp
+++ b/include/bitcoin/system/intrinsics/xcpu/functional_256.hpp
@@ -75,9 +75,9 @@ INLINE xint256_t shr(xint256_t a) NOEXCEPT
     // AVX2
     if constexpr (S == bits<uint16_t>)
         return mm256_srli_epi16(a, B);
-    else if constexpr (S == bits<uint32_t>)
+    if constexpr (S == bits<uint32_t>)
         return mm256_srli_epi32(a, B);
-    else if constexpr (S == bits<uint64_t>)
+    if constexpr (S == bits<uint64_t>)
         return mm256_srli_epi64(a, B);
 }
 
@@ -92,9 +92,9 @@ INLINE xint256_t shl(xint256_t a) NOEXCEPT
     // AVX2
     if constexpr (S == bits<uint16_t>)
         return mm256_slli_epi16(a, B);
-    else if constexpr (S == bits<uint32_t>)
+    if constexpr (S == bits<uint32_t>)
         return mm256_slli_epi32(a, B);
-    else if constexpr (S == bits<uint64_t>)
+    if constexpr (S == bits<uint64_t>)
         return mm256_slli_epi64(a, B);
 }
 
@@ -116,11 +116,11 @@ INLINE xint256_t add(xint256_t a, xint256_t b) NOEXCEPT
 {
     if constexpr (S == bits<uint8_t>)
         return mm256_add_epi8(a, b);
-    else if constexpr (S == bits<uint16_t>)
+    if constexpr (S == bits<uint16_t>)
         return mm256_add_epi16(a, b);
-    else if constexpr (S == bits<uint32_t>)
+    if constexpr (S == bits<uint32_t>)
         return mm256_add_epi32(a, b);
-    else if constexpr (S == bits<uint64_t>)
+    if constexpr (S == bits<uint64_t>)
         return mm256_add_epi64(a, b);
 }
 
@@ -130,22 +130,36 @@ INLINE xint256_t addc(xint256_t a) NOEXCEPT
 {
     if constexpr (S == bits<uint8_t>)
         return add<S>(a, mm256_set1_epi8(K));
-    else if constexpr (S == bits<uint16_t>)
+    if constexpr (S == bits<uint16_t>)
         return add<S>(a, mm256_set1_epi16(K));
-    else if constexpr (S == bits<uint32_t>)
+    if constexpr (S == bits<uint32_t>)
         return add<S>(a, mm256_set1_epi32(K));
-    else if constexpr (S == bits<uint64_t>)
+    if constexpr (S == bits<uint64_t>)
         return add<S>(a, mm256_set1_epi64x(K));
 }
 
 } // namespace f
 
-/// broadcast/get/set
+/// add/broadcast/get/set
 /// ---------------------------------------------------------------------------
 
 // AVX
-template <typename xWord, typename Word,
-    if_same<xWord, xint256_t> = true, if_integral_integer<Word> = true>
+template <typename Word, if_integral_integer<Word> = true>
+INLINE xint256_t add(xint256_t a, xint256_t b) NOEXCEPT
+{
+    if constexpr (is_same_type<Word, uint8_t>)
+        return mm256_add_epi8(a, b);
+    if constexpr (is_same_type<Word, uint16_t>)
+        return mm256_add_epi16(a, b);
+    if constexpr (is_same_type<Word, uint32_t>)
+        return mm256_add_epi32(a, b);
+    if constexpr (is_same_type<Word, uint64_t>)
+        return mm256_add_epi64(a, b);
+}
+
+// AVX
+template <typename xWord, typename Word, if_integral_integer<Word> = true,
+    if_same<xWord, xint256_t> = true>
 INLINE xint256_t broadcast(Word a) NOEXCEPT
 {
     // set1 broadcasts integer to all elements.
@@ -169,13 +183,13 @@ INLINE Word get(xint256_t a) NOEXCEPT
     // AVX2
     if constexpr (is_same_type<Word, uint8_t>)
         return mm256_extract_epi8(a, Lane);
-    else if constexpr (is_same_type<Word, uint16_t>)
+    if constexpr (is_same_type<Word, uint16_t>)
         return mm256_extract_epi16(a, Lane);
 
     // AVX
-    else if constexpr (is_same_type<Word, uint32_t>)
+    if constexpr (is_same_type<Word, uint32_t>)
         return mm256_extract_epi32(a, Lane);
-    else if constexpr (is_same_type<Word, uint64_t>)
+    if constexpr (is_same_type<Word, uint64_t>)
         return mm256_extract_epi64(a, Lane);
 }
 
@@ -270,25 +284,26 @@ INLINE xint256_t byteswap(xint256_t a) NOEXCEPT
 
 /// load/store (from casted to loaded/stored)
 /// ---------------------------------------------------------------------------
+/// These have defined overrides for !HAVE_AVX2
 
-INLINE xint256_t load_aligned(const xint256_t& bytes) NOEXCEPT
+INLINE xint256_t load(const xint256_t& bytes) NOEXCEPT
 {
-    return mm256_load_si256(&bytes);
+    return mm256_loadu_si256(&bytes);
 }
 
-INLINE xint256_t load(const xint256_t& bytes) NOEXCEPT
+INLINE void store(xint256_t& bytes, xint256_t a) NOEXCEPT
 {
-    return mm256_loadu_si256(&bytes);
+    mm256_storeu_si256(&bytes, a);
 }
 
-INLINE void store_aligned(xint256_t& bytes, xint256_t a) NOEXCEPT
+INLINE xint256_t load_aligned(const xint256_t& bytes) NOEXCEPT
 {
-    mm256_store_si256(&bytes, a);
+    return mm256_load_si256(&bytes);
 }
 
-INLINE void store(xint256_t& bytes, xint256_t a) NOEXCEPT
+INLINE void store_aligned(xint256_t& bytes, xint256_t a) NOEXCEPT
 {
-    mm256_storeu_si256(&bytes, a);
+    mm256_store_si256(&bytes, a);
 }
 
 #else
diff --git a/include/bitcoin/system/intrinsics/xcpu/functional_512.hpp b/include/bitcoin/system/intrinsics/xcpu/functional_512.hpp
index 3308bd0362..53724cdbbd 100644
--- a/include/bitcoin/system/intrinsics/xcpu/functional_512.hpp
+++ b/include/bitcoin/system/intrinsics/xcpu/functional_512.hpp
@@ -83,9 +83,9 @@ INLINE xint512_t shr(xint512_t a) NOEXCEPT
         return mm512_srli_epi16(a, B);
 
     // AVX512F
-    else if constexpr (S == bits<uint32_t>)
+    if constexpr (S == bits<uint32_t>)
         return mm512_srli_epi32(a, B);
-    else if constexpr (S == bits<uint64_t>)
+    if constexpr (S == bits<uint64_t>)
         return mm512_srli_epi64(a, B);
 }
 
@@ -102,9 +102,9 @@ INLINE xint512_t shl(xint512_t a) NOEXCEPT
         return mm512_slli_epi16(a, B);
 
     // AVX512F
-    else if constexpr (S == bits<uint32_t>)
+    if constexpr (S == bits<uint32_t>)
         return mm512_slli_epi32(a, B);
-    else if constexpr (S == bits<uint64_t>)
+    if constexpr (S == bits<uint64_t>)
         return mm512_slli_epi64(a, B);
 }
 
@@ -126,13 +126,13 @@ INLINE xint512_t add(xint512_t a, xint512_t b) NOEXCEPT
     // AVX512BW
     if constexpr (S == bits<uint8_t>)
         return mm512_add_epi8(a, b);
-    else if constexpr (S == bits<uint16_t>)
+    if constexpr (S == bits<uint16_t>)
         return mm512_add_epi16(a, b);
 
     // AVX512F
-    else if constexpr (S == bits<uint32_t>)
+    if constexpr (S == bits<uint32_t>)
         return mm512_add_epi32(a, b);
-    else if constexpr (S == bits<uint64_t>)
+    if constexpr (S == bits<uint64_t>)
         return mm512_add_epi64(a, b);
 }
 
@@ -142,11 +142,11 @@ INLINE xint512_t addc(xint512_t a) NOEXCEPT
 {
     if constexpr (S == bits<uint8_t>)
         return add<S>(a, mm512_set1_epi8(K));
-    else if constexpr (S == bits<uint16_t>)
+    if constexpr (S == bits<uint16_t>)
         return add<S>(a, mm512_set1_epi16(K));
-    else if constexpr (S == bits<uint32_t>)
+    if constexpr (S == bits<uint32_t>)
         return add<S>(a, mm512_set1_epi32(K));
-    else if constexpr (S == bits<uint64_t>)
+    if constexpr (S == bits<uint64_t>)
         return add<S>(a, mm512_set1_epi64(K));
 }
 
@@ -156,8 +156,22 @@ INLINE xint512_t addc(xint512_t a) NOEXCEPT
 /// ---------------------------------------------------------------------------
 
 // AVX512F
-template <typename xWord, typename Word,
-    if_same<xWord, xint512_t> = true, if_integral_integer<Word> = true>
+template <typename Word, if_integral_integer<Word> = true>
+INLINE xint512_t add(xint512_t a, xint512_t b) NOEXCEPT
+{
+    if constexpr (is_same_type<Word, uint8_t>)
+        return mm256_add_epi8(a, b);
+    if constexpr (is_same_type<Word, uint16_t>)
+        return mm256_add_epi16(a, b);
+    if constexpr (is_same_type<Word, uint32_t>)
+        return mm256_add_epi32(a, b);
+    if constexpr (is_same_type<Word, uint64_t>)
+        return mm256_add_epi64(a, b);
+}
+
+// AVX512F
+template <typename xWord, typename Word, if_integral_integer<Word> = true,
+    if_same<xWord, xint512_t> = true>
 INLINE xint512_t broadcast(Word a) NOEXCEPT
 {
     // set1 broadcasts integer to all elements.
@@ -310,25 +324,26 @@ INLINE xint512_t byteswap(xint512_t a) NOEXCEPT
 
 /// load/store (from casted to loaded/stored)
 /// ---------------------------------------------------------------------------
+/// These have defined overrides for !HAVE_AVX2
 
-INLINE xint512_t load_aligned(const xint512_t& bytes) NOEXCEPT
+INLINE xint512_t load(const xint512_t& bytes) NOEXCEPT
 {
-    return mm512_load_si512(&bytes);
+    return mm512_loadu_si512(&bytes);
 }
 
-INLINE xint512_t load(const xint512_t& bytes) NOEXCEPT
+INLINE void store(xint512_t& bytes, xint512_t a) NOEXCEPT
 {
-    return mm512_loadu_si512(&bytes);
+    mm512_storeu_si512(&bytes, a);
 }
 
-INLINE void store_aligned(xint512_t& bytes, xint512_t a) NOEXCEPT
+INLINE xint512_t load_aligned(const xint512_t& bytes) NOEXCEPT
 {
-    mm512_store_si512(&bytes, a);
+    return mm512_load_si512(&bytes);
 }
 
-INLINE void store(xint512_t& bytes, xint512_t a) NOEXCEPT
+INLINE void store_aligned(xint512_t& bytes, xint512_t a) NOEXCEPT
 {
-    mm512_storeu_si512(&bytes, a);
+    mm512_store_si512(&bytes, a);
 }
 
 #else

From bb1adcdfb9ccf8789d07bfb4aea9dcdea0f543b5 Mon Sep 17 00:00:00 2001
From: evoskuil <eric@voskuil.org>
Date: Thu, 5 Dec 2024 12:12:21 -0500
Subject: [PATCH 6/9] Delint.

---
 include/bitcoin/system/intrinsics/xcpu/defines.hpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/bitcoin/system/intrinsics/xcpu/defines.hpp b/include/bitcoin/system/intrinsics/xcpu/defines.hpp
index 5a1f03b908..be56f019f6 100644
--- a/include/bitcoin/system/intrinsics/xcpu/defines.hpp
+++ b/include/bitcoin/system/intrinsics/xcpu/defines.hpp
@@ -325,13 +325,13 @@ BC_POP_WARNING()
     #define mm_sha1msg2_epu32(a, b)         {}
     #define mm_sha1rnds4_epu32(a, b, f)     {}
     #define mm_sha1nexte_epu32(a, b)        {}
-    #define mm_sha256msg1_epu32(a, b)       {}
-    #define mm_sha256msg2_epu32(a, b)       {}
+    #define mm_sha256msg1_epu32(a, b)       (b)
+    #define mm_sha256msg2_epu32(a, b)       (b)
     #define mm_sha256rnds2_epu32(a, b, k)   (k)
 
     // supporting
-    #define mm_alignr_epi8(a, b, c)         (a)
-    #define mm_blend_epi16(a, b, mask)      (a)
+    #define mm_alignr_epi8(a, b, c)         (b)
+    #define mm_blend_epi16(a, b, mask)      (b)
 #else
     #define mm_sha1msg1_epu32(a, b)         _mm_sha1msg1_epu32(a, b)
     #define mm_sha1msg2_epu32(a, b)         _mm_sha1msg2_epu32(a, b)

From 2b4691637c5df20d3aed0931e83b41419293faac Mon Sep 17 00:00:00 2001
From: evoskuil <eric@voskuil.org>
Date: Thu, 5 Dec 2024 12:31:58 -0500
Subject: [PATCH 7/9] Disable neon (activating but not impl).

---
 include/bitcoin/system/have.hpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/bitcoin/system/have.hpp b/include/bitcoin/system/have.hpp
index b9897f0fb9..79da053996 100644
--- a/include/bitcoin/system/have.hpp
+++ b/include/bitcoin/system/have.hpp
@@ -110,12 +110,13 @@
     #define HAVE_XASSEMBLY
 #endif
 
+/// DISABLED
 /// ARM Neon intrinsics.
 #if defined(HAVE_ARM)
     // -march=armv8-a+crc+crypto [all]
     // -arch arm64 [apple] (also -isysroot to phone sdk)
     #if defined(HAVE_GNUC) || defined(__ARM_NEON) || defined(HAVE_MSC)
-        #define HAVE_NEON
+        ////#define HAVE_NEON
     #endif
 #endif
 

From 7fa9ec0a392db09885bf513c02d2ae6697c479da Mon Sep 17 00:00:00 2001
From: evoskuil <eric@voskuil.org>
Date: Thu, 5 Dec 2024 12:32:09 -0500
Subject: [PATCH 8/9] Comments.

---
 .../impl/hash/sha/algorithm_compress.ipp      | 29 -------------------
 1 file changed, 29 deletions(-)

diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_compress.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_compress.ipp
index 32093848b3..6b9db05926 100644
--- a/include/bitcoin/system/impl/hash/sha/algorithm_compress.ipp
+++ b/include/bitcoin/system/impl/hash/sha/algorithm_compress.ipp
@@ -71,17 +71,6 @@ round(auto a, auto& b, auto c, auto d, auto& e, auto wk) NOEXCEPT
 
     e = /*a =*/ f::add<s>(f::add<s>(f::add<s>(f::rol<5, s>(a), fn(b, c, d)), e), wk);
     b = /*c =*/ f::rol<30, s>(b);
-
-    // SHA-NI
-    // Four rounds (total rounds 80/4).
-    // First round is add(e, w), then sha1nexte(e, w).
-    // fk is round-based enumeration implying f selection and k value.
-    //     e1 = sha1nexte(e0, w);
-    //     abcd = sha1rnds4(abcd, e0, fk);
-    // NEON
-    // f is implied by k in wk.
-    //     e1 = vsha1h(vgetq_lane(abcd, 0);
-    //     vsha1cq(abcd, e0, vaddq(w, k));
 }
 
 TEMPLATE
@@ -97,16 +86,6 @@ round(auto a, auto b, auto c, auto& d, auto e, auto f, auto g, auto& h,
     const auto t = f::add<s>(f::add<s>(f::add<s>(Sigma1(e), choice(e, f, g)), h), wk);
     d = /*e =*/    f::add<s>(d, t);
     h = /*a =*/    f::add<s>(f::add<s>(Sigma0(a), majority(a, b, c)), t);
-
-    // Each call is 2 rounds, s, w and k are 128 (4 words each, s1/s2 is 8 word state).
-    // SHA-NI
-    //     const auto value = add(w, k);
-    //     abcd = sha256rnds2(abcd, efgh, value);
-    //     efgh = sha256rnds2(efgh, abcd, shuffle(value));
-    // NEON
-    //     const auto value = vaddq(w, k);
-    //     abcd = vsha256hq(abcd, efgh, value);
-    //     efgh = vsha256h2q(efgh, abcd, value);
 }
 
 TEMPLATE
@@ -125,10 +104,6 @@ round(auto& state, const auto& wk) NOEXCEPT
             state[(SHA::rounds + 3 - Round) % SHA::state_words],
             state[(SHA::rounds + 4 - Round) % SHA::state_words], // a->e
             extract<word, Lane>(wk[Round]));
-
-        // SHA-NI/NEON
-        // State packs in 128 (one state variable), reduces above to 1 out[].
-        // Input value is 128 (w). Constants (k) statically initialized as 128.
     }
     else
     {
@@ -142,10 +117,6 @@ round(auto& state, const auto& wk) NOEXCEPT
             state[(SHA::rounds + 6 - Round) % SHA::state_words],
             state[(SHA::rounds + 7 - Round) % SHA::state_words], // a->h
             extract<word, Lane>(wk[Round]));
-
-        // SHA-NI/NEON
-        // Each element is 128 (vs. 32), reduces above to 2 out[] (s0/s1).
-        // Input value is 128 (w). Constants (k) statically initialized as 128.
     }
 }
 

From 99167b1f7a60d26837385376771b47ef78605a7d Mon Sep 17 00:00:00 2001
From: evoskuil <eric@voskuil.org>
Date: Thu, 5 Dec 2024 13:03:17 -0500
Subject: [PATCH 9/9] Delint.

---
 .../system/impl/hash/sha/algorithm_native.ipp        |  5 +++--
 include/bitcoin/system/intrinsics/xcpu/defines.hpp   | 11 ++++++-----
 .../system/intrinsics/xcpu/functional_128.hpp        | 12 ++++++------
 3 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_native.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_native.ipp
index 1c0f424001..79f7a34898 100644
--- a/include/bitcoin/system/impl/hash/sha/algorithm_native.ipp
+++ b/include/bitcoin/system/impl/hash/sha/algorithm_native.ipp
@@ -63,9 +63,10 @@ prepare(xint128_t& message0, xint128_t message1) NOEXCEPT
 
 TEMPLATE
 INLINE void CLASS::
-prepare(xint128_t& message0, xint128_t message1, xint128_t& message2) NOEXCEPT
+prepare(xint128_t& SHANI_ONLY(message0), xint128_t message1,
+    xint128_t& message2) NOEXCEPT
 {
-    message2 = mm_sha256msg2_epu32(add<word_t>(message2,
+    message2 = mm_sha256msg2_epu32(mm_add_epi32(message2,
         mm_alignr_epi8(message1, message0, 4)), message1);
 }
 
diff --git a/include/bitcoin/system/intrinsics/xcpu/defines.hpp b/include/bitcoin/system/intrinsics/xcpu/defines.hpp
index be56f019f6..0d997d0960 100644
--- a/include/bitcoin/system/intrinsics/xcpu/defines.hpp
+++ b/include/bitcoin/system/intrinsics/xcpu/defines.hpp
@@ -328,10 +328,9 @@ BC_POP_WARNING()
     #define mm_sha256msg1_epu32(a, b)       (b)
     #define mm_sha256msg2_epu32(a, b)       (b)
     #define mm_sha256rnds2_epu32(a, b, k)   (k)
-
-    // supporting
-    #define mm_alignr_epi8(a, b, c)         (b)
-    #define mm_blend_epi16(a, b, mask)      (b)
+    #define mm_alignr_epi8(a, b, c)         (a)
+    #define mm_blend_epi16(a, b, mask)      (a)
+    #define SHANI_ONLY(a)
 #else
     #define mm_sha1msg1_epu32(a, b)         _mm_sha1msg1_epu32(a, b)
     #define mm_sha1msg2_epu32(a, b)         _mm_sha1msg2_epu32(a, b)
@@ -341,10 +340,12 @@ BC_POP_WARNING()
     #define mm_sha256msg2_epu32(a, b)       _mm_sha256msg2_epu32(a, b)
     #define mm_sha256rnds2_epu32(a, b, k)   _mm_sha256rnds2_epu32(a, b, k)
 
-    // supporting   
+    // supporting
     #define mm_alignr_epi8(a, b, c)         _mm_alignr_epi8(a, b, c)
     #define mm_blend_epi16(a, b, mask)      _mm_blend_epi16(a, b, mask)
 
+    // unused argument suppression
+    #define SHANI_ONLY(a) a
 #endif
 
 #endif
diff --git a/include/bitcoin/system/intrinsics/xcpu/functional_128.hpp b/include/bitcoin/system/intrinsics/xcpu/functional_128.hpp
index 43dd6d4900..0f841da915 100644
--- a/include/bitcoin/system/intrinsics/xcpu/functional_128.hpp
+++ b/include/bitcoin/system/intrinsics/xcpu/functional_128.hpp
@@ -299,9 +299,9 @@ INLINE void store_aligned(xint128_t& bytes, xint128_t a) NOEXCEPT
 using xint128_t = std_array<uint8_t, bytes<128>>;
 
 template <typename Word, if_integral_integer<Word> = true>
-INLINE xint128_t add(xint128_t, xint128_t) NOEXCEPT
+INLINE xint128_t add(xint128_t, xint128_t b) NOEXCEPT
 {
-    return {};
+    return b;
 }
 
 template <typename xWord, if_same<xWord, xint128_t> = true>
@@ -311,14 +311,14 @@ INLINE xint128_t set(uint32_t, uint32_t, uint32_t, uint32_t) NOEXCEPT
 }
 
 template <typename Word, if_integral_integer<Word> = true>
-INLINE xint128_t byteswap(xint128_t) NOEXCEPT
+INLINE xint128_t byteswap(xint128_t a) NOEXCEPT
 {
-    return {};
+    return a;
 }
 
-INLINE xint128_t load(const xint128_t&) NOEXCEPT
+INLINE xint128_t load(const xint128_t& a) NOEXCEPT
 {
-    return {};
+    return a;
 }
 
 INLINE void store(xint128_t&, xint128_t) NOEXCEPT