From 09a51274d8e75bdcd63b10513b093ae98e4a724c Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Mon, 23 Oct 2023 17:18:43 +0530 Subject: [PATCH 1/3] Organize GEMM stuff into separate files --- slimt/QMM.cc | 879 +------------------------------------ slimt/QMM.hh | 12 - slimt/qmm/Gemmology.inl.hh | 348 +++++++++++++++ slimt/qmm/Intgemm.inl.hh | 238 ++++++++++ slimt/qmm/Ruy.inl.hh | 272 ++++++++++++ 5 files changed, 870 insertions(+), 879 deletions(-) create mode 100644 slimt/qmm/Gemmology.inl.hh create mode 100644 slimt/qmm/Intgemm.inl.hh create mode 100644 slimt/qmm/Ruy.inl.hh diff --git a/slimt/QMM.cc b/slimt/QMM.cc index 34531859..e3e00e3d 100644 --- a/slimt/QMM.cc +++ b/slimt/QMM.cc @@ -6,890 +6,35 @@ #ifdef SLIMT_HAS_INTGEMM #include "intgemm/callbacks/configs.h" #include "intgemm/intgemm.h" + +namespace slimt::qmm::detail { +constexpr Provider kAutoProvider = Provider::Intgemm; +} +#include "slimt/qmm/Intgemm.inl.hh" #endif #ifdef SLIMT_HAS_RUY #include "ruy/ruy.h" +namespace slimt::qmm::detail { +constexpr Provider kAutoProvider = Provider::Ruy; +} +#include "slimt/qmm/Ruy.inl.hh" #endif #ifdef SLIMT_HAS_GEMMOLOGY #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-parameter" #include "gemmology/gemmology.h" - -#if defined(USE_AVX512) -#define GEMMOLOGY_SUPPORTED_ARCHS \ - xsimd::arch_list -#elif defined(USE_AVX2) -#define GEMMOLOGY_SUPPORTED_ARCHS \ - xsimd::arch_list -#elif defined(USE_SSSE3) -#define GEMMOLOGY_SUPPORTED_ARCHS xsimd::arch_list -#elif defined(USE_SSE2) -#define GEMMOLOGY_SUPPORTED_ARCHS xsimd::arch_list -#elif defined(USE_NEON) and defined(XSIMD_WITH_NEON64) -#define GEMMOLOGY_SUPPORTED_ARCHS xsimd::arch_list -#else -#error no supported architecture -#endif - -#pragma GCC diagnostic pop -#endif - -#include "slimt/Tensor.hh" - -#ifdef SLIMT_HAS_INTGEMM namespace slimt::qmm::detail { -template <> -Tensor affine_with_select( - Tensor& x, const Tensor& W, const Tensor& b, float a_quant, float b_quant, - const std::vector& indices, const std::string& name) { - // Naming is to simplify thinking with the intgemm API below. - Tensor& A = x; // NOLINT - const Tensor& B = W; // NOLINT - const Tensor& bias = b; - - size_t A_cols = A.dim(-1); // NOLINT - size_t B_cols = B.dim(-1); // NOLINT - size_t A_rows = A.size() / A_cols; // NOLINT - size_t B_rows = B.size() / B_cols; // NOLINT - - size_t width = A_cols; - // SLIMT_TRACE3(x.shape(), W.shape(), b.shape()); - - // Check widths are same, making matrix multiplication viable. - assert(A_cols == B_rows); - - // Prepare Activations (A). - Tensor prepared_A(Type::i8, A.shape(), "quantized_acts"); // NOLINT - intgemm::Int8Shift::PrepareA( // - A.data(), prepared_A.data(), // - a_quant, // - A_rows, width // - ); - - // Prepare bias - Tensor prepared_bias(Type::f32, bias.shape(), "prepared_bias"); - constexpr float kMax8bit = kInt8Maxf; - float a_alpha = kMax8bit / a_quant; - float b_alpha = kMax8bit / b_quant; - - float bias_unquant_multiplier = (-1.0F * (a_alpha * b_alpha)) / kMax8bit; - auto prepare_bias_callback = intgemm::callbacks::UnquantizeAndAddBiasAndWrite( - bias_unquant_multiplier, bias.data(), // - prepared_bias.data() // - ); - - intgemm::Int8Shift::PrepareBias( // - B.data(), // - width, B_cols, // - prepare_bias_callback // - ); - - // Select before multiply? - // NOLINTNEXTLINE - Tensor selected_B(Type::i8, Shape({width, indices.size()}), "selected_B"); - const uint32_t* indices_begin = indices.data(); - const uint32_t* indices_end = indices.data() + indices.size(); - - intgemm::Int8::SelectColumnsB(B.data(), selected_B.data(), - B_rows, indices_begin, indices_end); - - // Select bias accordingly. - Tensor selected_bias(Type::f32, Shape({indices.size()}), "selected_bias"); - auto* selected_bias_ptr = selected_bias.data(); - for (uint32_t index : indices) { - *(selected_bias_ptr) = *(prepared_bias.data() + index); - ++selected_bias_ptr; - } - - // Multiply y = A * B + bias (affine) - // Set y's shape replacing last dimension with the feature-dim B is projecting - // onto (B_cols). - Shape out_shape = x.shape(); - out_shape.set_dim(-1, indices.size()); - - Tensor y(Type::f32, out_shape, (name.empty() ? x.name() : name)); - size_t selected_B_cols = selected_B.dim(-1); // NOLINT - - float unquant_multiplier = 1.0F / (a_quant * b_quant); - auto multiply_callback = intgemm::callbacks::UnquantizeAndAddBiasAndWrite( - unquant_multiplier, selected_bias.data(), y.data()); - intgemm::Int8Shift::Multiply( // - prepared_A.data(), selected_B.data(), // - A_rows, width, selected_B_cols, // - multiply_callback // - ); - - return y; -} - -template <> -Tensor affine(Tensor& x, const Tensor& W, const Tensor& b, - float a_quant, float b_quant, - const std::string& name) { - // Naming is to simplify thinking with the intgemm API below. - Tensor& A = x; // NOLINT - const Tensor& B = W; // NOLINT - const Tensor& bias = b; - - size_t A_cols = A.dim(-1); // NOLINT - size_t B_cols = B.dim(-1); // NOLINT - size_t A_rows = A.size() / A_cols; // NOLINT - size_t B_rows = B.size() / B_cols; // NOLINT - - size_t width = A_cols; - // SLIMT_TRACE3(x.shape(), W.shape(), b.shape()); - - // Check widths are same, making matrix multiplication viable. - (void)B_rows; - assert(A_cols == B_rows); - - // Prepare Activations (A). - Tensor prepared_A(Type::i8, A.shape(), "quantized_acts"); // NOLINT - intgemm::Int8Shift::PrepareA( // - A.data(), prepared_A.data(), // - a_quant, // - A_rows, width // - ); - - // Prepare bias - Tensor prepared_bias(Type::f32, bias.shape(), "prepared_bias"); - float a_alpha = kInt8Maxf / a_quant; - float b_alpha = kInt8Maxf / b_quant; - - float bias_unquant_multiplier = (-1.0F * (a_alpha * b_alpha)) / kInt8Maxf; - auto prepare_bias_callback = intgemm::callbacks::UnquantizeAndAddBiasAndWrite( - bias_unquant_multiplier, bias.data(), // - prepared_bias.data() // - ); - - intgemm::Int8Shift::PrepareBias( // - B.data(), // - width, B_cols, // - prepare_bias_callback // - ); - - // Multiply y = A * B + bias (affine) - // Set y's shape replacing last dimension with the feature-dim B is projecting - // onto (B_cols). - Shape out_shape = x.shape(); - out_shape.set_dim(-1, B_cols); - - Tensor y(Type::f32, out_shape, (name.empty() ? x.name() : name)); - - float unquant_multiplier = 1.0F / (a_quant * b_quant); - auto multiply_callback = intgemm::callbacks::UnquantizeAndAddBiasAndWrite( - unquant_multiplier, prepared_bias.data(), y.data()); - intgemm::Int8Shift::Multiply( // - prepared_A.data(), B.data(), // - A_rows, width, B_cols, // - multiply_callback // - ); - - return y; -} - -template <> -Tensor dot(Tensor& x, const Tensor& W, float a_quant, - float b_quant, const std::string& name) { - // Naming is to simplify thinking with the intgemm API below. - Tensor& A = x; // NOLINT - const Tensor& B = W; // NOLINT - - size_t A_cols = A.dim(-1); // NOLINT - size_t B_cols = B.dim(-1); // NOLINT - size_t A_rows = A.size() / A_cols; // NOLINT - size_t B_rows = B.size() / B_cols; // NOLINT - - size_t width = A_cols; - // SLIMT_TRACE3(x.shape(), W.shape(), b.shape()); - - // Check widths are same, making matrix multiplication viable. - (void)B_rows; - assert(A_cols == B_rows); - - // Prepare Activations (A). - Tensor prepared_A(Type::i8, A.shape(), "quantized_acts"); // NOLINT - intgemm::Int8Shift::PrepareA( // - A.data(), prepared_A.data(), // - a_quant, // - A_rows, width // - ); - - // Prepare bias - - // Fake bias, all elements are zero. - Tensor bias(x.type(), Shape({1, B_cols}), "zero_bias"); - bias.fill_in_place(0.0F); - - Tensor prepared_bias(Type::f32, bias.shape(), "prepared_bias"); - float a_alpha = kInt8Maxf / a_quant; - float b_alpha = kInt8Maxf / b_quant; - - float bias_unquant_multiplier = (-1.0F * (a_alpha * b_alpha)) / kInt8Maxf; - auto prepare_bias_callback = intgemm::callbacks::UnquantizeAndAddBiasAndWrite( - bias_unquant_multiplier, bias.data(), // - prepared_bias.data() // - ); - - intgemm::Int8Shift::PrepareBias( // - B.data(), // - width, B_cols, // - prepare_bias_callback // - ); - - // - // Multiply y = A * B (dot) - // Set y's shape replacing last dimension with the feature-dim B is projecting - // onto (B_cols). - Shape out_shape = x.shape(); - out_shape.set_dim(-1, B_cols); - - Tensor y(Type::f32, out_shape, (name.empty() ? x.name() : name)); - - float unquant_multiplier = 1.0F / (a_quant * b_quant); - auto multiply_callback = intgemm::callbacks::UnquantizeAndAddBiasAndWrite( - unquant_multiplier, prepared_bias.data(), y.data()); - intgemm::Int8Shift::Multiply( // - prepared_A.data(), B.data(), // - A_rows, width, B_cols, // - multiply_callback // - ); - - return y; +constexpr Provider kAutoProvider = Provider::Gemmology; } - -template <> -void prepare_weight_transposed(const float* weights, - int8_t* prepared, - float quantization_multiplier, - size_t cols, size_t rows) { - intgemm::Int8::PrepareBTransposed(weights, prepared, quantization_multiplier, - cols, rows); -} - -template <> -void prepare_weight_quantized_transposed(const int8_t* input, - int8_t* output, - size_t rows, - size_t cols) { - intgemm::Int8::PrepareBQuantizedTransposed(input, output, rows, cols); -} -} // namespace slimt::qmm::detail - +#include "slimt/qmm/Gemmology.inl.hh" +#pragma GCC diagnostic pop #endif -#ifdef SLIMT_HAS_RUY -namespace slimt::qmm::detail { - -using Index = uint64_t; - -void quantize(const float* input, float scale, Index rows, Index width, - int8_t* output) { - const Index size = rows * width; - for (size_t i = 0; i < size; i++) { - // Round to nearest after multiplying with scale. - float value = roundf(scale * input[i]); - - // Since float can store bigger values, we threshold anything that's gone - // higher and can't fit in int8. - value = std::max(-kInt8Maxf, value); - value = std::min(kInt8Maxf, value); - - // Finally a static cast. - output[i] = static_cast(value); - }; -} - -template -void transpose(const Scalar* input, Index rows, Index cols, Scalar* output) { - for (size_t i = 0; i < rows; i++) { - for (size_t j = 0; j < cols; j++) { - output[j * rows + i] = input[i * cols + j]; - } - } -} - -void unquantize(const int32_t* input, float unquant_multiplier, Index rows_A, - Index cols_B, float* output) { - for (size_t i = 0; i < rows_A; i++) { - for (size_t j = 0; j < cols_B; j++) { - Index idx = i * cols_B + j; - output[idx] = (input[idx] * unquant_multiplier); - } - } -} - -void unquantizeAddBias(const int32_t* input, const float* input_bias_prepared, - float unquant_multiplier, Index rows_A, Index cols_B, - float* output) { - for (size_t i = 0; i < rows_A; i++) { - for (size_t j = 0; j < cols_B; j++) { - Index idx = i * cols_B + j; - output[idx] = (input[idx] * unquant_multiplier) + input_bias_prepared[j]; - } - } -} - -// Ruy. -template <> -Tensor affine(Tensor& x, const Tensor& W, const Tensor& b, - float a_quant, float b_quant, - const std::string& name) { - Tensor& A = x; // NOLINT - const Tensor& B = W; // NOLINT - const Tensor& bias = b; - - size_t A_cols = A.dim(-1); // NOLINT - size_t B_cols = B.dim(-1); // NOLINT - size_t A_rows = A.size() / A_cols; // NOLINT - size_t B_rows = B.size() / B_cols; // NOLINT - - size_t width = B_rows; - - (void)name; - // Prepare A: Quantize from f32 -> i8 - Tensor prepared_A(Type::i8, x.shape(), "prepared_A"); // NOLINT - - detail::quantize(x.data(), a_quant, A_rows, A_cols, - prepared_A.data()); - - ruy::Context context; - ruy::Matrix lhs; - ruy::MakeSimpleLayout(A_rows, width, ruy::Order::kRowMajor, - lhs.mutable_layout()); - lhs.set_data(prepared_A.data()); - - // PrepareB: ? - ruy::Matrix rhs; - ruy::MakeSimpleLayout(width, B_cols, ruy::Order::kColMajor, - rhs.mutable_layout()); - rhs.set_data(W.data()); - - // PrepareBias: ? - // Actualyl there is no need. - const Tensor& prepared_bias = bias; - - ruy::Matrix dst; - ruy::MakeSimpleLayout(A_rows, B_cols, ruy::Order::kRowMajor, - dst.mutable_layout()); - - Shape out_shape = x.shape(); - out_shape.set_dim(-1, B_cols); - Tensor AB(Type::i32, out_shape, name + "_out"); // NOLINT - dst.set_data(AB.data()); - - // Multiply C = AB; - // When Dst is int32, mul_params is unused. - ruy::MulParams mul_params; - ruy::Mul(lhs, rhs, mul_params, &context, &dst); - - // Unquantizes, then adds bias in a single statement on the output. - Tensor y(Type::f32, out_shape, name + "_out"); // NOLINT - float unquant_multiplier = 1.0F / (a_quant * b_quant); - detail::unquantizeAddBias(AB.data(), prepared_bias.data(), - unquant_multiplier, A_rows, B_cols, - y.data()); - return y; -} - -template <> -Tensor affine_with_select(Tensor& x, const Tensor& W, - const Tensor& b, float a_quant, - float b_quant, - const std::vector& indices, - const std::string& name) { - Tensor& A = x; // NOLINT - const Tensor& B = W; // NOLINT - const Tensor& bias = b; - - size_t A_cols = A.dim(-1); // NOLINT - size_t B_cols = B.dim(-1); // NOLINT - size_t A_rows = A.size() / A_cols; // NOLINT - size_t B_rows = B.size() / B_cols; // NOLINT - - size_t width = B_rows; - - (void)name; - // Prepare A: Quantize from f32 -> i8 - Tensor prepared_A(Type::i8, x.shape(), "prepared_A"); // NOLINT - - detail::quantize(x.data(), a_quant, A_rows, A_cols, - prepared_A.data()); - - ruy::Context context; - ruy::Matrix lhs; - ruy::MakeSimpleLayout(A_rows, width, ruy::Order::kRowMajor, - lhs.mutable_layout()); - lhs.set_data(prepared_A.data()); - - // PrepareB: Select - Tensor selected_B(Type::i8, Shape({width, indices.size()}), // NOLINT - "selected_B"); - - // SelectColumnsB, but inlined? - // B_prepared is expected to be col-major, for our implementation via ruy. If - // col-major we can memcpy the respective column entries as they're - // sequential. There are width = rows entries. - auto B_data = B.data(); // NOLINT - auto sB_data = selected_B.data(); // NOLINT - for (size_t c = 0; c < indices.size(); ++c) { - int8_t* sB_begin = &(sB_data[c * width]); // NOLINT - const int8_t* B_begin = &(B_data[indices[c] * width]); // NOLINT - std::memcpy(sB_begin, B_begin, width); - } - - ruy::Matrix rhs; - ruy::MakeSimpleLayout(width, indices.size(), ruy::Order::kColMajor, - rhs.mutable_layout()); - rhs.set_data(selected_B.data()); - - // Once again, bias needn't be prepared. But needs to be selected. - const Tensor& prepared_bias = bias; - Tensor selected_bias(Type::f32, Shape({indices.size()}), "selected_bias"); - auto* selected_bias_ptr = selected_bias.data(); - for (uint32_t index : indices) { - *(selected_bias_ptr) = *(prepared_bias.data() + index); - ++selected_bias_ptr; - } - - // Multiply C = A select(B); - // When Dst is int32, mul_params is unused. - size_t selected_B_cols = selected_B.dim(-1); // NOLINT - ruy::Matrix dst; - ruy::MakeSimpleLayout(A_rows, selected_B_cols, ruy::Order::kRowMajor, - dst.mutable_layout()); - - Shape out_shape = x.shape(); - out_shape.set_dim(-1, selected_B_cols); - - Tensor AB(Type::i32, out_shape, name + "_out"); // NOLINT - dst.set_data(AB.data()); - - ruy::MulParams mul_params; - ruy::Mul(lhs, rhs, mul_params, &context, &dst); - - // Unquantizes, then adds bias in a single statement on the output. - Tensor y(Type::f32, out_shape, name + "_out"); // NOLINT - float unquant_multiplier = 1.0F / (a_quant * b_quant); - detail::unquantizeAddBias(AB.data(), prepared_bias.data(), - unquant_multiplier, A_rows, selected_B_cols, - y.data()); - return y; -} - -template <> -Tensor dot(Tensor& x, const Tensor& W, float a_quant, - float b_quant, const std::string& name) { - Tensor& A = x; // NOLINT - const Tensor& B = W; // NOLINT - - size_t A_cols = A.dim(-1); // NOLINT - size_t B_cols = B.dim(-1); // NOLINT - size_t A_rows = A.size() / A_cols; // NOLINT - size_t B_rows = B.size() / B_cols; // NOLINT - - size_t width = B_rows; - - (void)name; - // Prepare A: Quantize from f32 -> i8 - Tensor prepared_A(Type::i8, x.shape(), "prepared_A"); // NOLINT - - detail::quantize(x.data(), a_quant, A_rows, A_cols, - prepared_A.data()); - - ruy::Context context; - ruy::Matrix lhs; - ruy::MakeSimpleLayout(A_rows, width, ruy::Order::kRowMajor, - lhs.mutable_layout()); - lhs.set_data(prepared_A.data()); - - // PrepareB: ? - ruy::Matrix rhs; - ruy::MakeSimpleLayout(width, B_cols, ruy::Order::kColMajor, - rhs.mutable_layout()); - rhs.set_data(W.data()); - - // PrepareBias: ? - // Actualyl there is no need. - ruy::Matrix dst; - ruy::MakeSimpleLayout(A_rows, B_cols, ruy::Order::kRowMajor, - dst.mutable_layout()); - - Shape out_shape = x.shape(); - out_shape.set_dim(-1, B_cols); - Tensor AB(Type::i32, out_shape, name + "_out"); // NOLINT - dst.set_data(AB.data()); - - // Multiply C = AB; - // When Dst is int32, mul_params is unused. - ruy::MulParams mul_params; - ruy::Mul(lhs, rhs, mul_params, &context, &dst); - - // Unquantizes, then adds bias in a single statement on the output. - Tensor y(Type::f32, out_shape, name + "_out"); // NOLINT - float unquant_multiplier = 1.0F / (a_quant * b_quant); - detail::unquantize(AB.data(), unquant_multiplier, A_rows, B_cols, - y.data()); - return y; -} - -template <> -void prepare_weight_transposed(const float* weights, - int8_t* prepared, - float quantization_multiplier, - size_t cols, size_t rows) { - detail::quantize(weights, quantization_multiplier, cols, rows, prepared); -} - -template <> -void prepare_weight_quantized_transposed(const int8_t* input, - int8_t* output, - size_t rows, - size_t cols) { - std::memcpy(output, input, - /*count=*/sizeof(int8_t) * (rows * cols)); -} -} // namespace slimt::qmm::detail -#endif // SLIMT_HAS_RUY - #ifdef SLIMT_HAS_GEMMOLOGY - -namespace gemmology { - -#ifdef USE_AVX512 -template struct Engine; -template void Engine::SelectColumnsB(const int8_t*, int8_t*, - size_t, const uint32_t*, - const uint32_t*); -template void Engine::Shift::Multiply( - const uint8_t*, const int8_t*, size_t, size_t, size_t, - gemmology::callbacks::UnquantizeAndAddBiasAndWrite); -template void Engine::Shift::PrepareBias( - const int8_t*, size_t, size_t, - gemmology::callbacks::UnquantizeAndAddBiasAndWrite); -#endif - -#ifdef USE_AVX2 -template struct Engine; -template void Engine::SelectColumnsB(const int8_t*, int8_t*, - size_t, const uint32_t*, - const uint32_t*); -template void Engine::Shift::Multiply( - const uint8_t*, const int8_t*, size_t, size_t, size_t, - gemmology::callbacks::UnquantizeAndAddBiasAndWrite); -template void Engine::Shift::PrepareBias( - const int8_t*, size_t, size_t, - gemmology::callbacks::UnquantizeAndAddBiasAndWrite); -#endif - -#ifdef USE_SSE2 -template struct Engine; -template void Engine::SelectColumnsB(const int8_t*, int8_t*, - size_t, const uint32_t*, - const uint32_t*); - -template void Engine::Shift::Multiply( - const uint8_t*, const int8_t*, size_t, size_t, size_t, - gemmology::callbacks::UnquantizeAndAddBiasAndWrite); -template void Engine::Shift::PrepareBias( - const int8_t*, size_t, size_t, - gemmology::callbacks::UnquantizeAndAddBiasAndWrite); #endif -#ifdef USE_SSSE3 -template struct Engine; -template void Engine::SelectColumnsB(const int8_t*, int8_t*, - size_t, const uint32_t*, - const uint32_t*); -template void Engine::Shift::Multiply( - const uint8_t*, const int8_t*, size_t, size_t, size_t, - gemmology::callbacks::UnquantizeAndAddBiasAndWrite); -template void Engine::Shift::PrepareBias( - const int8_t*, size_t, size_t, - gemmology::callbacks::UnquantizeAndAddBiasAndWrite); -#endif - -#ifdef USE_NEON -template struct Engine; -template void Engine::SelectColumnsB(const int8_t*, int8_t*, - size_t, const uint32_t*, - const uint32_t*); -template void Engine::Shift::Multiply( - const uint8_t*, const int8_t*, size_t, size_t, size_t, - gemmology::callbacks::UnquantizeAndAddBiasAndWrite); -template void Engine::Shift::PrepareBias( - const int8_t*, size_t, size_t, - gemmology::callbacks::UnquantizeAndAddBiasAndWrite); -#endif // USE_NEON - -} // namespace gemmology - -// Dispatch *at runtime* based on run-time hardware and compile-time -// architectures. -// -// FIXME: Ideally we would not run the dispatch code at each function call. -#define GEMMOLOGY_DISPATCH(FUNCTION) \ - xsimd::dispatch([](auto arch, auto... args) { \ - return gemmology::Engine::FUNCTION(args...); \ - }) - -namespace slimt::qmm::detail { - -template <> -Tensor affine_with_select( - Tensor& x, const Tensor& W, const Tensor& b, float a_quant, float b_quant, - const std::vector& indices, const std::string& name) { - // Naming is to simplify thinking with the gemmology API below. - Tensor& A = x; // NOLINT - const Tensor& B = W; // NOLINT - const Tensor& bias = b; - - size_t A_cols = A.dim(-1); // NOLINT - size_t B_cols = B.dim(-1); // NOLINT - size_t A_rows = A.size() / A_cols; // NOLINT - size_t B_rows = B.size() / B_cols; // NOLINT - - size_t width = A_cols; - // SLIMT_TRACE3(x.shape(), W.shape(), b.shape()); - - // Check widths are same, making matrix multiplication viable. - assert(A_cols == B_rows); - - // Prepare Activations (A). - Tensor prepared_A(Type::i8, A.shape(), "quantized_acts"); // NOLINT - auto PrepareA = GEMMOLOGY_DISPATCH(Shift::PrepareA); // NOLINT - PrepareA( // - A.data(), prepared_A.data(), // - a_quant, // - A_rows, width // - ); - - // Prepare bias - Tensor prepared_bias(Type::f32, bias.shape(), "prepared_bias"); - constexpr float kMax8bit = kInt8Maxf; - float a_alpha = kMax8bit / a_quant; - float b_alpha = kMax8bit / b_quant; - - float bias_unquant_multiplier = (-1.0F * (a_alpha * b_alpha)) / kMax8bit; - auto prepare_bias_callback = - gemmology::callbacks::UnquantizeAndAddBiasAndWrite( - bias_unquant_multiplier, bias.data(), // - prepared_bias.data() // - ); - - auto PrepareBias = GEMMOLOGY_DISPATCH(Shift::PrepareBias); // NOLINT - PrepareBias( // - B.data(), // - width, B_cols, // - prepare_bias_callback // - ); - - // Select before multiply? - // NOLINTNEXTLINE - Tensor selected_B(Type::i8, Shape({width, indices.size()}), "selected_B"); - const uint32_t* indices_begin = indices.data(); - const uint32_t* indices_end = indices.data() + indices.size(); - - auto SelectColumnsB = GEMMOLOGY_DISPATCH(SelectColumnsB); // NOLINT - SelectColumnsB(B.data(), selected_B.data(), B_rows, - indices_begin, indices_end); - - // Select bias accordingly. - Tensor selected_bias(Type::f32, Shape({indices.size()}), "selected_bias"); - auto* selected_bias_ptr = selected_bias.data(); - for (uint32_t index : indices) { - *(selected_bias_ptr) = *(prepared_bias.data() + index); - ++selected_bias_ptr; - } - - // Multiply y = A * B + bias (affine) - // Set y's shape replacing last dimension with the feature-dim B is projecting - // onto (B_cols). - Shape out_shape = x.shape(); - out_shape.set_dim(-1, indices.size()); - - Tensor y(Type::f32, out_shape, (name.empty() ? x.name() : name)); - size_t selected_B_cols = selected_B.dim(-1); // NOLINT - - float unquant_multiplier = 1.0F / (a_quant * b_quant); - auto multiply_callback = gemmology::callbacks::UnquantizeAndAddBiasAndWrite( - unquant_multiplier, selected_bias.data(), y.data()); - auto Multiply = GEMMOLOGY_DISPATCH(Shift::Multiply); // NOLINT - Multiply( // - prepared_A.data(), selected_B.data(), // - A_rows, width, selected_B_cols, // - multiply_callback // - ); - - return y; -} - -template <> -Tensor affine(Tensor& x, const Tensor& W, const Tensor& b, - float a_quant, float b_quant, - const std::string& name) { - // Naming is to simplify thinking with the gemmology API below. - Tensor& A = x; // NOLINT - const Tensor& B = W; // NOLINT - const Tensor& bias = b; - - size_t A_cols = A.dim(-1); // NOLINT - size_t B_cols = B.dim(-1); // NOLINT - size_t A_rows = A.size() / A_cols; // NOLINT - size_t B_rows = B.size() / B_cols; // NOLINT - - size_t width = A_cols; - // SLIMT_TRACE3(x.shape(), W.shape(), b.shape()); - - // Check widths are same, making matrix multiplication viable. - (void)B_rows; - assert(A_cols == B_rows); - - // Prepare Activations (A). - Tensor prepared_A(Type::i8, A.shape(), "quantized_acts"); // NOLINT - auto PrepareA = GEMMOLOGY_DISPATCH(Shift::PrepareA); // NOLINT - PrepareA( // - A.data(), prepared_A.data(), // - a_quant, // - A_rows, width // - ); - - // Prepare bias - Tensor prepared_bias(Type::f32, bias.shape(), "prepared_bias"); - float a_alpha = kInt8Maxf / a_quant; - float b_alpha = kInt8Maxf / b_quant; - - float bias_unquant_multiplier = (-1.0F * (a_alpha * b_alpha)) / kInt8Maxf; - auto prepare_bias_callback = - gemmology::callbacks::UnquantizeAndAddBiasAndWrite( - bias_unquant_multiplier, bias.data(), // - prepared_bias.data() // - ); - - auto PrepareBias = GEMMOLOGY_DISPATCH(Shift::PrepareBias); // NOLINT - PrepareBias( // - B.data(), // - width, B_cols, // - prepare_bias_callback // - ); - - // Multiply y = A * B + bias (affine) - // Set y's shape replacing last dimension with the feature-dim B is projecting - // onto (B_cols). - Shape out_shape = x.shape(); - out_shape.set_dim(-1, B_cols); - - Tensor y(Type::f32, out_shape, (name.empty() ? x.name() : name)); - - float unquant_multiplier = 1.0F / (a_quant * b_quant); - auto multiply_callback = gemmology::callbacks::UnquantizeAndAddBiasAndWrite( - unquant_multiplier, prepared_bias.data(), y.data()); - auto Multiply = GEMMOLOGY_DISPATCH(Shift::Multiply); // NOLINT - Multiply( // - prepared_A.data(), B.data(), // - A_rows, width, B_cols, // - multiply_callback // - ); - - return y; -} - -template <> -Tensor dot(Tensor& x, const Tensor& W, float a_quant, - float b_quant, const std::string& name) { - // Naming is to simplify thinking with the gemmology API below. - Tensor& A = x; // NOLINT - const Tensor& B = W; // NOLINT - - size_t A_cols = A.dim(-1); // NOLINT - size_t B_cols = B.dim(-1); // NOLINT - size_t A_rows = A.size() / A_cols; // NOLINT - size_t B_rows = B.size() / B_cols; // NOLINT - - size_t width = A_cols; - // SLIMT_TRACE3(x.shape(), W.shape(), b.shape()); - - // Check widths are same, making matrix multiplication viable. - (void)B_rows; - assert(A_cols == B_rows); - - // Prepare Activations (A). - Tensor prepared_A(Type::i8, A.shape(), "quantized_acts"); // NOLINT - auto PrepareA = GEMMOLOGY_DISPATCH(Shift::PrepareA); // NOLINT - PrepareA( // - A.data(), prepared_A.data(), // - a_quant, // - A_rows, width // - ); - - // Prepare bias - - // Fake bias, all elements are zero. - Tensor bias(x.type(), Shape({1, B_cols}), "zero_bias"); - bias.fill_in_place(0.0F); - - Tensor prepared_bias(Type::f32, bias.shape(), "prepared_bias"); - float a_alpha = kInt8Maxf / a_quant; - float b_alpha = kInt8Maxf / b_quant; - - float bias_unquant_multiplier = (-1.0F * (a_alpha * b_alpha)) / kInt8Maxf; - auto prepare_bias_callback = - gemmology::callbacks::UnquantizeAndAddBiasAndWrite( - bias_unquant_multiplier, bias.data(), // - prepared_bias.data() // - ); - - auto PrepareBias = GEMMOLOGY_DISPATCH(Shift::PrepareBias); // NOLINT - PrepareBias( // - B.data(), // - width, B_cols, // - prepare_bias_callback // - ); - - // - // Multiply y = A * B (dot) - // Set y's shape replacing last dimension with the feature-dim B is projecting - // onto (B_cols). - Shape out_shape = x.shape(); - out_shape.set_dim(-1, B_cols); - - Tensor y(Type::f32, out_shape, (name.empty() ? x.name() : name)); - - float unquant_multiplier = 1.0F / (a_quant * b_quant); - auto multiply_callback = gemmology::callbacks::UnquantizeAndAddBiasAndWrite( - unquant_multiplier, prepared_bias.data(), y.data()); - auto Multiply = GEMMOLOGY_DISPATCH(Shift::Multiply); // NOLINT - Multiply( // - prepared_A.data(), B.data(), // - A_rows, width, B_cols, // - multiply_callback // - ); - - return y; -} - -template <> -void prepare_weight_transposed( - const float* weights, int8_t* prepared, float quantization_multiplier, - size_t cols, size_t rows) { - auto PrepareBTransposed = GEMMOLOGY_DISPATCH(PrepareBTransposed); // NOLINT - PrepareBTransposed(weights, prepared, quantization_multiplier, cols, rows); -} - -template <> -void prepare_weight_quantized_transposed( - const int8_t* input, int8_t* output, size_t rows, size_t cols) { - // NOLINTNEXTLINE - auto PrepareBQuantizedTransposed = - GEMMOLOGY_DISPATCH(PrepareBQuantizedTransposed); - PrepareBQuantizedTransposed(input, output, rows, cols); -} - -} // namespace slimt::qmm::detail -#endif // SLIMT_HAS_GEMMOLOGY - namespace slimt::qmm { Tensor affine(Tensor& x, const Tensor& W, const Tensor& b, float a_quant, float b_quant, const std::string& name) { diff --git a/slimt/QMM.hh b/slimt/QMM.hh index 8df05b6e..b9c969d1 100644 --- a/slimt/QMM.hh +++ b/slimt/QMM.hh @@ -43,18 +43,6 @@ template void prepare_weight_quantized_transposed(const int8_t* input, int8_t* output, size_t rows, size_t cols); -#ifdef SLIMT_HAS_INTGEMM -constexpr Provider kAutoProvider = Provider::Intgemm; -#endif - -#ifdef SLIMT_HAS_RUY -constexpr Provider kAutoProvider = Provider::Ruy; -#endif - -#ifdef SLIMT_HAS_GEMMOLOGY -constexpr Provider kAutoProvider = Provider::Gemmology; -#endif - } // namespace detail Tensor affine(Tensor& x, const Tensor& W, const Tensor& b, float a_quant, diff --git a/slimt/qmm/Gemmology.inl.hh b/slimt/qmm/Gemmology.inl.hh new file mode 100644 index 00000000..db122209 --- /dev/null +++ b/slimt/qmm/Gemmology.inl.hh @@ -0,0 +1,348 @@ +#if defined(USE_AVX512) +#define GEMMOLOGY_SUPPORTED_ARCHS \ + xsimd::arch_list +#elif defined(USE_AVX2) +#define GEMMOLOGY_SUPPORTED_ARCHS \ + xsimd::arch_list +#elif defined(USE_SSSE3) +#define GEMMOLOGY_SUPPORTED_ARCHS xsimd::arch_list +#elif defined(USE_SSE2) +#define GEMMOLOGY_SUPPORTED_ARCHS xsimd::arch_list +#elif defined(USE_NEON) and defined(XSIMD_WITH_NEON64) +#define GEMMOLOGY_SUPPORTED_ARCHS xsimd::arch_list +#else +#error no supported architecture +#endif + +namespace gemmology { + +#ifdef USE_AVX512 +template struct Engine; +template void Engine::SelectColumnsB(const int8_t*, int8_t*, + size_t, const uint32_t*, + const uint32_t*); +template void Engine::Shift::Multiply( + const uint8_t*, const int8_t*, size_t, size_t, size_t, + gemmology::callbacks::UnquantizeAndAddBiasAndWrite); +template void Engine::Shift::PrepareBias( + const int8_t*, size_t, size_t, + gemmology::callbacks::UnquantizeAndAddBiasAndWrite); +#endif + +#ifdef USE_AVX2 +template struct Engine; +template void Engine::SelectColumnsB(const int8_t*, int8_t*, + size_t, const uint32_t*, + const uint32_t*); +template void Engine::Shift::Multiply( + const uint8_t*, const int8_t*, size_t, size_t, size_t, + gemmology::callbacks::UnquantizeAndAddBiasAndWrite); +template void Engine::Shift::PrepareBias( + const int8_t*, size_t, size_t, + gemmology::callbacks::UnquantizeAndAddBiasAndWrite); +#endif + +#ifdef USE_SSE2 +template struct Engine; +template void Engine::SelectColumnsB(const int8_t*, int8_t*, + size_t, const uint32_t*, + const uint32_t*); + +template void Engine::Shift::Multiply( + const uint8_t*, const int8_t*, size_t, size_t, size_t, + gemmology::callbacks::UnquantizeAndAddBiasAndWrite); +template void Engine::Shift::PrepareBias( + const int8_t*, size_t, size_t, + gemmology::callbacks::UnquantizeAndAddBiasAndWrite); +#endif + +#ifdef USE_SSSE3 +template struct Engine; +template void Engine::SelectColumnsB(const int8_t*, int8_t*, + size_t, const uint32_t*, + const uint32_t*); +template void Engine::Shift::Multiply( + const uint8_t*, const int8_t*, size_t, size_t, size_t, + gemmology::callbacks::UnquantizeAndAddBiasAndWrite); +template void Engine::Shift::PrepareBias( + const int8_t*, size_t, size_t, + gemmology::callbacks::UnquantizeAndAddBiasAndWrite); +#endif + +#ifdef USE_NEON +template struct Engine; +template void Engine::SelectColumnsB(const int8_t*, int8_t*, + size_t, const uint32_t*, + const uint32_t*); +template void Engine::Shift::Multiply( + const uint8_t*, const int8_t*, size_t, size_t, size_t, + gemmology::callbacks::UnquantizeAndAddBiasAndWrite); +template void Engine::Shift::PrepareBias( + const int8_t*, size_t, size_t, + gemmology::callbacks::UnquantizeAndAddBiasAndWrite); +#endif // USE_NEON + +} // namespace gemmology + +// Dispatch *at runtime* based on run-time hardware and compile-time +// architectures. +// +// FIXME: Ideally we would not run the dispatch code at each function call. +#define GEMMOLOGY_DISPATCH(FUNCTION) \ + xsimd::dispatch([](auto arch, auto... args) { \ + return gemmology::Engine::FUNCTION(args...); \ + }) + +namespace slimt::qmm::detail { + +template <> +Tensor affine_with_select( + Tensor& x, const Tensor& W, const Tensor& b, float a_quant, float b_quant, + const std::vector& indices, const std::string& name) { + // Naming is to simplify thinking with the gemmology API below. + Tensor& A = x; // NOLINT + const Tensor& B = W; // NOLINT + const Tensor& bias = b; + + size_t A_cols = A.dim(-1); // NOLINT + size_t B_cols = B.dim(-1); // NOLINT + size_t A_rows = A.size() / A_cols; // NOLINT + size_t B_rows = B.size() / B_cols; // NOLINT + + size_t width = A_cols; + // SLIMT_TRACE3(x.shape(), W.shape(), b.shape()); + + // Check widths are same, making matrix multiplication viable. + assert(A_cols == B_rows); + + // Prepare Activations (A). + Tensor prepared_A(Type::i8, A.shape(), "quantized_acts"); // NOLINT + auto PrepareA = GEMMOLOGY_DISPATCH(Shift::PrepareA); // NOLINT + PrepareA( // + A.data(), prepared_A.data(), // + a_quant, // + A_rows, width // + ); + + // Prepare bias + Tensor prepared_bias(Type::f32, bias.shape(), "prepared_bias"); + constexpr float kMax8bit = kInt8Maxf; + float a_alpha = kMax8bit / a_quant; + float b_alpha = kMax8bit / b_quant; + + float bias_unquant_multiplier = (-1.0F * (a_alpha * b_alpha)) / kMax8bit; + auto prepare_bias_callback = + gemmology::callbacks::UnquantizeAndAddBiasAndWrite( + bias_unquant_multiplier, bias.data(), // + prepared_bias.data() // + ); + + auto PrepareBias = GEMMOLOGY_DISPATCH(Shift::PrepareBias); // NOLINT + PrepareBias( // + B.data(), // + width, B_cols, // + prepare_bias_callback // + ); + + // Select before multiply? + // NOLINTNEXTLINE + Tensor selected_B(Type::i8, Shape({width, indices.size()}), "selected_B"); + const uint32_t* indices_begin = indices.data(); + const uint32_t* indices_end = indices.data() + indices.size(); + + auto SelectColumnsB = GEMMOLOGY_DISPATCH(SelectColumnsB); // NOLINT + SelectColumnsB(B.data(), selected_B.data(), B_rows, + indices_begin, indices_end); + + // Select bias accordingly. + Tensor selected_bias(Type::f32, Shape({indices.size()}), "selected_bias"); + auto* selected_bias_ptr = selected_bias.data(); + for (uint32_t index : indices) { + *(selected_bias_ptr) = *(prepared_bias.data() + index); + ++selected_bias_ptr; + } + + // Multiply y = A * B + bias (affine) + // Set y's shape replacing last dimension with the feature-dim B is projecting + // onto (B_cols). + Shape out_shape = x.shape(); + out_shape.set_dim(-1, indices.size()); + + Tensor y(Type::f32, out_shape, (name.empty() ? x.name() : name)); + size_t selected_B_cols = selected_B.dim(-1); // NOLINT + + float unquant_multiplier = 1.0F / (a_quant * b_quant); + auto multiply_callback = gemmology::callbacks::UnquantizeAndAddBiasAndWrite( + unquant_multiplier, selected_bias.data(), y.data()); + auto Multiply = GEMMOLOGY_DISPATCH(Shift::Multiply); // NOLINT + Multiply( // + prepared_A.data(), selected_B.data(), // + A_rows, width, selected_B_cols, // + multiply_callback // + ); + + return y; +} + +template <> +Tensor affine(Tensor& x, const Tensor& W, const Tensor& b, + float a_quant, float b_quant, + const std::string& name) { + // Naming is to simplify thinking with the gemmology API below. + Tensor& A = x; // NOLINT + const Tensor& B = W; // NOLINT + const Tensor& bias = b; + + size_t A_cols = A.dim(-1); // NOLINT + size_t B_cols = B.dim(-1); // NOLINT + size_t A_rows = A.size() / A_cols; // NOLINT + size_t B_rows = B.size() / B_cols; // NOLINT + + size_t width = A_cols; + // SLIMT_TRACE3(x.shape(), W.shape(), b.shape()); + + // Check widths are same, making matrix multiplication viable. + (void)B_rows; + assert(A_cols == B_rows); + + // Prepare Activations (A). + Tensor prepared_A(Type::i8, A.shape(), "quantized_acts"); // NOLINT + auto PrepareA = GEMMOLOGY_DISPATCH(Shift::PrepareA); // NOLINT + PrepareA( // + A.data(), prepared_A.data(), // + a_quant, // + A_rows, width // + ); + + // Prepare bias + Tensor prepared_bias(Type::f32, bias.shape(), "prepared_bias"); + float a_alpha = kInt8Maxf / a_quant; + float b_alpha = kInt8Maxf / b_quant; + + float bias_unquant_multiplier = (-1.0F * (a_alpha * b_alpha)) / kInt8Maxf; + auto prepare_bias_callback = + gemmology::callbacks::UnquantizeAndAddBiasAndWrite( + bias_unquant_multiplier, bias.data(), // + prepared_bias.data() // + ); + + auto PrepareBias = GEMMOLOGY_DISPATCH(Shift::PrepareBias); // NOLINT + PrepareBias( // + B.data(), // + width, B_cols, // + prepare_bias_callback // + ); + + // Multiply y = A * B + bias (affine) + // Set y's shape replacing last dimension with the feature-dim B is projecting + // onto (B_cols). + Shape out_shape = x.shape(); + out_shape.set_dim(-1, B_cols); + + Tensor y(Type::f32, out_shape, (name.empty() ? x.name() : name)); + + float unquant_multiplier = 1.0F / (a_quant * b_quant); + auto multiply_callback = gemmology::callbacks::UnquantizeAndAddBiasAndWrite( + unquant_multiplier, prepared_bias.data(), y.data()); + auto Multiply = GEMMOLOGY_DISPATCH(Shift::Multiply); // NOLINT + Multiply( // + prepared_A.data(), B.data(), // + A_rows, width, B_cols, // + multiply_callback // + ); + + return y; +} + +template <> +Tensor dot(Tensor& x, const Tensor& W, float a_quant, + float b_quant, const std::string& name) { + // Naming is to simplify thinking with the gemmology API below. + Tensor& A = x; // NOLINT + const Tensor& B = W; // NOLINT + + size_t A_cols = A.dim(-1); // NOLINT + size_t B_cols = B.dim(-1); // NOLINT + size_t A_rows = A.size() / A_cols; // NOLINT + size_t B_rows = B.size() / B_cols; // NOLINT + + size_t width = A_cols; + // SLIMT_TRACE3(x.shape(), W.shape(), b.shape()); + + // Check widths are same, making matrix multiplication viable. + (void)B_rows; + assert(A_cols == B_rows); + + // Prepare Activations (A). + Tensor prepared_A(Type::i8, A.shape(), "quantized_acts"); // NOLINT + auto PrepareA = GEMMOLOGY_DISPATCH(Shift::PrepareA); // NOLINT + PrepareA( // + A.data(), prepared_A.data(), // + a_quant, // + A_rows, width // + ); + + // Prepare bias + + // Fake bias, all elements are zero. + Tensor bias(x.type(), Shape({1, B_cols}), "zero_bias"); + bias.fill_in_place(0.0F); + + Tensor prepared_bias(Type::f32, bias.shape(), "prepared_bias"); + float a_alpha = kInt8Maxf / a_quant; + float b_alpha = kInt8Maxf / b_quant; + + float bias_unquant_multiplier = (-1.0F * (a_alpha * b_alpha)) / kInt8Maxf; + auto prepare_bias_callback = + gemmology::callbacks::UnquantizeAndAddBiasAndWrite( + bias_unquant_multiplier, bias.data(), // + prepared_bias.data() // + ); + + auto PrepareBias = GEMMOLOGY_DISPATCH(Shift::PrepareBias); // NOLINT + PrepareBias( // + B.data(), // + width, B_cols, // + prepare_bias_callback // + ); + + // + // Multiply y = A * B (dot) + // Set y's shape replacing last dimension with the feature-dim B is projecting + // onto (B_cols). + Shape out_shape = x.shape(); + out_shape.set_dim(-1, B_cols); + + Tensor y(Type::f32, out_shape, (name.empty() ? x.name() : name)); + + float unquant_multiplier = 1.0F / (a_quant * b_quant); + auto multiply_callback = gemmology::callbacks::UnquantizeAndAddBiasAndWrite( + unquant_multiplier, prepared_bias.data(), y.data()); + auto Multiply = GEMMOLOGY_DISPATCH(Shift::Multiply); // NOLINT + Multiply( // + prepared_A.data(), B.data(), // + A_rows, width, B_cols, // + multiply_callback // + ); + + return y; +} + +template <> +void prepare_weight_transposed( + const float* weights, int8_t* prepared, float quantization_multiplier, + size_t cols, size_t rows) { + auto PrepareBTransposed = GEMMOLOGY_DISPATCH(PrepareBTransposed); // NOLINT + PrepareBTransposed(weights, prepared, quantization_multiplier, cols, rows); +} + +template <> +void prepare_weight_quantized_transposed( + const int8_t* input, int8_t* output, size_t rows, size_t cols) { + // NOLINTNEXTLINE + auto PrepareBQuantizedTransposed = + GEMMOLOGY_DISPATCH(PrepareBQuantizedTransposed); + PrepareBQuantizedTransposed(input, output, rows, cols); +} + +} // namespace slimt::qmm::detail diff --git a/slimt/qmm/Intgemm.inl.hh b/slimt/qmm/Intgemm.inl.hh new file mode 100644 index 00000000..b8e1e98f --- /dev/null +++ b/slimt/qmm/Intgemm.inl.hh @@ -0,0 +1,238 @@ +namespace slimt::qmm::detail { +template <> +Tensor affine_with_select( + Tensor& x, const Tensor& W, const Tensor& b, float a_quant, float b_quant, + const std::vector& indices, const std::string& name) { + // Naming is to simplify thinking with the intgemm API below. + Tensor& A = x; // NOLINT + const Tensor& B = W; // NOLINT + const Tensor& bias = b; + + size_t A_cols = A.dim(-1); // NOLINT + size_t B_cols = B.dim(-1); // NOLINT + size_t A_rows = A.size() / A_cols; // NOLINT + size_t B_rows = B.size() / B_cols; // NOLINT + + size_t width = A_cols; + // SLIMT_TRACE3(x.shape(), W.shape(), b.shape()); + + // Check widths are same, making matrix multiplication viable. + assert(A_cols == B_rows); + + // Prepare Activations (A). + Tensor prepared_A(Type::i8, A.shape(), "quantized_acts"); // NOLINT + intgemm::Int8Shift::PrepareA( // + A.data(), prepared_A.data(), // + a_quant, // + A_rows, width // + ); + + // Prepare bias + Tensor prepared_bias(Type::f32, bias.shape(), "prepared_bias"); + constexpr float kMax8bit = kInt8Maxf; + float a_alpha = kMax8bit / a_quant; + float b_alpha = kMax8bit / b_quant; + + float bias_unquant_multiplier = (-1.0F * (a_alpha * b_alpha)) / kMax8bit; + auto prepare_bias_callback = intgemm::callbacks::UnquantizeAndAddBiasAndWrite( + bias_unquant_multiplier, bias.data(), // + prepared_bias.data() // + ); + + intgemm::Int8Shift::PrepareBias( // + B.data(), // + width, B_cols, // + prepare_bias_callback // + ); + + // Select before multiply? + // NOLINTNEXTLINE + Tensor selected_B(Type::i8, Shape({width, indices.size()}), "selected_B"); + const uint32_t* indices_begin = indices.data(); + const uint32_t* indices_end = indices.data() + indices.size(); + + intgemm::Int8::SelectColumnsB(B.data(), selected_B.data(), + B_rows, indices_begin, indices_end); + + // Select bias accordingly. + Tensor selected_bias(Type::f32, Shape({indices.size()}), "selected_bias"); + auto* selected_bias_ptr = selected_bias.data(); + for (uint32_t index : indices) { + *(selected_bias_ptr) = *(prepared_bias.data() + index); + ++selected_bias_ptr; + } + + // Multiply y = A * B + bias (affine) + // Set y's shape replacing last dimension with the feature-dim B is projecting + // onto (B_cols). + Shape out_shape = x.shape(); + out_shape.set_dim(-1, indices.size()); + + Tensor y(Type::f32, out_shape, (name.empty() ? x.name() : name)); + size_t selected_B_cols = selected_B.dim(-1); // NOLINT + + float unquant_multiplier = 1.0F / (a_quant * b_quant); + auto multiply_callback = intgemm::callbacks::UnquantizeAndAddBiasAndWrite( + unquant_multiplier, selected_bias.data(), y.data()); + intgemm::Int8Shift::Multiply( // + prepared_A.data(), selected_B.data(), // + A_rows, width, selected_B_cols, // + multiply_callback // + ); + + return y; +} + +template <> +Tensor affine(Tensor& x, const Tensor& W, const Tensor& b, + float a_quant, float b_quant, + const std::string& name) { + // Naming is to simplify thinking with the intgemm API below. + Tensor& A = x; // NOLINT + const Tensor& B = W; // NOLINT + const Tensor& bias = b; + + size_t A_cols = A.dim(-1); // NOLINT + size_t B_cols = B.dim(-1); // NOLINT + size_t A_rows = A.size() / A_cols; // NOLINT + size_t B_rows = B.size() / B_cols; // NOLINT + + size_t width = A_cols; + // SLIMT_TRACE3(x.shape(), W.shape(), b.shape()); + + // Check widths are same, making matrix multiplication viable. + (void)B_rows; + assert(A_cols == B_rows); + + // Prepare Activations (A). + Tensor prepared_A(Type::i8, A.shape(), "quantized_acts"); // NOLINT + intgemm::Int8Shift::PrepareA( // + A.data(), prepared_A.data(), // + a_quant, // + A_rows, width // + ); + + // Prepare bias + Tensor prepared_bias(Type::f32, bias.shape(), "prepared_bias"); + float a_alpha = kInt8Maxf / a_quant; + float b_alpha = kInt8Maxf / b_quant; + + float bias_unquant_multiplier = (-1.0F * (a_alpha * b_alpha)) / kInt8Maxf; + auto prepare_bias_callback = intgemm::callbacks::UnquantizeAndAddBiasAndWrite( + bias_unquant_multiplier, bias.data(), // + prepared_bias.data() // + ); + + intgemm::Int8Shift::PrepareBias( // + B.data(), // + width, B_cols, // + prepare_bias_callback // + ); + + // Multiply y = A * B + bias (affine) + // Set y's shape replacing last dimension with the feature-dim B is projecting + // onto (B_cols). + Shape out_shape = x.shape(); + out_shape.set_dim(-1, B_cols); + + Tensor y(Type::f32, out_shape, (name.empty() ? x.name() : name)); + + float unquant_multiplier = 1.0F / (a_quant * b_quant); + auto multiply_callback = intgemm::callbacks::UnquantizeAndAddBiasAndWrite( + unquant_multiplier, prepared_bias.data(), y.data()); + intgemm::Int8Shift::Multiply( // + prepared_A.data(), B.data(), // + A_rows, width, B_cols, // + multiply_callback // + ); + + return y; +} + +template <> +Tensor dot(Tensor& x, const Tensor& W, float a_quant, + float b_quant, const std::string& name) { + // Naming is to simplify thinking with the intgemm API below. + Tensor& A = x; // NOLINT + const Tensor& B = W; // NOLINT + + size_t A_cols = A.dim(-1); // NOLINT + size_t B_cols = B.dim(-1); // NOLINT + size_t A_rows = A.size() / A_cols; // NOLINT + size_t B_rows = B.size() / B_cols; // NOLINT + + size_t width = A_cols; + // SLIMT_TRACE3(x.shape(), W.shape(), b.shape()); + + // Check widths are same, making matrix multiplication viable. + (void)B_rows; + assert(A_cols == B_rows); + + // Prepare Activations (A). + Tensor prepared_A(Type::i8, A.shape(), "quantized_acts"); // NOLINT + intgemm::Int8Shift::PrepareA( // + A.data(), prepared_A.data(), // + a_quant, // + A_rows, width // + ); + + // Prepare bias + + // Fake bias, all elements are zero. + Tensor bias(x.type(), Shape({1, B_cols}), "zero_bias"); + bias.fill_in_place(0.0F); + + Tensor prepared_bias(Type::f32, bias.shape(), "prepared_bias"); + float a_alpha = kInt8Maxf / a_quant; + float b_alpha = kInt8Maxf / b_quant; + + float bias_unquant_multiplier = (-1.0F * (a_alpha * b_alpha)) / kInt8Maxf; + auto prepare_bias_callback = intgemm::callbacks::UnquantizeAndAddBiasAndWrite( + bias_unquant_multiplier, bias.data(), // + prepared_bias.data() // + ); + + intgemm::Int8Shift::PrepareBias( // + B.data(), // + width, B_cols, // + prepare_bias_callback // + ); + + // + // Multiply y = A * B (dot) + // Set y's shape replacing last dimension with the feature-dim B is projecting + // onto (B_cols). + Shape out_shape = x.shape(); + out_shape.set_dim(-1, B_cols); + + Tensor y(Type::f32, out_shape, (name.empty() ? x.name() : name)); + + float unquant_multiplier = 1.0F / (a_quant * b_quant); + auto multiply_callback = intgemm::callbacks::UnquantizeAndAddBiasAndWrite( + unquant_multiplier, prepared_bias.data(), y.data()); + intgemm::Int8Shift::Multiply( // + prepared_A.data(), B.data(), // + A_rows, width, B_cols, // + multiply_callback // + ); + + return y; +} + +template <> +void prepare_weight_transposed(const float* weights, + int8_t* prepared, + float quantization_multiplier, + size_t cols, size_t rows) { + intgemm::Int8::PrepareBTransposed(weights, prepared, quantization_multiplier, + cols, rows); +} + +template <> +void prepare_weight_quantized_transposed(const int8_t* input, + int8_t* output, + size_t rows, + size_t cols) { + intgemm::Int8::PrepareBQuantizedTransposed(input, output, rows, cols); +} +} // namespace slimt::qmm::detail diff --git a/slimt/qmm/Ruy.inl.hh b/slimt/qmm/Ruy.inl.hh new file mode 100644 index 00000000..ce195957 --- /dev/null +++ b/slimt/qmm/Ruy.inl.hh @@ -0,0 +1,272 @@ + +namespace slimt::qmm::detail { + +using Index = uint64_t; + +void quantize(const float* input, float scale, Index rows, Index width, + int8_t* output) { + const Index size = rows * width; + for (size_t i = 0; i < size; i++) { + // Round to nearest after multiplying with scale. + float value = roundf(scale * input[i]); + + // Since float can store bigger values, we threshold anything that's gone + // higher and can't fit in int8. + value = std::max(-kInt8Maxf, value); + value = std::min(kInt8Maxf, value); + + // Finally a static cast. + output[i] = static_cast(value); + }; +} + +template +void transpose(const Scalar* input, Index rows, Index cols, Scalar* output) { + for (size_t i = 0; i < rows; i++) { + for (size_t j = 0; j < cols; j++) { + output[j * rows + i] = input[i * cols + j]; + } + } +} + +void unquantize(const int32_t* input, float unquant_multiplier, Index rows_A, + Index cols_B, float* output) { + for (size_t i = 0; i < rows_A; i++) { + for (size_t j = 0; j < cols_B; j++) { + Index idx = i * cols_B + j; + output[idx] = (input[idx] * unquant_multiplier); + } + } +} + +void unquantizeAddBias(const int32_t* input, const float* input_bias_prepared, + float unquant_multiplier, Index rows_A, Index cols_B, + float* output) { + for (size_t i = 0; i < rows_A; i++) { + for (size_t j = 0; j < cols_B; j++) { + Index idx = i * cols_B + j; + output[idx] = (input[idx] * unquant_multiplier) + input_bias_prepared[j]; + } + } +} + +// Ruy. +template <> +Tensor affine(Tensor& x, const Tensor& W, const Tensor& b, + float a_quant, float b_quant, + const std::string& name) { + Tensor& A = x; // NOLINT + const Tensor& B = W; // NOLINT + const Tensor& bias = b; + + size_t A_cols = A.dim(-1); // NOLINT + size_t B_cols = B.dim(-1); // NOLINT + size_t A_rows = A.size() / A_cols; // NOLINT + size_t B_rows = B.size() / B_cols; // NOLINT + + size_t width = B_rows; + + (void)name; + // Prepare A: Quantize from f32 -> i8 + Tensor prepared_A(Type::i8, x.shape(), "prepared_A"); // NOLINT + + detail::quantize(x.data(), a_quant, A_rows, A_cols, + prepared_A.data()); + + ruy::Context context; + ruy::Matrix lhs; + ruy::MakeSimpleLayout(A_rows, width, ruy::Order::kRowMajor, + lhs.mutable_layout()); + lhs.set_data(prepared_A.data()); + + // PrepareB: ? + ruy::Matrix rhs; + ruy::MakeSimpleLayout(width, B_cols, ruy::Order::kColMajor, + rhs.mutable_layout()); + rhs.set_data(W.data()); + + // PrepareBias: ? + // Actualyl there is no need. + const Tensor& prepared_bias = bias; + + ruy::Matrix dst; + ruy::MakeSimpleLayout(A_rows, B_cols, ruy::Order::kRowMajor, + dst.mutable_layout()); + + Shape out_shape = x.shape(); + out_shape.set_dim(-1, B_cols); + Tensor AB(Type::i32, out_shape, name + "_out"); // NOLINT + dst.set_data(AB.data()); + + // Multiply C = AB; + // When Dst is int32, mul_params is unused. + ruy::MulParams mul_params; + ruy::Mul(lhs, rhs, mul_params, &context, &dst); + + // Unquantizes, then adds bias in a single statement on the output. + Tensor y(Type::f32, out_shape, name + "_out"); // NOLINT + float unquant_multiplier = 1.0F / (a_quant * b_quant); + detail::unquantizeAddBias(AB.data(), prepared_bias.data(), + unquant_multiplier, A_rows, B_cols, + y.data()); + return y; +} + +template <> +Tensor affine_with_select(Tensor& x, const Tensor& W, + const Tensor& b, float a_quant, + float b_quant, + const std::vector& indices, + const std::string& name) { + Tensor& A = x; // NOLINT + const Tensor& B = W; // NOLINT + const Tensor& bias = b; + + size_t A_cols = A.dim(-1); // NOLINT + size_t B_cols = B.dim(-1); // NOLINT + size_t A_rows = A.size() / A_cols; // NOLINT + size_t B_rows = B.size() / B_cols; // NOLINT + + size_t width = B_rows; + + (void)name; + // Prepare A: Quantize from f32 -> i8 + Tensor prepared_A(Type::i8, x.shape(), "prepared_A"); // NOLINT + + detail::quantize(x.data(), a_quant, A_rows, A_cols, + prepared_A.data()); + + ruy::Context context; + ruy::Matrix lhs; + ruy::MakeSimpleLayout(A_rows, width, ruy::Order::kRowMajor, + lhs.mutable_layout()); + lhs.set_data(prepared_A.data()); + + // PrepareB: Select + Tensor selected_B(Type::i8, Shape({width, indices.size()}), // NOLINT + "selected_B"); + + // SelectColumnsB, but inlined? + // B_prepared is expected to be col-major, for our implementation via ruy. If + // col-major we can memcpy the respective column entries as they're + // sequential. There are width = rows entries. + auto B_data = B.data(); // NOLINT + auto sB_data = selected_B.data(); // NOLINT + for (size_t c = 0; c < indices.size(); ++c) { + int8_t* sB_begin = &(sB_data[c * width]); // NOLINT + const int8_t* B_begin = &(B_data[indices[c] * width]); // NOLINT + std::memcpy(sB_begin, B_begin, width); + } + + ruy::Matrix rhs; + ruy::MakeSimpleLayout(width, indices.size(), ruy::Order::kColMajor, + rhs.mutable_layout()); + rhs.set_data(selected_B.data()); + + // Once again, bias needn't be prepared. But needs to be selected. + const Tensor& prepared_bias = bias; + Tensor selected_bias(Type::f32, Shape({indices.size()}), "selected_bias"); + auto* selected_bias_ptr = selected_bias.data(); + for (uint32_t index : indices) { + *(selected_bias_ptr) = *(prepared_bias.data() + index); + ++selected_bias_ptr; + } + + // Multiply C = A select(B); + // When Dst is int32, mul_params is unused. + size_t selected_B_cols = selected_B.dim(-1); // NOLINT + ruy::Matrix dst; + ruy::MakeSimpleLayout(A_rows, selected_B_cols, ruy::Order::kRowMajor, + dst.mutable_layout()); + + Shape out_shape = x.shape(); + out_shape.set_dim(-1, selected_B_cols); + + Tensor AB(Type::i32, out_shape, name + "_out"); // NOLINT + dst.set_data(AB.data()); + + ruy::MulParams mul_params; + ruy::Mul(lhs, rhs, mul_params, &context, &dst); + + // Unquantizes, then adds bias in a single statement on the output. + Tensor y(Type::f32, out_shape, name + "_out"); // NOLINT + float unquant_multiplier = 1.0F / (a_quant * b_quant); + detail::unquantizeAddBias(AB.data(), prepared_bias.data(), + unquant_multiplier, A_rows, selected_B_cols, + y.data()); + return y; +} + +template <> +Tensor dot(Tensor& x, const Tensor& W, float a_quant, + float b_quant, const std::string& name) { + Tensor& A = x; // NOLINT + const Tensor& B = W; // NOLINT + + size_t A_cols = A.dim(-1); // NOLINT + size_t B_cols = B.dim(-1); // NOLINT + size_t A_rows = A.size() / A_cols; // NOLINT + size_t B_rows = B.size() / B_cols; // NOLINT + + size_t width = B_rows; + + (void)name; + // Prepare A: Quantize from f32 -> i8 + Tensor prepared_A(Type::i8, x.shape(), "prepared_A"); // NOLINT + + detail::quantize(x.data(), a_quant, A_rows, A_cols, + prepared_A.data()); + + ruy::Context context; + ruy::Matrix lhs; + ruy::MakeSimpleLayout(A_rows, width, ruy::Order::kRowMajor, + lhs.mutable_layout()); + lhs.set_data(prepared_A.data()); + + // PrepareB: ? + ruy::Matrix rhs; + ruy::MakeSimpleLayout(width, B_cols, ruy::Order::kColMajor, + rhs.mutable_layout()); + rhs.set_data(W.data()); + + // PrepareBias: ? + // Actualyl there is no need. + ruy::Matrix dst; + ruy::MakeSimpleLayout(A_rows, B_cols, ruy::Order::kRowMajor, + dst.mutable_layout()); + + Shape out_shape = x.shape(); + out_shape.set_dim(-1, B_cols); + Tensor AB(Type::i32, out_shape, name + "_out"); // NOLINT + dst.set_data(AB.data()); + + // Multiply C = AB; + // When Dst is int32, mul_params is unused. + ruy::MulParams mul_params; + ruy::Mul(lhs, rhs, mul_params, &context, &dst); + + // Unquantizes, then adds bias in a single statement on the output. + Tensor y(Type::f32, out_shape, name + "_out"); // NOLINT + float unquant_multiplier = 1.0F / (a_quant * b_quant); + detail::unquantize(AB.data(), unquant_multiplier, A_rows, B_cols, + y.data()); + return y; +} + +template <> +void prepare_weight_transposed(const float* weights, + int8_t* prepared, + float quantization_multiplier, + size_t cols, size_t rows) { + detail::quantize(weights, quantization_multiplier, cols, rows, prepared); +} + +template <> +void prepare_weight_quantized_transposed(const int8_t* input, + int8_t* output, + size_t rows, + size_t cols) { + std::memcpy(output, input, + /*count=*/sizeof(int8_t) * (rows * cols)); +} +} // namespace slimt::qmm::detail From c23346ff673dc7dfb394609963f81683cb4d1270 Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Mon, 23 Oct 2023 17:21:36 +0530 Subject: [PATCH 2/3] Rename these to .cc --- slimt/QMM.cc | 9 ++++++--- slimt/qmm/{Gemmology.inl.hh => Gemmology.inl.cc} | 0 slimt/qmm/{Intgemm.inl.hh => Intgemm.inl.cc} | 0 slimt/qmm/{Ruy.inl.hh => Ruy.inl.cc} | 0 4 files changed, 6 insertions(+), 3 deletions(-) rename slimt/qmm/{Gemmology.inl.hh => Gemmology.inl.cc} (100%) rename slimt/qmm/{Intgemm.inl.hh => Intgemm.inl.cc} (100%) rename slimt/qmm/{Ruy.inl.hh => Ruy.inl.cc} (100%) diff --git a/slimt/QMM.cc b/slimt/QMM.cc index e3e00e3d..ab087cee 100644 --- a/slimt/QMM.cc +++ b/slimt/QMM.cc @@ -10,7 +10,8 @@ namespace slimt::qmm::detail { constexpr Provider kAutoProvider = Provider::Intgemm; } -#include "slimt/qmm/Intgemm.inl.hh" +// NOLINTNEXTLINE: The C++ file inclusion is intended. +#include "slimt/qmm/Intgemm.inl.cc" #endif #ifdef SLIMT_HAS_RUY @@ -18,7 +19,8 @@ constexpr Provider kAutoProvider = Provider::Intgemm; namespace slimt::qmm::detail { constexpr Provider kAutoProvider = Provider::Ruy; } -#include "slimt/qmm/Ruy.inl.hh" +// NOLINTNEXTLINE: The C++ file inclusion is intended. +#include "slimt/qmm/Ruy.inl.cc" #endif #ifdef SLIMT_HAS_GEMMOLOGY @@ -28,7 +30,8 @@ constexpr Provider kAutoProvider = Provider::Ruy; namespace slimt::qmm::detail { constexpr Provider kAutoProvider = Provider::Gemmology; } -#include "slimt/qmm/Gemmology.inl.hh" +// NOLINTNEXTLINE: The C++ file inclusion is intended. +#include "slimt/qmm/Gemmology.inl.cc" #pragma GCC diagnostic pop #endif diff --git a/slimt/qmm/Gemmology.inl.hh b/slimt/qmm/Gemmology.inl.cc similarity index 100% rename from slimt/qmm/Gemmology.inl.hh rename to slimt/qmm/Gemmology.inl.cc diff --git a/slimt/qmm/Intgemm.inl.hh b/slimt/qmm/Intgemm.inl.cc similarity index 100% rename from slimt/qmm/Intgemm.inl.hh rename to slimt/qmm/Intgemm.inl.cc diff --git a/slimt/qmm/Ruy.inl.hh b/slimt/qmm/Ruy.inl.cc similarity index 100% rename from slimt/qmm/Ruy.inl.hh rename to slimt/qmm/Ruy.inl.cc From 919e3b8e5f2077a7dcf0dd03eb54c48a41728986 Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Mon, 23 Oct 2023 17:26:45 +0530 Subject: [PATCH 3/3] Reorder --- slimt/qmm/Gemmology.inl.cc | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/slimt/qmm/Gemmology.inl.cc b/slimt/qmm/Gemmology.inl.cc index db122209..151af863 100644 --- a/slimt/qmm/Gemmology.inl.cc +++ b/slimt/qmm/Gemmology.inl.cc @@ -27,7 +27,7 @@ template void Engine::Shift::Multiply( template void Engine::Shift::PrepareBias( const int8_t*, size_t, size_t, gemmology::callbacks::UnquantizeAndAddBiasAndWrite); -#endif +#endif // USE_AVX512 #ifdef USE_AVX2 template struct Engine; @@ -40,7 +40,20 @@ template void Engine::Shift::Multiply( template void Engine::Shift::PrepareBias( const int8_t*, size_t, size_t, gemmology::callbacks::UnquantizeAndAddBiasAndWrite); -#endif +#endif // USE_AVX2 + +#ifdef USE_SSSE3 +template struct Engine; +template void Engine::SelectColumnsB(const int8_t*, int8_t*, + size_t, const uint32_t*, + const uint32_t*); +template void Engine::Shift::Multiply( + const uint8_t*, const int8_t*, size_t, size_t, size_t, + gemmology::callbacks::UnquantizeAndAddBiasAndWrite); +template void Engine::Shift::PrepareBias( + const int8_t*, size_t, size_t, + gemmology::callbacks::UnquantizeAndAddBiasAndWrite); +#endif // USE_SSSE3 #ifdef USE_SSE2 template struct Engine; @@ -54,20 +67,7 @@ template void Engine::Shift::Multiply( template void Engine::Shift::PrepareBias( const int8_t*, size_t, size_t, gemmology::callbacks::UnquantizeAndAddBiasAndWrite); -#endif - -#ifdef USE_SSSE3 -template struct Engine; -template void Engine::SelectColumnsB(const int8_t*, int8_t*, - size_t, const uint32_t*, - const uint32_t*); -template void Engine::Shift::Multiply( - const uint8_t*, const int8_t*, size_t, size_t, size_t, - gemmology::callbacks::UnquantizeAndAddBiasAndWrite); -template void Engine::Shift::PrepareBias( - const int8_t*, size_t, size_t, - gemmology::callbacks::UnquantizeAndAddBiasAndWrite); -#endif +#endif // USE_SSE2 #ifdef USE_NEON template struct Engine;