diff --git a/test/crypto/test_modular_arithmetic.cpp b/test/crypto/test_modular_arithmetic.cpp index 54257ca..b0f3338 100644 --- a/test/crypto/test_modular_arithmetic.cpp +++ b/test/crypto/test_modular_arithmetic.cpp @@ -138,77 +138,161 @@ BOOST_AUTO_TEST_CASE(modexp_benchmarking) try { return result; }; - static constexpr unsigned int num_trials = 100; // 10000 + static constexpr unsigned int num_trials = 10; // 10000 static_assert(num_trials > 0); - static constexpr unsigned int start_num_bytes = 128; // 64 - static constexpr unsigned int end_num_bytes = 256; // 512 - static constexpr unsigned int delta_num_bytes = 128; // 64 + static constexpr unsigned int bit_calc_limit = 101; // 120 - static_assert(start_num_bytes <= end_num_bytes); - static_assert(delta_num_bytes > 0); - static_assert((end_num_bytes - start_num_bytes) % delta_num_bytes == 0); + static constexpr unsigned int start_num_bytes = 1; + static constexpr unsigned int end_num_bytes = 1 << ((bit_calc_limit + 7)/8); - static constexpr unsigned num_slots = (end_num_bytes - start_num_bytes) / delta_num_bytes + 1; + static_assert(start_num_bytes <= end_num_bytes); struct statistics { - int64_t min_time_ns; - int64_t max_time_ns; - int64_t avg_time_ns; + unsigned int modulus_bit_size; // bit size of modulus and base + unsigned int exponent_bit_size; // bit size of exponent + int64_t min_time_ns; + int64_t max_time_ns; + int64_t avg_time_ns; + }; + + std::vector stats; + + auto ceil_log2 = [](uint32_t n) -> uint32_t { + if (n <= 1) { + return 0; + } + return 32 - __builtin_clz(n - 1); }; - std::vector stats(num_slots); + BOOST_CHECK(ceil_log2(0) == 0); + BOOST_CHECK(ceil_log2(1) == 0); + BOOST_CHECK(ceil_log2(2) == 1); + BOOST_CHECK(ceil_log2(3) == 2); + BOOST_CHECK(ceil_log2(4) == 2); + BOOST_CHECK(ceil_log2(5) == 3); + BOOST_CHECK(ceil_log2(15) == 4); + BOOST_CHECK(ceil_log2(16) == 4); + BOOST_CHECK(ceil_log2(17) == 5); + + for (unsigned int n = start_num_bytes; n <= end_num_bytes; n *= 2) { + unsigned int bit_calc = 8 * ceil_log2(n); + for (unsigned int exponent_num_bytes = 1; + exponent_num_bytes <= 2*n && bit_calc <= bit_calc_limit; + exponent_num_bytes *= 2, bit_calc += 5) + { + int64_t min_duration_ns = std::numeric_limits::max(); + int64_t max_duration_ns = 0; + int64_t total_duration_ns = 0; - for (unsigned int n = start_num_bytes, slot = 0; n <= end_num_bytes; n += delta_num_bytes, ++slot) { - int64_t min_duration_ns = std::numeric_limits::max(); - int64_t max_duration_ns = 0; - int64_t total_duration_ns = 0; + for (unsigned int trial = 0; trial < num_trials; ++trial) { + auto base = generate_random_bytes(r, n); + auto exponent = generate_random_bytes(r, exponent_num_bytes); + auto modulus = generate_random_bytes(r, n); - for (unsigned int trial = 0; trial < num_trials; ++trial) { - auto base = generate_random_bytes(r, n); - auto exponent = generate_random_bytes(r, n); - auto modulus = generate_random_bytes(r, n); + auto start_time = std::chrono::steady_clock::now(); - auto start_time = std::chrono::steady_clock::now(); + auto res = fc::modexp(base, exponent, modulus); - auto res = fc::modexp(base, exponent, modulus); + auto end_time = std::chrono::steady_clock::now(); - auto end_time = std::chrono::steady_clock::now(); + int64_t duration_ns = std::chrono::duration_cast(end_time - start_time).count(); - int64_t duration_ns = std::chrono::duration_cast(end_time - start_time).count(); + //ilog("(${base})^(${exp}) % ${mod} = ${result} [took ${duration} ns]", + // ("base", base)("exp", exponent)("mod", modulus)("result", std::get(res))("duration", duration_ns) + // ); - //ilog("(${base})^(${exp}) % ${mod} = ${result}", - // ("base", base)("exp", exponent)("mod", modulus)("result", std::get(res)) - // ); + min_duration_ns = std::min(min_duration_ns, duration_ns); + max_duration_ns = std::max(max_duration_ns, duration_ns); + total_duration_ns += duration_ns; + } - //ilog("slot ${slot}: mod_exp took ${duration} ns", ("slot", slot)("duration", duration_ns)); + stats.push_back(statistics{ + .modulus_bit_size = n * 8, + .exponent_bit_size = exponent_num_bytes * 8, + .min_time_ns = min_duration_ns, + .max_time_ns = max_duration_ns, + .avg_time_ns = (total_duration_ns / num_trials), + }); - min_duration_ns = std::min(min_duration_ns, duration_ns); - max_duration_ns = std::max(max_duration_ns, duration_ns); - total_duration_ns += duration_ns; - } + const auto& stat = stats.back(); - stats[slot] = statistics{ - .min_time_ns = min_duration_ns, - .max_time_ns = max_duration_ns, - .avg_time_ns = (total_duration_ns / num_trials), - }; + ilog("Completed random runs of mod_exp with ${bit_width}-bit width base and modulus values and " + "${exp_bit_width}-bit width exponent values. " + "Min time: ${min} ns; Average time: ${avg} ns; Max time: ${max} ns.", + ("bit_width", stat.modulus_bit_size)("exp_bit_width", stat.exponent_bit_size) + ("min", stat.min_time_ns)("avg", stat.avg_time_ns)("max", stat.max_time_ns) + ); - ilog("Completed random runs of mod_exp with ${bit_width}-bit width values. Min time: ${min} ns; Average time: ${avg} ns; Max time: ${max} ns.", - ("bit_width", n*8)("min", stats[slot].min_time_ns)("avg", stats[slot].avg_time_ns)("max", stats[slot].max_time_ns) - ); + } } - // Running the above benchmark (using commented values for num_trials and *_num_bytes) with a release build on an AMD 3.4 GHz CPU - // provides average durations for executing mod_exp for increasing bit sizes for the value. - - // For example: with 512-bit values, the average duration is approximately 40 microseconds; with 1024-bit values, the average duration - // is approximately 260 microseconds; with 2048-bit values, the average duration is approximately 2 milliseconds; and, with 4096-bit - // values, the average duration is approximately 14 milliseconds. + std::string stats_output = "Table (in csv format) summarizing statistics from runs:\n"; + stats_output += "Modulus/Base Bit Size,Exponent Bit Size,Average Time (ns)\n"; + for (const auto& stat : stats) { + stats_output += std::to_string(stat.modulus_bit_size); + stats_output += ','; + stats_output += std::to_string(stat.exponent_bit_size); + stats_output += ','; + stats_output += std::to_string(stat.avg_time_ns); + stats_output += '\n'; + } - // It appears that a model of the average time that scales quadratically with the bit size fits the empirically generated data well. - // TODO: See if theoretical analysis of the modular exponentiation algorithm also justifies quadratic scaling. + ilog(stats_output); + + // Running the above benchmark (using commented values for num_trials and bit_calc_limit) with a release build on + // an AMD 3.4 GHz CPU provides average durations for executing mod_exp for varying bit sizes for the values + // (but with base and modulus bit sizes kept equal to one another). + + // Holding the base/modulus bit size constant and increasing the exponent bit size shows a linear relationship with increasing bit + // size on the average time to execute the modular exponentiation. The slope of the best fit line to the empirical data appears + // to scale super-linearly with base/modulus size. A quadratic (degree 2) fit works okay, but it appears that a better fit is to + // model the slope of the linear relationship between average time and exponent bit size as a the base/modulus bit size taken to + // the 1.6 power and then scaled by some constant. + + // Holding the exponent bit size constant and increasing the base/modulus bit size shows a super-linear relationship with + // increasing bit size on the average time to execute the modular exponentiation. A quadratic relationship works pretty well + // but perhaps a fractional exponent between 1 and 2 (e.g. 1.6) would work well as well. + + // What is particularly revealing is plotting the average time with respect to some combination of the bit sizes of base/modulus and + // exponent. If the independent variable is the product of the exponent bit size and the base/modulus bit size, the correlation is + // not great. Even if the independent variable is the product of the exponent bit size and the base/modulus bit size taken to some power, + // the correlation is still not great. + // It seems that trying to capture all the data using a model like that breaks down when the exponent bit size is greater than the + // base/modulus bit size. + // If we filter out all the data points where the exponent bit size is greater than the base/modulus bit size, and then choose as + // then independent variable the product of the exponent bit size and the base/modulus bit size taken to some power, then we get + // a pretty good linear correlation when a power of 1.6 is chosen. + + // TODO: See if theoretical analysis of the modular exponentiation algorithm also justifies these scaling properties. + + // Example results for average time: + // | Modulus/Base Bit Size | Exponent Bit Size | Average Time (ns) | + // | --------------------- | ----------------- | ----------------- | + // | 2048 | 32 | 33826 | + // | 2048 | 256 | 250067 | + // | 2048 | 2048 | 1891095 | + // | 4096 | 32 | 129181 | + // | 4096 | 256 | 954024 | + // | 4096 | 2048 | 7205115 | + // | 8192 | 32 | 347938 | + // | 8192 | 256 | 2503652 | + // | 8192 | 2048 | 19199775 | + + // The empirical results show that the average time stays well below 5 ms if the exponent bit size does not exceed the + // modulus/base bit size and the product of the exponent bit size and the + // (modulus/base bit size)^1.6 does not exceed 500,000,000. + // Another way of satisfying that constraint is to require that the 5*ceil(log2(exponent bit size)) + 8*ceil(log2(modulus bit size)) + // be less than or equal to 5*floor(log2(500000000)) = 140. + // Or equivalently, assuming the bit sizes are multiples of 8: + // 5*ceil(log2(exponent bit size/8)) + 8*ceil(log2(modulus bit size/8)) <= 101. + + // Take, as an example, a 8192-bit modulus/base and a 128-bit exponent (which on average took 1.29 ms). + // 5*ceil(log2(128)) + 8*ceil(log2(8192)) = 5*7 + 8*13 = 139 which is less than the limit of 140. + // + // On the other hand, consider a 4096-bit modulus/base and a 1024-bit exponent (which on average took 3.69 ms). + // 5*ceil(log2(1024)) + 8*ceil(log2(4096)) = 5*10 + 8*12 = 146 which is greater than the limit of 140. } FC_LOG_AND_RETHROW();