Skip to content

Commit

Permalink
Changes to modexp_benchmarking test case.
Browse files Browse the repository at this point in the history
Benchmarking now tries to vary the bit size of the exponent
separately from the bit size for the base and modulus.

Part of work required for eosnetworkfoundation/mandel#747.
  • Loading branch information
arhag committed Aug 5, 2022
1 parent 1289831 commit 159336e
Showing 1 changed file with 132 additions and 48 deletions.
180 changes: 132 additions & 48 deletions test/crypto/test_modular_arithmetic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -138,77 +138,161 @@ BOOST_AUTO_TEST_CASE(modexp_benchmarking) try {
return result;
};

static constexpr unsigned int num_trials = 100; // 10000
static constexpr unsigned int num_trials = 10; // 10000

static_assert(num_trials > 0);

static constexpr unsigned int start_num_bytes = 128; // 64
static constexpr unsigned int end_num_bytes = 256; // 512
static constexpr unsigned int delta_num_bytes = 128; // 64
static constexpr unsigned int bit_calc_limit = 101; // 120

static_assert(start_num_bytes <= end_num_bytes);
static_assert(delta_num_bytes > 0);
static_assert((end_num_bytes - start_num_bytes) % delta_num_bytes == 0);
static constexpr unsigned int start_num_bytes = 1;
static constexpr unsigned int end_num_bytes = 1 << ((bit_calc_limit + 7)/8);

static constexpr unsigned num_slots = (end_num_bytes - start_num_bytes) / delta_num_bytes + 1;
static_assert(start_num_bytes <= end_num_bytes);

struct statistics {
int64_t min_time_ns;
int64_t max_time_ns;
int64_t avg_time_ns;
unsigned int modulus_bit_size; // bit size of modulus and base
unsigned int exponent_bit_size; // bit size of exponent
int64_t min_time_ns;
int64_t max_time_ns;
int64_t avg_time_ns;
};

std::vector<statistics> stats;

auto ceil_log2 = [](uint32_t n) -> uint32_t {
if (n <= 1) {
return 0;
}
return 32 - __builtin_clz(n - 1);
};

std::vector<statistics> stats(num_slots);
BOOST_CHECK(ceil_log2(0) == 0);
BOOST_CHECK(ceil_log2(1) == 0);
BOOST_CHECK(ceil_log2(2) == 1);
BOOST_CHECK(ceil_log2(3) == 2);
BOOST_CHECK(ceil_log2(4) == 2);
BOOST_CHECK(ceil_log2(5) == 3);
BOOST_CHECK(ceil_log2(15) == 4);
BOOST_CHECK(ceil_log2(16) == 4);
BOOST_CHECK(ceil_log2(17) == 5);

for (unsigned int n = start_num_bytes; n <= end_num_bytes; n *= 2) {
unsigned int bit_calc = 8 * ceil_log2(n);
for (unsigned int exponent_num_bytes = 1;
exponent_num_bytes <= 2*n && bit_calc <= bit_calc_limit;
exponent_num_bytes *= 2, bit_calc += 5)
{
int64_t min_duration_ns = std::numeric_limits<int64_t>::max();
int64_t max_duration_ns = 0;
int64_t total_duration_ns = 0;

for (unsigned int n = start_num_bytes, slot = 0; n <= end_num_bytes; n += delta_num_bytes, ++slot) {
int64_t min_duration_ns = std::numeric_limits<int64_t>::max();
int64_t max_duration_ns = 0;
int64_t total_duration_ns = 0;
for (unsigned int trial = 0; trial < num_trials; ++trial) {
auto base = generate_random_bytes(r, n);
auto exponent = generate_random_bytes(r, exponent_num_bytes);
auto modulus = generate_random_bytes(r, n);

for (unsigned int trial = 0; trial < num_trials; ++trial) {
auto base = generate_random_bytes(r, n);
auto exponent = generate_random_bytes(r, n);
auto modulus = generate_random_bytes(r, n);
auto start_time = std::chrono::steady_clock::now();

auto start_time = std::chrono::steady_clock::now();
auto res = fc::modexp(base, exponent, modulus);

auto res = fc::modexp(base, exponent, modulus);
auto end_time = std::chrono::steady_clock::now();

auto end_time = std::chrono::steady_clock::now();
int64_t duration_ns = std::chrono::duration_cast<std::chrono::nanoseconds>(end_time - start_time).count();

int64_t duration_ns = std::chrono::duration_cast<std::chrono::nanoseconds>(end_time - start_time).count();
//ilog("(${base})^(${exp}) % ${mod} = ${result} [took ${duration} ns]",
// ("base", base)("exp", exponent)("mod", modulus)("result", std::get<bytes>(res))("duration", duration_ns)
// );

//ilog("(${base})^(${exp}) % ${mod} = ${result}",
// ("base", base)("exp", exponent)("mod", modulus)("result", std::get<bytes>(res))
// );
min_duration_ns = std::min(min_duration_ns, duration_ns);
max_duration_ns = std::max(max_duration_ns, duration_ns);
total_duration_ns += duration_ns;
}

//ilog("slot ${slot}: mod_exp took ${duration} ns", ("slot", slot)("duration", duration_ns));
stats.push_back(statistics{
.modulus_bit_size = n * 8,
.exponent_bit_size = exponent_num_bytes * 8,
.min_time_ns = min_duration_ns,
.max_time_ns = max_duration_ns,
.avg_time_ns = (total_duration_ns / num_trials),
});

min_duration_ns = std::min(min_duration_ns, duration_ns);
max_duration_ns = std::max(max_duration_ns, duration_ns);
total_duration_ns += duration_ns;
}
const auto& stat = stats.back();

stats[slot] = statistics{
.min_time_ns = min_duration_ns,
.max_time_ns = max_duration_ns,
.avg_time_ns = (total_duration_ns / num_trials),
};
ilog("Completed random runs of mod_exp with ${bit_width}-bit width base and modulus values and "
"${exp_bit_width}-bit width exponent values. "
"Min time: ${min} ns; Average time: ${avg} ns; Max time: ${max} ns.",
("bit_width", stat.modulus_bit_size)("exp_bit_width", stat.exponent_bit_size)
("min", stat.min_time_ns)("avg", stat.avg_time_ns)("max", stat.max_time_ns)
);

ilog("Completed random runs of mod_exp with ${bit_width}-bit width values. Min time: ${min} ns; Average time: ${avg} ns; Max time: ${max} ns.",
("bit_width", n*8)("min", stats[slot].min_time_ns)("avg", stats[slot].avg_time_ns)("max", stats[slot].max_time_ns)
);
}
}

// Running the above benchmark (using commented values for num_trials and *_num_bytes) with a release build on an AMD 3.4 GHz CPU
// provides average durations for executing mod_exp for increasing bit sizes for the value.

// For example: with 512-bit values, the average duration is approximately 40 microseconds; with 1024-bit values, the average duration
// is approximately 260 microseconds; with 2048-bit values, the average duration is approximately 2 milliseconds; and, with 4096-bit
// values, the average duration is approximately 14 milliseconds.
std::string stats_output = "Table (in csv format) summarizing statistics from runs:\n";
stats_output += "Modulus/Base Bit Size,Exponent Bit Size,Average Time (ns)\n";
for (const auto& stat : stats) {
stats_output += std::to_string(stat.modulus_bit_size);
stats_output += ',';
stats_output += std::to_string(stat.exponent_bit_size);
stats_output += ',';
stats_output += std::to_string(stat.avg_time_ns);
stats_output += '\n';
}

// It appears that a model of the average time that scales quadratically with the bit size fits the empirically generated data well.
// TODO: See if theoretical analysis of the modular exponentiation algorithm also justifies quadratic scaling.
ilog(stats_output);

// Running the above benchmark (using commented values for num_trials and bit_calc_limit) with a release build on
// an AMD 3.4 GHz CPU provides average durations for executing mod_exp for varying bit sizes for the values
// (but with base and modulus bit sizes kept equal to one another).

// Holding the base/modulus bit size constant and increasing the exponent bit size shows a linear relationship with increasing bit
// size on the average time to execute the modular exponentiation. The slope of the best fit line to the empirical data appears
// to scale super-linearly with base/modulus size. A quadratic (degree 2) fit works okay, but it appears that a better fit is to
// model the slope of the linear relationship between average time and exponent bit size as a the base/modulus bit size taken to
// the 1.6 power and then scaled by some constant.

// Holding the exponent bit size constant and increasing the base/modulus bit size shows a super-linear relationship with
// increasing bit size on the average time to execute the modular exponentiation. A quadratic relationship works pretty well
// but perhaps a fractional exponent between 1 and 2 (e.g. 1.6) would work well as well.

// What is particularly revealing is plotting the average time with respect to some combination of the bit sizes of base/modulus and
// exponent. If the independent variable is the product of the exponent bit size and the base/modulus bit size, the correlation is
// not great. Even if the independent variable is the product of the exponent bit size and the base/modulus bit size taken to some power,
// the correlation is still not great.
// It seems that trying to capture all the data using a model like that breaks down when the exponent bit size is greater than the
// base/modulus bit size.
// If we filter out all the data points where the exponent bit size is greater than the base/modulus bit size, and then choose as
// then independent variable the product of the exponent bit size and the base/modulus bit size taken to some power, then we get
// a pretty good linear correlation when a power of 1.6 is chosen.

// TODO: See if theoretical analysis of the modular exponentiation algorithm also justifies these scaling properties.

// Example results for average time:
// | Modulus/Base Bit Size | Exponent Bit Size | Average Time (ns) |
// | --------------------- | ----------------- | ----------------- |
// | 2048 | 32 | 33826 |
// | 2048 | 256 | 250067 |
// | 2048 | 2048 | 1891095 |
// | 4096 | 32 | 129181 |
// | 4096 | 256 | 954024 |
// | 4096 | 2048 | 7205115 |
// | 8192 | 32 | 347938 |
// | 8192 | 256 | 2503652 |
// | 8192 | 2048 | 19199775 |

// The empirical results show that the average time stays well below 5 ms if the exponent bit size does not exceed the
// modulus/base bit size and the product of the exponent bit size and the
// (modulus/base bit size)^1.6 does not exceed 500,000,000.
// Another way of satisfying that constraint is to require that the 5*ceil(log2(exponent bit size)) + 8*ceil(log2(modulus bit size))
// be less than or equal to 5*floor(log2(500000000)) = 140.
// Or equivalently, assuming the bit sizes are multiples of 8:
// 5*ceil(log2(exponent bit size/8)) + 8*ceil(log2(modulus bit size/8)) <= 101.

// Take, as an example, a 8192-bit modulus/base and a 128-bit exponent (which on average took 1.29 ms).
// 5*ceil(log2(128)) + 8*ceil(log2(8192)) = 5*7 + 8*13 = 139 which is less than the limit of 140.
//
// On the other hand, consider a 4096-bit modulus/base and a 1024-bit exponent (which on average took 3.69 ms).
// 5*ceil(log2(1024)) + 8*ceil(log2(4096)) = 5*10 + 8*12 = 146 which is greater than the limit of 140.

} FC_LOG_AND_RETHROW();

Expand Down

0 comments on commit 159336e

Please sign in to comment.