From b2ab8cf4130a579a9bd8fa7008d1f4cae7f953a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Piotr=20W=C3=B3jcik?= <3044353+pwojcikdev@users.noreply.github.com> Date: Wed, 31 May 2023 14:42:46 +0200 Subject: [PATCH] Blockprocessor stall workaround (#4240) --- nano/core_test/blockprocessor.cpp | 29 +++++++++++++++++++++++++++++ nano/core_test/toml.cpp | 6 ++++++ nano/node/blocking_observer.cpp | 10 ++++++++++ nano/node/blocking_observer.hpp | 1 + nano/node/blockprocessor.cpp | 11 ++++++++++- nano/node/node.cpp | 2 +- nano/node/nodeconfig.cpp | 7 +++++++ nano/node/nodeconfig.hpp | 3 +++ 8 files changed, 67 insertions(+), 2 deletions(-) diff --git a/nano/core_test/blockprocessor.cpp b/nano/core_test/blockprocessor.cpp index 497aed8f9d..749f223e18 100644 --- a/nano/core_test/blockprocessor.cpp +++ b/nano/core_test/blockprocessor.cpp @@ -38,4 +38,33 @@ TEST (block_processor, broadcast_block_on_arrival) node1->process_active (send1); // Checks whether the block was broadcast. ASSERT_TIMELY (5s, node2->ledger.block_or_pruned_exists (send1->hash ())); +} + +TEST (block_processor, add_blocking_invalid_block) +{ + nano::test::system system; + + nano::node_config config = system.default_config (); + config.block_process_timeout = std::chrono::seconds{ 1 }; + auto & node = *system.add_node (config); + + nano::state_block_builder builder; + auto send1 = builder.make_block () + .account (nano::dev::genesis_key.pub) + .previous (nano::dev::genesis->hash ()) + .representative (nano::dev::genesis_key.pub) + .balance (nano::dev::constants.genesis_amount - nano::Gxrb_ratio) + .link (nano::dev::genesis_key.pub) + .sign (nano::dev::genesis_key.prv, nano::dev::genesis_key.pub) + .work (*system.work.generate (nano::dev::genesis->hash ())) + .build_shared (); + + send1->signature.clear (); + + auto background = std::async (std::launch::async, [&] () { + return node.process_local (send1); + }); + + ASSERT_TIMELY (5s, background.wait_for (std::chrono::seconds (0)) == std::future_status::ready); + ASSERT_FALSE (background.get ().has_value ()); } \ No newline at end of file diff --git a/nano/core_test/toml.cpp b/nano/core_test/toml.cpp index f9a96cf999..eb3f60b46d 100644 --- a/nano/core_test/toml.cpp +++ b/nano/core_test/toml.cpp @@ -155,6 +155,7 @@ TEST (toml, daemon_config_deserialize_defaults) ASSERT_EQ (conf.node.bootstrap_bandwidth_limit, defaults.node.bootstrap_bandwidth_limit); ASSERT_EQ (conf.node.bootstrap_bandwidth_burst_ratio, defaults.node.bootstrap_bandwidth_burst_ratio); ASSERT_EQ (conf.node.block_processor_batch_max_time, defaults.node.block_processor_batch_max_time); + ASSERT_EQ (conf.node.block_process_timeout, defaults.node.block_process_timeout); ASSERT_EQ (conf.node.bootstrap_connections, defaults.node.bootstrap_connections); ASSERT_EQ (conf.node.bootstrap_connections_max, defaults.node.bootstrap_connections_max); ASSERT_EQ (conf.node.bootstrap_initiator_threads, defaults.node.bootstrap_initiator_threads); @@ -169,6 +170,7 @@ TEST (toml, daemon_config_deserialize_defaults) ASSERT_EQ (conf.node.io_threads, defaults.node.io_threads); ASSERT_EQ (conf.node.max_work_generate_multiplier, defaults.node.max_work_generate_multiplier); ASSERT_EQ (conf.node.network_threads, defaults.node.network_threads); + ASSERT_EQ (conf.node.background_threads, defaults.node.background_threads); ASSERT_EQ (conf.node.secondary_work_peers, defaults.node.secondary_work_peers); ASSERT_EQ (conf.node.online_weight_minimum, defaults.node.online_weight_minimum); ASSERT_EQ (conf.node.rep_crawler_weight_minimum, defaults.node.rep_crawler_weight_minimum); @@ -403,6 +405,7 @@ TEST (toml, daemon_config_deserialize_no_defaults) bootstrap_bandwidth_limit = 999 bootstrap_bandwidth_burst_ratio = 999.9 block_processor_batch_max_time = 999 + block_process_timeout = 999 bootstrap_connections = 999 bootstrap_connections_max = 999 bootstrap_initiator_threads = 999 @@ -417,6 +420,7 @@ TEST (toml, daemon_config_deserialize_no_defaults) io_threads = 999 lmdb_max_dbs = 999 network_threads = 999 + background_threads = 999 online_weight_minimum = "999" rep_crawler_weight_minimum = "999" election_hint_weight_percent = 19 @@ -578,6 +582,7 @@ TEST (toml, daemon_config_deserialize_no_defaults) ASSERT_NE (conf.node.bootstrap_bandwidth_limit, defaults.node.bootstrap_bandwidth_limit); ASSERT_NE (conf.node.bootstrap_bandwidth_burst_ratio, defaults.node.bootstrap_bandwidth_burst_ratio); ASSERT_NE (conf.node.block_processor_batch_max_time, defaults.node.block_processor_batch_max_time); + ASSERT_NE (conf.node.block_process_timeout, defaults.node.block_process_timeout); ASSERT_NE (conf.node.bootstrap_connections, defaults.node.bootstrap_connections); ASSERT_NE (conf.node.bootstrap_connections_max, defaults.node.bootstrap_connections_max); ASSERT_NE (conf.node.bootstrap_initiator_threads, defaults.node.bootstrap_initiator_threads); @@ -593,6 +598,7 @@ TEST (toml, daemon_config_deserialize_no_defaults) ASSERT_NE (conf.node.max_work_generate_multiplier, defaults.node.max_work_generate_multiplier); ASSERT_NE (conf.node.frontiers_confirmation, defaults.node.frontiers_confirmation); ASSERT_NE (conf.node.network_threads, defaults.node.network_threads); + ASSERT_NE (conf.node.background_threads, defaults.node.background_threads); ASSERT_NE (conf.node.secondary_work_peers, defaults.node.secondary_work_peers); ASSERT_NE (conf.node.max_pruning_age, defaults.node.max_pruning_age); ASSERT_NE (conf.node.max_pruning_depth, defaults.node.max_pruning_depth); diff --git a/nano/node/blocking_observer.cpp b/nano/node/blocking_observer.cpp index 809f0b11db..339b09b84b 100644 --- a/nano/node/blocking_observer.cpp +++ b/nano/node/blocking_observer.cpp @@ -50,3 +50,13 @@ bool nano::blocking_observer::exists (std::shared_ptr block) auto existing = blocking.find (block); return existing != blocking.end (); } + +void nano::blocking_observer::erase (std::shared_ptr block) +{ + nano::lock_guard lock{ mutex }; + auto existing = blocking.find (block); + if (existing != blocking.end ()) + { + blocking.erase (existing); + } +} \ No newline at end of file diff --git a/nano/node/blocking_observer.hpp b/nano/node/blocking_observer.hpp index f4b01b2239..bd97141224 100644 --- a/nano/node/blocking_observer.hpp +++ b/nano/node/blocking_observer.hpp @@ -22,6 +22,7 @@ class blocking_observer void observe (nano::process_return const & result, std::shared_ptr block); [[nodiscard]] std::future insert (std::shared_ptr block); bool exists (std::shared_ptr block); + void erase (std::shared_ptr block); private: std::unordered_multimap, std::promise> blocking; diff --git a/nano/node/blockprocessor.cpp b/nano/node/blockprocessor.cpp index 6be01128a7..7fabb21e10 100644 --- a/nano/node/blockprocessor.cpp +++ b/nano/node/blockprocessor.cpp @@ -106,7 +106,16 @@ std::optional nano::block_processor::add_blocking (std::sh std::optional result; try { - result = future.get (); + auto status = future.wait_for (node.config.block_process_timeout); + debug_assert (status != std::future_status::deferred); + if (status == std::future_status::ready) + { + result = future.get (); + } + else + { + blocking.erase (block); + } } catch (std::future_error const &) { diff --git a/nano/node/node.cpp b/nano/node/node.cpp index f9970ffd95..5b3142dc00 100644 --- a/nano/node/node.cpp +++ b/nano/node/node.cpp @@ -148,7 +148,7 @@ nano::node::node (boost::asio::io_context & io_ctx_a, boost::filesystem::path co config (config_a), network_params{ config.network_params }, stats (config.stats_config), - workers (std::max (3u, config.io_threads / 4), nano::thread_role::name::worker), + workers{ config.background_threads, nano::thread_role::name::worker }, bootstrap_workers{ config.bootstrap_serving_threads, nano::thread_role::name::bootstrap_worker }, flags (flags_a), work (work_a), diff --git a/nano/node/nodeconfig.cpp b/nano/node/nodeconfig.cpp index bbc819391a..529d6fa14e 100644 --- a/nano/node/nodeconfig.cpp +++ b/nano/node/nodeconfig.cpp @@ -94,6 +94,7 @@ nano::error nano::node_config::serialize_toml (nano::tomlconfig & toml) const toml.put ("io_threads", io_threads, "Number of threads dedicated to I/O operations. Defaults to the number of CPU threads, and at least 4.\ntype:uint64"); toml.put ("network_threads", network_threads, "Number of threads dedicated to processing network messages. Defaults to the number of CPU threads, and at least 4.\ntype:uint64"); toml.put ("work_threads", work_threads, "Number of threads dedicated to CPU generated work. Defaults to all available CPU threads.\ntype:uint64"); + toml.put ("background_threads", background_threads, "Number of threads dedicated to background node work, including handling of RPC requests. Defaults to all available CPU threads.\ntype:uint64"); toml.put ("signature_checker_threads", signature_checker_threads, "Number of additional threads dedicated to signature verification. Defaults to number of CPU threads / 2.\ntype:uint64"); toml.put ("enable_voting", enable_voting, "Enable or disable voting. Enabling this option requires additional system resources, namely increased CPU, bandwidth and disk usage.\ntype:bool"); toml.put ("bootstrap_connections", bootstrap_connections, "Number of outbound bootstrap connections. Must be a power of 2. Defaults to 4.\nWarning: a larger amount of connections may use substantially more system memory.\ntype:uint64"); @@ -102,6 +103,7 @@ nano::error nano::node_config::serialize_toml (nano::tomlconfig & toml) const toml.put ("bootstrap_serving_threads", bootstrap_serving_threads, "Number of threads dedicated to serving bootstrap data to other peers. Defaults to half the number of CPU threads, and at least 2.\ntype:uint64"); toml.put ("bootstrap_frontier_request_count", bootstrap_frontier_request_count, "Number frontiers per bootstrap frontier request. Defaults to 1048576.\ntype:uint32,[1024..4294967295]"); toml.put ("block_processor_batch_max_time", block_processor_batch_max_time.count (), "The maximum time the block processor can continuously process blocks for.\ntype:milliseconds"); + toml.put ("block_process_timeout", block_process_timeout.count (), "Time to wait for block processing result.\ntype:seconds"); toml.put ("allow_local_peers", allow_local_peers, "Enable or disable local host peering.\ntype:bool"); toml.put ("vote_minimum", vote_minimum.to_string_dec (), "Local representatives do not vote if the delegated weight is under this threshold. Saves on system resources.\ntype:string,amount,raw"); toml.put ("vote_generator_delay", vote_generator_delay.count (), "Delay before votes are sent to allow for efficient bundling of hashes in votes.\ntype:milliseconds"); @@ -339,6 +341,10 @@ nano::error nano::node_config::deserialize_toml (nano::tomlconfig & toml) toml.get ("block_processor_batch_max_time", block_processor_batch_max_time_l); block_processor_batch_max_time = std::chrono::milliseconds (block_processor_batch_max_time_l); + auto block_process_timeout_l = block_process_timeout.count (); + toml.get ("block_process_timeout", block_process_timeout_l); + block_process_timeout = std::chrono::seconds{ block_process_timeout_l }; + auto unchecked_cutoff_time_l = static_cast (unchecked_cutoff_time.count ()); toml.get ("unchecked_cutoff_time", unchecked_cutoff_time_l); unchecked_cutoff_time = std::chrono::seconds (unchecked_cutoff_time_l); @@ -360,6 +366,7 @@ nano::error nano::node_config::deserialize_toml (nano::tomlconfig & toml) toml.get ("io_threads", io_threads); toml.get ("work_threads", work_threads); toml.get ("network_threads", network_threads); + toml.get ("background_threads", background_threads); toml.get ("bootstrap_connections", bootstrap_connections); toml.get ("bootstrap_connections_max", bootstrap_connections_max); toml.get ("bootstrap_initiator_threads", bootstrap_initiator_threads); diff --git a/nano/node/nodeconfig.hpp b/nano/node/nodeconfig.hpp index df95f41664..1c23d57d91 100644 --- a/nano/node/nodeconfig.hpp +++ b/nano/node/nodeconfig.hpp @@ -64,6 +64,7 @@ class node_config unsigned io_threads{ std::max (4u, nano::hardware_concurrency ()) }; unsigned network_threads{ std::max (4u, nano::hardware_concurrency ()) }; unsigned work_threads{ std::max (4u, nano::hardware_concurrency ()) }; + unsigned background_threads{ std::max (4u, nano::hardware_concurrency ()) }; /* Use half available threads on the system for signature checking. The calling thread does checks as well, so these are extra worker threads */ unsigned signature_checker_threads{ std::max (2u, nano::hardware_concurrency () / 2) }; bool enable_voting{ false }; @@ -84,6 +85,8 @@ class node_config std::string external_address; uint16_t external_port{ 0 }; std::chrono::milliseconds block_processor_batch_max_time{ std::chrono::milliseconds (500) }; + /** Time to wait for block processing result */ + std::chrono::seconds block_process_timeout{ 15 }; std::chrono::seconds unchecked_cutoff_time{ std::chrono::seconds (4 * 60 * 60) }; // 4 hours /** Timeout for initiated async operations */ std::chrono::seconds tcp_io_timeout{ (network_params.network.is_dev_network () && !is_sanitizer_build ()) ? std::chrono::seconds (5) : std::chrono::seconds (15) };