Setting api to sync when hint is latency in benchmark_app (openvinoto…

…olkit#29060) ### Details: Now the default setting of latency mode in benchmark_app is aync mode. So when we run benchmark_app on dual socket platform with latency hint, OV RT create stream on socket that benchmark_app is running on. However, thread of benchmark_app may be scheduled to other socket by OS during inference. Then performance will drop. After changing the default setting to sync mode, the benchmark_app thread becomes one of the inference threads. This reduces cross-socket switching, leading to more stable performance results. - *Setting api to sync when hint is latency* ### Tickets: - *CVS-154111* --------- Co-authored-by: Wanglei Shen <[email protected]>
andrei-kochin · Feb 25, 2025 · 9980e86 · 9980e86
1 parent c3d0954
commit 9980e86
Show file tree

Hide file tree

Showing 6 changed files with 16 additions and 7 deletions.
diff --git a/docs/articles_en/get-started/learn-openvino/openvino-samples/benchmark-tool.rst b/docs/articles_en/get-started/learn-openvino/openvino-samples/benchmark-tool.rst
@@ -382,7 +382,7 @@ available options and parameters:
               -t TIME, --time TIME  Optional. Time in seconds to execute topology.
 
               -api {sync,async}, --api_type {sync,async}
-                                    Optional. Enable using sync/async API. Default value is async.
+                                    Optional. Enable using sync/async API. When hint is throughput, default value is async. When hint is latency, default value is sync.
 
 
             Input shapes:
@@ -557,7 +557,7 @@ available options and parameters:
                 -c  <absolute_path>           Required for GPU custom kernels. Absolute path to an .xml file with the kernels description.
                 -cache_dir  <path>            Optional. Enables caching of loaded models to specified directory. List of devices which support caching is shown at the end of this message.
                 -load_from_file               Optional. Loads model from file directly without read_model. All CNNNetwork options (like re-shape) will be ignored
-                -api <sync/async>             Optional. Enable Sync/Async API. Default value is "async".
+                -api <sync/async>             Optional. Enable Sync/Async API. When hint is throughput, default value is "async". When hint is latency, default value is "sync".
                 -nireq  <integer>             Optional. Number of infer requests. Default value is determined automatically for device.
                 -nstreams  <integer>          Optional. Number of streams to use for inference on the CPU or GPU devices (for HETERO and MULTI device cases use format <dev1>:<nstreams1>,   <dev2>:<nstreams2> or just <nstreams>). Default value is determined automatically for a device.Please note that although the automatic selection usually provides a reasonable    performance, it still may be non - optimal for some cases, especially for very small models. See sample's README for more details. Also, using nstreams>1 is inherently    throughput-oriented option, while for the best-latency estimations the number of streams should be set to 1.
                 -inference_only         Optional. Measure only inference stage. Default option for static models. Dynamic models are measured in full mode which includes inputs setup stage,    inference only mode available for them with single input data shape only. To enable full mode for static models pass "false" value to this argument: ex. "-inference_only=false".

diff --git a/samples/cpp/benchmark_app/benchmark_app.hpp b/samples/cpp/benchmark_app/benchmark_app.hpp
@@ -98,7 +98,9 @@ static const char layout_message[] =
     "For example, \"input1[NCHW],input2[NC]\" or \"[NCHW]\" in case of one input size.";
 
 /// @brief message for execution mode
-static const char api_message[] = "Optional. Enable Sync/Async API. Default value is \"async\".";
+static const char api_message[] =
+    "Optional. Enable Sync/Async API. When hint is throughput, default value is \"async\". "
+    "When hint is latency, default value is \"sync\".";
 
 /// @brief message for #streams for CPU inference
 static const char infer_num_streams_message[] =
@@ -303,7 +305,7 @@ DEFINE_string(cache_dir, "", cache_dir_message);
 DEFINE_bool(load_from_file, false, load_from_file_message);
 
 /// @brief Define execution mode
-DEFINE_string(api, "async", api_message);
+DEFINE_string(api, "", api_message);
 
 /// @brief Number of infer requests in parallel
 DEFINE_uint64(nireq, 0, infer_requests_count_message);

diff --git a/samples/cpp/benchmark_app/main.cpp b/samples/cpp/benchmark_app/main.cpp
@@ -58,6 +58,9 @@ bool parse_and_check_command_line(int argc, char* argv[]) {
         show_usage();
         throw std::logic_error("The percentile value is incorrect. The applicable values range is [1, 100].");
     }
+    if (FLAGS_api == "") {
+        FLAGS_api = FLAGS_hint == "latency" ? "sync" : "async";
+    }
     if (FLAGS_api != "async" && FLAGS_api != "sync") {
         throw std::logic_error("Incorrect API. Please set -api option to `sync` or `async` value.");
     }

diff --git a/tools/benchmark_tool/openvino/tools/benchmark/benchmark.py b/tools/benchmark_tool/openvino/tools/benchmark/benchmark.py
@@ -16,7 +16,7 @@ def percentile(values, percent):
 
 class Benchmark:
     def __init__(self, device: str, number_infer_requests: int = 0, number_iterations: int = None,
-                 duration_seconds: int = None, api_type: str = 'async', inference_only = None,
+                 duration_seconds: int = None, api_type: str = '', inference_only = None,
                  maximum_inference_rate: float = 0):
         self.device = device
         self.core = Core()

diff --git a/tools/benchmark_tool/openvino/tools/benchmark/main.py b/tools/benchmark_tool/openvino/tools/benchmark/main.py
@@ -49,6 +49,9 @@ def arg_not_empty(arg_value,empty_value):
         raise Exception("Cannot set precision for a compiled model. " \
                         "Please re-compile your model with required precision.")
 
+    if args.api_type == "":
+        args.api_type = "sync" if args.perf_hint == "latency" else "async"
+
     if args.api_type == "sync":
         if args.time == 0 and (args.number_infer_requests > args.number_iterations):
             raise Exception("Number of infer requests should be less than or equal to number of iterations in sync mode.")

diff --git a/tools/benchmark_tool/openvino/tools/benchmark/parameters.py b/tools/benchmark_tool/openvino/tools/benchmark/parameters.py
@@ -108,8 +108,9 @@ def parse_args():
                       help="Optional. Enable model caching to specified directory")
     advs.add_argument('-lfile', '--load_from_file', required=False, nargs='?', default=argparse.SUPPRESS,
                       help="Optional. Loads model from file directly without read_model.")
-    args.add_argument('-api', '--api_type', type=str, required=False, default='async', choices=['sync', 'async'],
-                      help='Optional. Enable using sync/async API. Default value is async.')
+    args.add_argument('-api', '--api_type', type=str, required=False, default='', choices=['sync', 'async'],
+                      help='Optional. Enable using sync/async API. When hint is throughput, default value is async. '
+                           'When hint is latency, default value is sync.')
     advs.add_argument('-nireq', '--number_infer_requests', type=check_positive, required=False, default=0,
                       help='Optional. Number of infer requests. Default value is determined automatically for device.')
     advs.add_argument('-nstreams', '--number_streams', type=str, required=False, default=None,