diff --git a/evals/benchmark/README.md b/evals/benchmark/README.md index 349eb319..8a3ef5f3 100644 --- a/evals/benchmark/README.md +++ b/evals/benchmark/README.md @@ -34,7 +34,15 @@ pip install -r ../../requirements.txt 1 Define the test cases and configurations in the benchmark.yaml file. -2 Run the benchmark script: +2 Temporarily increase the file descriptor limit before run test: + +```bash +ulimit -n 100000 +``` + +This command increases the maximum number of file descriptors (which represent open files, network connections, etc.) that a single process can use. By default, many systems set a conservative limit, such as 1024, which may not be sufficient for high-concurrency applications or large-scale load testing. Raising this limit ensures that the process can handle a larger number of open connections or files without running into errors caused by insufficient file descriptors. + +3 Run the benchmark script: ```bash python benchmark.py diff --git a/evals/benchmark/stresscli/commands/config.ini b/evals/benchmark/stresscli/commands/config.ini index 4bf288db..e954fd13 100644 --- a/evals/benchmark/stresscli/commands/config.ini +++ b/evals/benchmark/stresscli/commands/config.ini @@ -8,11 +8,13 @@ RPS = RPS:\s+([\d.]+) Input_Tokens_per_Second = Input Tokens per Second:\s+([\d.]+) Output_Tokens_per_Second = Output Tokens per Second:\s+([\d.]+) End_to_End_latency_P50 = End to End latency\(ms\),\s+P50:\s+([\d.]+) -End_to_End_latency_P99 = End to End latency\(ms\),\s+P50:[\s\d.,]+P99:\s+([\d.]+) -End_to_End_latency_Avg = End to End latency\(ms\),\s+P50:[\s\d.,]+P99:\s+[\s\d.,]+Avg:\s+([\d.]+) +End_to_End_latency_P90 = End to End latency\(ms\),\s+P50:[\s\d.,]+P90:\s+([\d.]+) +End_to_End_latency_P99 = End to End latency\(ms\),\s+P50:[\s\d.,]+P90:\s+[\s\d.,]+P99:\s+([\d.]+) +End_to_End_latency_Avg = End to End latency\(ms\),\s+P50:[\s\d.,]+P90:\s+[\s\d.,]+P99:\s+[\s\d.,]+Avg:\s+([\d.]+) First_token_latency_P50 = First token latency\(ms\),\s+P50:\s+([\d.]+) -First_token_latency_P99 = First token latency\(ms\),\s+P50:[\s\d.,]+P99:\s+([\d.]+) -First_token_latency_Avg = First token latency\(ms\),\s+P50:[\s\d.,]+P99:\s+[\s\d.,]+Avg:\s+([\d.]+) +First_token_latency_P90 = First token latency\(ms\),\s+P50:[\s\d.,]+P90:\s+([\d.]+) +First_token_latency_P99 = First token latency\(ms\),\s+P50:[\s\d.,]+P90:\s+[\s\d.,]+P99:\s+([\d.]+) +First_token_latency_Avg = First token latency\(ms\),\s+P50:[\s\d.,]+P90:\s+[\s\d.,]+P99:\s+[\s\d.,]+Avg:\s+([\d.]+) Average_Next_token_latency = Average Next token latency\(ms\):\s+([\d.]+) Average_token_latency = Average token latency\(ms\)\s+:\s+([\d.]+) locust_num_requests = \"num_requests\":\s+(\d+) diff --git a/evals/benchmark/stresscli/commands/load_test.py b/evals/benchmark/stresscli/commands/load_test.py index 229749d8..a6e635a9 100644 --- a/evals/benchmark/stresscli/commands/load_test.py +++ b/evals/benchmark/stresscli/commands/load_test.py @@ -115,6 +115,9 @@ def run_locust_test(kubeconfig, global_settings, run_settings, output_folder, in os.makedirs(end_output_folder, exist_ok=True) metrics_output = os.path.join(output_folder, f"{index}_metrics.json") + spawn_rate = 100 if runspec["users"] > 100 else runspec["users"] + processes = 10 if runspec["max_requests"] > 2000 else 5 if runspec["max_requests"] > 1000 else 2 + cmd = [ "locust", "--locustfile", @@ -126,11 +129,11 @@ def run_locust_test(kubeconfig, global_settings, run_settings, output_folder, in "--users", str(runspec["users"]), "--spawn-rate", - str(runspec["users"]), + str(spawn_rate), "--max-request", str(runspec["max_requests"]), "--processes", - str(runspec["processes"]), + str(processes), "--bench-target", str(runspec["bench-target"]), "--llm-model", diff --git a/evals/benchmark/stresscli/locust/aistress.py b/evals/benchmark/stresscli/locust/aistress.py index 19f96a43..a41f19df 100644 --- a/evals/benchmark/stresscli/locust/aistress.py +++ b/evals/benchmark/stresscli/locust/aistress.py @@ -27,7 +27,7 @@ def _(parser): help="Stop the benchmark If exceed this request", ) parser.add_argument( - "--http-timeout", type=int, env_var="HTTP_TIMEOUT", default=3000, help="Http timeout before receive response" + "--http-timeout", type=int, env_var="HTTP_TIMEOUT", default=120000, help="Http timeout before receive response" ) parser.add_argument( "--bench-target", diff --git a/evals/benchmark/stresscli/locust/tokenresponse.py b/evals/benchmark/stresscli/locust/tokenresponse.py index 483cd333..75943915 100644 --- a/evals/benchmark/stresscli/locust/tokenresponse.py +++ b/evals/benchmark/stresscli/locust/tokenresponse.py @@ -62,8 +62,8 @@ def staticsOutput(environment, reqlist): "Succeed Response: {} (Total {}, {:.1%} Success), Duration: {:.2f}s, Input Tokens: {}," " Output Tokens: {}, RPS: {:.2f}, Input Tokens per Second: {:.2f}, Output Tokens per Second: {:.2f}" ) - e2e_msg = "End to End latency(ms), P50: {:.2f}, P99: {:.2f}, Avg: {:.2f}" - first_msg = "First token latency(ms), P50: {:.2f}, P99: {:.2f}, Avg: {:.2f}" + e2e_msg = "End to End latency(ms), P50: {:.2f}, P90: {:.2f}, P99: {:.2f}, Avg: {:.2f}" + first_msg = "First token latency(ms), P50: {:.2f}, P90: {:.2f}, P99: {:.2f}, Avg: {:.2f}" next_msg = "Average Next token latency(ms): {:.2f}" average_msg = "Average token latency(ms) : {:.2f}" console_logger.warning("\n=================Total statistics=====================") @@ -92,12 +92,20 @@ def staticsOutput(environment, reqlist): ) ) console_logger.warning( - e2e_msg.format(numpy.percentile(e2e_lat, 50), numpy.percentile(e2e_lat, 99), numpy.average(e2e_lat)) + e2e_msg.format( + numpy.percentile(e2e_lat, 50), + numpy.percentile(e2e_lat, 90), + numpy.percentile(e2e_lat, 99), + numpy.average(e2e_lat), + ) ) if tokens_output != 0: console_logger.warning( first_msg.format( - numpy.percentile(first_token, 50), numpy.percentile(first_token, 99), numpy.average(first_token) + numpy.percentile(first_token, 50), + numpy.percentile(first_token, 90), + numpy.percentile(first_token, 99), + numpy.average(first_token), ) ) console_logger.warning(next_msg.format(numpy.average(next_token)))