Skip to content

Commit

Permalink
feat: add --use_python_runtime and --enable_cuda_graph args to th…
Browse files Browse the repository at this point in the history
…e perf run script (#3397)
  • Loading branch information
zewenli98 authored Feb 24, 2025
1 parent b0464ca commit b43c4c2
Showing 1 changed file with 96 additions and 25 deletions.
121 changes: 96 additions & 25 deletions tools/perf/perf_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,11 +175,17 @@ def run_ts_trt(model, input_tensors, params, precision, batch_size):
"inputs": input_tensors,
"enabled_precisions": {precision_to_dtype(precision)},
"truncate_long_and_double": params.get("truncate", False),
"use_python_runtime": params.get("use_python_runtime", False),
}

if precision == "int8":
compile_settings.update({"calib": params.get("calibration_cache")})

if params.get("enable_cuda_graph", False):
logging.warning(
f"Torchscript backend doesn't support CUDA Graphs. `--enable_cuda_graph` will be ignored."
)

start_compile = timeit.default_timer()
model = torchtrt.compile(model, ir="ts", **compile_settings)
end_compile = timeit.default_timer()
Expand Down Expand Up @@ -217,19 +223,34 @@ def run_hf_dynamo(model, input_tensors, params, precision, batch_size):
inputs=input_tensors,
enabled_precisions={precision_to_dtype(precision)},
truncate_double=params.get("truncate", False),
use_python_runtime=params.get("use_python_runtime", False),
)
end_compile = timeit.default_timer()
compile_time_s = end_compile - start_compile
record_llm_perf(
trt_model,
"Dynamo",
input_tensors,
precision,
osl,
batch_size,
iters,
compile_time_s,
)

if params.get("enable_cuda_graph", False):
with torchtrt.runtime.enable_cudagraphs(trt_model) as cudagraphs_module:
record_llm_perf(
cudagraphs_module,
"Dynamo",
input_tensors,
precision,
osl,
batch_size,
iters,
compile_time_s,
)
else:
record_llm_perf(
trt_model,
"Dynamo",
input_tensors,
precision,
osl,
batch_size,
iters,
compile_time_s,
)


@run_with_try_except
Expand Down Expand Up @@ -262,14 +283,27 @@ def run_dynamo(model, input_tensors, params, precision, batch_size):
),
cache_built_engines=params.get("cache_built_engines", False),
reuse_cached_engines=params.get("reuse_cached_engines", False),
use_python_runtime=params.get("use_python_runtime", False),
)
end_compile = timeit.default_timer()
compile_time_s = end_compile - start_compile
iters = params.get("iterations", 20)

record_perf(
model, "Dynamo", input_tensors, precision, iters, batch_size, compile_time_s
)
if params.get("enable_cuda_graph", False):
with torchtrt.runtime.enable_cudagraphs(model) as cudagraphs_module:
record_perf(
cudagraphs_module,
"Dynamo",
input_tensors,
precision,
iters,
batch_size,
compile_time_s,
)
else:
record_perf(
model, "Dynamo", input_tensors, precision, iters, batch_size, compile_time_s
)


@run_with_try_except
Expand All @@ -292,6 +326,7 @@ def run_torch_compile(model, input_tensors, params, precision, batch_size):
"enabled_precisions": {precision_to_dtype(precision)},
"truncate": params.get("truncate", False),
"min_block_size": params.get("min_block_size", 1),
"use_python_runtime": params.get("use_python_runtime", False),
}
start_compile = timeit.default_timer()
model = torch.compile(model, backend="tensorrt", dynamic=None, options=compile_spec)
Expand All @@ -300,15 +335,27 @@ def run_torch_compile(model, input_tensors, params, precision, batch_size):
compile_time_s = end_compile - start_compile
iters = params.get("iterations", 20)

record_perf(
model,
"torch_compile",
input_tensors,
precision,
iters,
batch_size,
compile_time_s,
)
if params.get("enable_cuda_graph", False):
with torchtrt.runtime.enable_cudagraphs(model) as cudagraphs_module:
record_perf(
cudagraphs_module,
"torch_compile",
input_tensors,
precision,
iters,
batch_size,
compile_time_s,
)
else:
record_perf(
model,
"torch_compile",
input_tensors,
precision,
iters,
batch_size,
compile_time_s,
)


@run_with_try_except
Expand All @@ -320,9 +367,13 @@ def run_hf_inductor(model, input_tensors, params, precision, batch_size):
# Mark dynamic shapes for input sequence
input_seq = input_tensors[0]
torch._dynamo.mark_dynamic(input_seq, 1, min=1, max=osl)
mode = "max-autotune"
if params.get("enable_cuda_graph", False):
mode = "reduce-overhead"

start_compile = timeit.default_timer()
# Compile the model
model = torch.compile(model, backend="inductor", dynamic=None, mode="max-autotune")
model = torch.compile(model, backend="inductor", dynamic=None, mode=mode)
model(input_seq)
end_compile = timeit.default_timer()
compile_time_s = end_compile - start_compile
Expand Down Expand Up @@ -356,15 +407,25 @@ def run_inductor(model, input_tensors, params, precision, batch_size):
if params["is_text_llm"]:
return run_hf_inductor(model, input_tensors, params, precision, batch_size)

mode = "max-autotune"
if params.get("enable_cuda_graph", False):
mode = "reduce-overhead"

start_compile = timeit.default_timer()
model = torch.compile(model, backend="inductor", dynamic=None, mode="max-autotune")
model = torch.compile(model, backend="inductor", dynamic=None, mode=mode)
model(*input_tensors)
end_compile = timeit.default_timer()
compile_time_s = end_compile - start_compile
iters = params.get("iterations", 20)

record_perf(
model, "inductor", input_tensors, precision, iters, batch_size, compile_time_s
model,
"inductor",
input_tensors,
precision,
iters,
batch_size,
compile_time_s,
)


Expand Down Expand Up @@ -587,6 +648,16 @@ def run(
action="store_true",
help="Boolean flag to determine if the user provided model is a TRT engine or not",
)
arg_parser.add_argument(
"--use_python_runtime",
action="store_true",
help="Whether to use Python runtime or not. Using C++ runtime by default",
)
arg_parser.add_argument(
"--enable_cuda_graph",
action="store_true",
help="Whether to enable CUDA Graph. It is not used by default",
)
arg_parser.add_argument(
"--report",
type=str,
Expand Down

0 comments on commit b43c4c2

Please sign in to comment.