From 4969446bf028a1d92e7be5e99c1a0dcb83e71780 Mon Sep 17 00:00:00 2001 From: chengcongdu Date: Thu, 26 Dec 2024 19:23:00 +0000 Subject: [PATCH] update a3mega nccl plugin to 1.0.7 and rxdm to 1.0.13_1 --- modules/compute/gke-node-pool/gpu_direct.tf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/compute/gke-node-pool/gpu_direct.tf b/modules/compute/gke-node-pool/gpu_direct.tf index 9403ea34fc..8f0e67a9b2 100644 --- a/modules/compute/gke-node-pool/gpu_direct.tf +++ b/modules/compute/gke-node-pool/gpu_direct.tf @@ -43,11 +43,11 @@ locals { "a3-megagpu-8g" = { # Manifest to be installed for enabling TCPXO on a3-megagpu-8g machines gpu_direct_manifests = [ - "https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/fee883360a660f71ba07478db95d5c1325322f77/gpudirect-tcpxo/nccl-tcpxo-installer.yaml", # nccl_plugin v1.0.4 for tcpxo - "https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/fee883360a660f71ba07478db95d5c1325322f77/nri_device_injector/nri-device-injector.yaml", # nri_plugin + "https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/b324ec8994aa98ca320438dd2d01ff6d7f9165bb/gpudirect-tcpxo/nccl-tcpxo-installer.yaml", # nccl_plugin v1.0.7 for tcpxo + "https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/b324ec8994aa98ca320438dd2d01ff6d7f9165bb/nri_device_injector/nri-device-injector.yaml", # nri_plugin ] updated_workload_path = replace(local.workload_path_tcpxo, ".yaml", "-tcpxo.yaml") - rxdm_version = "v1.0.10" # matching nccl-tcpxo-installer version v1.0.4 + rxdm_version = "v1.0.13_1" # matching nccl-tcpxo-installer version v1.0.7 min_additional_networks = 8 major_minor_version_acceptable_map = { "1.28" = "1.28.9-gke.1250000"